Initial Version of HEVC decoder Compliant to reference software HM11.0 onwards Bug: 14571712 Change-Id: I8af25c1221cc6ab70440141c4d9b48c1ac69696a

commit: 0d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098 [log] [tgz]
author: Harish Mahendrakar <harish.mahendrakar@ittiam.com> Fri May 16 10:31:13 2014 -0700
committer: Lajos Molnar <lajos@google.com> Wed May 21 18:14:55 2014 -0700
tree: 8a81f7d0f636b8b69bfe611aa124035e32ed4edc
parent: 446ae52464da2263587877973845fe044100e205 [diff]
diff --git a/Android.mk b/Android.mk
new file mode 100644
index 0000000..4668c52
--- /dev/null
+++ b/Android.mk

@@ -0,0 +1,6 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+# decoder
+include $(LOCAL_PATH)/decoder.mk
+

diff --git a/MODULE_LICENSE_APACHE2 b/MODULE_LICENSE_APACHE2
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/MODULE_LICENSE_APACHE2


diff --git a/NOTICE b/NOTICE
new file mode 100644
index 0000000..e960962
--- /dev/null
+++ b/NOTICE

@@ -0,0 +1,14 @@
+Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at:
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+

diff --git a/common/arm/ihevc_deblk_chroma_horz.s b/common/arm/ihevc_deblk_chroma_horz.s
new file mode 100644
index 0000000..34422ff
--- /dev/null
+++ b/common/arm/ihevc_deblk_chroma_horz.s

@@ -0,0 +1,148 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/*******************************************************************************
+@* @file
+@*  ihevc_deblk_luma_horz.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  anand s
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************/
+
+.text
+.align 4
+
+
+
+
+.extern gai4_ihevc_qp_table
+.extern gai4_ihevc_tc_table
+.globl ihevc_deblk_chroma_horz_a9q
+
+gai4_ihevc_qp_table_addr:
+.long gai4_ihevc_qp_table - ulbl1 - 8
+
+gai4_ihevc_tc_table_addr:
+.long gai4_ihevc_tc_table - ulbl2 - 8
+
+.type ihevc_deblk_chroma_horz_a9q, %function
+
+ihevc_deblk_chroma_horz_a9q:
+    push        {r4-r12,lr}
+    sub         r12,r0,r1
+    vld1.8      {d0},[r0]
+    sub         r5,r12,r1
+    add         r6,r0,r1
+    add         r1,r2,r3
+    vmovl.u8    q0,d0
+    ldr         r10,[sp,#0x28]
+    vld1.8      {d2},[r12]
+    add         r2,r1,#1
+    ldr         r4,[sp,#0x30]
+    vld1.8      {d4},[r5]
+    ldr         r8,[sp,#0x34]
+    vld1.8      {d16},[r6]
+    ldr         r9,[sp,#0x38]
+    adds        r1,r10,r2,asr #1
+    vmovl.u8    q1,d2
+    ldr         r7,[sp,#0x2c]
+    ldr         r3,gai4_ihevc_qp_table_addr
+ulbl1:
+    add         r3, r3, pc
+    bmi         l1.3312
+    cmp         r1,#0x39
+    ldrle       r1,[r3,r1,lsl #2]
+    subgt       r1,r1,#6
+l1.3312:
+    adds        r2,r7,r2,asr #1
+    vmovl.u8    q2,d4
+    bmi         l1.3332
+    cmp         r2,#0x39
+    ldrle       r2,[r3,r2,lsl #2]
+    subgt       r2,r2,#6
+l1.3332:
+    add         r1,r1,r4,lsl #1
+    vsub.i16    q3,q0,q1
+    add         r3,r1,#2
+    cmp         r3,#0x35
+    movgt       r1,#0x35
+    vshl.i16    q3,q3,#2
+    vmovl.u8    q8,d16
+    bgt         l1.3368
+    adds        r3,r1,#2
+    addpl       r1,r1,#2
+    movmi       r1,#0
+l1.3368:
+    ldr         r3,gai4_ihevc_tc_table_addr
+ulbl2:
+    add         r3, r3, pc
+    vadd.i16    q2,q3,q2
+    add         r2,r2,r4,lsl #1
+    vsub.i16    q3,q2,q8
+    add         r4,r2,#2
+    ldr         r1,[r3,r1,lsl #2]
+    cmp         r4,#0x35
+    movgt       r2,#0x35
+    bgt         l1.3412
+    adds        r4,r2,#2
+    addpl       r2,r2,#2
+    movmi       r2,#0
+l1.3412:
+
+
+    ldr         r2,[r3,r2,lsl #2]
+    cmp         r8,#0
+    vdup.16     q8,r2
+    vdup.16     q2,r1
+    rsb         r1,r1,#0
+    vrshr.s16   q3,q3,#3
+    vdup.16     q9,r1
+    rsb         r1,r2,#0
+    vzip.16     q2,q8
+    vdup.16     q10,r1
+
+    vzip.16     q9,q10
+
+    vmin.s16    q8,q3,q2
+    vmax.s16    q2,q9,q8
+    vadd.i16    q1,q1,q2
+    vsub.i16    q0,q0,q2
+    vqmovun.s16 d2,q1
+    vqmovun.s16 d0,q0
+    beq         l1.3528
+    vst1.8      {d2},[r12]
+l1.3528:
+    cmp         r9,#0
+    beq         l1.3540
+    vst1.8      {d0},[r0]
+l1.3540:
+    pop         {r4-r12,pc}
+
+

diff --git a/common/arm/ihevc_deblk_chroma_vert.s b/common/arm/ihevc_deblk_chroma_vert.s
new file mode 100644
index 0000000..4cb305f
--- /dev/null
+++ b/common/arm/ihevc_deblk_chroma_vert.s

@@ -0,0 +1,163 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/*******************************************************************************
+@* @file
+@*  ihevc_deblk_luma_vert.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  anand s
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************/
+
+.text
+.align 4
+
+
+
+
+
+.extern gai4_ihevc_qp_table
+.extern gai4_ihevc_tc_table
+.globl ihevc_deblk_chroma_vert_a9q
+
+gai4_ihevc_qp_table_addr:
+.long gai4_ihevc_qp_table - ulbl1 - 8
+
+gai4_ihevc_tc_table_addr:
+.long gai4_ihevc_tc_table  - ulbl2 - 8
+
+.type ihevc_deblk_chroma_vert_a9q, %function
+
+ihevc_deblk_chroma_vert_a9q:
+    push        {r4-r12,lr}
+    sub         r8,r0,#4
+    add         r2,r2,r3
+    vld1.8      {d5},[r8],r1
+    add         r2,r2,#1
+    vld1.8      {d17},[r8],r1
+    ldr         r7,[sp,#0x28]
+    vld1.8      {d16},[r8],r1
+    ldr         r4,[sp,#0x38]
+    vld1.8      {d4},[r8]
+    ldr         r5,[sp,#0x30]
+    vtrn.8      d5,d17
+    adds        r3,r7,r2,asr #1
+    vtrn.8      d16,d4
+    ldr         r7,gai4_ihevc_qp_table_addr
+ulbl1:
+    add         r7,r7,pc
+    ldr         r12,[sp,#0x34]
+    ldr         r6,[sp,#0x2c]
+    bmi         l1.2944
+    cmp         r3,#0x39
+    ldrle       r3,[r7,r3,lsl #2]
+    subgt       r3,r3,#6
+l1.2944:
+    vtrn.16     d5,d16
+    adds        r2,r6,r2,asr #1
+    vtrn.16     d17,d4
+    bmi         l1.2964
+    cmp         r2,#0x39
+    ldrle       r2,[r7,r2,lsl #2]
+    subgt       r2,r2,#6
+l1.2964:
+    vtrn.32     d5,d17
+    add         r3,r3,r5,lsl #1
+    vtrn.32     d16,d4
+    add         r6,r3,#2
+    vmovl.u8    q9,d17
+    cmp         r6,#0x35
+    movgt       r3,#0x35
+    bgt         l1.2996
+    adds        r6,r3,#2
+    addpl       r3,r3,#2
+    movmi       r3,#0
+l1.2996:
+    vsubl.u8    q0,d17,d16
+    ldr         r6,gai4_ihevc_tc_table_addr
+ulbl2:
+    add         r6,r6,pc
+    vshl.i16    q0,q0,#2
+    add         r2,r2,r5,lsl #1
+    add         r5,r2,#2
+    vaddw.u8    q0,q0,d5
+    cmp         r5,#0x35
+    ldr         r3,[r6,r3,lsl #2]
+    vsubw.u8    q2,q0,d4
+    movgt       r2,#0x35
+    bgt         l1.3036
+    adds        r5,r2,#2
+    addpl       r2,r2,#2
+    movmi       r2,#0
+l1.3036:
+
+
+    vrshr.s16   q3,q2,#3
+    vdup.16     d2,r3
+    ldr         r2,[r6,r2,lsl #2]
+    rsb         r3,r3,#0
+    cmp         r12,#0
+    vdup.16     d3,r2
+    rsb         r2,r2,#0
+    vdup.16     d30,r3
+    vdup.16     d31,r2
+
+
+    vmin.s16    q2,q3,q1
+    vmax.s16    q1,q15,q2
+
+    vmovl.u8    q3,d16
+
+    vadd.i16    q0,q3,q1
+    vsub.i16    q1,q9,q1
+    vqmovun.s16 d0,q0
+    sub         r2,r0,#2
+    vqmovun.s16 d1,q1
+    vtrn.32     d0,d1
+    vtrn.8      d0,d1
+    beq         l1.3204
+
+    vst1.16     {d0[0]},[r2],r1
+    vst1.16     {d1[0]},[r2],r1
+    vst1.16     {d0[1]},[r2],r1
+    vst1.16     {d1[1]},[r2]
+l1.3204:
+    cmp         r4,#0
+    beq         l1.3228
+    vst1.16     {d0[2]},[r0],r1
+    vst1.16     {d1[2]},[r0],r1
+    vst1.16     {d0[3]},[r0],r1
+    vst1.16     {d1[3]},[r0]
+l1.3228:
+    pop         {r4-r12,pc}
+
+
+

diff --git a/common/arm/ihevc_deblk_luma_horz.s b/common/arm/ihevc_deblk_luma_horz.s
new file mode 100644
index 0000000..b12ceb9
--- /dev/null
+++ b/common/arm/ihevc_deblk_luma_horz.s

@@ -0,0 +1,543 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/*******************************************************************************
+@* @file
+@*  ihevc_deblk_luma_vert.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  anand s
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************/
+
+.text
+.align 4
+
+
+
+
+
+.extern gai4_ihevc_tc_table
+.extern gai4_ihevc_beta_table
+.globl ihevc_deblk_luma_horz_a9q
+
+gai4_ihevc_tc_table_addr:
+.long gai4_ihevc_tc_table  - ulbl1 - 8
+
+gai4_ihevc_beta_table_addr:
+.long gai4_ihevc_beta_table  - ulbl2 - 8
+
+.type ihevc_deblk_luma_horz_a9q, %function
+
+ihevc_deblk_luma_horz_a9q:
+    stmfd       sp!, {r3-r12,lr}
+    ldr         r4,[sp,#0x2c]
+    ldr         r5,[sp,#0x30]
+
+    add         r3,r3,r4
+    add         r3,r3,#1
+    ldr         r6, [sp,#0x34]
+    asr         r3,r3,#1
+    add         r7,r3,r5,lsl #1
+    add         r3,r3,r6,lsl #1
+    cmp         r7,#0x33
+    movgt       r7,#0x33
+    bgt         l1.1532
+    cmp         r7,#0x0
+    movlt       r7,#0x0                     @ r7 has the beta_index value
+l1.1532:
+    @     bic      r2,r2,#1
+    asr         r2,r2,#1
+
+    add         r3,r3,r2,lsl #1
+    cmp         r3,#0x35
+    movgt       r3,#0x35
+    bgt         l1.1564
+    cmp         r3,#0x0
+    movlt       r3,#0x0                     @ r3 has the tc_index value
+
+    @    qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
+    @    beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
+    @    tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
+
+l1.1564:
+    ldr         r2,gai4_ihevc_beta_table_addr
+ulbl2:
+    add         r2,r2,pc
+    ldr         r4,gai4_ihevc_tc_table_addr
+ulbl1:
+    add         r4,r4,pc
+
+    ldr         r5,[r2,r7,lsl #2]           @ beta
+    ldr         r6,[r4,r3,lsl #2]           @ tc
+
+
+
+    cmp         r6,#0
+    beq         l1.2404
+    vmov.i16    d0,#0x2
+    lsl         r7,r6,#1
+    add         r14,r1,r1,lsl #1
+    ldr         r8,[r0,-r14]                @ -3 value
+    vdup.8      d1,r7
+    ldr         r10,[r0,-r1,lsl #1]         @-2 value
+    vdup.32     d23,r8                      @ -3 value
+    ldr         r11,[r0,-r1]                @-1 value
+    vdup.32     d24,r10                     @ -2 value
+    and         r8,#0xff
+    ldr         r12,[r0,#0]                 @ 0 value
+    vdup.32     d25, r11                    @-1 value
+    and         r10,#0xff
+    ldr         r9,[r0,r1]                  @ 1 value
+    vdup.32     d26,r12                     @ 0 value
+    and         r11,#0xff
+    ldr         r2,[r0,r1,lsl #1]           @ 2 value
+    vdup.32     d27,r9                      @ 1value
+    and         r12,#0xff
+    vdup.32     d28,r2                      @ 2 value
+    and         r9,#0xff
+    and         r2,#0xff
+
+    add         r12,r12,r2
+    subs        r9,r12,r9,lsl #1            @ dq0 value is stored in r9
+    rsbmi       r9,r9,#0
+    @dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
+
+    add         r8,r8,r11
+    subs        r8,r8,r10,lsl #1
+    rsbmi       r8,r8,#0                    @ dp0 value is stored in r8
+    @  dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
+
+
+
+    add         r3,r1,r1,lsl #1
+    add         r14,r0,#3
+
+
+    ldrb        r2,[r14,-r3]                @ -2 value
+    ldrb        r10,[r14,-r1,lsl #1]        @ -2 value
+    ldrb        r11,[r14,-r1]               @ -1 value
+    ldrb        r12,[r14,#0]                @ 0 value
+    ldrb        r3,[r14,r1]                 @ 1 value
+    ldrb        r4,[r14,r1,lsl #1]          @ 2 value
+
+
+    add         r12,r12,r4
+    subs        r12,r12,r3,lsl #1           @ dq3value is stored in r12
+    rsbmi       r12,r12,#0
+    @    dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
+
+
+    add         r2,r2,r11
+    subs        r11,r2,r10,lsl #1
+    rsbmi       r11,r11,#0                  @ dp3 value is stored in r8
+    @    dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]   + pu1_src[3 * src_strd - 1] )@
+
+
+
+    add         r3,r8,r9                    @ r3 has the d0 value
+    add         r4,r11,r12                  @ r4 has the d3 value
+
+
+    @    d0 = dp0 + dq0@
+    @    d3 = dp3 + dq3@
+
+    add         r14,r8,r11                  @ r13 has the value dp
+    add         r12,r12,r9                  @ r12 has the value  dq
+    @    dp = dp0 + dp3@
+    @   dq = dq0 + dq3@
+
+    add         r11, r3, r4                 @ r3 has the value d
+
+    @   d = d0 + d3@
+
+
+    cmp         r11,r5
+    bge         l1.2404
+
+    @    if(d < beta)
+
+
+    @ registers which cannont be altered : r3,r4 r5,r6,r12,r13,r0,r1,r11
+
+    @ registers for use: r2,r7,r8,r9,r10,
+
+    asr         r10,r5,#2
+    vqadd.u8    d30,d26,d1
+    cmp         r10,r3,lsl #1
+    vqsub.u8    d31,d26,d1
+    ble         l1.1840
+    add         r10,r1,r1,lsl #1
+    vaddl.u8    q3,d25,d26
+    ldr         r2,[r0,-r1,lsl #2]          @ has the -4 value
+    ldrb        r7,[r0,-r1]                 @ has the -1 value
+    vdup.32     d22,r2                      @ -4 value
+    vaddw.u8    q4,q3,d27
+    ldrb        r3,[r0,#0]                  @ r4 has the 0 value
+    vqadd.u8    d16,d27,d1
+    and         r2,#0xff
+    vmul.i16    q6,q4,d0[0]
+    ldr         r8,[r0,r10]                 @ has the 3 value
+    vaddl.u8    q5,d24,d28
+    subs        r2,r2,r7
+    vqsub.u8    d17,d27,d1
+    vdup.32     d29,r8                      @ 3 value
+    and         r8,#0xff
+    vadd.i16    q6,q6,q5
+    rsbmi       r2,r2,#0
+    vrshrn.i16  d20,q6,#3
+    subs        r8,r8,r3
+    rsbmi       r8,r8,#0
+    vmin.u8     d18,d20,d30
+    add         r8,r8,r2
+
+    cmp         r8,r5,asr #3
+    bge         l1.1840
+    vaddw.u8    q7,q4,d28
+    subs        r7,r3,r7
+    vmax.u8     d4,d18,d31
+    rsbmi       r7,r7,#0
+    vqadd.u8    d30,d28,d1
+    mov         r10,#5
+    vrshrn.i16  d21,q7,#2
+    mul         r10,r10,r6
+    vqsub.u8    d31,d28,d1
+    add         r10,#1
+    cmp         r7,r10,asr #1
+    vmin.u8     d18,d21,d16
+    bge         l1.1840
+
+
+    @        if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4])  < (beta >> 3) )
+    @            && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
+
+    vmax.u8     d5,d18,d17
+    asr         r10,r5,#2
+    vaddl.u8    q8,d29,d28
+    cmp         r10,r4,lsl #1
+    ble         l1.1840
+
+    add         r10,r1,r1,lsl #1
+    vmul.i16    q8,q8,d0[0]
+    add         r4,r0,#3
+
+
+    ldrb        r2,[r4,-r1,lsl #2]
+    vadd.i16    q8,q8,q7
+    ldrb        r7,[r4,-r1]
+    vrshrn.i16  d19,q8,#3
+    ldrb        r3,[r4,#0]
+    ldrb        r8,[r4,r10]
+    @   ubfx   r7,r2,#24,#8           @ has the -1 value
+    @  and    r2,#0xff               @ has the -4 value
+    @  ubfx   r8,r3,#24,#8           @ has the 3 value
+    @  and    r3,#0xff               @ r4 has the 0 value
+
+
+
+    subs        r8,r8,r3
+    vmin.u8     d18,d19,d30
+    rsbmi       r8,r8,#0
+    vaddl.u8    q3,d25,d24
+    subs        r2,r2,r7
+    vmax.u8     d3,d18,d31
+    rsbmi       r2,r2,#0
+    vaddw.u8    q4,q3,d26
+    add         r8,r8,r2
+    vqadd.u8    d30,d25,d1
+    cmp         r8,r5,asr #3
+    vqsub.u8    d31,d25,d1
+    bge         l1.1840
+    vmul.i16    q6,q4,d0[0]
+    subs        r7,r3,r7
+    vqadd.u8    d16,d24,d1
+    rsbmi       r7,r7,#0
+    vaddl.u8    q5,d23,d27
+    mov         r10,#5
+    vqsub.u8    d17,d24,d1
+    mul         r10,r10,r6
+    vadd.i16    q6,q6,q5
+    add         r10,#1
+    vrshrn.i16  d20,q6,#3
+    cmp         r7,r10,asr #1
+    vaddw.u8    q7,q4,d23
+    bge         l1.1840
+    vmin.u8     d18,d20,d30
+    mov         r2,#2
+    vqadd.u8    d30,d23,d1
+    ldr         r4,[sp,#0x38]               @ loading the filter_flag_p
+    vmax.u8     d2,d18,d31
+    ldr         r5,[sp,#0x3c]               @ loading the filter_flag_q
+    vrshrn.i16  d21,q7,#2
+    b           end_dep_deq_decision_horz
+    @ r2 has the value of de
+    @ r6 has teh value of tc
+    @ r5 has the value of beta
+    @ r14 has the value of dp
+    @ r12 has the value of dq
+    @ r0 has the value of source address
+    @ r1 has the src stride
+
+l1.1840:
+    mov         r2,#1
+
+    mov         r11,r5
+    ldr         r4,[sp,#0x38]               @ loading the filter_flag_p
+    ldr         r5,[sp,#0x3c]               @ loading the filter_flag_q
+
+    cmp         r6,#1
+    moveq       r9,#0
+    moveq       r10,#0
+    beq         end_dep_deq_decision_horz
+
+    and         r7,r4,r5
+    cmp         r7,#1
+    beq         both_flags_set_horz
+    cmp         r4,#0
+    beq         set_flag_dep_zero_horz
+
+
+    add         r8,r11,r11,asr #1
+    mov         r10,#0
+    asr         r8,#3
+    cmp         r8,r14
+    movgt       r9,#1
+    movle       r9,#0
+    b           end_dep_deq_decision_horz
+set_flag_dep_zero_horz:
+
+    add         r8,r11,r11,asr #1
+    mov         r9,#0
+    asr         r8,#3
+    cmp         r8,r12
+    movgt       r10,#1
+    movle       r10,#0
+    b           end_dep_deq_decision_horz
+
+both_flags_set_horz:
+    add         r8,r11,r11,asr #1
+    asr         r8,#3
+    cmp         r8,r14
+    movgt       r9,#1
+    movle       r9,#0
+    cmp         r8,r12
+    movgt       r10,#1
+    movle       r10,#0
+end_dep_deq_decision_horz:
+
+    @r0=source address
+    @r1=stride
+    @ r2 =de
+    @ r4=flag p
+    @r5= flag q
+    @r6 =tc
+    @ r9 =dep
+    @ r10=deq
+
+
+
+    @   add     r14,r1,r1,lsl #1
+    @   lsl     r7,r6,#1
+    @   vdup.8  d1,r7
+    @   vmov.i16  d0,#0x2
+    vmin.u8     d18,d21,d16
+    cmp         r2,#1
+    vqsub.u8    d31,d23,d1
+    beq         l1.2408
+    vaddl.u8    q4,d23,d22
+    cmp         r5,#1
+
+    bne         strong_filtering_p
+
+strong_filtering_q:
+    mov         r12,r0
+    vst1.32     d4[0],[r12],r1
+    vst1.32     d5[0],[r12],r1
+    vst1.32     d3[0],[r12]
+    cmp         r4,#1
+    bne         l1.2404
+strong_filtering_p:
+    vmax.u8     d5,d18,d17
+    mov         r12,r0
+    vmul.i16    q4,q4,d0[0]
+    rsb         r11,r1,#0
+    vadd.i16    q8,q4,q7
+    add         r12,r12,r11
+    vrshrn.i16  d19,q8,#3
+    vst1.32     d2[0],[r12],r11
+    vmin.u8     d18,d19,d30
+    vst1.32     d5[0],[r12],r11
+    vmax.u8     d3,d18,d31
+    vst1.32     d3[0],[r12]
+
+l1.2404:
+    ldmfd       sp!, {r3-r12,pc}
+
+    @ r4=flag p
+    @r5= flag q
+    @r6 =tc
+    @ r9 =dep
+    @ r10=deq
+
+
+    @       d22          -4 value
+
+    @d23        @ -3 value
+
+    @   vdup.32 d24,r11         @ -2 value
+
+    @   vdup.32 d25, r11        @-1 value
+
+    @   vdup.32 d26,r11         @ 0 value
+
+    @   vdup.32 d27,r11         @ 1value
+
+    @   vdup.32 d28,r11         @ 2 value
+
+    @   vdup.32 d29,r11         @ 3 value
+
+l1.2408:
+
+    vmov.i16    d0,#0x9
+
+    vsubl.u8    q5,d26,d25
+
+    vmul.i16    q5,q5,d0[0]
+
+    vmov.i16    d0,#0x3
+
+    vsubl.u8    q6,d27,d24
+    vmul.i16    q6,q6,d0[0]
+
+
+    vdup.8      d30,r6                      @ duplicating the +tc value
+
+    rsb         r12,r6,#0
+    vdup.8      d31,r12                     @ duplicating the -tc value
+
+
+
+    vsub.i16    q5,q5,q6
+
+
+
+    vrshr.s16   q5,q5,#4
+    @   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
+
+    vabs.s16    q4,q5
+    vmovn.i16   d9,q4
+    @ storing the absolute values of delta in d9
+
+    vqmovn.s16  d10,q5
+    @ storing the clipped values of delta in d16
+
+
+    vmin.s8     d11,d10,d30
+    vmax.s8     d8,d31,d11                  @ d8 has the value  delta = clip3(delta, -tc, tc)@
+
+
+    vmovl.u8    q3,d25
+
+    vaddw.s8    q2,q3,d8
+
+    vqmovun.s16 d12,q2
+    vmovl.u8    q3,d26
+    vsubw.s8    q2,q3,d8
+    vqmovun.s16 d13,q2
+
+
+    mov         r11,#0xa
+    mul         r12,r11,r6
+    vdup.8      d2,r12                      @ d2 has the 10*tc value
+    vmov        d18,d24
+    vdup.8      d0,r6
+    vshr.s8     d0,#1
+    vneg.s8     d1,d0
+
+    cmp         r4,#1
+    bne         l1.2724
+    cmp         r9,#1
+    bne         l1.2700
+
+    @ d12 and d13 have the value temp_p0 and temp_q0
+    vaddl.u8    q7,d23,d25
+    vrshrn.u16  d14,q7,#1
+    vsubl.u8    q7,d14,d24
+    vaddw.s8    q7,q7,d8
+    vqshrn.s16  d14,q7,#1
+    vmin.s8     d15,d14,d0
+    vmax.s8     d14,d1,d15
+
+    @ d14 has the delta p value
+    vmovl.u8    q8,d24
+    vaddw.s8    q8,q8,d14
+    vqmovun.s16 d14,q8
+
+    @  d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@
+    vcge.u8     d18,d9,d2
+    vbsl        d18,d24,d14
+
+l1.2700:
+    mov         r12,r0
+    rsb         r11,r1,#0
+    add         r12,r11
+    vcge.u8     d19,d9,d2
+    vbsl        d19,d25,d12
+    vst1.32     {d19[0]},[r12],r11
+    vst1.32     {d18[0]},[r12]
+l1.2724:
+    cmp         r5,#1
+    bne         l1.2404
+    cmp         r10,#1
+    vmov        d18, d27
+    bne         l1.2852
+
+    vaddl.u8    q7,d26,d28
+    vrshrn.u16  d14,q7,#1
+    vsubl.u8    q7,d14,d27
+    vsubw.s8    q7,q7,d8
+    vqshrn.s16  d14,q7,#1
+    vmin.s8     d15,d14,d0
+    vmax.s8     d14,d1,d15
+@ d14 has the delta p value
+    vmovl.u8    q8,d27
+    vaddw.s8    q8,q8,d14
+    vqmovun.s16 d14,q8
+    vcge.u8     d18,d9,d2
+    vbsl        d18,d27,d14
+l1.2852:
+    mov         r12,r0
+    vcge.u8     d19,d9,d2
+    vbsl        d19,d26,d13
+    vst1.32     {d19[0]},[r12],r1
+    vst1.32     {d18[0]},[r12]
+    ldmfd       sp!, {r3-r12,r15}
+
+
+

diff --git a/common/arm/ihevc_deblk_luma_vert.s b/common/arm/ihevc_deblk_luma_vert.s
new file mode 100644
index 0000000..ee247cc
--- /dev/null
+++ b/common/arm/ihevc_deblk_luma_vert.s

@@ -0,0 +1,593 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/*******************************************************************************
+@* @file
+@*  ihevc_deblk_luma_vert.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  anand s
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************/
+
+.text
+.align 4
+
+
+
+
+
+.extern gai4_ihevc_tc_table
+.extern gai4_ihevc_beta_table
+
+.globl ihevc_deblk_luma_vert_a9q
+
+gai4_ihevc_tc_table_addr:
+.long gai4_ihevc_tc_table   - ulbl1 - 8
+
+gai4_ihevc_beta_table_addr:
+.long gai4_ihevc_beta_table   - ulbl2 - 8
+
+.type ihevc_deblk_luma_vert_a9q, %function
+
+ihevc_deblk_luma_vert_a9q:
+
+    push        {r3-r12,lr}
+    ldr         r4,[sp,#0x2c]
+    ldr         r5,[sp,#0x30]
+
+    add         r3,r3,r4
+    add         r3,r3,#1
+    ldr         r6, [sp,#0x34]
+    asr         r3,r3,#1
+    add         r7,r3,r5,lsl #1
+    add         r3,r3,r6,lsl #1
+    cmp         r7,#0x33
+    movgt       r7,#0x33
+    bgt         l1.56
+    cmp         r7,#0x0
+    movlt       r7,#0x0                     @ r7 has the beta_index value
+l1.56:
+
+@     bic      r2,r2,#1
+    asr         r2,r2,#1
+
+    add         r3,r3,r2,lsl #1
+    cmp         r3,#0x35
+    movgt       r3,#0x35
+    bgt         l1.88
+    cmp         r3,#0x0
+    movlt       r3,#0x0                     @ r3 has the tc_index value
+
+@    qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
+@    beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
+@    tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
+
+l1.88:
+    ldr         r2,gai4_ihevc_beta_table_addr
+ulbl2:
+    add         r2,r2,pc
+    vmov.i8     d18,#0x2
+    ldr         r4,gai4_ihevc_tc_table_addr
+ulbl1:
+    add         r4,r4,pc
+
+    ldr         r5,[r2,r7,lsl #2]           @ beta
+    vmov.i16    q8,#0x2
+    ldr         r6,[r4,r3,lsl #2]           @ tc
+    lsl         r8,r6,#1
+    cmp         r6,#0
+    vdup.8      d19,r8
+    sub         r7,r0,#4
+    vmov.i8     d23,#0x3
+    beq         l1.964
+
+
+    vld1.8      {d24},[r7],r1
+    ldrb        r8,[r0,#-3]                 @ -3 value
+    vld1.8      {d1},[r7],r1
+    ldrb        r10,[r0,#-2]                @-2 value
+    vld1.8      {d2},[r7],r1
+    ldrb        r11,[r0,#-1]                @-1 value
+    vld1.8      {d0},[r7]
+    ldrb        r12,[r0,#0]                 @ 0 value
+    ldrb        r9,[r0,#1]                  @ 1 value
+    vtrn.8      d24,d1
+    ldrb        r2,[r0,#2]                  @ 2 value
+    vtrn.8      d2,d0
+    add         r12,r12,r2
+    subs        r9,r12,r9,lsl #1            @ dq0 value is stored in r9
+    rsbmi       r9,r9,#0
+@dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
+    vtrn.16     d24,d2
+    add         r8,r8,r11
+    vtrn.16     d1,d0
+    subs        r8,r8,r10,lsl #1
+    rsbmi       r8,r8,#0                    @ dp0 value is stored in r8
+@  dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
+
+
+
+    add         r14,r1,r1,lsl #1
+    add         r14,r0,r14
+
+    vdup.32     d4,d24[1]
+    ldrb        r2,[r14,#-3]                @ -2 value
+    vdup.32     d7,d2[1]
+    ldrb        r10,[r14,#-2]               @ -2 value
+    vdup.32     d3,d2[0]
+    ldrb        r11,[r14,#-1]               @ -1 value
+    vdup.32     d5,d1[1]
+    ldrb        r12,[r14,#0]                @ 0 value
+    vdup.32     d6,d1[0]
+    ldrb        r3,[r14,#1]                 @ 1 value
+    vdup.32     d2,d0[0]
+    ldrb        r4,[r14,#2]                 @ 2 value
+
+
+    add         r12,r12,r4
+    subs        r12,r12,r3,lsl #1           @ dq3value is stored in r12
+    rsbmi       r12,r12,#0
+@    dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
+
+
+    add         r2,r2,r11
+    subs        r11,r2,r10,lsl #1
+    rsbmi       r11,r11,#0                  @ dp3 value is stored in r8
+@    dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]   + pu1_src[3 * src_strd - 1] )@
+
+
+
+    add         r3,r8,r9                    @ r3 has the d0 value
+    add         r4,r11,r12                  @ r4 has the d3 value
+
+
+@    d0 = dp0 + dq0@
+@    d3 = dp3 + dq3@
+
+    add         r14,r8,r11                  @ r13 has the value dp
+    add         r12,r12,r9                  @ r12 has the value  dq
+@    dp = dp0 + dp3@
+@   dq = dq0 + dq3@
+
+    add         r11, r3, r4                 @ r3 has the value d
+
+@   d = d0 + d3@
+
+
+    cmp         r11,r5
+    vdup.32     d22,d0[1]
+    bge         l1.964
+
+@    if(d < beta)
+
+
+    @ registers which cannont be altered : r3,r4 r5,r6,r12,r13,r0,r1,r11
+
+    @ registers for use: r2,r7,r8,r9,r10,
+    vqsub.u8    d30,d7,d19
+    asr         r10,r5,#2
+    vqadd.u8    d31,d7,d19
+    cmp         r10,r3,lsl #1
+    vaddl.u8    q0,d5,d4
+    ble         l1.336
+
+    ldrb        r2,[r0,#-4]
+    vaddw.u8    q0,q0,d2
+    ldrb        r7,[r0,#-1]
+    vmull.u8    q10,d7,d23
+    ldrb        r3,[r0,#0]
+    vmlal.u8    q10,d22,d18
+    ldrb        r8,[r0,#3]
+@   ubfx   r7,r2,#24,#8           @ has the -1 value
+@  and    r2,#0xff               @ has the -4 value
+@  ubfx   r8,r3,#24,#8           @ has the 3 value
+@  and    r3,#0xff               @ r4 has the 0 value
+
+    vadd.i16    q10,q10,q0
+    subs        r8,r8,r3
+    vrshrn.i16  d22,q10,#3
+    rsbmi       r8,r8,#0
+    subs        r2,r2,r7
+    vmin.u8     d21,d22,d31
+    rsbmi       r2,r2,#0
+    vmax.u8     d22,d21,d30
+    add         r8,r8,r2
+    vaddl.u8    q10,d7,d3
+    cmp         r8,r5,asr #3
+    vmla.i16    q10,q0,q8
+    bge         l1.336
+    vaddw.u8    q0,q0,d7
+    subs        r7,r3,r7
+    vrshrn.i16  d20,q10,#3
+    rsbmi       r7,r7,#0
+    vrshrn.i16  d0,q0,#2
+    mov         r10,#5
+    vqadd.u8    d30,d5,d19
+    mul         r10,r10,r6
+    vqsub.u8    d31,d5,d19
+    add         r10,#1
+    cmp         r7,r10,asr #1
+    bge         l1.336
+
+
+@        if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4])  < (beta >> 3) )
+@            && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
+
+
+    asr         r10,r5,#2
+    vqsub.u8    d25,d4,d19
+    cmp         r10,r4,lsl #1
+    vqadd.u8    d21,d4,d19
+    ble         l1.336
+    vmin.u8     d26,d20,d21
+    add         r4,r1,r1,lsl #1
+    add         r4,r4,r0
+    vmax.u8     d20,d26,d25
+    ldrb        r2,[r4,#-4]
+    vmin.u8     d19,d0,d30
+    ldrb        r7,[r4,#-1]
+    vmax.u8     d21,d19,d31
+    ldrb        r3,[r4,#0]
+    lsl         r10,r6,#1
+    ldrb        r8,[r4,#3]
+@   ubfx   r7,r2,#24,#8           @ has the -1 value
+@  and    r2,#0xff               @ has the -4 value
+@  ubfx   r8,r3,#24,#8           @ has the 3 value
+@  and    r3,#0xff               @ r4 has the 0 value
+    vaddl.u8    q0,d2,d3
+    vdup.8      d19,r10
+    subs        r8,r8,r3
+    vaddw.u8    q0,q0,d4
+    rsbmi       r8,r8,#0
+    vqadd.u8    d30,d2,d19
+    subs        r2,r2,r7
+    vqsub.u8    d31,d2,d19
+    rsbmi       r2,r2,#0
+    vaddl.u8    q13,d5,d6
+    add         r8,r8,r2
+    vmla.i16    q13,q0,q8
+    cmp         r8,r5,asr #3
+    bge         l1.336
+    vrshrn.i16  d26,q13,#3
+    subs        r7,r3,r7
+    vqadd.u8    d27,d3,d19
+    rsbmi       r7,r7,#0
+    vqsub.u8    d28,d3,d19
+    mov         r10,#5
+    vmin.u8     d16,d26,d30
+    mul         r10,r10,r6
+    add         r10,#1
+    cmp         r7,r10,asr #1
+    vmax.u8     d26,d16,d31
+    bge         l1.336
+    vqadd.u8    d30,d6,d19
+
+    mov         r2,#2
+    ldr         r4,[sp,#0x38]               @ loading the filter_flag_p
+    vqsub.u8    d31,d6,d19
+    ldr         r5,[sp,#0x3c]               @ loading the filter_flag_q
+    b           end_dep_deq_decision
+@ r2 has the value of de
+@ r6 has teh value of tc
+@ r5 has the value of beta
+@ r14 has the value of dp
+@ r12 has the value of dq
+@ r0 has the value of source address
+@ r1 has the src stride
+
+l1.336:
+    mov         r2,#1
+l1.424:
+    mov         r11,r5
+    ldr         r4,[sp,#0x38]               @ loading the filter_flag_p
+    ldr         r5,[sp,#0x3c]               @ loading the filter_flag_q
+
+    cmp         r6,#1
+    moveq       r9,#0
+    moveq       r10,#0
+    beq         end_dep_deq_decision
+
+    and         r7,r4,r5
+
+    cmp         r7,#1
+    beq         both_flags_set
+    cmp         r4,#0
+    beq         set_flag_dep_zero
+
+
+    add         r8,r11,r11,asr #1
+    mov         r10,#0
+    asr         r8,#3
+    cmp         r8,r14
+    movgt       r9,#1
+    movle       r9,#0
+    b           end_dep_deq_decision
+set_flag_dep_zero:
+
+    add         r8,r11,r11,asr #1
+    mov         r9,#0
+    asr         r8,#3
+    cmp         r8,r12
+    movgt       r10,#1
+    movle       r10,#0
+    b           end_dep_deq_decision
+
+both_flags_set:
+    add         r8,r11,r11,asr #1
+    asr         r8,#3
+    cmp         r8,r14
+    movgt       r9,#1
+    movle       r9,#0
+    cmp         r8,r12
+    movgt       r10,#1
+    movle       r10,#0
+end_dep_deq_decision:
+
+@r0=source address
+@r1=stride
+@ r2 =de
+@ r4=flag p
+@r5= flag q
+@r6 =tc
+@ r9 =dep
+@ r10=deq
+@   b   l1.964
+
+
+    cmp         r2,#2
+@ r4 has the value of de
+    bne         l1.968
+
+    cmp         r5,#0
+    beq         l1.780
+@ r5 has the flag of q
+
+    add         r3,r0,#2
+    vst1.8      {d22[0]},[r3],r1
+
+    vst1.8      {d22[1]},[r3],r1
+
+    vst1.8      {d22[2]},[r3],r1
+
+    vst1.8      {d22[3]},[r3]
+    add         r3,r0,r1
+    vtrn.8      d20,d21
+
+    vst1.16     {d20[0]},[r0]
+    vst1.16     {d21[0]},[r3],r1
+    vst1.16     {d20[1]},[r3],r1
+    vst1.16     {d21[1]},[r3]
+
+
+l1.780:
+    cmp         r4,#0
+    beq         l1.964
+    @ r5 has the flag p
+
+
+    vdup.32     d7,d24[0]
+    sub         r3,r0,#1
+    vaddw.u8    q8,q0,d6
+    add         r7,r3,r1
+    vrshrn.i16  d2,q8,#2
+    vst1.8      {d26[0]},[r3]
+    sub         r0,r0,#3
+    vmin.u8     d16,d2,d27
+    vst1.8      {d26[1]},[r7],r1
+    vmull.u8    q1,d6,d23
+    vmlal.u8    q1,d7,d18
+    vst1.8      {d26[2]},[r7],r1
+    vmax.u8     d5,d16,d28
+    vst1.8      {d26[3]},[r7]
+    vadd.i16    q0,q1,q0
+    vrshrn.i16  d0,q0,#3
+
+
+    vmin.u8     d1,d0,d30
+    vmax.u8     d0,d1,d31
+
+    vtrn.8      d0,d5
+    vst1.16     {d0[0]},[r0],r1
+    vst1.16     {d5[0]},[r0],r1
+    vst1.16     {d0[1]},[r0],r1
+    vst1.16     {d5[1]},[r0]
+l1.964:
+    pop         {r3-r12,pc}
+l1.968:
+
+
+    vmov.i16    q0,#0x9
+    rsb         r11,r6,#0
+    cmp         r4,#0
+    @ checks for the flag p
+    vmov.i16    q8,#0x3
+    vmov.i8     d24,#0x1
+
+
+    vdup.8      d30,r11
+    and         r11,r6,#0xff
+    vdup.8      d31,r11
+
+    vsubl.u8    q9,d4,d2
+    vmul.i16    q9,q9,q0
+    vsubl.u8    q0,d5,d3
+
+
+
+    vmul.i16    q8,q0,q8
+    vsub.i16    q8,q9,q8
+    vrshr.s16   q8,q8,#4
+@   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
+
+    vabs.s16    q0,q8
+    vmovn.i16   d0,q0
+    @ storing the absolute values of delta in d0
+
+    vqmovn.s16  d16,q8
+    @ storing the clipped values of delta in d16
+
+    vmov.i8     d1,#0xa
+    vdup.8      d21,r11
+    vmul.i8     d1,d1,d21
+    @ d1 stores the value (10 * tc)
+
+@if(abs(delta) < 10 * tc)
+
+    vmin.s8     d18,d16,d31
+    vmax.s8     d20,d18,d30
+
+@ delta = clip3(delta, -tc, tc)@
+    vmovl.s8    q8,d20
+    vmovl.u8    q9,d2
+    vadd.i16    q9,q9,q8
+
+    vqmovun.s16 d22,q9
+    vmovl.u8    q9,d4
+    vsub.i16    q8,q9,q8
+    vqmovun.s16 d23,q8
+@ tmp_p0 = clip_u8(pu1_src[-1] + delta)@
+@  tmp_q0 = clip_u8(pu1_src[0] - delta)@
+    beq         l1.1272
+
+
+
+    cmp         r9,#1
+    bne         l1.1212
+@ checks for the flag dep
+
+    asr         r3,r6,#1
+
+
+    vaddl.u8    q8,d6,d2
+    vaddw.u8    q8,q8,d24
+    vdup.8      d18,r3
+    rsb         r3,r3,#0
+    vdup.8      d19,r3
+    vshr.u16    q8,q8,#1
+    vmovn.i16   d16,q8
+
+    vsubl.u8    q8,d16,d3
+    vaddw.s8    q8,q8,d20
+    vshr.s16    q8,q8,#1
+    vqmovn.s16  d16,q8
+
+    vmin.s8     d17,d16,d18
+    vmax.s8     d16,d19,d17
+
+
+
+
+    vmovl.u8    q9,d3
+    vmovl.s8    q8,d16
+    vadd.i16    q8,q9,q8
+
+    vqmovun.s16 d16,q8
+    vmov        d30,d3
+    vcge.u8     d3,d0,d1
+
+
+    vbsl        d3,d30,d16
+l1.1212:
+    vdup.8      d16,r11
+    sub         r12,r0,#3
+    sub         r3,r0,#1
+@     vmul.i8  d16,d16,d1
+    vtrn.8      d6,d3
+    vst1.16     {d6[0]},[r12],r1
+    vcge.u8     d16,d0,d1
+    vst1.16     {d3[0]},[r12],r1
+    vbsl        d16,d2,d22
+    vst1.8      {d16[0]},[r3],r1
+    vst1.8      {d16[1]},[r3],r1
+    vst1.16     {d6[1]},[r12],r1
+    vst1.8      {d16[2]},[r3],r1
+    vst1.16     {d3[1]},[r12]
+    vst1.8      {d16[3]},[r3]
+l1.1272:
+    @   ldr      r3,[sp,#0x38]
+    cmp         r5,#0
+    beq         l1.964
+    @ checks for the flag q
+    cmp         r10,#1
+    bne         l1.1412
+    @ checks for the flag deq
+    vmov        d2,d7
+    asr         r3,r6,#1
+
+    vdup.8      d6,r3
+    rsb         r3,r3,#0
+    vdup.8      d16,r3
+    vaddl.u8    q1,d2,d4
+    vaddw.u8    q1,q1,d24
+    vshr.u16    q1,q1,#1
+    vmovn.i16   d2,q1
+
+    vsubl.u8    q1,d2,d5
+    vsubw.s8    q1,q1,d20
+    vshr.s16    q1,q1,#1
+    vqmovn.s16  d3,q1
+
+    vmin.s8     d2,d3,d6
+    vmax.s8     d3,d16,d2
+    @  vdup.8   d6,r2
+    @   vmul.i8  d6,d6,d1
+
+
+
+    vmovl.u8    q8,d5
+    vmovl.s8    q1,d3
+    vadd.i16    q1,q8,q1
+    vqmovun.s16 d3,q1
+    vmov        d30,d5
+    vcge.u8     d5,d0,d1
+
+
+    vbsl        d5,d30,d3
+l1.1412:
+    @  vdup.8   d2,r2
+    add         r3,r0,#2
+    add         r11,r3,r1
+    @   vmul.i8  d1,d2,d1
+    vst1.8      {d7[0]},[r3]
+    vst1.8      {d7[1]},[r11],r1
+    vst1.8      {d7[2]},[r11],r1
+    vcge.u8     d0,d0,d1
+    vst1.8      {d7[3]},[r11]
+    vbsl        d0,d4,d23
+    vtrn.8      d0,d5
+    vst1.16     {d0[0]},[r0],r1
+    vst1.16     {d5[0]},[r0],r1
+    vst1.16     {d0[1]},[r0],r1
+    vst1.16     {d5[1]},[r0]
+    pop         {r3-r12,pc}
+
+
+

diff --git a/common/arm/ihevc_func_selector.h b/common/arm/ihevc_func_selector.h
new file mode 100644
index 0000000..8188178
--- /dev/null
+++ b/common/arm/ihevc_func_selector.h

@@ -0,0 +1,227 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_func_selector.h
+*
+* @brief
+*  For each function decide whether to use C function,  or Neon intrinsics
+* or Cortex A8 intrinsics or Neon  assembly or cortex a8 assembly
+*
+* @author
+*  Harish
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef __IHEVC_FUNC_SELECTOR_H__
+#define __IHEVC_FUNC_SELECTOR_H__
+
+#include "ihevc_func_types.h"
+
+#define    INTER_PRED_LUMA_COPY                     C
+#define    INTER_PRED_LUMA_HORZ                     C
+#define    INTER_PRED_LUMA_VERT                     C
+#define    INTER_PRED_LUMA_COPY_W16OUT              C
+#define    INTER_PRED_LUMA_HORZ_W16OUT              C
+
+#define    INTER_PRED_LUMA_VERT_W16OUT              C
+#define    INTER_PRED_LUMA_VERT_W16INP              C
+#define    INTER_PRED_LUMA_VERT_W16INP_W16OUT       C
+
+#define    INTER_PRED_CHROMA_COPY                   C
+#define    INTER_PRED_CHROMA_HORZ                   C
+#define    INTER_PRED_CHROMA_VERT                   C
+#define    INTER_PRED_CHROMA_COPY_W16OUT            C
+#define    INTER_PRED_CHROMA_HORZ_W16OUT            C
+#define    INTER_PRED_CHROMA_VERT_W16OUT            C
+#define    INTER_PRED_CHROMA_VERT_W16INP            C
+#define    INTER_PRED_CHROMA_VERT_W16INP_W16OUT     C
+
+#define    WEIGHTED_PRED_UNI                        C
+#define    WEIGHTED_PRED_BI                         C
+#define    WEIGHTED_PRED_BI_DEFAULT                 C
+#define    WEIGHTED_PRED_CHROMA_UNI                 C
+#define    WEIGHTED_PRED_CHROMA_BI                  C
+#define    WEIGHTED_PRED_CHROMA_BI_DEFAULT          C
+
+#define    PAD_VERT                                 C
+#define    PAD_HORZ                                 C
+#define    PAD_LEFT_LUMA                            C
+#define    PAD_LEFT_CHROMA                          C
+#define    PAD_RIGHT_LUMA                           C
+#define    PAD_RIGHT_CHROMA                         C
+
+#define     DEBLOCKING_ASM                          C
+#define     DEBLK_LUMA_HORZ                         C
+#define     DEBLK_LUMA_VERT                         C
+#define     DEBLK_CHROMA_HORZ                       C
+#define     DEBLK_CHROMA_VERT                       C
+
+#define     SAO_BAND_OFFSET_LUMA                    C
+#define     SAO_BAND_OFFSET_CHROMA                  C
+#define     SAO_EDGE_OFFSET_CLASS0_LUMA             C
+#define     SAO_EDGE_OFFSET_CLASS1_LUMA             C
+#define     SAO_EDGE_OFFSET_CLASS2_LUMA             C
+#define     SAO_EDGE_OFFSET_CLASS3_LUMA             C
+#define     SAO_EDGE_OFFSET_CLASS0_CHROMA           C
+#define     SAO_EDGE_OFFSET_CLASS1_CHROMA           C
+#define     SAO_EDGE_OFFSET_CLASS2_CHROMA           C
+#define     SAO_EDGE_OFFSET_CLASS3_CHROMA           C
+
+#define     INTRA_PRED_LUMA_REF_SUBSTITUTION        C
+#define     INTRA_PRED_REF_FILTERING                 C
+#define     INTRA_PRED_LUMA_PLANAR                  C
+#define     INTRA_PRED_LUMA_DC                      C
+#define     INTRA_PRED_LUMA_HORZ                    C
+#define     INTRA_PRED_LUMA_VER                     C
+#define     INTRA_PRED_LUMA_MODE_2                  C
+#define     INTRA_PRED_LUMA_MODE_18_34              C
+#define     INTRA_PRED_LUMA_MODE_3_T0_9             C
+#define     INTRA_PRED_LUMA_MODE_11_T0_17           C
+#define     INTRA_PRED_LUMA_MODE_19_T0_25           C
+#define     INTRA_PRED_LUMA_MODE_27_T0_33           C
+
+#define     INTRA_PRED_CHROMA_PLANAR                C
+#define     INTRA_PRED_CHROMA_DC                    C
+#define     INTRA_PRED_CHROMA_HOR                   C
+#define     INTRA_PRED_CHROMA_VER                   C
+#define     INTRA_PRED_CHROMA_MODE_2                C
+#define     INTRA_PRED_CHROMA_18_34                 C
+#define     INTRA_PRED_CHROMA_3_T0_9                C
+#define     INTRA_PRED_CHROMA_11_T0_17              C
+#define     INTRA_PRED_CHROMA_19_T0_25              C
+#define     INTRA_PRED_CHROMA_27_T0_33              C
+#define     INTRA_PRED_CHROMA_REF_SUBSTITUTION      C
+
+/* Forward transform functions */
+/* Luma */
+#define RESI_TRANS_QUANT_4X4_TTYPE1                 C
+#define RESI_TRANS_QUANT_4X4                        C
+#define RESI_TRANS_QUANT_8X8                        C
+#define RESI_TRANS_QUANT_16X16                      C
+#define RESI_TRANS_QUANT_32X32                      C
+
+#define RESI_QUANT_4X4_TTYPE1                       C
+#define RESI_QUANT_4X4                              C
+#define RESI_QUANT_8X8                              C
+#define RESI_QUANT_16X16                            C
+#define RESI_QUANT_32X32                            C
+
+#define RESI_TRANS_4X4_TTYPE1                       C
+#define RESI_TRANS_4X4                              C
+#define RESI_TRANS_8X8                              C
+#define RESI_TRANS_16X16                            C
+#define RESI_TRANS_32X32                            C
+
+#define RESI_4X4_TTYPE1                             C
+#define RESI_4X4                                    C
+#define RESI_8X8                                    C
+#define RESI_16X16                                  C
+#define RESI_32X32                                  C
+
+#define TRANS_4X4_TTYPE1                            C
+#define TRANS_4X4                                   C
+#define TRANS_8X8                                   C
+#define TRANS_16X16                                 C
+#define TRANS_32X32                                 C
+
+#define QUANT_4X4_TTYPE1                            C
+#define QUANT_4X4                                   C
+#define QUANT_8X8                                   C
+#define QUANT_16X16                                 C
+#define QUANT_32X32                                 C
+
+/* Chroma interleaved*/
+#define CHROMA_RESI_TRANS_QUANT_4X4                        C
+#define CHROMA_RESI_TRANS_QUANT_8X8                        C
+#define CHROMA_RESI_TRANS_QUANT_16X16                      C
+
+#define CHROMA_RESI_QUANT_4X4                              C
+#define CHROMA_RESI_QUANT_8X8                              C
+#define CHROMA_RESI_QUANT_16X16                            C
+
+#define CHROMA_RESI_TRANS_4X4                              C
+#define CHROMA_RESI_TRANS_8X8                              C
+#define CHROMA_RESI_TRANS_16X16                            C
+
+#define CHROMA_RESI_4X4                                    C
+#define CHROMA_RESI_8X8                                    C
+#define CHROMA_RESI_16X16                                  C
+
+/* Inverse transform functions */
+/* Luma */
+#define IQUANT_ITRANS_RECON_4X4_TTYPE1              C
+#define IQUANT_ITRANS_RECON_4X4                     C
+#define IQUANT_ITRANS_RECON_8X8                     C
+#define IQUANT_ITRANS_RECON_16X16                   C
+#define IQUANT_ITRANS_RECON_32X32                   C
+
+#define IQUANT_RECON_4X4_TTYPE1                     C
+#define IQUANT_RECON_4X4                            C
+#define IQUANT_RECON_8X8                            C
+#define IQUANT_RECON_16X16                          C
+#define IQUANT_RECON_32X32                          C
+
+#define ITRANS_RECON_4X4_TTYPE1                     C
+#define ITRANS_RECON_4X4                            C
+#define ITRANS_RECON_8X8                            C
+#define ITRANS_RECON_16X16                          C
+#define ITRANS_RECON_32X32                          C
+
+#define RECON_4X4_TTYPE1                            C
+#define RECON_4X4                                   C
+#define RECON_8X8                                   C
+#define RECON_16X16                                 C
+#define RECON_32X32                                 C
+
+#define ITRANS_4X4_TTYPE1                           C
+#define ITRANS_4X4                                  C
+#define ITRANS_8X8                                  C
+#define ITRANS_16X16                                C
+#define ITRANS_32X32                                C
+
+/* Chroma interleaved */
+#define CHROMA_IQUANT_ITRANS_RECON_4X4                     C
+#define CHROMA_IQUANT_ITRANS_RECON_8X8                     C
+#define CHROMA_IQUANT_ITRANS_RECON_16X16                   C
+
+#define CHROMA_IQUANT_RECON_4X4                            C
+#define CHROMA_IQUANT_RECON_8X8                            C
+#define CHROMA_IQUANT_RECON_16X16                          C
+
+#define CHROMA_ITRANS_RECON_4X4                            C
+#define CHROMA_ITRANS_RECON_8X8                            C
+#define CHROMA_ITRANS_RECON_16X16                          C
+
+#define CHROMA_RECON_4X4                                   C
+#define CHROMA_RECON_8X8                                   C
+#define CHROMA_RECON_16X16                                 C
+
+#define IHEVC_MEMCPY                                C
+#define IHEVC_MEMSET                                C
+#define IHEVC_MEMSET_16BIT                          C
+#define IHEVC_MEMCPY_MUL_8                          C
+#define IHEVC_MEMSET_MUL_8                          C
+#define IHEVC_MEMSET_16BIT_MUL_8                    C
+
+#endif  /* __IHEVC_FUNC_SELECTOR_H__ */

diff --git a/common/arm/ihevc_inter_pred_chroma_copy.s b/common/arm/ihevc_inter_pred_chroma_copy.s
new file mode 100644
index 0000000..0da34cc
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_chroma_copy.s

@@ -0,0 +1,270 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_inter_pred_chroma_copy_neon.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*   chroma interprediction filter for copy
+@*
+@* @par description:
+@*    copies the array of width 'wd' and height 'ht' from the  location pointed
+@*    by 'src' to the location pointed by 'dst'
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_chroma_copy( uword8 *pu1_src,
+@                                   uword8 *pu1_dst,
+@                                   word32 src_strd,
+@                                   word32 dst_strd,
+@                                   word8 *pi1_coeff,
+@                                   word32 ht,
+@                                   word32 wd)
+@**************variables vs registers*****************************************
+@               r0 => *pu1_src
+@               r1 => *pu1_dst
+@               r2 =>  src_strd
+@               r3 =>  dst_strd
+@               r4 => *pi1_coeff
+@               r5 =>  ht
+@               r6 =>  wd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_chroma_copy_a9q
+
+.type ihevc_inter_pred_chroma_copy_a9q, %function
+
+ihevc_inter_pred_chroma_copy_a9q:
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+    ldr         r12,[sp,#48]                @loads wd
+    lsl         r12,r12,#1
+    ldr         r7,[sp,#44]                 @loads ht
+    cmp         r7,#0                       @checks ht == 0
+    ble         end_loops
+    and         r8,r7,#3                    @check ht for mul of 2
+    sub         r7,r7,r8                    @check the rounded height value
+    tst         r12,#15                     @checks wd for multiples for 4 & 8
+    beq         core_loop_wd_16
+    tst         r12,#7                      @checks wd for multiples for 4 & 8
+    beq         core_loop_wd_8
+
+    sub         r11,r12,#4
+    cmp         r7,#0
+    beq         outer_loop_wd_4_ht_2
+
+outer_loop_wd_4:
+    subs        r4,r12,#0                   @checks wd == 0
+    ble         end_inner_loop_wd_4
+
+inner_loop_wd_4:
+    vld1.32     {d0[0]},[r0]                @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    add         r5,r0,r2                    @pu1_src_tmp += src_strd
+    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
+    vst1.32     {d0[0]},[r1]                @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    add         r0,r0,#4                    @pu1_src += 4
+    vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    subs        r4,r4,#4                    @(wd -4)
+    vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    add         r1,r1,#4                    @pu1_dst += 4
+    vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    bgt         inner_loop_wd_4
+
+end_inner_loop_wd_4:
+    subs        r7,r7,#4                    @ht - 4
+    sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
+    sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
+    bgt         outer_loop_wd_4
+    cmp         r8,#0
+    bgt         outer_loop_wd_4_ht_2
+
+end_loops:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+outer_loop_wd_4_ht_2:
+    subs        r4,r12,#0                   @checks wd == 0
+    ble         end_loops
+
+inner_loop_wd_4_ht_2:
+    vld1.32     {d0[0]},[r0]                @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    add         r5,r0,r2                    @pu1_src_tmp += src_strd
+    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
+    vst1.32     {d0[0]},[r1]                @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    add         r0,r0,#4                    @pu1_src += 4
+    vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    subs        r4,r4,#4                    @(wd -4)
+    add         r1,r1,#4                    @pu1_dst += 4
+    bgt         inner_loop_wd_4_ht_2
+    b           end_loops
+
+core_loop_wd_8:
+    sub         r11,r12,#8
+    cmp         r7,#0
+    beq         outer_loop_wd_8_ht_2
+
+outer_loop_wd_8:
+    subs        r4,r12,#0                   @checks wd
+    ble         end_inner_loop_wd_8
+
+inner_loop_wd_8:
+    add         r5,r0,r2                    @pu1_src_tmp += src_strd
+    vld1.8      {d0},[r0]!                  @vld1_u8(pu1_src_tmp)
+    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
+    vst1.8      {d0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
+    vld1.8      {d1},[r5],r2                @vld1_u8(pu1_src_tmp)
+    vst1.8      {d1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
+    subs        r4,r4,#8                    @wd - 8(loop condition)
+    vld1.8      {d2},[r5],r2                @vld1_u8(pu1_src_tmp)
+    vst1.8      {d2},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
+    vld1.8      {d3},[r5],r2                @vld1_u8(pu1_src_tmp)
+    vst1.8      {d3},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
+    bgt         inner_loop_wd_8
+
+end_inner_loop_wd_8:
+    subs        r7,r7,#4                    @ht -= 4
+    sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
+    sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
+    bgt         outer_loop_wd_8
+    cmp         r8,#0
+    bgt         outer_loop_wd_8_ht_2
+    b           end_loops
+
+outer_loop_wd_8_ht_2:
+    subs        r4,r12,#0                   @checks wd
+    ble         end_loops
+
+inner_loop_wd_8_ht_2:
+    add         r5,r0,r2                    @pu1_src_tmp += src_strd
+    vld1.8      {d0},[r0]!                  @vld1_u8(pu1_src_tmp)
+    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
+    vst1.8      {d0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
+    vld1.8      {d1},[r5],r2                @vld1_u8(pu1_src_tmp)
+    vst1.8      {d1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
+    @subs     r4,r4,#8                      @wd - 8(loop condition)
+    @bgt      inner_loop_wd_8_ht_2
+    b           end_loops
+
+core_loop_wd_16:
+    sub         r11,r12,#16
+    cmp         r7,#0
+    beq         outer_loop_wd_16_ht_2
+
+outer_loop_wd_16:
+    subs        r4,r12,#0                   @checks wd
+    ble         end_inner_loop_wd_16
+
+inner_loop_wd_16:
+    add         r5,r0,r2                    @pu1_src_tmp += src_strd
+    vld1.8      {q0},[r0]!                  @vld1_u8(pu1_src_tmp)
+    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
+    vst1.8      {q0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
+    vld1.8      {q1},[r5],r2                @vld1_u8(pu1_src_tmp)
+    vst1.8      {q1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
+    subs        r4,r4,#16                   @wd - 16(loop condition)
+    vld1.8      {q2},[r5],r2                @vld1_u8(pu1_src_tmp)
+    vst1.8      {q2},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
+    vld1.8      {q3},[r5],r2                @vld1_u8(pu1_src_tmp)
+    vst1.8      {q3},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
+    bgt         inner_loop_wd_16
+
+end_inner_loop_wd_16:
+    subs        r7,r7,#4                    @ht -= 4
+    sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
+    sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
+    bgt         outer_loop_wd_16
+    cmp         r8,#0
+    bgt         outer_loop_wd_16_ht_2
+    b           end_loops
+
+outer_loop_wd_16_ht_2:
+    subs        r4,r12,#0                   @checks wd
+    ble         end_loops
+
+inner_loop_wd_16_ht_2:
+    add         r5,r0,r2                    @pu1_src_tmp += src_strd
+    vld1.8      {q0},[r0]!                  @vld1_u8(pu1_src_tmp)
+    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
+    vst1.8      {q0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
+    vld1.8      {q1},[r5],r2                @vld1_u8(pu1_src_tmp)
+    vst1.8      {q1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
+    @subs     r4,r4,#16                     @wd - 16(loop condition)
+    @bgt      inner_loop_wd_16_ht_2
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+

diff --git a/common/arm/ihevc_inter_pred_chroma_copy_w16out.s b/common/arm/ihevc_inter_pred_chroma_copy_w16out.s
new file mode 100644
index 0000000..a927fa7
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_chroma_copy_w16out.s

@@ -0,0 +1,325 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_inter_pred_chroma_copy_w16out_neon.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*   chroma interprediction filter for copy
+@*
+@* @par description:
+@*    copies the array of width 'wd' and height 'ht' from the  location pointed
+@*    by 'src' to the location pointed by 'dst'
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src,
+@                                           word16 *pi2_dst,
+@                                           word32 src_strd,
+@                                           word32 dst_strd,
+@                                           word8 *pi1_coeff,
+@                                           word32 ht,
+@                                           word32 wd)
+@**************variables vs registers*****************************************
+@r0 => *pu1_src
+@r1 => *pi2_dst
+@r2 =>  src_strd
+@r3 =>  dst_strd
+@r4 => *pi1_coeff
+@r5 =>  ht
+@r6 =>  wd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_chroma_copy_w16out_a9q
+
+.type ihevc_inter_pred_chroma_copy_w16out_a9q, %function
+
+ihevc_inter_pred_chroma_copy_w16out_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+    ldr         r12,[sp,#48]                @loads wd
+    lsl         r12,r12,#1                  @2*wd
+    ldr         r7,[sp,#44]                 @loads ht
+    cmp         r7,#0                       @ht condition(ht == 0)
+    ble         end_loops                   @loop
+    and         r8,r7,#3                    @check ht for mul of 2
+    sub         r9,r7,r8                    @check the rounded height value
+    and         r11,r7,#6
+    cmp         r11,#6
+    beq         loop_ht_6
+    tst         r12,#7                      @conditional check for wd (multiples)
+    beq         core_loop_wd_8
+
+loop_ht_6:
+    sub         r11,r12,#4
+    lsls        r6,r3,#1
+    cmp         r9,#0
+    beq         outer_loop_wd_4_ht_2
+
+outer_loop_wd_4:
+    subs        r4,r12,#0                   @wd conditional subtract
+    ble         end_inner_loop_wd_4
+
+inner_loop_wd_4:
+    vld1.8      {d0},[r0]                   @vld1_u8(pu1_src_tmp)
+    add         r5,r0,r2                    @pu1_src +src_strd
+    vmovl.u8    q0,d0                       @vmovl_u8(vld1_u8(pu1_src_tmp)
+    add         r10,r1,r6
+    subs        r4,r4,#4                    @wd - 4
+    vshl.i64    q0,q0,#6                    @vshlq_n_s64(temp, 6)
+    vld1.8      {d22},[r5],r2               @vld1_u8(pu1_src_tmp)
+    add         r0,r0,#4                    @pu1_src += 4
+    vst1.64     {d0},[r1]                   @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    add         r1,r1,#8
+    vmovl.u8    q11,d22                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    vld1.8      {d24},[r5],r2               @vld1_u8(pu1_src_tmp)
+    vshl.i64    q11,q11,#6                  @vshlq_n_s64(temp, 6)
+    vmovl.u8    q12,d24                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    vst1.64     {d22},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    vshl.i64    q12,q12,#6                  @vshlq_n_s64(temp, 6)
+    vld1.8      {d26},[r5],r2               @vld1_u8(pu1_src_tmp)
+    vst1.64     {d24},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    vmovl.u8    q13,d26                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    vshl.i64    q13,q13,#6                  @vshlq_n_s64(temp, 6)
+    vst1.64     {d26},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    bgt         inner_loop_wd_4
+
+end_inner_loop_wd_4:
+    subs        r9,r9,#4                    @ht - 4
+    sub         r0,r5,r11
+    sub         r1,r10,r11,lsl #1
+    bgt         outer_loop_wd_4
+    cmp         r8,#0
+    bgt         outer_loop_wd_4_ht_2
+
+
+end_loops:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+outer_loop_wd_4_ht_2:
+    subs        r4,r12,#0                   @wd conditional subtract
+    ble         end_inner_loop_wd_4
+
+inner_loop_wd_4_ht_2:
+    vld1.8      {d0},[r0]                   @vld1_u8(pu1_src_tmp)
+    add         r5,r0,r2                    @pu1_src +src_strd
+    vmovl.u8    q0,d0                       @vmovl_u8(vld1_u8(pu1_src_tmp)
+    add         r10,r1,r6
+    subs        r4,r4,#4                    @wd - 4
+    vshl.i64    q0,q0,#6                    @vshlq_n_s64(temp, 6)
+    vld1.8      {d22},[r5],r2               @vld1_u8(pu1_src_tmp)
+    add         r0,r0,#4                    @pu1_src += 4
+    vst1.64     {d0},[r1]                   @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    add         r1,r1,#8
+    vmovl.u8    q11,d22                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    vld1.8      {d24},[r5],r2               @vld1_u8(pu1_src_tmp)
+    vshl.i64    q11,q11,#6                  @vshlq_n_s64(temp, 6)
+    vmovl.u8    q12,d24                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    vst1.64     {d22},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    bgt         inner_loop_wd_4_ht_2
+    b           end_loops
+
+
+core_loop_wd_8:
+    @sub            r11,r12,#8
+    lsls        r5,r3,#1
+    rsb         r11,r12,r3, lsl #2          @ r11 = (dst_strd * 4) - width
+    rsb         r8,r12,r2,lsl #2            @r2->src_strd
+    mov         r4,r12, lsr #3              @ divide by 8
+    mov         r7,r9
+    mul         r7, r4
+    sub         r4,r12,#0                   @wd conditional check
+    sub         r7,r7,#4                    @subtract one for epilog
+    cmp         r9,#0
+    beq         core_loop_wd_8_ht_2
+
+prolog:
+    add         r6,r0,r2                    @pu1_src_tmp += src_strd
+    add         r10,r1,r5
+    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
+    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
+    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
+    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    subs        r4,r4,#8                    @wd decrements by 8
+    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
+    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
+    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
+    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
+    addle       r0,r0,r8
+    add         r6,r0,r2                    @pu1_src_tmp += src_strd
+    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
+    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
+
+    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
+    addle       r1,r1,r11,lsl #1
+    suble       r4,r12,#0                   @wd conditional check
+
+    subs        r7,r7,#4                    @ht - 4
+
+    blt         epilog_end                  @jumps to epilog_end
+    beq         epilog                      @jumps to epilog
+
+
+
+outer_loop_wd_8:
+
+    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
+
+    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    subs        r4,r4,#8                    @wd decrements by 8
+    addle       r0,r0,r8
+
+    add         r6,r0,r2                    @pu1_src_tmp += src_strd
+
+    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
+    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
+
+    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
+
+    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
+
+    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
+    add         r10,r1,r5
+
+    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
+
+    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
+
+    addle       r1,r1,r11,lsl #1
+    suble       r4,r12,#0                   @wd conditional check
+
+    subs        r7,r7,#4                    @ht - 4
+    bgt         outer_loop_wd_8
+
+epilog:
+    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
+
+    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    @add        r6,r0,r2                @pu1_src_tmp += src_strd
+
+    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
+    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
+    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
+    add         r10,r1,r5
+    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
+
+    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
+epilog_end:
+    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    b           end_loops
+
+core_loop_wd_8_ht_2:
+    add         r6,r0,r2                    @pu1_src_tmp += src_strd
+    add         r10,r1,r5
+    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
+    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
+    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
+    subs        r12,r12,#8                  @wd decrements by 8
+    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
+    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
+    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
+    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    bgt         core_loop_wd_8_ht_2
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+

diff --git a/common/arm/ihevc_inter_pred_chroma_horz.s b/common/arm/ihevc_inter_pred_chroma_horz.s
new file mode 100644
index 0000000..fbd1be1
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_chroma_horz.s

@@ -0,0 +1,684 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_inter_pred_chroma_horz_neon.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  yogeswaran rs / akshaya mukund
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    chroma interprediction filter for horizontal input
+@*
+@* @par description:
+@*    applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
+@*    to the elements pointed by 'pu1_src' and  writes to the location pointed
+@*    by 'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
+@*    assumptions : the function is optimized considering the fact width is
+@*    multiple of 2,4 or 8. if width is 2, then height  should be multiple of 2.
+@*    width 4,8 is optimized further
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_chroma_horz(uword8 *pu1_src,
+@                                   uword8 *pu1_dst,
+@                                   word32 src_strd,
+@                                   word32 dst_strd,
+@                                   word8 *pi1_coeff,
+@                                   word32 ht,
+@                                   word32 wd)
+@**************variables vs registers*****************************************
+@r0 => *pu1_src
+@r1 => *pi2_dst
+@r2 =>  src_strd
+@r3 =>  dst_strd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_chroma_horz_a9q
+
+.type ihevc_inter_pred_chroma_horz_a9q, %function
+
+ihevc_inter_pred_chroma_horz_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads pi1_coeff
+    ldr         r7,[sp,#44]                 @loads ht
+    ldr         r10,[sp,#48]                @loads wd
+
+    vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
+    subs        r14,r7,#0                   @checks for ht == 0
+    vabs.s8     d2,d0                       @vabs_s8(coeff)
+    mov         r11,#2
+    ble         end_loops
+
+    vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+    sub         r12,r0,#2                   @pu1_src - 2
+    vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+    add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
+    vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+
+    tst         r10,#3                      @checks wd for multiples
+    mov         r5,r10,lsl #1
+
+    vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+
+    bne         outer_loop_4
+    cmp         r10,#12
+    beq         skip_16
+
+    cmp         r10,#8
+    bge         outer_loop_16
+skip_16:
+    tst         r7,#3
+
+    sub         r9,r0,#2
+    beq         outer_loop_ht_4             @jumps to else condition
+
+    b           outer_loop_8
+
+
+outer_loop_16:
+    mov         r10,r5                      @2wd
+    mul         r14,r14,r10
+
+    rsb         r6,r3,#16
+
+    add         r4,r12,r2
+    mov         r9,#10
+    and         r0, r12, #31
+    rsb         r8,r5,r3,lsl #1
+    pld         [r12, r2, lsl #1]
+
+
+
+
+    vld1.u32    {q0},[r12],r11              @vector load pu1_src
+    pld         [r4, r2, lsl #1]
+    vld1.u32    {q1},[r12],r11              @vector load pu1_src
+
+    vld1.u32    {q2},[r12],r11              @vector load pu1_src
+
+    vld1.u32    {q3},[r12],r9               @vector load pu1_src
+
+
+    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vld1.u32    {q4},[r4],r11               @vector load pu1_src
+    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    vld1.u32    {q5},[r4],r11               @vector load pu1_src
+    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vld1.u32    {q6},[r4],r11               @vector load pu1_src
+    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+    vld1.u32    {q7},[r4],r9                @vector load pu1_src
+    vmull.u8    q14,d3,d25
+
+    vmlsl.u8    q14,d1,d24
+
+
+    vmlal.u8    q14,d5,d26
+
+    vmlsl.u8    q14,d7,d27
+
+
+    cmp         r14,#32
+    beq         epilog_end
+    sub         r14,#64
+
+inner_loop_16:
+
+
+
+
+@    bgt            l_2
+
+@   pld         [r12, r2, lsl #1]
+@   pld         [r4, r2, lsl #1]
+
+
+
+    subs        r10,r10,#16
+
+    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+
+    addeq       r12,r12,r8
+    addeq       r4,r12,r2
+    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+
+
+    pld         [r12, r2, lsl #2]
+    vqrshrun.s16 d30,q15,#6
+
+    vld1.u32    {q0},[r12],r11              @vector load pu1_src
+    vqrshrun.s16 d31,q14,#6
+
+
+    vld1.u32    {q1},[r12],r11              @vector load pu1_src
+    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+
+
+
+    vld1.u32    {q2},[r12],r11              @vector load pu1_src
+    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+    vld1.u32    {q3},[r12],r9               @vector load pu1_src
+    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+    pld         [r4, r2, lsl #2]
+    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vst1.16     {q15}, [r1],r3
+    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    vld1.u32    {q4},[r4],r11               @vector load pu1_src
+    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+    vld1.u32    {q5},[r4],r11               @vector load pu1_src
+    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+    vld1.u32    {q6},[r4],r11               @vector load pu1_src
+    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vld1.u32    {q7},[r4],r9                @vector load pu1_src
+    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    cmp         r10,#0
+    vqrshrun.s16 d22,q11,#6
+    vqrshrun.s16 d23,q10,#6
+
+
+
+    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    moveq       r10,r5                      @2wd
+    vmull.u8    q14,d3,d25
+
+
+    vst1.16     {q11},[r1],r6               @store the result pu1_dst
+    vmlsl.u8    q14,d1,d24
+
+
+    addeq       r1,r1,r8
+    vmlal.u8    q14,d5,d26
+
+    subs        r14,r14,#32                 @decrement the ht loop
+    vmlsl.u8    q14,d7,d27
+
+@     mov           r0, r7
+
+    bgt         inner_loop_16
+
+
+
+    add         r14,r14,#64
+    cmp         r14,#32
+    beq         epilog_end
+
+epilog:
+    vqrshrun.s16 d30,q15,#6
+    vqrshrun.s16 d31,q14,#6
+
+
+
+    vst1.16     {q15}, [r1],r3
+    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+
+
+
+    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    subs        r10,r10,#16                 @decrement the wd loop
+    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    addeq       r12,r12,r8
+    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+    moveq       r10,r5                      @2wd
+
+
+    addeq       r4,r12,r2
+    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vld1.u32    {q0},[r12],r11              @vector load pu1_src
+    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    vld1.u32    {q1},[r12],r11              @vector load pu1_src
+    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vld1.u32    {q2},[r12],r11              @vector load pu1_src
+    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+    vld1.u32    {q3},[r12],r9               @vector load pu1_src
+    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+
+    vld1.u32    {q4},[r4],r11               @vector load pu1_src
+    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    vld1.u32    {q5},[r4],r11               @vector load pu1_src
+    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    vld1.u32    {q6},[r4],r11               @vector load pu1_src
+    vmull.u8    q14,d3,d25
+    vld1.u32    {q7},[r4],r9                @vector load pu1_src
+    vmlsl.u8    q14,d1,d24
+    vqrshrun.s16 d22,q11,#6
+    vqrshrun.s16 d23,q10,#6
+
+    vst1.16     {q11},[r1],r6               @store the result pu1_dst
+    vmlal.u8    q14,d5,d26
+
+    vmlsl.u8    q14,d7,d27
+    addeq       r1,r1,r8
+
+
+
+epilog_end:
+    vqrshrun.s16 d30,q15,#6
+    vqrshrun.s16 d31,q14,#6
+
+
+    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+    vqrshrun.s16 d22,q11,#6
+    vqrshrun.s16 d23,q10,#6
+
+
+    vst1.16     {q15}, [r1],r3
+
+    vst1.16     {q11},[r1]                  @store the result pu1_dst
+
+
+
+    b           end_loops
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+outer_loop_8:
+
+
+    add         r6,r1,r3                    @pu1_dst + dst_strd
+    mov         r7,r5
+    add         r4,r12,r2                   @pu1_src + src_strd
+
+
+inner_loop_8:
+    @vld1.u32  {d0,d1},[r12],r11               @vector load pu1_src
+    vld1.u32    {d0},[r12],r11              @vector load pu1_src
+    vld1.u32    {d1},[r12],r11              @vector load pu1_src
+    vld1.u32    {d2},[r12],r11              @vector load pu1_src
+    vld1.u32    {d3},[r12],r11              @vector load pu1_src
+
+    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
+    vmull.u8    q4,d1,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
+    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
+    vmlal.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vmlsl.u8    q4,d3,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    vld1.u32    {d4},[r4],r11               @vector load pu1_src
+    vld1.u32    {d5},[r4],r11               @vector load pu1_src
+    vld1.u32    {d6},[r4],r11               @vector load pu1_src
+    vld1.u32    {d7},[r4],r11               @vector load pu1_src
+    @vld1.u32  {d12,d13},[r4],r11              @vector load pu1_src + src_strd
+    @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
+    vmull.u8    q5,d5,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vmlsl.u8    q5,d4,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
+    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
+    vqrshrun.s16 d8,q4,#6                   @right shift and saturating narrow result 1
+    vmlal.u8    q5,d6,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vmlsl.u8    q5,d7,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    vst1.8      {d8},[r1]!                  @store the result pu1_dst
+
+    vqrshrun.s16 d10,q5,#6                  @right shift and saturating narrow result 2
+    subs        r7,r7,#8                    @decrement the wd loop
+    vst1.8      {d10},[r6]!                 @store the result pu1_dst
+    bgt         inner_loop_8
+
+    sub         r12,r12,r5
+    subs        r14,r14,#2                  @decrement the ht loop
+    sub         r1,r1,r5
+    add         r12,r12,r2,lsl #1
+    add         r1,r1,r3,lsl #1
+    bgt         outer_loop_8
+    b           end_loops
+
+@height if 4 comes
+outer_loop_ht_4:
+
+    mov         r7,r5
+
+prologue_ht_4:
+
+inner_loop_ht_4:
+
+    mov         r12,r9
+    mov         r4,r1
+
+    sub         r8, r2, #6
+
+    vld1.u32    {d0},[r12],r11              @(1)vector load pu1_src
+    vld1.u32    {d1},[r12],r11              @(1)vector load pu1_src
+    vld1.u32    {d2},[r12],r11              @(1)vector load pu1_src
+    @vld1.u32  {d3},[r12],r2               @(1)vector load pu1_src
+    vld1.u32    {d3},[r12],r8               @(1)vector load pu1_src
+
+    @sub       r12, r12, #6                @(1)
+
+    vld1.u32    {d4},[r12],r11              @(2)vector load pu1_src
+    vld1.u32    {d5},[r12],r11              @(2)vector load pu1_src
+    vld1.u32    {d6},[r12],r11              @(2)vector load pu1_src
+    @vld1.u32  {d7},[r12],r2               @(2)vector load pu1_src
+    vld1.u32    {d7},[r12],r8               @(2)vector load pu1_src
+
+    @sub       r12, r12, #6                @(2)
+
+    vld1.u32    {d14},[r12],r11             @(3)vector load pu1_src
+    vmull.u8    q4,d1,d25                   @(1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+    vld1.u32    {d15},[r12],r11             @(3)vector load pu1_src
+    vmlsl.u8    q4,d0,d24                   @(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vld1.u32    {d16},[r12],r11             @(3)vector load pu1_src
+    vmlal.u8    q4,d2,d26                   @(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    @vld1.u32  {d17},[r12],r2              @(3)vector load pu1_src
+    vld1.u32    {d17},[r12],r8              @(3)vector load pu1_src
+    vmlsl.u8    q4,d3,d27                   @(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    @sub       r12, r12, #6                @(3)
+    vmull.u8    q5,d5,d25                   @(2)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+    vld1.u32    {d18},[r12],r11             @(4)vector load pu1_src
+    vmlsl.u8    q5,d4,d24                   @(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vld1.u32    {d19},[r12],r11             @(4)vector load pu1_src
+    vmlal.u8    q5,d6,d26                   @(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    vld1.u32    {d20},[r12],r11             @(4)vector load pu1_src
+    vmlsl.u8    q5,d7,d27                   @(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    vld1.u32    {d21},[r12],r2              @(4)vector load pu1_src
+    vqrshrun.s16 d8,q4,#6                   @(1)right shift and saturating narrow result 1
+
+    add         r9,r9,#8                    @(core loop)
+
+    subs        r7,r7,#8                    @(prologue)decrement the wd loop
+    beq         epilogue
+
+core_loop:
+    mov         r12,r9
+
+    vld1.u32    {d0},[r12],r11              @(1_1)vector load pu1_src
+    vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+    vld1.u32    {d1},[r12],r11              @(1_1)vector load pu1_src
+    vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vld1.u32    {d2},[r12],r11              @(1_1)vector load pu1_src
+    vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    @vld1.u32  {d3},[r12],r2               @(1_1)vector load pu1_src
+    vld1.u32    {d3},[r12],r8               @(1_1)vector load pu1_src
+    vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    @sub       r12, r12, #6                @(1_1)
+
+    vst1.8      {d8},[r4],r3                @(1)store the result pu1_dst
+    vqrshrun.s16 d10,q5,#6                  @(2)right shift and saturating narrow result 2
+
+    vld1.u32    {d4},[r12],r11              @(2_1)vector load pu1_src
+    vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+    vld1.u32    {d5},[r12],r11              @(2_1)vector load pu1_src
+    vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vld1.u32    {d6},[r12],r11              @(2_1)vector load pu1_src
+    vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    @vld1.u32  {d7},[r12],r2               @(2_1)vector load pu1_src
+    vld1.u32    {d7},[r12],r8               @(2_1)vector load pu1_src
+    vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    @sub       r12, r12, #6                @(2_1)
+
+    vst1.8      {d10},[r4],r3               @(2)store the result pu1_dst
+    vqrshrun.s16 d12,q6,#6                  @(3)right shift and saturating narrow result 1
+
+    vld1.u32    {d14},[r12],r11             @(3_1)vector load pu1_src
+    vmull.u8    q4,d1,d25                   @(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+    vld1.u32    {d15},[r12],r11             @(3_1)vector load pu1_src
+    vmlsl.u8    q4,d0,d24                   @(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vld1.u32    {d16},[r12],r11             @(3_1)vector load pu1_src
+    vmlal.u8    q4,d2,d26                   @(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    @vld1.u32  {d17},[r12],r2              @(3_1)vector load pu1_src
+    vld1.u32    {d17},[r12],r8              @(3_1)vector load pu1_src
+    vmlsl.u8    q4,d3,d27                   @(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    @sub       r12, r12, #6                @(3_1)
+
+    vst1.8      {d12},[r4],r3               @(3)store the result pu1_dst
+    vqrshrun.s16 d22,q11,#6                 @(4)right shift and saturating narrow result 2
+
+    add         r9,r9,#8                    @(core loop)
+
+    vmull.u8    q5,d5,d25                   @(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vld1.u32    {d18},[r12],r11             @(4_1)vector load pu1_src
+
+    vld1.u32    {d19},[r12],r11             @(4_1)vector load pu1_src
+    vmlsl.u8    q5,d4,d24                   @(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vld1.u32    {d20},[r12],r11             @(4_1)vector load pu1_src
+    vmlal.u8    q5,d6,d26                   @(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    vld1.u32    {d21},[r12],r2              @(4_1)vector load pu1_src
+    vmlsl.u8    q5,d7,d27                   @(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    add         r1,r1,#8                    @(core loop)
+
+    subs        r7,r7,#8                    @(core loop)
+
+    vst1.8      {d22}, [r4], r3             @(4)store the result pu1_dst
+    vqrshrun.s16 d8,q4,#6                   @(1_1)right shift and saturating narrow result 1
+
+    mov         r4, r1                      @(core loop)
+
+    bgt         core_loop                   @loopback
+
+epilogue:
+    vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+    vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    vst1.8      {d8},[r4],r3                @(1)store the result pu1_dst
+    vqrshrun.s16 d10,q5,#6                  @(2)right shift and saturating narrow result 2
+
+    vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    vst1.8      {d10},[r4],r3               @(2)store the result pu1_dst
+    vqrshrun.s16 d12,q6,#6                  @(3)right shift and saturating narrow result 1
+
+    vst1.8      {d12},[r4],r3               @(3)store the result pu1_dst
+
+    add         r1,r1,#8                    @(core loop)
+
+    vqrshrun.s16 d22,q11,#6                 @(4)right shift and saturating narrow result 2
+
+
+    vst1.8      {d22}, [r4], r3             @(4)store the result pu1_dst
+
+    sub         r9,r9,r5
+    subs        r14,r14,#4                  @decrement the ht loop
+    sub         r1,r1,r5
+    add         r9,r9,r2,lsl #2
+    add         r1,r1,r3,lsl #2
+    bgt         outer_loop_ht_4
+    b           end_loops
+
+outer_loop_4:
+    add         r6,r1,r3                    @pu1_dst + dst_strd
+    mov         r7,r5
+    add         r4,r12,r2                   @pu1_src + src_strd
+
+inner_loop_4:
+    @vld1.u32  {d0,d1},[r12]                   @vector load pu1_src
+
+    vld1.u32    {d0},[r12],r11              @vector load pu1_src
+    vld1.u32    {d1},[r12],r11              @vector load pu1_src
+    vld1.u32    {d2},[r12],r11              @vector load pu1_src
+    vld1.u32    {d3},[r12]                  @vector load pu1_src
+
+    sub         r12,r12,#2                  @increment the input pointer
+    vld1.u32    {d4},[r4],r11               @vector load pu1_src
+    vld1.u32    {d5},[r4],r11               @vector load pu1_src
+    vld1.u32    {d6},[r4],r11               @vector load pu1_src
+    vld1.u32    {d7},[r4]                   @vector load pu1_src
+    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
+    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
+    @vld1.u32  {d12,d13},[r4]                  @vector load pu1_src + src_strd
+    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
+
+    sub         r4,r4,#2                    @increment the input pointer
+    @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
+    @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
+    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
+
+    vzip.32     d0,d4                       @vector zip the i iteration and ii interation in single register
+    vzip.32     d1,d5
+    vzip.32     d2,d6
+    vzip.32     d3,d7
+
+    vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
+    vmlsl.u8    q4,d0,d24
+    vmlal.u8    q4,d2,d26
+    vmlsl.u8    q4,d3,d27
+
+    vqrshrun.s16 d8,q4,#6                   @narrow right shift and saturating the result
+    vst1.32     {d8[0]},[r1]!               @store the i iteration result which is in upper part of the register
+    subs        r7,r7,#4                    @decrement the wd by 4
+
+    vst1.32     {d8[1]},[r6]!               @store the ii iteration result which is in lower part of the register
+
+    bgt         inner_loop_4
+
+    sub         r12,r12,r5
+    subs        r14,r14,#2                  @decrement the ht by 2
+    sub         r1,r1,r5
+    add         r12,r12,r2,lsl #1
+    add         r1,r1,r3,lsl #1
+    bgt         outer_loop_4
+
+end_loops:
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+
+
+

diff --git a/common/arm/ihevc_inter_pred_chroma_horz_w16out.s b/common/arm/ihevc_inter_pred_chroma_horz_w16out.s
new file mode 100644
index 0000000..f95937c
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_chroma_horz_w16out.s

@@ -0,0 +1,719 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_inter_pred_chroma_horz_neon.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  yogeswaran rs / akshaya mukund
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*       chroma interprediction filter to store horizontal 16bit ouput
+@*
+@* @par description:
+@*    applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
+@*    to the elements pointed by 'pu1_src' and  writes to the location pointed
+@*    by 'pu1_dst'  no downshifting or clipping is done and the output is  used
+@*    as an input for vertical filtering or weighted  prediction
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[out] pi2_dst
+@*  word16 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@void ihevc_inter_pred_chroma_horz_w16out(uword8 *pu1_src,
+@                                          word16 *pi2_dst,
+@                                          word32 src_strd,
+@                                          word32 dst_strd,
+@                                          word8 *pi1_coeff,
+@                                          word32 ht,
+@                                          word32 wd)
+@**************variables vs registers*****************************************
+@r0 => *pu1_src
+@r1 => *pi2_dst
+@r2 =>  src_strd
+@r3 =>  dst_strd
+
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_chroma_horz_w16out_a9q
+
+
+.type ihevc_inter_pred_chroma_horz_w16out_a9q, %function
+
+ihevc_inter_pred_chroma_horz_w16out_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads pi1_coeff
+    ldr         r6,[sp,#44]                 @loads ht
+    ldr         r10,[sp,#48]                @loads wd
+
+    vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
+    subs        r14,r6,#0                   @checks for ht == 0
+    vabs.s8     d2,d0                       @vabs_s8(coeff)
+
+@******* added
+    mov         r11, #2
+@******* added ends
+
+    ble         end_loops
+
+    vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+    sub         r12,r0,#2                   @pu1_src - 2
+    vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+    add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
+    vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+
+    tst         r10,#3                      @checks wd for multiples of 4
+    mov         r5,r10,lsl #1               @2wd
+
+    vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+
+    and         r7,r14,#1                   @added              @calculating ht_residue ht_residue = (ht & 1)
+    sub         r14,r14,r7                  @added              @decrement height by ht_residue(residue value is calculated outside)
+
+    bne         outer_loop_4                @ this branching happens when the width is 2 or 6
+
+    cmp         r10,#12
+    beq         skip_16
+
+    cmp         r10,#8
+    bge         outer_loop_16
+
+skip_16:
+    tst         r6,#3
+
+@******* removal
+    @mov       r11,#8
+@******* removal ends
+
+    sub         r9,r0,#2
+    beq         outer_loop_ht_4             @this branching happens when the height is a a multiple of 4
+
+
+
+@    cmp        r10,#12
+@    beq    outer_loop_8
+@    cmp        r10,#16
+@    bge    outer_loop_16
+    b           outer_loop_8
+
+
+
+outer_loop_16:
+    add         r4,r12,r2
+
+
+    and         r0, r12, #31
+    pld         [r12, r2, lsl #1]
+
+
+
+
+
+
+
+    vld1.u32    {q0},[r12],r11              @vector load pu1_src
+    mov         r10,r5                      @2wd
+    mul         r14,r14,r10
+    vld1.u32    {q1},[r12],r11              @vector load pu1_src
+    pld         [r4, r2, lsl #1]
+    mov         r9,#10
+    vld1.u32    {q2},[r12],r11              @vector load pu1_src
+    rsb         r6,r3,#8
+    sub         r8,r3,#8
+    vld1.u32    {q3},[r12],r9               @vector load pu1_src
+
+
+    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vld1.u32    {q4},[r4],r11               @vector load pu1_src
+    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    vld1.u32    {q5},[r4],r11               @vector load pu1_src
+    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vld1.u32    {q6},[r4],r11               @vector load pu1_src
+    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+    vld1.u32    {q7},[r4],r9                @vector load pu1_src
+    vmull.u8    q14,d3,d25
+    lsl         r6,#1
+    rsb         r3,r5,r3,lsl #1
+    vmlsl.u8    q14,d1,d24
+    lsl         r8,#1
+    rsb         r7,r5,r2,lsl #1
+    vmlal.u8    q14,d5,d26
+
+    vmlsl.u8    q14,d7,d27
+    cmp         r14,#32
+    beq         epilog_end
+    sub         r14,#64
+
+inner_loop_16:
+
+    @ and           r7, r12, #31                    @decrement the wd loop
+    @ cmp           r7, r0
+    pld         [r12, r2, lsl #2]
+    pld         [r4, r2, lsl #2]
+
+
+    subs        r10,r10,#16
+
+    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+
+
+@    addeq      r12,r12,r2,lsl #1
+@    subeq      r12,r12,r5
+    addeq       r12,r12,r7
+    addeq       r4,r12,r2
+
+
+    vst1.16     {q15}, [r1]!
+    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+
+
+
+
+    vld1.u32    {q0},[r12],r11              @vector load pu1_src
+    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+
+
+
+    vld1.u32    {q1},[r12],r11              @vector load pu1_src
+    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+    vld1.u32    {q2},[r12],r11              @vector load pu1_src
+    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+    vst1.16     {q14}, [r1],r8
+    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vld1.u32    {q3},[r12],r9               @vector load pu1_src
+    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    vld1.u32    {q4},[r4],r11               @vector load pu1_src
+    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+    vld1.u32    {q5},[r4],r11               @vector load pu1_src
+    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+    vld1.u32    {q6},[r4],r11               @vector load pu1_src
+    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vld1.u32    {q7},[r4],r9                @vector load pu1_src
+    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    vst1.16     {q11},[r1]!                 @store the result pu1_dst
+    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    moveq       r10,r5                      @2wd
+    vmull.u8    q14,d3,d25
+
+
+
+    vmlsl.u8    q14,d1,d24
+    vst1.16     {q10},[r1],r6               @store the result pu1_dst
+
+
+    addeq       r1,r1,r3,lsl #1
+    vmlal.u8    q14,d5,d26
+
+    subs        r14,r14,#32                 @decrement the ht loop
+    vmlsl.u8    q14,d7,d27
+
+
+
+@    mov            r0, r7
+    bgt         inner_loop_16
+
+
+
+    add         r14,r14,#64
+    cmp         r14,#32
+    beq         epilog_end
+
+epilog:
+
+    vst1.16     {q15}, [r1]!
+    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vst1.16     {q14}, [r1],r8
+
+
+
+    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    subs        r10,r10,#16                 @decrement the wd loop
+    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+@    addeq      r12,r12,r2,lsl #1
+    addeq       r12,r12,r7
+    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+    @ subeq     r12,r12,r5
+    moveq       r10,r5                      @2wd
+    addeq       r4,r12,r2
+    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vld1.u32    {q0},[r12],r11              @vector load pu1_src
+    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    vld1.u32    {q1},[r12],r11              @vector load pu1_src
+    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vld1.u32    {q2},[r12],r11              @vector load pu1_src
+    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+    vld1.u32    {q3},[r12],r9               @vector load pu1_src
+    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+
+    vld1.u32    {q4},[r4],r11               @vector load pu1_src
+    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    vld1.u32    {q5},[r4],r11               @vector load pu1_src
+    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    vld1.u32    {q6},[r4],r11               @vector load pu1_src
+    vmull.u8    q14,d3,d25
+    vld1.u32    {q7},[r4],r9                @vector load pu1_src
+    vmlsl.u8    q14,d1,d24
+    vst1.16     {q11},[r1]!                 @store the result pu1_dst
+    vmlal.u8    q14,d5,d26
+    vst1.16     {q10},[r1],r6               @store the result pu1_dst
+    vmlsl.u8    q14,d7,d27
+    addeq       r1,r1,r3,lsl #1
+
+
+epilog_end:
+
+    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+    vst1.16     {q15}, [r1]!
+    vst1.16     {q14}, [r1],r8
+    vst1.16     {q11},[r1]!                 @store the result pu1_dst
+    vst1.16     {q10},[r1],r6               @store the result pu1_dst
+
+
+    ldr         r6,[sp,#44]                 @loads ht
+
+    and         r7,r6,#1
+
+    cmp         r7,#0
+    mov         r10,r5
+    addne       r12,r12,r2,lsl #1
+    subne       r12,r12,r5
+    addne       r1,r1,r3,lsl #1
+
+
+    bgt         loop_residue_4
+
+    b           end_loops
+
+
+
+
+outer_loop_8:
+
+    add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
+    mov         r10,r5                      @2wd
+    add         r4,r12,r2                   @pu1_src + src_strd
+
+inner_loop_8:
+    @vld1.u32  {d0,d1},[r12],r11               @vector load pu1_src
+    vld1.u32    {d0},[r12],r11              @vector load pu1_src
+    vld1.u32    {d1},[r12],r11              @vector load pu1_src
+    vld1.u32    {d2},[r12],r11              @vector load pu1_src
+    vld1.u32    {d3},[r12],r11              @vector load pu1_src
+
+
+    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
+    vmull.u8    q4,d1,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
+    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
+    vmlal.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vmlsl.u8    q4,d3,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    @vld1.u32  {d12,d13},[r4],r11              @vector load pu1_src + src_strd
+    vld1.u32    {d4},[r4],r11               @vector load pu1_src
+    vld1.u32    {d5},[r4],r11               @vector load pu1_src
+    vld1.u32    {d6},[r4],r11               @vector load pu1_src
+    vld1.u32    {d7},[r4],r11               @vector load pu1_src
+    @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
+    vmull.u8    q5,d5,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vmlsl.u8    q5,d4,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
+    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
+    vmlal.u8    q5,d6,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vmlsl.u8    q5,d7,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    vst1.16     {d8, d9}, [r1]!
+
+    subs        r10,r10,#8                  @decrement the wd loop
+    vst1.16     {d10, d11},[r6]!            @store the result pu1_dst
+    bgt         inner_loop_8
+
+    sub         r12,r12,r5
+    subs        r14,r14,#2                  @decrement the ht loop
+    sub         r1,r1,r5,lsl #1
+    add         r12,r12,r2,lsl #1
+    add         r1,r1,r3,lsl #2
+    bgt         outer_loop_8
+
+    cmp         r7,#0
+    mov         r10,r5
+    bgt         loop_residue_4
+
+    b           end_loops
+
+
+
+@height if 4 comes
+outer_loop_ht_4:
+
+    mov         r10,r5
+
+prologue_ht_4:
+    mov         r8,r3,lsl #1
+
+inner_loop_ht_4:
+
+    mov         r12,r9
+    mov         r4,r1
+
+    sub         r0, r2, #6                  @ not sure if r0 needs to be preserved
+
+    vld1.u32    {d0},[r12],r11              @(1)vector load pu1_src
+    vld1.u32    {d1},[r12],r11              @(1)vector load pu1_src
+    vld1.u32    {d2},[r12],r11              @(1)vector load pu1_src
+    vld1.u32    {d3},[r12],r0               @(1)vector load pu1_src
+
+    vld1.u32    {d4},[r12],r11              @(2)vector load pu1_src
+    vld1.u32    {d5},[r12],r11              @(2)vector load pu1_src
+    vld1.u32    {d6},[r12],r11              @(2)vector load pu1_src
+    vld1.u32    {d7},[r12],r0               @(2)vector load pu1_src
+
+    vld1.u32    {d14},[r12],r11             @(3)vector load pu1_src
+    vmull.u8    q4,d1,d25                   @(1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+    vld1.u32    {d15},[r12],r11             @(3)vector load pu1_src
+    vmlsl.u8    q4,d0,d24                   @(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vld1.u32    {d16},[r12],r11             @(3)vector load pu1_src
+    vmlal.u8    q4,d2,d26                   @(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    vld1.u32    {d17},[r12],r0              @(3)vector load pu1_src
+    vmlsl.u8    q4,d3,d27                   @(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    vld1.u32    {d18},[r12],r11             @(4)vector load pu1_src
+    vmull.u8    q5,d5,d25                   @(2)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+    vld1.u32    {d19},[r12],r11             @(4)vector load pu1_src
+    vmlsl.u8    q5,d4,d24                   @(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vld1.u32    {d20},[r12],r11             @(4)vector load pu1_src
+    vmlal.u8    q5,d6,d26                   @(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    vld1.u32    {d21},[r12],r2              @(4)vector load pu1_src
+    vmlsl.u8    q5,d7,d27                   @(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    add         r9,r9,#8                    @(core loop)
+
+    subs        r10,r10,#8                  @(prologue)decrement the wd loop
+    beq         epilogue
+
+core_loop:
+    vst1.16     {d8, d9},[r4],r8            @(1)store the result pu1_dst
+    mov         r12,r9
+
+    vld1.u32    {d0},[r12],r11              @(1_1)vector load pu1_src
+    vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+    vld1.u32    {d1},[r12],r11              @(1_1)vector load pu1_src
+    vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vld1.u32    {d2},[r12],r11              @(1_1)vector load pu1_src
+    vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    vld1.u32    {d3},[r12],r0               @(1_1)vector load pu1_src
+    vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    vst1.16     {d10, d11},[r4],r8          @(2)store the result pu1_dst
+    add         r9,r9,#8                    @(core loop)
+
+    vld1.u32    {d4},[r12],r11              @(2_1)vector load pu1_src
+    vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+    vld1.u32    {d5},[r12],r11              @(2_1)vector load pu1_src
+    vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vld1.u32    {d6},[r12],r11              @(2_1)vector load pu1_src
+    vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    vld1.u32    {d7},[r12],r0               @(2_1)vector load pu1_src
+    vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    vst1.16     {d12, d13},[r4],r8          @(3)store the result pu1_dst
+    add         r1,r1,#16                   @(core loop)
+
+    vld1.u32    {d14},[r12],r11             @(3_1)vector load pu1_src
+    vmull.u8    q4,d1,d25                   @(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+    vld1.u32    {d15},[r12],r11             @(3_1)vector load pu1_src
+    vmlsl.u8    q4,d0,d24                   @(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vld1.u32    {d16},[r12],r11             @(3_1)vector load pu1_src
+    vmlal.u8    q4,d2,d26                   @(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    vld1.u32    {d17},[r12],r0              @(3_1)vector load pu1_src
+    vmlsl.u8    q4,d3,d27                   @(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    vst1.16     {d22, d23}, [r4], r8        @(4)store the result pu1_dst
+    subs        r10,r10,#8                  @(core loop)
+
+    vmull.u8    q5,d5,d25                   @(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vld1.u32    {d18},[r12],r11             @(4_1)vector load pu1_src
+
+    vld1.u32    {d19},[r12],r11             @(4_1)vector load pu1_src
+    vmlsl.u8    q5,d4,d24                   @(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vld1.u32    {d20},[r12],r11             @(4_1)vector load pu1_src
+    vmlal.u8    q5,d6,d26                   @(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    mov         r4, r1                      @(core loop)
+
+    vld1.u32    {d21},[r12],r0              @(4_1)vector load pu1_src
+    vmlsl.u8    q5,d7,d27                   @(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+
+    bgt         core_loop                   @loopback
+
+epilogue:
+    vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+    vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    vst1.16     {d8, d9},[r4], r8           @(1)store the result pu1_dst
+
+    vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    vst1.16     {d10, d11},[r4], r8         @(2)store the result pu1_dst
+
+    vst1.16     {d12, d13},[r4], r8         @(3)store the result pu1_dst
+
+    add         r1,r1,#16                   @(core loop)
+
+    vst1.16     {d22, d23},[r4], r8         @(4)store the result pu1_dst
+
+    sub         r9,r9,r5
+    subs        r14,r14,#4                  @decrement the ht loop
+    sub         r1,r1,r5,lsl #1
+    add         r9,r9,r2,lsl #2
+    add         r1,r1,r3,lsl #3
+    bgt         outer_loop_ht_4
+
+    cmp         r7,#0
+    mov         r10,r5
+    movgt       r12,r9
+    movgt       r4,r1
+    bgt         loop_residue_4
+
+    b           end_loops
+
+outer_loop_4:
+    add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
+    mov         r10,r5
+    add         r4,r12,r2                   @pu1_src + src_strd
+
+inner_loop_4:
+    @vld1.u32  {d0,d1},[r12]                   @vector load pu1_src
+    vld1.u32    {d0},[r12],r11              @vector load pu1_src
+    vld1.u32    {d1},[r12],r11              @vector load pu1_src
+    vld1.u32    {d2},[r12],r11              @vector load pu1_src
+    vld1.u32    {d3},[r12]                  @vector load pu1_src
+
+@**** removal
+    @add       r12,r12,#4                      @increment the input pointer
+@**** removal ends
+@**** addn
+    sub         r12,r12,#2                  @increment the input pointer
+@**** addn ends
+    vld1.u32    {d4},[r4],r11               @vector load pu1_src
+    vld1.u32    {d5},[r4],r11               @vector load pu1_src
+    vld1.u32    {d6},[r4],r11               @vector load pu1_src
+    vld1.u32    {d7},[r4]                   @vector load pu1_src
+    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
+    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
+    @vld1.u32  {d12,d13},[r4]                  @vector load pu1_src + src_strd
+    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
+
+    @add       r4,r4,#4                        @increment the input pointer
+    sub         r4,r4,#2
+    @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
+    @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
+    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
+
+@**** removal
+    @vzip.32   d0,d12                          @vector zip the i iteration and ii interation in single register
+    @vzip.32   d2,d14
+    @vzip.32   d4,d16
+    @vzip.32   d6,d18
+@**** removal ends
+@**** addn
+    vzip.32     d0,d4                       @vector zip the i iteration and ii interation in single register
+    vzip.32     d1,d5
+    vzip.32     d2,d6
+    vzip.32     d3,d7
+@**** addn ends
+
+    vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
+    vmlsl.u8    q4,d0,d24
+    vmlal.u8    q4,d2,d26
+    vmlsl.u8    q4,d3,d27
+
+    vst1.32     {d8},[r1]!                  @store the i iteration result which is in upper part of the register
+    subs        r10,r10,#4                  @decrement the wd by 4
+
+    vst1.32     {d9},[r6]!                  @store the ii iteration result which is in lower part of the register
+
+    bgt         inner_loop_4
+
+    sub         r12,r12,r5
+    subs        r14,r14,#2                  @decrement the ht by 2
+    sub         r1,r1,r5,lsl #1
+    add         r12,r12,r2,lsl #1
+    add         r1,r1,r3,lsl #2
+    bgt         outer_loop_4
+
+    cmp         r7,#0
+    mov         r10,r5
+    beq         end_loops
+
+loop_residue_4:
+
+    mov         r10,r5                      @2wd
+
+loop_residue:
+
+    @vld1.u32  {d0,d1},[r12]                   @vector load pu1_src
+    vld1.u32    {d0},[r12],r11              @vector load pu1_src
+    vld1.u32    {d1},[r12],r11              @vector load pu1_src
+    vld1.u32    {d2},[r12],r11              @vector load pu1_src
+    vld1.u32    {d3},[r12]                  @vector load pu1_src
+    @vext.u8       d2,d0,d1,#2             @vector extract of src[0_2]
+    @vmull.u8      q4,d2,d25               @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    @vmlsl.u8      q4,d0,d24               @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    @vext.u8       d4,d0,d1,#4             @vector extract of src[0_4]
+    @add           r12,r12,#4              @pu1_src + 4
+    sub         r12, r12, #2
+    @vext.u8       d6,d0,d1,#6             @vector extract of src[0_6]
+    @vmlal.u8      q4,d4,d26               @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    @vmlsl.u8      q4,d6,d27               @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+    vmull.u8    q4,d1,d25
+    vmlsl.u8    q4,d0,d24
+    vmlal.u8    q4,d2,d26
+    vmlsl.u8    q4,d3,d27
+
+    vst1.64     {d8 },[r1]                  @store the result pu1_dst
+    subs        r10,r10,#4                  @decrement the wd loop
+    add         r1,r1,#8                    @pi2_dst + 8
+
+    bgt         loop_residue                @loop again
+
+    @inner loop ends
+    @add           r8,r3,lsl #1            @2*dst_strd
+    @sub           r8,r8,r5,lsl #1         @2*dst_strd - 2wd
+    @sub           r9,r2,r5                @src_strd - 2wd
+    @subs          r7,r7,#1                @decrement the ht loop
+    @add           r12,r12,r9              @pu1_src + src_strd
+    @add           r1,r1,r8                @pu1_dst + 2*dst_strd
+    @bgt           outer_loop_residue_4    @loop again
+    @b                 end_loops               @jumps to end
+
+end_loops:
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+

diff --git a/common/arm/ihevc_inter_pred_chroma_vert.s b/common/arm/ihevc_inter_pred_chroma_vert.s
new file mode 100644
index 0000000..e786497
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_chroma_vert.s

@@ -0,0 +1,383 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_inter_pred_chroma_vert_neon.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*   chroma interprediction filter for vertical input
+@*
+@* @par description:
+@*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+@*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+@*    'pu1_dst'  the output is down shifted by 6 and clipped to 8 bits
+@*    assumptions : the function is optimized considering the fact width is
+@*    multiple of 2,4 or 8. and also considering height  should be multiple of 2
+@*    width 4,8 is optimized further
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@void ihevc_inter_pred_chroma_vert(uword8 *pu1_src,
+@                                   uword8 *pu1_dst,
+@                                   word32 src_strd,
+@                                   word32 dst_strd,
+@                                   word8 *pi1_coeff,
+@                                   word32 ht,
+@                                   word32 wd)
+@**************variables vs registers*****************************************
+@r0 => *pu1_src
+@r1 => *pi2_dst
+@r2 =>  src_strd
+@r3 =>  dst_strd
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_chroma_vert_a9q
+
+.type ihevc_inter_pred_chroma_vert_a9q, %function
+
+ihevc_inter_pred_chroma_vert_a9q:
+
+    stmfd       sp!,{r4-r12,r14}            @stack stores the values of the arguments
+
+    ldr         r4,[sp,#44]                 @loads ht
+    ldr         r12,[sp,#40]                @loads pi1_coeff
+    cmp         r4,#0                       @checks ht == 0
+    ldr         r6,[sp,#48]                 @loads wd
+    sub         r0,r0,r2                    @pu1_src - src_strd
+    vld1.8      {d0},[r12]                  @loads pi1_coeff
+
+    ble         end_loops                   @jumps to end
+
+    tst         r6,#3                       @checks (wd & 3)
+    vabs.s8     d3,d0                       @vabs_s8(coeff)
+    lsl         r10,r6,#1                   @2*wd
+    vdup.8      d0,d3[0]                    @coeffabs_0
+    vdup.8      d1,d3[1]                    @coeffabs_1
+    vdup.8      d2,d3[2]                    @coeffabs_2
+    vdup.8      d3,d3[3]                    @coeffabs_3
+
+    bgt         outer_loop_wd_2             @jumps to loop handling wd ==2
+
+    tst         r4,#7                       @checks ht for mul of 8
+    beq         core_loop_ht_8              @when height is multiple of 8
+
+    lsl         r7,r3,#1                    @2*dst_strd
+    sub         r9,r7,r10                   @2*dst_strd - 2wd
+    lsl         r12,r2,#1                   @2*src_strd
+    sub         r8,r12,r10                  @2*src_strd - 2wd
+    mov         r5,r10                      @2wd
+
+inner_loop_ht_2:                            @called when wd is multiple of 4 and ht is 4,2
+
+    add         r6,r0,r2                    @pu1_src +src_strd
+    vld1.8      {d9},[r6],r2                @loads pu1_src
+    subs        r5,r5,#8                    @2wd - 8
+    vld1.8      {d5},[r0]!                  @loads src
+    vmull.u8    q3,d9,d1                    @vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+    vld1.8      {d4},[r6],r2                @loads incremented src
+    vmlsl.u8    q3,d5,d0                    @vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
+    vld1.8      {d8},[r6],r2                @loads incremented src
+    vmlal.u8    q3,d4,d2                    @vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
+    vmull.u8    q2,d4,d1
+    vmlsl.u8    q3,d8,d3
+    vmlsl.u8    q2,d9,d0
+    vld1.8      {d10},[r6]                  @loads the incremented src
+    vmlal.u8    q2,d8,d2
+    vqrshrun.s16 d6,q3,#6                   @shifts right
+    vmlsl.u8    q2,d10,d3
+    add         r6,r1,r3                    @pu1_dst + dst_strd
+    vqrshrun.s16 d4,q2,#6                   @shifts right
+    vst1.8      {d6},[r1]!                  @stores the loaded value
+
+    vst1.8      {d4},[r6]                   @stores the loaded value
+
+    bgt         inner_loop_ht_2             @inner loop again
+
+    subs        r4,r4,#2                    @ht - 2
+    add         r1,r1,r9                    @pu1_dst += (2*dst_strd - 2wd)
+    mov         r5,r10                      @2wd
+    add         r0,r0,r8                    @pu1_src += (2*src_strd - 2wd)
+
+    bgt         inner_loop_ht_2             @loop again
+
+    b           end_loops                   @jumps to end
+
+outer_loop_wd_2:                            @called when width is multiple of 2
+    lsl         r5,r3,#1                    @2*dst_strd
+    mov         r12,r10                     @2wd
+    sub         r9,r5,r10                   @2*dst_strd - 2wd
+    lsl         r7,r2,#1                    @2*src_strd
+    sub         r8,r7,r10                   @2*src_strd - 2wd
+
+inner_loop_wd_2:
+
+    add         r6,r0,r2                    @pu1_src + src_strd
+    vld1.32     {d6[0]},[r0]                @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0
+    subs        r12,r12,#4                  @2wd - 4
+    add         r0,r0,#4                    @pu1_src + 4
+    vld1.32     {d6[1]},[r6],r2             @loads pu1_src_tmp
+    vdup.32     d7,d6[1]
+    vld1.32     {d7[1]},[r6],r2             @loads pu1_src_tmp
+    vmull.u8    q2,d7,d1                    @vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+    vdup.32     d7,d7[1]
+    vld1.32     {d7[1]},[r6],r2
+    vmlsl.u8    q2,d6,d0
+    vmlal.u8    q2,d7,d2
+    vdup.32     d7,d7[1]
+    vld1.32     {d7[1]},[r6]
+    add         r6,r1,r3                    @pu1_dst + dst_strd
+    vmlsl.u8    q2,d7,d3
+    vqrshrun.s16 d4,q2,#6                   @vrshrq_n_s16(vreinterpretq_s16_u16(mul_res1),6)
+    vst1.32     {d4[0]},[r1]                @stores the loaded value
+    add         r1,r1,#4                    @pu1_dst += 4
+    vst1.32     {d4[1]},[r6]                @stores the loaded value
+
+    bgt         inner_loop_wd_2             @inner loop again
+
+    @inner loop ends
+    subs        r4,r4,#2                    @ht - 2
+    add         r1,r1,r9                    @pu1_dst += 2*dst_strd - 2*wd
+    mov         r12,r10                     @2wd
+    add         r0,r0,r8                    @pu1_src += 2*src_strd - 2*wd
+
+    bgt         inner_loop_wd_2             @loop again
+
+    b           end_loops                   @jumps to end
+
+core_loop_ht_8:                             @when wd & ht is multiple of 8
+
+    lsl         r12,r3,#2                   @4*dst_strd
+    sub         r8,r12,r10                  @4*dst_strd - 2wd
+    lsl         r12,r2,#2                   @4*src_strd
+    sub         r9,r12,r10                  @4*src_strd - 2wd
+
+    bic         r5,r10,#7                   @r5 ->wd
+    mov         r14,r10,lsr #3              @divide by 8
+    mul         r12,r4,r14                  @multiply height by width
+    sub         r12,#4                      @subtract by one for epilog
+
+prolog:
+    add         r6,r0,r2                    @pu1_src + src_strd
+    vld1.8      {d5},[r6],r2                @loads pu1_src
+    subs        r5,r5,#8                    @2wd - 8
+    vld1.8      {d4},[r0]!                  @loads the source
+    vld1.8      {d6},[r6],r2                @load and increment
+    vmull.u8    q15,d5,d1                   @mul with coeff 1
+    vld1.8      {d7},[r6],r2                @load and increment
+    vmlsl.u8    q15,d4,d0
+    add         r7,r1,r3                    @pu1_dst
+    vmlal.u8    q15,d6,d2
+    vmlsl.u8    q15,d7,d3
+    vld1.8      {d8},[r6],r2                @load and increment
+
+    vmull.u8    q14,d6,d1                   @mul_res 2
+    addle       r0,r0,r9                    @pu1_dst += 4*dst_strd - 2*wd
+    vmlsl.u8    q14,d5,d0
+    bicle       r5,r10,#7                   @r5 ->wd
+    vmlal.u8    q14,d7,d2
+    vld1.8      {d9},[r6],r2
+    vmlsl.u8    q14,d8,d3
+    vqrshrun.s16 d30,q15,#6
+
+    vld1.8      {d10},[r6],r2
+    vmull.u8    q13,d7,d1
+    add         r6,r0,r2                    @pu1_src + src_strd
+    vmlsl.u8    q13,d6,d0
+    vst1.8      {d30},[r1]!                 @stores the loaded value
+    vmlal.u8    q13,d8,d2
+    vld1.8      {d4},[r0]!                  @loads the source
+    vmlsl.u8    q13,d9,d3
+    vqrshrun.s16 d28,q14,#6
+
+    addle       r1,r1,r8                    @pu1_src += 4*src_strd - 2*wd
+    vmull.u8    q12,d8,d1
+    vld1.8      {d5},[r6],r2                @loads pu1_src
+    vmlsl.u8    q12,d7,d0
+    subs        r12,r12,#4
+    vld1.8      {d6},[r6],r2                @load and increment
+    vmlal.u8    q12,d9,d2
+    vld1.8      {d7},[r6],r2                @load and increment
+    vmlsl.u8    q12,d10,d3
+
+    lsl         r11,r2,#2
+    vst1.8      {d28},[r7],r3               @stores the loaded value
+    vqrshrun.s16 d26,q13,#6
+    rsb         r11,r2,r2,lsl #3
+    add         r14,r2,r2,lsl #1
+    add         r14,r14,r11
+    ble         epilog                      @jumps to epilog
+
+kernel_8:
+
+    vmull.u8    q15,d5,d1                   @mul with coeff 1
+    subs        r5,r5,#8                    @2wd - 8
+    vmlsl.u8    q15,d4,d0
+    addle       r0,r0,r9                    @pu1_dst += 4*dst_strd - 2*wd
+    vmlal.u8    q15,d6,d2
+    rsble       r11,r2,r2,lsl #3
+    vmlsl.u8    q15,d7,d3
+    vst1.8      {d26},[r7],r3               @stores the loaded value
+    vqrshrun.s16 d24,q12,#6
+
+    vld1.8      {d8},[r6],r2                @load and increment
+
+    vmull.u8    q14,d6,d1                   @mul_res 2
+    bicle       r5,r10,#7                   @r5 ->wd
+    vmlsl.u8    q14,d5,d0
+    vst1.8      {d24},[r7],r3               @stores the loaded value
+
+    vmlal.u8    q14,d7,d2
+
+    vld1.8      {d9},[r6],r2
+    vqrshrun.s16 d30,q15,#6
+
+    vmlsl.u8    q14,d8,d3
+    vld1.8      {d10},[r6],r2
+    add         r7,r1,r3                    @pu1_dst
+    vmull.u8    q13,d7,d1
+    add         r6,r0,r2                    @pu1_src + src_strd
+
+    pld         [r0,r11]
+
+
+    vmlsl.u8    q13,d6,d0
+    vld1.8      {d4},[r0]!                  @loads the source
+
+    vmlal.u8    q13,d8,d2
+    vst1.8      {d30},[r1]!                 @stores the loaded value
+
+    vmlsl.u8    q13,d9,d3
+    vld1.8      {d5},[r6],r2                @loads pu1_src
+
+    add         r11,r11,r2
+    vqrshrun.s16 d28,q14,#6
+
+    vmull.u8    q12,d8,d1
+    vld1.8      {d6},[r6],r2                @load and increment
+    addle       r1,r1,r8                    @pu1_src += 4*src_strd - 2*wd
+
+    cmp         r11,r14
+    rsbgt       r11,r2,r2,lsl #3
+
+    vmlsl.u8    q12,d7,d0
+    subs        r12,r12,#4
+
+    vmlal.u8    q12,d9,d2
+    vld1.8      {d7},[r6],r2                @load and increment
+
+    vmlsl.u8    q12,d10,d3
+    vst1.8      {d28},[r7],r3               @stores the loaded value
+    vqrshrun.s16 d26,q13,#6
+
+    bgt         kernel_8                    @jumps to kernel_8
+
+epilog:
+
+    vmull.u8    q15,d5,d1                   @mul with coeff 1
+    vmlsl.u8    q15,d4,d0
+    vmlal.u8    q15,d6,d2
+    vmlsl.u8    q15,d7,d3
+    vst1.8      {d26},[r7],r3               @stores the loaded value
+    vqrshrun.s16 d24,q12,#6
+
+    vld1.8      {d8},[r6],r2                @load and increment
+    vmull.u8    q14,d6,d1                   @mul_res 2
+    vmlsl.u8    q14,d5,d0
+    vmlal.u8    q14,d7,d2
+    vmlsl.u8    q14,d8,d3
+    vst1.8      {d24},[r7],r3               @stores the loaded value
+    vqrshrun.s16 d30,q15,#6
+
+    vld1.8      {d9},[r6],r2
+    vmull.u8    q13,d7,d1
+    add         r7,r1,r3                    @pu1_dst
+    vmlsl.u8    q13,d6,d0
+    vst1.8      {d30},[r1]!                 @stores the loaded value
+
+    vqrshrun.s16 d28,q14,#6
+    vmlal.u8    q13,d8,d2
+    vld1.8      {d10},[r6],r2
+    vmlsl.u8    q13,d9,d3
+
+    vmull.u8    q12,d8,d1
+    vqrshrun.s16 d26,q13,#6
+    vst1.8      {d28},[r7],r3               @stores the loaded value
+    vmlsl.u8    q12,d7,d0
+    vmlal.u8    q12,d9,d2
+    vst1.8      {d26},[r7],r3               @stores the loaded value
+    vmlsl.u8    q12,d10,d3
+
+    vqrshrun.s16 d24,q12,#6
+    vst1.8      {d24},[r7],r3               @stores the loaded value
+end_loops:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+

diff --git a/common/arm/ihevc_inter_pred_chroma_vert_w16inp.s b/common/arm/ihevc_inter_pred_chroma_vert_w16inp.s
new file mode 100644
index 0000000..ba2ea8e
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_chroma_vert_w16inp.s

@@ -0,0 +1,342 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_inter_pred_chroma_vert_neon_w16inp_neon.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  yogeswaran rs / parthiban
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*       chroma interprediction filter for 16bit vertical input.
+@*
+@* @par description:
+@*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+@*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+@*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
+@*    clipped to lie  between 0 and 255   assumptions : the function is
+@*    optimized considering the fact width and  height are multiple of 2.
+@*
+@* @param[in] pi2_src
+@*  word16 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@void ihevc_inter_pred_chroma_vert_w16inp(word16 *pi2_src,
+@                                          uword8 *pu1_dst,
+@                                          word32 src_strd,
+@                                          word32 dst_strd,
+@                                          word8 *pi1_coeff,
+@                                          word32 ht,
+@                                          word32 wd)
+@**************variables vs registers*****************************************
+@r0 => *pu1_src
+@r1 => *pi2_dst
+@r2 =>  src_strd
+@r3 =>  dst_strd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_chroma_vert_w16inp_a9q
+
+.type ihevc_inter_pred_chroma_vert_w16inp_a9q, %function
+
+ihevc_inter_pred_chroma_vert_w16inp_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4, [sp,#40]                @loads pi1_coeff
+    ldr         r6, [sp,#48]                @wd
+    lsl         r2,r2,#1                    @src_strd = 2* src_strd
+    ldr         r5,[sp,#44]                 @loads ht
+    vld1.8      {d0},[r4]                   @loads pi1_coeff
+    sub         r4,r0,r2                    @pu1_src - src_strd
+    vmovl.s8    q0,d0                       @long the value
+
+    tst         r6,#3                       @checks wd  == 2
+    vdup.16     d12,d0[0]                   @coeff_0
+    vdup.16     d13,d0[1]                   @coeff_1
+    vdup.16     d14,d0[2]                   @coeff_2
+    vdup.16     d15,d0[3]                   @coeff_3
+
+    bgt         core_loop_ht_2              @jumps to loop handles wd 2
+
+    tst         r5,#3                       @checks ht == mul of 4
+    beq         core_loop_ht_4              @jumps to loop handles ht mul of 4
+
+core_loop_ht_2:
+    lsl         r7,r2,#1                    @2*src_strd
+    lsl         r12,r3,#1                   @2*dst_strd
+    lsl         r9,r6,#2                    @4*wd
+    sub         r6,r12,r6,lsl #1            @2*dst_strd - 2*wd
+    sub         r8,r7,r9                    @2*src_strd - 4*wd
+    mov         r12,r9                      @4wd
+
+inner_loop_ht_2:
+    add         r0,r4,r2                    @increments pi2_src
+    vld1.16     {d0},[r4]!                  @loads pu1_src
+    vmull.s16   q0,d0,d12                   @vmull_s16(src_tmp1, coeff_0)
+    subs        r12,r12,#8                  @2wd + 8
+    vld1.16     {d2},[r0],r2                @loads pi2_src
+    vmull.s16   q4,d2,d12                   @vmull_s16(src_tmp2, coeff_0)
+    vld1.16     {d3},[r0],r2                @loads pi2_src
+    vmlal.s16   q0,d2,d13
+    vld1.16     {d6},[r0],r2
+    vmlal.s16   q4,d3,d13
+    vld1.16     {d2},[r0]
+    add         r7,r1,r3                    @pu1_dst + dst_strd
+    vmlal.s16   q0,d3,d14
+    vmlal.s16   q4,d6,d14
+    vmlal.s16   q0,d6,d15
+    vmlal.s16   q4,d2,d15
+    vqshrn.s32  d0,q0,#6                    @right shift
+    vqshrn.s32  d30,q4,#6                   @right shift
+    vqrshrun.s16 d0,q0,#6                   @rounding shift
+    vqrshrun.s16 d30,q15,#6                 @rounding shift
+    vst1.32     {d0[0]},[r1]!               @stores the loaded value
+    vst1.32     {d30[0]},[r7]               @stores the loaded value
+    bgt         inner_loop_ht_2             @inner loop -again
+
+    @inner loop ends
+    subs        r5,r5,#2                    @increments ht
+    add         r1,r1,r6                    @pu1_dst += 2*dst_strd - 2*wd
+    mov         r12,r9                      @4wd
+    add         r4,r4,r8                    @pi1_src_tmp1 += 2*src_strd - 4*wd
+    bgt         inner_loop_ht_2             @loop again
+
+    b           end_loops                   @jumps to end
+
+core_loop_ht_4:
+    lsl         r7,r2,#2                    @2*src_strd
+    lsl         r12,r3,#2                   @2*dst_strd
+    mov         r11,r6,lsr #1               @divide by 2
+    sub         lr,r12,r6,lsl #1            @2*dst_strd - 2*wd
+    sub         r8,r7,r6,lsl #2             @2*src_strd - 4*wd
+
+    mul         r12,r5,r11                  @multiply height by width
+    sub         r12,#4                      @subtract by one for epilog
+    mov         r11,r6,lsl #1               @2*wd
+
+prolog:
+    add         r0,r4,r2                    @increments pi2_src
+    vld1.16     {d0},[r4]!                  @loads pu1_src
+    vld1.16     {d1},[r0],r2                @loads pi2_src
+    subs        r11,r11,#4
+    vld1.16     {d2},[r0],r2                @loads pi2_src
+    vmull.s16   q15,d0,d12                  @vmull_s16(src_tmp1, coeff_0)
+    vld1.16     {d3},[r0],r2
+    vmlal.s16   q15,d1,d13
+    vmlal.s16   q15,d2,d14
+    add         r9,r1,r3                    @pu1_dst + dst_strd
+    vmlal.s16   q15,d3,d15
+
+    vld1.16     {d4},[r0],r2
+    vmull.s16   q14,d1,d12                  @vmull_s16(src_tmp2, coeff_0)
+    addle       r4,r4,r8
+    vmlal.s16   q14,d2,d13
+    vld1.s16    {d5},[r0],r2
+    vmlal.s16   q14,d3,d14
+    vld1.s16    {d6},[r0],r2
+    vmlal.s16   q14,d4,d15
+    movle       r11,r6,lsl #1
+
+    vqshrn.s32  d30,q15,#6                  @right shift
+
+    vmull.s16   q13,d2,d12                  @vmull_s16(src_tmp2, coeff_0)
+    add         r0,r4,r2
+    vmlal.s16   q13,d3,d13
+    vmlal.s16   q13,d4,d14
+    vld1.16     {d0},[r4]!                  @loads pu1_src
+    vmlal.s16   q13,d5,d15
+
+    vqrshrun.s16 d30,q15,#6                 @rounding shift
+    vqshrn.s32  d28,q14,#6                  @right shift
+
+    vld1.16     {d1},[r0],r2                @loads pi2_src
+    vmull.s16   q12,d3,d12                  @vmull_s16(src_tmp2, coeff_0)
+    vst1.32     {d30[0]},[r1]!              @stores the loaded value
+    vmlal.s16   q12,d4,d13
+    vld1.16     {d2},[r0],r2                @loads pi2_src
+    vmlal.s16   q12,d5,d14
+    vld1.16     {d3},[r0],r2
+    vmlal.s16   q12,d6,d15
+    addle       r1,r1,lr
+
+    vqshrn.s32  d26,q13,#6                  @right shift
+    subs        r12,r12,#4
+    vqrshrun.s16 d28,q14,#6                 @rounding shift
+
+    beq         epilog                      @jumps to epilog
+
+kernel_4:
+    vmull.s16   q15,d0,d12                  @vmull_s16(src_tmp1, coeff_0)
+    subs        r11,r11,#4
+    vmlal.s16   q15,d1,d13
+    vst1.32     {d28[0]},[r9],r3            @stores the loaded value
+    vmlal.s16   q15,d2,d14
+    vmlal.s16   q15,d3,d15
+
+    vqshrn.s32  d24,q12,#6                  @right shift
+    vqrshrun.s16 d26,q13,#6                 @rounding shift
+
+    vld1.16     {d4},[r0],r2
+    vmull.s16   q14,d1,d12                  @vmull_s16(src_tmp2, coeff_0)
+    vmlal.s16   q14,d2,d13
+    vmlal.s16   q14,d3,d14
+    vmlal.s16   q14,d4,d15
+    vst1.32     {d26[0]},[r9],r3            @stores the loaded value
+    addle       r4,r4,r8
+    movle       r11,r6,lsl #1
+
+    vqshrn.s32  d30,q15,#6                  @right shift
+    vqrshrun.s16 d24,q12,#6                 @rounding shift
+
+    vld1.s16    {d5},[r0],r2
+    vmull.s16   q13,d2,d12                  @vmull_s16(src_tmp2, coeff_0)
+    vld1.s16    {d6},[r0],r2
+    vmlal.s16   q13,d3,d13
+    vst1.32     {d24[0]},[r9]               @stores the loaded value
+    add         r0,r4,r2
+    vmlal.s16   q13,d4,d14
+    vld1.16     {d0},[r4]!                  @loads pu1_src
+    vmlal.s16   q13,d5,d15
+
+    vqshrn.s32  d28,q14,#6                  @right shift
+    vqrshrun.s16 d30,q15,#6                 @rounding shift
+
+    vld1.16     {d1},[r0],r2                @loads pi2_src
+    vmull.s16   q12,d3,d12                  @vmull_s16(src_tmp2, coeff_0)
+    add         r9,r1,r3                    @pu1_dst + dst_strd
+    vld1.16     {d2},[r0],r2                @loads pi2_src
+    vmlal.s16   q12,d4,d13
+    vld1.16     {d3},[r0],r2
+    vmlal.s16   q12,d5,d14
+
+    vst1.32     {d30[0]},[r1]!              @stores the loaded value
+    vmlal.s16   q12,d6,d15
+
+    vqshrn.s32  d26,q13,#6                  @right shift
+    vqrshrun.s16 d28,q14,#6                 @rounding shift
+    addle       r1,r1,lr
+
+    subs        r12,r12,#4
+
+    bgt         kernel_4                    @jumps to kernel_4
+
+epilog:
+    vmull.s16   q15,d0,d12                  @vmull_s16(src_tmp1, coeff_0)
+    vst1.32     {d28[0]},[r9],r3            @stores the loaded value
+    vmlal.s16   q15,d1,d13
+    vmlal.s16   q15,d2,d14
+    vmlal.s16   q15,d3,d15
+
+    vqshrn.s32  d24,q12,#6                  @right shift
+    vqrshrun.s16 d26,q13,#6                 @rounding shift
+
+    vmull.s16   q14,d1,d12                  @vmull_s16(src_tmp2, coeff_0)
+    vld1.16     {d4},[r0],r2
+    vmlal.s16   q14,d2,d13
+    vst1.32     {d26[0]},[r9],r3            @stores the loaded value
+    vmlal.s16   q14,d3,d14
+    vmlal.s16   q14,d4,d15
+
+    vqshrn.s32  d30,q15,#6                  @right shift
+    vqrshrun.s16 d24,q12,#6                 @rounding shift
+
+    vmull.s16   q13,d2,d12                  @vmull_s16(src_tmp2, coeff_0)
+    vld1.s16    {d5},[r0],r2
+    vmlal.s16   q13,d3,d13
+    vmlal.s16   q13,d4,d14
+    vmlal.s16   q13,d5,d15
+
+    vqshrn.s32  d28,q14,#6                  @right shift
+    vqrshrun.s16 d30,q15,#6                 @rounding shift
+
+    vst1.32     {d24[0]},[r9]               @stores the loaded value
+    vmull.s16   q12,d3,d12                  @vmull_s16(src_tmp2, coeff_0)
+    vmlal.s16   q12,d4,d13
+    add         r9,r1,r3                    @pu1_dst + dst_strd
+    vld1.s16    {d6},[r0],r2
+    vmlal.s16   q12,d5,d14
+    vmlal.s16   q12,d6,d15
+    vst1.32     {d30[0]},[r1]!              @stores the loaded value
+
+    vqrshrun.s16 d28,q14,#6                 @rounding shift
+    vqshrn.s32  d26,q13,#6                  @right shift
+
+    vst1.32     {d28[0]},[r9],r3            @stores the loaded value
+    vqrshrun.s16 d26,q13,#6                 @rounding shift
+
+    vqshrn.s32  d24,q12,#6                  @right shift
+    vst1.32     {d26[0]},[r9],r3            @stores the loaded value
+    vqrshrun.s16 d24,q12,#6                 @rounding shift
+
+    vst1.32     {d24[0]},[r9]               @stores the loaded value
+
+end_loops:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+

diff --git a/common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s b/common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
new file mode 100644
index 0000000..00b3011
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s

@@ -0,0 +1,329 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_inter_pred_chroma_vert_neon_w16inp_w16out_neon.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  yogeswaran rs / parthiban
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    chroma interprediction filter for 16bit vertical input and output.
+@*
+@* @par description:
+@*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+@*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+@*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 6 and
+@*    8192 is  subtracted to store it as a 16 bit number  the output is used as
+@*    a input to weighted prediction   assumptions : the function is optimized
+@*    considering the fact width and  height are multiple of 2.
+@*
+@* @param[in] pi2_src
+@*  word16 pointer to the source
+@*
+@* @param[out] pi2_dst
+@*  word16 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@void ihevc_inter_pred_chroma_vert_w16inp_w16out(word16 *pi2_src,
+@                                                 word16 *pi2_dst,
+@                                                 word32 src_strd,
+@                                                 word32 dst_strd,
+@                                                 word8 *pi1_coeff,
+@                                                 word32 ht,
+@                                                 word32 wd)
+@**************variables vs registers*****************************************
+@r0 => *pu1_src
+@r1 => *pi2_dst
+@r2 =>  src_strd
+@r3 =>  dst_strd
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q
+
+.type ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q, %function
+
+ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4, [sp,#40]                @loads pi1_coeff
+    ldr         r6, [sp,#48]                @wd
+    lsl         r2,r2,#1                    @src_strd = 2* src_strd
+    ldr         r5,[sp,#44]                 @loads ht
+    vld1.8      {d0},[r4]                   @loads pi1_coeff
+    sub         r4,r0,r2                    @pu1_src - src_strd
+    vmovl.s8    q0,d0                       @long the value
+
+    tst         r6,#3                       @checks wd  == 2
+    vdup.16     d12,d0[0]                   @coeff_0
+    vdup.16     d13,d0[1]                   @coeff_1
+    vdup.16     d14,d0[2]                   @coeff_2
+    vdup.16     d15,d0[3]                   @coeff_3
+
+    bgt         core_loop_ht_2              @jumps to loop handles wd 2
+
+    tst         r5,#3                       @checks ht == mul of 4
+    beq         core_loop_ht_4              @jumps to loop handles ht mul of 4
+
+core_loop_ht_2:
+    lsl         r7,r2,#1                    @2*src_strd
+    lsl         r3,r3,#1                    @2*dst_strd
+    lsl         r9,r6,#2                    @4*wd
+    sub         r6,r3,r6,lsl #1             @2*dst_strd - 2*wd
+    sub         r8,r7,r9                    @2*src_strd - 4*wd
+    mov         r12,r9                      @4wd
+
+inner_loop_ht_2:
+    add         r0,r4,r2                    @increments pi2_src
+    vld1.16     {d0},[r4]!                  @loads pu1_src
+    vmull.s16   q0,d0,d12                   @vmull_s16(src_tmp1, coeff_0)
+    subs        r12,r12,#8                  @2wd + 8
+    vld1.16     {d2},[r0],r2                @loads pi2_src
+    vmull.s16   q4,d2,d12                   @vmull_s16(src_tmp2, coeff_0)
+    vld1.16     {d3},[r0],r2                @loads pi2_src
+    vmlal.s16   q0,d2,d13
+    vld1.16     {d6},[r0],r2
+    vmlal.s16   q4,d3,d13
+    vld1.16     {d2},[r0]
+    add         r7,r1,r3                    @pu1_dst + dst_strd
+    vmlal.s16   q0,d3,d14
+    vmlal.s16   q4,d6,d14
+    vmlal.s16   q0,d6,d15
+    vmlal.s16   q4,d2,d15
+    vqshrn.s32  d0,q0,#6                    @right shift
+    vqshrn.s32  d30,q4,#6                   @right shift
+    vst1.32     {d0},[r1]!                  @stores the loaded value
+    vst1.32     {d30},[r7]                  @stores the loaded value
+    bgt         inner_loop_ht_2             @inner loop -again
+
+    @inner loop ends
+    subs        r5,r5,#2                    @increments ht
+    add         r1,r1,r6,lsl #1             @pu1_dst += 2*dst_strd - 2*wd
+    mov         r12,r9                      @4wd
+    add         r4,r4,r8                    @pi1_src_tmp1 += 2*src_strd - 4*wd
+    bgt         inner_loop_ht_2             @loop again
+
+    b           end_loops                   @jumps to end
+
+core_loop_ht_4:
+    lsl         r7,r2,#2                    @2*src_strd
+    lsl         r10,r3,#2                   @2*dst_strd
+    mov         r11,r6,lsr #1               @divide by 2
+    sub         lr,r10,r6,lsl #1            @2*dst_strd - 2*wd
+    sub         r8,r7,r6,lsl #2             @2*src_strd - 4*wd
+
+    mul         r12,r5,r11                  @multiply height by width
+    sub         r12,#4                      @subtract by one for epilog
+    mov         r11,r6,lsl #1               @2*wd
+    lsl         r3,r3,#1                    @2*dst_strd
+
+prolog:
+    add         r0,r4,r2                    @increments pi2_src
+    vld1.16     {d0},[r4]!                  @loads pu1_src
+    vld1.16     {d1},[r0],r2                @loads pi2_src
+    subs        r11,r11,#4
+    vld1.16     {d2},[r0],r2                @loads pi2_src
+    vmull.s16   q15,d0,d12                  @vmull_s16(src_tmp1, coeff_0)
+    vld1.16     {d3},[r0],r2
+    vmlal.s16   q15,d1,d13
+    vmlal.s16   q15,d2,d14
+    add         r9,r1,r3                    @pu1_dst + dst_strd
+    vmlal.s16   q15,d3,d15
+
+    vld1.16     {d4},[r0],r2
+    vmull.s16   q14,d1,d12                  @vmull_s16(src_tmp2, coeff_0)
+    addle       r4,r4,r8
+    movle       r11,r6,lsl #1
+    vmlal.s16   q14,d2,d13
+    vmlal.s16   q14,d3,d14
+    vld1.s16    {d5},[r0],r2
+    vmlal.s16   q14,d4,d15
+
+    vqshrn.s32  d30,q15,#6                  @right shift
+
+    vld1.s16    {d6},[r0],r2
+    vmull.s16   q13,d2,d12                  @vmull_s16(src_tmp2, coeff_0)
+    vmlal.s16   q13,d3,d13
+    vmlal.s16   q13,d4,d14
+    add         r0,r4,r2
+    vld1.16     {d0},[r4]!                  @loads pu1_src
+    vmlal.s16   q13,d5,d15
+
+    vqshrn.s32  d28,q14,#6                  @right shift
+
+    vld1.16     {d1},[r0],r2                @loads pi2_src
+    vmull.s16   q12,d3,d12                  @vmull_s16(src_tmp2, coeff_0)
+    vst1.32     {d30},[r1]!                 @stores the loaded value
+    vmlal.s16   q12,d4,d13
+    vld1.16     {d2},[r0],r2                @loads pi2_src
+    vmlal.s16   q12,d5,d14
+    vld1.16     {d3},[r0],r2
+    vmlal.s16   q12,d6,d15
+    addle       r1,r1,lr,lsl #1
+
+    vqshrn.s32  d26,q13,#6                  @right shift
+    subs        r12,r12,#4
+
+    beq         epilog                      @jumps to epilog
+
+kernel_4:
+    vmull.s16   q15,d0,d12                  @vmull_s16(src_tmp1, coeff_0)
+    subs        r11,r11,#4
+    vmlal.s16   q15,d1,d13
+    vst1.32     {d28},[r9],r3               @stores the loaded value
+    vmlal.s16   q15,d2,d14
+    vmlal.s16   q15,d3,d15
+
+    vqshrn.s32  d24,q12,#6                  @right shift
+
+    vld1.16     {d4},[r0],r2
+    vmull.s16   q14,d1,d12                  @vmull_s16(src_tmp2, coeff_0)
+    vmlal.s16   q14,d2,d13
+    vmlal.s16   q14,d3,d14
+    vmlal.s16   q14,d4,d15
+    vst1.32     {d26},[r9],r3               @stores the loaded value
+    addle       r4,r4,r8
+    movle       r11,r6,lsl #1
+
+    vqshrn.s32  d30,q15,#6                  @right shift
+
+    vld1.s16    {d5},[r0],r2
+    vmull.s16   q13,d2,d12                  @vmull_s16(src_tmp2, coeff_0)
+    vld1.s16    {d6},[r0],r2
+    vmlal.s16   q13,d3,d13
+    vst1.32     {d24},[r9]                  @stores the loaded value
+    add         r0,r4,r2
+    vmlal.s16   q13,d4,d14
+    vld1.16     {d0},[r4]!                  @loads pu1_src
+    vmlal.s16   q13,d5,d15
+
+    vqshrn.s32  d28,q14,#6                  @right shift
+
+    vld1.16     {d1},[r0],r2                @loads pi2_src
+    vmull.s16   q12,d3,d12                  @vmull_s16(src_tmp2, coeff_0)
+    vld1.16     {d2},[r0],r2                @loads pi2_src
+    vmlal.s16   q12,d4,d13
+    add         r9,r1,r3                    @pu1_dst + dst_strd
+    vld1.16     {d3},[r0],r2
+    vmlal.s16   q12,d5,d14
+
+    vst1.32     {d30},[r1]!                 @stores the loaded value
+    vmlal.s16   q12,d6,d15
+
+    vqshrn.s32  d26,q13,#6                  @right shift
+    addle       r1,r1,lr,lsl #1
+
+    subs        r12,r12,#4
+
+    bgt         kernel_4                    @jumps to kernel_4
+
+epilog:
+    vmull.s16   q15,d0,d12                  @vmull_s16(src_tmp1, coeff_0)
+    vst1.32     {d28},[r9],r3               @stores the loaded value
+    vmlal.s16   q15,d1,d13
+    vmlal.s16   q15,d2,d14
+    vmlal.s16   q15,d3,d15
+
+    vqshrn.s32  d24,q12,#6                  @right shift
+
+    vmull.s16   q14,d1,d12                  @vmull_s16(src_tmp2, coeff_0)
+    vld1.16     {d4},[r0],r2
+    vmlal.s16   q14,d2,d13
+    vst1.32     {d26},[r9],r3               @stores the loaded value
+    vmlal.s16   q14,d3,d14
+    vmlal.s16   q14,d4,d15
+
+    vqshrn.s32  d30,q15,#6                  @right shift
+
+    vmull.s16   q13,d2,d12                  @vmull_s16(src_tmp2, coeff_0)
+    vld1.s16    {d5},[r0],r2
+    vmlal.s16   q13,d3,d13
+    vmlal.s16   q13,d4,d14
+    vmlal.s16   q13,d5,d15
+
+    vqshrn.s32  d28,q14,#6                  @right shift
+
+    vst1.32     {d24},[r9]                  @stores the loaded value
+    vmull.s16   q12,d3,d12                  @vmull_s16(src_tmp2, coeff_0)
+    vmlal.s16   q12,d4,d13
+    add         r9,r1,r3                    @pu1_dst + dst_strd
+    vld1.s16    {d6},[r0],r2
+    vmlal.s16   q12,d5,d14
+    vmlal.s16   q12,d6,d15
+    vst1.32     {d30},[r1]!                 @stores the loaded value
+
+    vqshrn.s32  d26,q13,#6                  @right shift
+
+    vst1.32     {d28},[r9],r3               @stores the loaded value
+
+    vqshrn.s32  d24,q12,#6                  @right shift
+    vst1.32     {d26},[r9],r3               @stores the loaded value
+
+    vst1.32     {d24},[r9]                  @stores the loaded value
+
+end_loops:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+

diff --git a/common/arm/ihevc_inter_pred_chroma_vert_w16out.s b/common/arm/ihevc_inter_pred_chroma_vert_w16out.s
new file mode 100644
index 0000000..6e6776c
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_chroma_vert_w16out.s

@@ -0,0 +1,367 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_inter_pred_chroma_vert_w16out_neon.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  yogeswaran rs/ pathiban
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*   interprediction chroma filter to store vertical 16bit ouput
+@*
+@* @par description:
+@*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+@*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+@*    'pu1_dst'  no downshifting or clipping is done and the output is  used as
+@*    an input for weighted prediction   assumptions : the function is optimized
+@*    considering the fact width is  multiple of 2,4 or 8. and also considering
+@*    height  should be multiple of 2. width 4,8 is optimized further
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[out] pi2_dst
+@*  word16 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*****************************************************************************
+@*/
+@void ihevc_inter_pred_chroma_vert_w16out(uword8 *pu1_src,
+@                                           word16 *pi2_dst,
+@                                           word32 src_strd,
+@                                           word32 dst_strd,
+@                                           word8 *pi1_coeff,
+@                                           word32 ht,
+@                                           word32 wd)
+@**************variables vs registers*****************************************
+@r0 => *pu1_src
+@r1 => *pi2_dst
+@r2 =>  src_strd
+@r3 =>  dst_strd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_chroma_vert_w16out_a9q
+
+.type ihevc_inter_pred_chroma_vert_w16out_a9q, %function
+
+ihevc_inter_pred_chroma_vert_w16out_a9q:
+
+    stmfd       sp!,{r4-r12,r14}            @stack stores the values of the arguments
+
+    ldr         r4,[sp,#44]                 @loads ht
+    ldr         r12,[sp,#40]                @loads pi1_coeff
+    cmp         r4,#0                       @checks ht == 0
+    ldr         r6,[sp,#48]                 @loads wd
+    sub         r0,r0,r2                    @pu1_src - src_strd
+    vld1.8      {d0},[r12]                  @loads pi1_coeff
+
+    ble         end_loops                   @jumps to end
+
+    tst         r6,#3                       @checks (wd & 3)
+    vabs.s8     d3,d0                       @vabs_s8(coeff)
+    lsl         r10,r6,#1                   @2*wd
+    vdup.8      d0,d3[0]                    @coeffabs_0
+    vdup.8      d1,d3[1]                    @coeffabs_1
+    vdup.8      d2,d3[2]                    @coeffabs_2
+    vdup.8      d3,d3[3]                    @coeffabs_3
+
+    bgt         outer_loop_wd_2             @jumps to loop handling wd ==2
+
+    tst         r4,#7                       @checks ht for mul of 8
+    beq         core_loop_ht_8              @when height is multiple of 8
+
+    lsl         r7,r3,#2                    @2*dst_strd
+    sub         r9,r7,r10,lsl #1            @4*dst_strd - 4wd
+    lsl         r12,r2,#1                   @2*src_strd
+    sub         r8,r12,r10                  @2*src_strd - 2wd
+    mov         r3,r3,lsl #1
+    mov         r5,r10                      @2wd
+
+inner_loop_ht_2:                            @called when wd is multiple of 4 and ht is 4,2
+
+    add         r6,r0,r2                    @pu1_src +src_strd
+    vld1.8      {d9},[r6],r2                @loads pu1_src
+    subs        r5,r5,#8                    @2wd - 8
+    vld1.8      {d5},[r0]!                  @loads src
+    vmull.u8    q3,d9,d1                    @vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+    vld1.8      {d4},[r6],r2                @loads incremented src
+    vmlsl.u8    q3,d5,d0                    @vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
+    vld1.8      {d8},[r6],r2                @loads incremented src
+    vmlal.u8    q3,d4,d2                    @vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
+    vmull.u8    q2,d4,d1
+    vld1.8      {d10},[r6]                  @loads the incremented src
+    vmlsl.u8    q3,d8,d3
+    vmlsl.u8    q2,d9,d0
+    vmlal.u8    q2,d8,d2
+    vmlsl.u8    q2,d10,d3
+    add         r6,r1,r3                    @pu1_dst + dst_strd
+    vst1.8      {q3},[r1]!                  @stores the loaded value
+
+    vst1.8      {q2},[r6]                   @stores the loaded value
+
+    bgt         inner_loop_ht_2             @inner loop again
+
+    subs        r4,r4,#2                    @ht - 2
+    add         r1,r1,r9                    @pu1_dst += (2*dst_strd - 2wd)
+    mov         r5,r10                      @2wd
+    add         r0,r0,r8                    @pu1_src += (2*src_strd - 2wd)
+
+    bgt         inner_loop_ht_2             @loop again
+
+    b           end_loops                   @jumps to end
+
+outer_loop_wd_2:                            @called when width is multiple of 2
+    lsl         r5,r3,#2                    @2*dst_strd
+    mov         r12,r10                     @2wd
+    sub         r9,r5,r10,lsl #1            @4*dst_strd - 4wd
+    lsl         r7,r2,#1                    @2*src_strd
+    sub         r8,r7,r10                   @2*src_strd - 2wd
+
+inner_loop_wd_2:
+
+    add         r6,r0,r2                    @pu1_src + src_strd
+    vld1.32     {d6[0]},[r0]                @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0
+    subs        r12,r12,#4                  @2wd - 4
+    add         r0,r0,#4                    @pu1_src + 4
+    vld1.32     {d6[1]},[r6],r2             @loads pu1_src_tmp
+    vdup.32     d7,d6[1]
+    vld1.32     {d7[1]},[r6],r2             @loads pu1_src_tmp
+    vmull.u8    q2,d7,d1                    @vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+    vdup.32     d7,d7[1]
+    vld1.32     {d7[1]},[r6],r2
+    vmlsl.u8    q2,d6,d0
+    vmlal.u8    q2,d7,d2
+    vdup.32     d7,d7[1]
+    vld1.32     {d7[1]},[r6]
+    add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
+    vmlsl.u8    q2,d7,d3
+    vst1.32     {d4},[r1]                   @stores the loaded value
+    add         r1,r1,#8                    @pu1_dst += 4
+    vst1.32     {d5},[r6]                   @stores the loaded value
+
+    bgt         inner_loop_wd_2             @inner loop again
+
+    @inner loop ends
+    subs        r4,r4,#2                    @ht - 2
+    add         r1,r1,r9                    @pu1_dst += 2*dst_strd - 2*wd
+    mov         r12,r10                     @2wd
+    add         r0,r0,r8                    @pu1_src += 2*src_strd - 2*wd
+
+    bgt         inner_loop_wd_2             @loop again
+
+    b           end_loops                   @jumps to end
+
+core_loop_ht_8:                             @when wd & ht is multiple of 8
+
+    lsl         r12,r3,#3                   @4*dst_strd
+    sub         r8,r12,r10,lsl #1           @4*dst_strd - 2wd
+    lsl         r12,r2,#2                   @4*src_strd
+    sub         r9,r12,r10                  @4*src_strd - 2wd
+
+    bic         r5,r10,#7                   @r5 ->wd
+    mov         r14,r10,lsr #3              @divide by 8
+    mul         r12,r4,r14                  @multiply height by width
+    sub         r12,#4                      @subtract by one for epilog
+    mov         r3,r3,lsl #1
+
+prolog:
+    add         r6,r0,r2                    @pu1_src + src_strd
+    vld1.8      {d5},[r6],r2                @loads pu1_src
+    subs        r5,r5,#8                    @2wd - 8
+    vld1.8      {d4},[r0]!                  @loads the source
+    vld1.8      {d6},[r6],r2                @load and increment
+    vmull.u8    q15,d5,d1                   @mul with coeff 1
+    vld1.8      {d7},[r6],r2                @load and increment
+    vmlsl.u8    q15,d4,d0
+    add         r7,r1,r3                    @pu1_dst
+    vmlal.u8    q15,d6,d2
+    vmlsl.u8    q15,d7,d3
+    vld1.8      {d8},[r6],r2                @load and increment
+
+    vmull.u8    q14,d6,d1                   @mul_res 2
+    addle       r0,r0,r9                    @pu1_dst += 4*dst_strd - 2*wd
+    vmlsl.u8    q14,d5,d0
+    bicle       r5,r10,#7                   @r5 ->wd
+    vmlal.u8    q14,d7,d2
+    vld1.8      {d9},[r6],r2
+    vmlsl.u8    q14,d8,d3
+
+    vld1.8      {d10},[r6],r2
+    vmull.u8    q13,d7,d1
+    add         r6,r0,r2                    @pu1_src + src_strd
+    vmlsl.u8    q13,d6,d0
+    vst1.8      {q15},[r1]!                 @stores the loaded value
+    vmlal.u8    q13,d8,d2
+    vld1.8      {d4},[r0]!                  @loads the source
+    vmlsl.u8    q13,d9,d3
+
+    addle       r1,r1,r8                    @pu1_src += 4*src_strd - 2*wd
+    vmull.u8    q12,d8,d1
+    vld1.8      {d5},[r6],r2                @loads pu1_src
+    vmlsl.u8    q12,d7,d0
+    subs        r12,r12,#4
+    vld1.8      {d6},[r6],r2                @load and increment
+    vmlal.u8    q12,d9,d2
+    vld1.8      {d7},[r6],r2                @load and increment
+    vmlsl.u8    q12,d10,d3
+    rsb         r11,r2,r2,lsl #3
+    add         r14,r2,r2,lsl #1
+    add         r14,r14,r11
+    vst1.8      {q14},[r7],r3               @stores the loaded value
+
+    ble         epilog                      @jumps to epilog
+
+kernel_8:
+
+    vmull.u8    q15,d5,d1                   @mul with coeff 1
+    subs        r5,r5,#8                    @2wd - 8
+    vmlsl.u8    q15,d4,d0
+    addle       r0,r0,r9                    @pu1_dst += 4*dst_strd - 2*wd
+    vmlal.u8    q15,d6,d2
+    rsble       r11,r2,r2,lsl #3
+    vmlsl.u8    q15,d7,d3
+    vst1.8      {q13},[r7],r3               @stores the loaded value
+
+    vld1.8      {d8},[r6],r2                @load and increment
+
+    vmull.u8    q14,d6,d1                   @mul_res 2
+    bicle       r5,r10,#7                   @r5 ->wd
+    vmlsl.u8    q14,d5,d0
+    vst1.8      {q12},[r7],r3               @stores the loaded value
+
+    vmlal.u8    q14,d7,d2
+    vld1.8      {d9},[r6],r2
+
+    vmlsl.u8    q14,d8,d3
+    vld1.8      {d10},[r6],r2
+    add         r7,r1,r3                    @pu1_dst
+    vmull.u8    q13,d7,d1
+    add         r6,r0,r2                    @pu1_src + src_strd
+    pld         [r0,r11]
+
+    vmlsl.u8    q13,d6,d0
+    vld1.8      {d4},[r0]!                  @loads the source
+
+    add         r11,r11,r2
+    vmlal.u8    q13,d8,d2
+    vst1.8      {q15},[r1]!                 @stores the loaded value
+
+    vmlsl.u8    q13,d9,d3
+    vld1.8      {d5},[r6],r2                @loads pu1_src
+
+    vmull.u8    q12,d8,d1
+    vld1.8      {d6},[r6],r2                @load and increment
+    addle       r1,r1,r8                    @pu1_src += 4*src_strd - 2*wd
+
+    cmp         r11,r14
+    rsbgt       r11,r2,r2,lsl #3
+
+    vmlsl.u8    q12,d7,d0
+    subs        r12,r12,#4
+
+
+    vmlal.u8    q12,d9,d2
+    vld1.8      {d7},[r6],r2                @load and increment
+
+    vmlsl.u8    q12,d10,d3
+    vst1.8      {q14},[r7],r3               @stores the loaded value
+
+    bgt         kernel_8                    @jumps to kernel_8
+
+epilog:
+
+    vmull.u8    q15,d5,d1                   @mul with coeff 1
+    vmlsl.u8    q15,d4,d0
+    vmlal.u8    q15,d6,d2
+    vmlsl.u8    q15,d7,d3
+    vst1.8      {q13},[r7],r3               @stores the loaded value
+
+    vld1.8      {d8},[r6],r2                @load and increment
+    vmull.u8    q14,d6,d1                   @mul_res 2
+    vmlsl.u8    q14,d5,d0
+    vmlal.u8    q14,d7,d2
+    vmlsl.u8    q14,d8,d3
+    vst1.8      {q12},[r7],r3               @stores the loaded value
+
+    vld1.8      {d9},[r6],r2
+    vmull.u8    q13,d7,d1
+    add         r7,r1,r3                    @pu1_dst
+    vmlsl.u8    q13,d6,d0
+    vst1.8      {q15},[r1]!                 @stores the loaded value
+    vmlal.u8    q13,d8,d2
+    vld1.8      {d10},[r6],r2
+    vmlsl.u8    q13,d9,d3
+
+    vmull.u8    q12,d8,d1
+    vst1.8      {q14},[r7],r3               @stores the loaded value
+    vmlsl.u8    q12,d7,d0
+    vmlal.u8    q12,d9,d2
+    vst1.8      {q13},[r7],r3               @stores the loaded value
+    vmlsl.u8    q12,d10,d3
+
+    vst1.8      {q12},[r7],r3               @stores the loaded value
+
+end_loops:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+

diff --git a/common/arm/ihevc_inter_pred_filters_luma_horz.s b/common/arm/ihevc_inter_pred_filters_luma_horz.s
new file mode 100644
index 0000000..ee98923
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_filters_luma_horz.s

@@ -0,0 +1,536 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@******************************************************************************
+@* @file
+@*  ihevc_inter_pred_luma_horz.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  parthiban v
+@*
+@* @par list of functions:
+@*
+@*  - ihevc_inter_pred_luma_horz()
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
+@/* include reconstruction */
+@
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     interprediction luma filter for vertical input
+@*
+@* @par description:
+@*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+@*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+@*    'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
+@*    assumptions : the function is optimized considering the fact width is
+@*    multiple of 4 or 8. and height as multiple of 2.
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_luma_horz (
+@                            uword8 *pu1_src,
+@                            uword8 *pu1_dst,
+@                            word32 src_strd,
+@                            word32 dst_strd,
+@                            word8 *pi1_coeff,
+@                            word32 ht,
+@                            word32 wd   )
+
+@**************variables vs registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 => *pi1_coeff
+@   r5 =>  ht
+@   r6 =>  wd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_luma_horz_a9q
+
+.type ihevc_inter_pred_luma_horz_a9q, %function
+
+ihevc_inter_pred_luma_horz_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+    @str        r1,[sp,#-4]
+    @ mov       r7,#8192
+start_loop_count:
+    @ ldr       r1,[sp,#-4]
+
+
+    ldr         r4,[sp,#40]                 @loads pi1_coeff
+    ldr         r8,[sp,#44]                 @loads ht
+    ldr         r10,[sp,#48]                @loads wd
+
+    vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
+    mov         r11,#1
+    subs        r14,r8,#0                   @checks for ht == 0
+
+    vabs.s8     d2,d0                       @vabs_s8(coeff)
+
+    @ble       end_loops
+
+
+    vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+    sub         r12,r0,#3                   @pu1_src - 3
+    vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+    add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
+    vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+    rsb         r9,r10,r2,lsl #1            @2*src_strd - wd
+    vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+    rsb         r8,r10,r3,lsl #1            @2*dst_strd - wd
+    vdup.8      d28,d2[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)
+
+    vdup.8      d29,d2[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)
+    @ tst       r10,#7                          @checks wd for multiples
+    vdup.8      d30,d2[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)
+    vdup.8      d31,d2[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)
+
+    mov         r7,r1
+
+    cmp         r10,#4
+    ble         outer_loop_4
+
+    cmp         r10,#24
+    moveq       r10,#16
+    addeq       r8,#8
+    addeq       r9,#8
+
+    cmp         r10,#16
+    bge         outer_loop_16
+
+    cmp         r10,#12
+    addeq       r8,#4
+    addeq       r9,#4
+    b           outer_loop_8
+
+
+outer_loop8_residual:
+    sub         r12,r0,#3                   @pu1_src - 3
+    mov         r1,r7
+    mov         r14,#32
+    add         r1,#16
+    add         r12,#16
+    mov         r10,#8
+    add         r8,#8
+    add         r9,#8
+
+outer_loop_8:
+
+    add         r6,r1,r3                    @pu1_dst + dst_strd
+    add         r4,r12,r2                   @pu1_src + src_strd
+    subs        r5,r10,#0                   @checks wd
+
+    ble         end_inner_loop_8
+
+inner_loop_8:
+    vld1.u32    {d0},[r12],r11              @vector load pu1_src
+    vld1.u32    {d1},[r12],r11
+    vld1.u32    {d2},[r12],r11
+    vld1.u32    {d3},[r12],r11
+
+
+
+
+
+    @ vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
+    @ vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
+    @ vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
+    @ vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
+    @ vext.u8   d6,d0,d1,#6                     @vector extract of src [0_6]
+    @ vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
+    @ vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
+    @ vext.u8   d14,d12,d13,#2
+
+    @vext.u8    d15,d12,d13,#3                  @vector extract of src[0_3]
+    @ vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
+    @ vext.u8   d17,d12,d13,#5                  @vector extract of src[0_5]
+    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
+    @vext.u8    d19,d12,d13,#7                  @vector extract of src[0_7]
+    @vext.u8   d13,d12,d13,#1                  @vector extract of src[0_1]
+    vld1.u32    {d4},[r12],r11
+    vmull.u8    q4,d1,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+    vld1.u32    {d5},[r12],r11
+    vmlal.u8    q4,d3,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vld1.u32    {d6},[r12],r11
+    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vld1.u32    {d7},[r12],r11
+    vmlsl.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    vld1.u32    {d12},[r4],r11              @vector load pu1_src + src_strd
+    vmlal.u8    q4,d4,d28                   @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+    vld1.u32    {d13},[r4],r11
+    vmlsl.u8    q4,d5,d29                   @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+    vld1.u32    {d14},[r4],r11
+    vmlal.u8    q4,d6,d30                   @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+    vld1.u32    {d15},[r4],r11
+    vmlsl.u8    q4,d7,d31                   @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+    vld1.u32    {d16},[r4],r11              @vector load pu1_src + src_strd
+
+    vmull.u8    q5,d15,d27                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vld1.u32    {d17},[r4],r11
+    vmlsl.u8    q5,d14,d26                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    vld1.u32    {d18},[r4],r11
+    vmlal.u8    q5,d16,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+    vld1.u32    {d19},[r4],r11              @vector load pu1_src + src_strd
+    vmlsl.u8    q5,d17,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+    vqrshrun.s16 d20,q4,#6                  @right shift and saturating narrow result 1
+    vmlal.u8    q5,d18,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+    vmlsl.u8    q5,d19,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+    vst1.8      {d20},[r1]!                 @store the result pu1_dst
+    vmlsl.u8    q5,d12,d24                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vmlal.u8    q5,d13,d25                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+
+    vqrshrun.s16 d8,q5,#6                   @right shift and saturating narrow result 2
+    subs        r5,r5,#8                    @decrement the wd loop
+    vst1.8      {d8},[r6]!                  @store the result pu1_dst
+    cmp         r5,#4
+    bgt         inner_loop_8
+
+end_inner_loop_8:
+    subs        r14,r14,#2                  @decrement the ht loop
+    add         r12,r12,r9                  @increment the src pointer by 2*src_strd-wd
+    add         r1,r1,r8                    @increment the dst pointer by 2*dst_strd-wd
+    bgt         outer_loop_8
+
+
+
+
+
+    ldr         r10,[sp,#48]                @loads wd
+    cmp         r10,#12
+
+    beq         outer_loop4_residual
+
+
+end_loops:
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+
+outer_loop_16:
+    str         r0, [sp, #-4]!
+    str         r7, [sp, #-4]!
+
+    add         r6,r1,r3                    @pu1_dst + dst_strd
+    add         r4,r12,r2                   @pu1_src + src_strd
+    and         r0, r12, #31
+    sub         r5,r10,#0                   @checks wd
+    @ble       end_loops1
+    pld         [r12, r2, lsl #1]
+    vld1.u32    {q0},[r12],r11              @vector load pu1_src
+    pld         [r4, r2, lsl #1]
+    vld1.u32    {q1},[r12],r11
+    vld1.u32    {q2},[r12],r11
+    vld1.u32    {q3},[r12],r11
+    vld1.u32    {q6},[r12],r11
+    vmull.u8    q4,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+    vld1.u32    {q7},[r12],r11
+    vmlal.u8    q4,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vld1.u32    {q8},[r12],r11
+    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vld1.u32    {q9},[r12],r11
+    vmlsl.u8    q4,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    vmlal.u8    q4,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+    vmlsl.u8    q4,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+    vmlal.u8    q4,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+    vmlsl.u8    q4,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+
+
+inner_loop_16:
+
+
+    subs        r5,r5,#16
+    vmull.u8    q10,d3,d25
+
+    add         r12,#8
+    vmlsl.u8    q10,d1,d24
+
+    subeq       r14,r14,#2
+    vmlal.u8    q10,d7,d27
+
+    vld1.u32    {q0},[r4],r11               @vector load pu1_src
+    vmlsl.u8    q10,d5,d26
+
+    vld1.u32    {q1},[r4],r11
+    vmlal.u8    q10,d13,d28
+
+    vld1.u32    {q2},[r4],r11
+    vmlal.u8    q10,d17,d30
+
+    vld1.u32    {q3},[r4],r11
+    vmlsl.u8    q10,d15,d29
+
+    vld1.u32    {q6},[r4],r11
+    vmlsl.u8    q10,d19,d31
+
+    vld1.u32    {q7},[r4],r11
+    vqrshrun.s16 d8,q4,#6                   @right shift and saturating narrow result 1
+
+    vld1.u32    {q8},[r4],r11
+    vmull.u8    q5,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    vld1.u32    {q9},[r4],r11
+    vmlal.u8    q5,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+    add         r4,#8
+    vmlsl.u8    q5,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+    addeq       r12,r12,r9                  @increment the src pointer by 2*src_strd-wd
+    vmlsl.u8    q5,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    addeq       r4,r12,r2                   @pu1_src + src_strd
+    vqrshrun.s16 d9,q10,#6
+
+    vmlal.u8    q5,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+
+@   and         r7, r12, #31
+    vmlsl.u8    q5,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+
+    vmlal.u8    q5,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+
+    vmlsl.u8    q5,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+
+    vmull.u8    q11,d3,d25
+
+    vmlsl.u8    q11,d1,d24
+
+    vst1.8      {q4},[r1]!                  @store the result pu1_dst
+    vmlal.u8    q11,d7,d27
+
+    addeq       r1,r1,r8
+    vqrshrun.s16 d10,q5,#6                  @right shift and saturating narrow result 2
+
+@   cmp         r7, r0
+    vmlsl.u8    q11,d5,d26
+
+    pld         [r12, r2, lsl #2]
+    vmlal.u8    q11,d13,d28
+
+    pld         [r4, r2, lsl #2]
+    vmlal.u8    q11,d17,d30
+
+@   mov         r0, r7
+    vmlsl.u8    q11,d15,d29
+
+    cmp         r14,#0
+    vmlsl.u8    q11,d19,d31
+
+    beq         epilog_16
+    vld1.u32    {q0},[r12],r11              @vector load pu1_src
+    vld1.u32    {q1},[r12],r11
+    vld1.u32    {q2},[r12],r11
+    vld1.u32    {q3},[r12],r11
+    vld1.u32    {q6},[r12],r11
+    vqrshrun.s16 d11,q11,#6
+    vmull.u8    q4,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+    vld1.u32    {q7},[r12],r11
+    vmlal.u8    q4,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vld1.u32    {q8},[r12],r11
+    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vld1.u32    {q9},[r12],r11
+    vmlsl.u8    q4,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    vmlal.u8    q4,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+    cmp         r5,#0
+    vmlsl.u8    q4,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+    moveq       r5,r10
+    vmlal.u8    q4,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+    vst1.8      {q5},[r6]!                  @store the result pu1_dst
+    vmlsl.u8    q4,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+    addeq       r6,r1,r3                    @pu1_dst + dst_strd
+    b           inner_loop_16
+
+
+epilog_16:
+    vqrshrun.s16 d11,q11,#6
+    vst1.8      {q5},[r6]!                  @store the result pu1_dst
+
+    ldr         r7, [sp], #4
+    ldr         r0, [sp], #4
+    ldr         r10,[sp,#48]
+    cmp         r10,#24
+
+    beq         outer_loop8_residual
+
+
+
+end_loops1:
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+
+
+
+outer_loop4_residual:
+    sub         r12,r0,#3                   @pu1_src - 3
+    mov         r1,r7
+    add         r1,#8
+    mov         r10,#4
+    add         r12,#8
+    mov         r14,#16
+    add         r8,#4
+    add         r9,#4
+
+outer_loop_4:
+    add         r6,r1,r3                    @pu1_dst + dst_strd
+    add         r4,r12,r2                   @pu1_src + src_strd
+
+    subs        r5,r10,#0                   @checks wd
+    ble         end_inner_loop_4
+
+inner_loop_4:
+    vld1.u32    {d0},[r12],r11              @vector load pu1_src
+    vld1.u32    {d1},[r12],r11
+    vld1.u32    {d2},[r12],r11
+    vld1.u32    {d3},[r12],r11
+    vld1.u32    {d4},[r12],r11
+    vld1.u32    {d5},[r12],r11
+    vld1.u32    {d6},[r12],r11
+    vld1.u32    {d7},[r12],r11
+    @add       r12,r12,#4                      @increment the input pointer
+    sub         r12,r12,#4
+    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
+    @vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
+    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
+
+    @vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
+    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
+    @vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
+    @vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
+    vld1.u32    {d12},[r4],r11              @vector load pu1_src + src_strd
+    vld1.u32    {d13},[r4],r11
+    vzip.32     d0,d12                      @vector zip the i iteration and ii interation in single register
+    vld1.u32    {d14},[r4],r11
+    vzip.32     d1,d13
+    vld1.u32    {d15},[r4],r11
+    vzip.32     d2,d14
+    vld1.u32    {d16},[r4],r11
+    vzip.32     d3,d15
+    vld1.u32    {d17},[r4],r11
+    vzip.32     d4,d16
+    vld1.u32    {d18},[r4],r11
+    vzip.32     d5,d17
+    vld1.u32    {d19},[r4],r11
+    sub         r4,r4,#4
+    @ add       r4,r4,#4                        @increment the input pointer
+    @ vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
+    @ vext.u8   d15,d12,d13,#3                  @vector extract of src[0_3]
+    @ vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
+    @ vext.u8   d17,d12,d13,#5                  @vector extract of src[0_5]
+    @ vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
+    @ vext.u8   d19,d12,d13,#7                  @vector extract of src[0_7]
+    @vext.u8   d13,d12,d13,#1                  @vector extract of src[0_1]
+
+
+
+
+
+
+
+    vzip.32     d6,d18
+    vzip.32     d7,d19
+
+    vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
+    vmlsl.u8    q4,d0,d24
+    vmlsl.u8    q4,d2,d26
+    vmlal.u8    q4,d3,d27
+    vmlal.u8    q4,d4,d28
+    vmlsl.u8    q4,d5,d29
+    vmlal.u8    q4,d6,d30
+    vmlsl.u8    q4,d7,d31
+
+    vqrshrun.s16 d8,q4,#6                   @narrow right shift and saturating the result
+    vst1.32     {d8[0]},[r1]!               @store the i iteration result which is in upper part of the register
+    vst1.32     {d8[1]},[r6]!               @store the ii iteration result which is in lower part of the register
+    subs        r5,r5,#4                    @decrement the wd by 4
+    bgt         inner_loop_4
+
+end_inner_loop_4:
+    subs        r14,r14,#2                  @decrement the ht by 4
+    add         r12,r12,r9                  @increment the input pointer 2*src_strd-wd
+    add         r1,r1,r8                    @increment the output pointer 2*dst_strd-wd
+    bgt         outer_loop_4
+    @subs   r7,r7,#1
+    @ bgt   start_loop_count
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+
+

diff --git a/common/arm/ihevc_inter_pred_filters_luma_vert.s b/common/arm/ihevc_inter_pred_filters_luma_vert.s
new file mode 100644
index 0000000..04942ae
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_filters_luma_vert.s

@@ -0,0 +1,947 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@******************************************************************************
+@* @file
+@*  ihevc_inter_pred_filters_luma_vert.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  parthiban v
+@*
+@* @par list of functions:
+@*
+@*  - ihevc_inter_pred_luma_vert()
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
+@/* include reconstruction */
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     interprediction luma filter for vertical input
+@*
+@* @par description:
+@*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+@*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+@*    'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
+@*    assumptions : the function is optimized considering the fact width is
+@*    multiple of 4 or 8. and height as multiple of 2.
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_luma_vert (
+@                            uword8 *pu1_src,
+@                            uword8 *pu1_dst,
+@                            word32 src_strd,
+@                            word32 dst_strd,
+@                            word8 *pi1_coeff,
+@                            word32 ht,
+@                            word32 wd   )
+
+@**************variables vs registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r6 =>  dst_strd
+@   r12 => *pi1_coeff
+@   r5 =>  ht
+@   r3 =>  wd
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_luma_vert_a9q
+
+.type ihevc_inter_pred_luma_vert_a9q, %function
+
+ihevc_inter_pred_luma_vert_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r12,[sp,#40]                @load pi1_coeff
+    mov         r6,r3
+    ldr         r5,[sp,#48]                 @load wd
+    vld1.u8     {d0},[r12]                  @coeff = vld1_s8(pi1_coeff)
+    sub         r12,r2,r2,lsl #2            @src_ctrd & pi1_coeff
+    vabs.s8     d0,d0                       @vabs_s8(coeff)
+    add         r0,r0,r12                   @r0->pu1_src    r12->pi1_coeff
+    ldr         r3,[sp,#44]                 @load ht
+    subs        r7,r3,#0                    @r3->ht
+    @ble        end_loops           @end loop jump
+    vdup.u8     d22,d0[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
+    cmp         r5,#8
+    vdup.u8     d23,d0[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
+    vdup.u8     d24,d0[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
+    vdup.u8     d25,d0[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
+    vdup.u8     d26,d0[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
+    vdup.u8     d27,d0[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
+    vdup.u8     d28,d0[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
+    vdup.u8     d29,d0[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
+    blt         core_loop_wd_4              @core loop wd 4 jump
+    str         r0, [sp, #-4]!
+    str         r1, [sp, #-4]!
+
+    bic         r4,r5,#7                    @r5 ->wd
+    rsb         r9,r4,r6,lsl #2             @r6->dst_strd   r5  ->wd
+    rsb         r8,r4,r2,lsl #2             @r2->src_strd
+    mov         r3, r5, lsr #3              @divide by 8
+    mul         r7, r3                      @multiply height by width
+    sub         r7, #4                      @subtract by one for epilog
+
+prolog:
+
+    and         r10, r0, #31
+    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
+    vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    subs        r4,r4,#8
+    vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+    vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
+    vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
+    vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+    vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+    vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
+    vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+    vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
+
+
+    vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+
+    addle       r0,r0,r8
+    vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
+
+    bicle       r4,r5,#7                    @r5 ->wd
+    vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
+
+    pld         [r3]
+    vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+    pld         [r3, r2]
+    vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+    pld         [r3, r2, lsl #1]
+    vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
+
+    add         r3, r3, r2
+    vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+
+    pld         [r3, r2, lsl #1]
+    vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
+
+    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
+    vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
+
+    vld1.u8     {d1},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmull.u8    q6,d3,d23
+    vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q6,d2,d22
+    vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q6,d4,d24
+    vmlal.u8    q6,d5,d25
+    vmlal.u8    q6,d6,d26
+    vmlsl.u8    q6,d7,d27
+    vmlal.u8    q6,d16,d28
+    vmlsl.u8    q6,d17,d29
+    add         r14,r1,r6
+    vst1.8      {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
+    vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
+    addle       r1,r1,r9
+
+    vmull.u8    q7,d4,d23
+    subs        r7,r7,#4
+    vmlsl.u8    q7,d3,d22
+    vmlsl.u8    q7,d5,d24
+    vmlal.u8    q7,d6,d25
+    vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q7,d7,d26
+    vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q7,d16,d27
+    vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q7,d17,d28
+    vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q7,d18,d29
+    vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+
+    vst1.8      {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
+    vqrshrun.s16 d12,q6,#6
+
+
+    blt         epilog_end                  @jumps to epilog_end
+    beq         epilog                      @jumps to epilog
+
+kernel_8:
+
+    subs        r4,r4,#8
+    vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+
+    addle       r0,r0,r8
+    vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
+
+    bicle       r4,r5,#7                    @r5 ->wd
+    vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
+
+    vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+
+    vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+
+    vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
+
+    vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+
+    vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
+    vst1.8      {d12},[r14],r6
+
+@   and         r11, r0, #31
+    vqrshrun.s16 d14,q7,#6
+
+    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
+    vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+
+    vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
+
+    vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
+
+    vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+
+    vst1.8      {d14},[r14],r6
+    vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+
+    add         r14,r1,#0
+    vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
+
+    add         r1, r1, #8
+    vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+
+    vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
+
+    addle       r1,r1,r9
+    vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
+
+@   cmp         r11, r10
+    vmull.u8    q6,d3,d23
+
+    add         r10, r3, r2, lsl #3         @ 10*strd - 8+2
+    vmlsl.u8    q6,d2,d22
+
+    add         r10, r10, r2                @ 11*strd
+    vmlsl.u8    q6,d4,d24
+
+    vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q6,d5,d25
+
+    vmlal.u8    q6,d6,d26
+    vst1.8      {d8},[r14],r6               @vst1_u8(pu1_dst,sto_res)@
+
+    pld         [r10]                       @11+ 0
+    vmlsl.u8    q6,d7,d27
+
+    pld         [r10, r2]                   @11+ 1*strd
+    vmlal.u8    q6,d16,d28
+
+    pld         [r10, r2, lsl #1]           @11+ 2*strd
+    vmlsl.u8    q6,d17,d29
+
+    add         r10, r10, r2                @12*strd
+    vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
+
+    pld         [r10, r2, lsl #1]           @11+ 3*strd
+    vmull.u8    q7,d4,d23
+
+@   mov         r10, r11
+    vmlsl.u8    q7,d3,d22
+
+    subs        r7,r7,#4
+    vmlsl.u8    q7,d5,d24
+
+    vmlal.u8    q7,d6,d25
+    vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q7,d7,d26
+    vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q7,d16,d27
+    vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q7,d17,d28
+    vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q7,d18,d29
+    vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+
+    vqrshrun.s16 d12,q6,#6
+    vst1.8      {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
+
+
+
+    bgt         kernel_8                    @jumps to kernel_8
+
+epilog:
+
+    vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+    vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
+    vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
+    vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+    vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+    vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
+    vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+    vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
+    vst1.8      {d12},[r14],r6
+
+    vqrshrun.s16 d14,q7,#6
+
+    vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+    vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
+    vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
+    vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+    vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+    vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
+    vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+    vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
+    vst1.8      {d14},[r14],r6
+
+    vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
+
+    vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmull.u8    q6,d3,d23
+    vmlsl.u8    q6,d2,d22
+    vmlsl.u8    q6,d4,d24
+    vmlal.u8    q6,d5,d25
+    vmlal.u8    q6,d6,d26
+    vmlsl.u8    q6,d7,d27
+    vmlal.u8    q6,d16,d28
+    vmlsl.u8    q6,d17,d29
+    add         r14,r1,r6
+    vst1.8      {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
+    vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
+
+    vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmull.u8    q7,d4,d23
+    vmlsl.u8    q7,d3,d22
+    vmlsl.u8    q7,d5,d24
+    vmlal.u8    q7,d6,d25
+    vmlal.u8    q7,d7,d26
+    vmlsl.u8    q7,d16,d27
+    vmlal.u8    q7,d17,d28
+    vmlsl.u8    q7,d18,d29
+
+    vst1.8      {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
+    vqrshrun.s16 d12,q6,#6
+
+epilog_end:
+    vst1.8      {d12},[r14],r6
+    vqrshrun.s16 d14,q7,#6
+
+    vst1.8      {d14},[r14],r6
+
+
+end_loops:
+    tst         r5,#7
+    ldr         r1, [sp], #4
+    ldr         r0, [sp], #4
+
+    ldmeqfd     sp!,{r4-r12,r15}            @reload the registers from sp
+    mov         r5, #4
+    add         r0, r0, #8
+    add         r1, r1, #8
+    mov         r7, #16
+    @
+
+core_loop_wd_4:
+    rsb         r9,r5,r6,lsl #2             @r6->dst_strd   r5  ->wd
+    rsb         r8,r5,r2,lsl #2             @r2->src_strd
+    vmov.i8     d4,#0
+
+outer_loop_wd_4:
+    subs        r12,r5,#0
+    ble         end_inner_loop_wd_4         @outer loop jump
+
+inner_loop_wd_4:
+    add         r3,r0,r2
+    vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
+    subs        r12,r12,#4
+    vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
+    vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
+    vld1.u32    {d4[0]},[r0]                @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)@
+    vmull.u8    q0,d5,d23                   @mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)@
+
+    vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
+    add         r0,r0,#4
+    vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
+    vmlsl.u8    q0,d4,d22                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)@
+
+    vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
+    vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
+    vmlsl.u8    q0,d6,d24                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)@
+
+    vmull.u8    q4,d7,d23
+    vdup.u32    d4,d7[1]                    @src_tmp1 = vdup_lane_u32(src_tmp4, 1)@
+    vmull.u8    q1,d7,d25                   @mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)@
+    vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
+    vmlsl.u8    q4,d6,d22
+    vmlal.u8    q0,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)@
+
+    vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
+    vmlsl.u8    q4,d4,d24
+    vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
+    vmlsl.u8    q1,d5,d27                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)@
+
+    vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
+    vmlal.u8    q4,d5,d25
+    vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
+    vmlal.u8    q0,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)@
+
+    vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
+    vmlal.u8    q4,d6,d26
+    vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
+    vmlsl.u8    q1,d7,d29                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)@
+
+    vdup.u32    d4,d7[1]
+    vadd.i16    q0,q0,q1                    @mul_res1 = vaddq_u16(mul_res1, mul_res2)@
+
+    vmlsl.u8    q4,d7,d27
+    vld1.u32    {d4[1]},[r3],r2
+    vmlal.u8    q4,d4,d28
+    vdup.u32    d5,d4[1]
+    vqrshrun.s16 d0,q0,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
+
+    vld1.u32    {d5[1]},[r3]
+    add         r3,r1,r6
+    vst1.32     {d0[0]},[r1]                @vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)@
+
+    vmlsl.u8    q4,d5,d29
+    vst1.32     {d0[1]},[r3],r6             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)@
+    vqrshrun.s16 d8,q4,#6
+
+    vst1.32     {d8[0]},[r3],r6
+    add         r1,r1,#4
+    vst1.32     {d8[1]},[r3]
+    bgt         inner_loop_wd_4
+
+end_inner_loop_wd_4:
+    subs        r7,r7,#4
+    add         r1,r1,r9
+    add         r0,r0,r8
+    bgt         outer_loop_wd_4
+
+    ldmfd       sp!, {r4-r12, r15}          @reload the registers from sp
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     interprediction luma filter for vertical 16bit output
+@*
+@* @par description:
+@*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+@*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+@*    'pu1_dst'  no downshifting or clipping is done and the output is  used as
+@*    an input for weighted prediction   assumptions : the function is optimized
+@*    considering the fact width is  multiple of 4 or 8. and height as multiple
+@*    of 2.
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[out] pi2_dst
+@*  word16 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_luma_vert_w16out(uword8 *pu1_src,
+@                                    word16 *pi2_dst,
+@                                    word32 src_strd,
+@                                    word32 dst_strd,
+@                                    word8 *pi1_coeff,
+@                                    word32 ht,
+@                                    word32 wd   )
+
+@**************variables vs registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r6 =>  dst_strd
+@   r12 => *pi1_coeff
+@   r5 =>  ht
+@   r3 =>  wd
+
+
+
+.globl ihevc_inter_pred_luma_vert_w16out_a9q
+
+.type ihevc_inter_pred_luma_vert_w16out_a9q, %function
+
+ihevc_inter_pred_luma_vert_w16out_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r12,[sp,#40]                @load pi1_coeff
+    mov         r6,r3
+    ldr         r5,[sp,#48]                 @load wd
+    vld1.u8     {d0},[r12]                  @coeff = vld1_s8(pi1_coeff)
+    sub         r12,r2,r2,lsl #2            @src_ctrd & pi1_coeff
+    vabs.s8     d0,d0                       @vabs_s8(coeff)
+    add         r0,r0,r12                   @r0->pu1_src    r12->pi1_coeff
+    ldr         r3,[sp,#44]                 @load ht
+    subs        r7,r3,#0                    @r3->ht
+    @ble        end_loops_16out         @end loop jump
+    vdup.u8     d22,d0[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
+    cmp         r5,#8
+    vdup.u8     d23,d0[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
+    vdup.u8     d24,d0[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
+    vdup.u8     d25,d0[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
+    vdup.u8     d26,d0[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
+    vdup.u8     d27,d0[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
+    vdup.u8     d28,d0[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
+    vdup.u8     d29,d0[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
+    blt         core_loop_wd_4_16out        @core loop wd 4 jump
+    str         r0, [sp, #-4]!
+    str         r1, [sp, #-4]!
+
+    bic         r4,r5,#7                    @r5 ->wd
+    rsb         r9,r4,r6,lsl #2             @r6->dst_strd   r5  ->wd
+    rsb         r8,r4,r2,lsl #2             @r2->src_strd
+    mov         r6, r6, lsl #1
+    mov         r3, r5, lsr #3              @divide by 8
+    mul         r7, r3                      @multiply height by width
+    sub         r7, #4                      @subtract by one for epilog
+
+prolog_16out:
+
+    and         r10, r0, #31
+    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
+
+    vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    subs        r4,r4,#8
+    vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+    vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
+    vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
+    vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+    vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+    vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
+    vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+    vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
+
+
+    addle       r0,r0,r8
+    vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+
+    bicle       r4,r5,#7                    @r5 ->wd
+    vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
+
+    vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
+
+    pld         [r3]
+    vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+    pld         [r3, r2]
+    vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+    pld         [r3, r2, lsl #1]
+    vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
+    add         r3, r3, r2
+    vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+    pld         [r3, r2, lsl #1]
+    vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
+
+    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
+    vmull.u8    q6,d3,d23
+    vld1.u8     {d1},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q6,d2,d22
+    vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q6,d4,d24
+    vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q6,d5,d25
+    vmlal.u8    q6,d6,d26
+    vmlsl.u8    q6,d7,d27
+    vmlal.u8    q6,d16,d28
+    vmlsl.u8    q6,d17,d29
+    add         r14,r1,r6
+    vst1.8      {d8, d9},[r1]!              @vst1_u8(pu1_dst,sto_res)@
+    @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
+    addle       r1,r1,r9,lsl #1
+
+    vmull.u8    q7,d4,d23
+    subs        r7,r7,#4
+    vmlsl.u8    q7,d3,d22
+    vmlsl.u8    q7,d5,d24
+    vmlal.u8    q7,d6,d25
+    vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q7,d7,d26
+    vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q7,d16,d27
+    vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q7,d17,d28
+    vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q7,d18,d29
+    vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+
+    vst1.8      {d10, d11},[r14],r6         @vst1_u8(pu1_dst_tmp,sto_res)@
+    @vqrshrun.s16 d12,q6,#6
+
+
+    blt         epilog_end_16out
+    beq         epilog_16out                @jumps to epilog
+
+kernel_8_16out:
+
+    subs        r4,r4,#8
+    vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+
+    addle       r0,r0,r8
+    vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
+
+    vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
+
+    vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+
+    bicle       r4,r5,#7                    @r5 ->wd
+    vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+
+    vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
+
+    vst1.8      {d12,d13},[r14],r6
+    vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+
+    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
+    vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
+
+
+@   and         r11, r0, #31
+    vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+
+    vst1.8      {d14,d15},[r14],r6
+    vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
+
+    add         r14,r1,r6
+    vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
+
+    vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+
+    vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+
+    vst1.8      {d8,d9},[r1]!               @vst1_u8(pu1_dst,sto_res)@
+    vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
+
+    addle       r1,r1,r9,lsl #1
+    vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+
+@   cmp         r11, r10
+    vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
+
+    add         r10, r3, r2, lsl #3         @ 10*strd - 8+2
+    vmull.u8    q6,d3,d23
+
+    add         r10, r10, r2                @ 11*strd
+    vmlsl.u8    q6,d2,d22
+
+    pld         [r10]                       @11+ 0
+    vmlsl.u8    q6,d4,d24
+
+    pld         [r10, r2]                   @11+ 1*strd
+    vmlal.u8    q6,d5,d25
+
+    pld         [r10, r2, lsl #1]           @11+ 2*strd
+    vmlal.u8    q6,d6,d26
+
+    add         r10, r10, r2                @12*strd
+    vmlsl.u8    q6,d7,d27
+
+    pld         [r10, r2, lsl #1]           @11+ 3*strd
+    vmlal.u8    q6,d16,d28
+
+@   mov         r10, r11
+    vmlsl.u8    q6,d17,d29
+
+    vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmull.u8    q7,d4,d23
+
+    subs        r7,r7,#4
+    vmlsl.u8    q7,d3,d22
+
+    vst1.8      {d10, d11},[r14],r6         @vst1_u8(pu1_dst_tmp,sto_res)@
+    vmlsl.u8    q7,d5,d24
+
+    vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q7,d6,d25
+
+    vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q7,d7,d26
+
+    vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q7,d16,d27
+
+    vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlal.u8    q7,d17,d28
+
+    vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+    vmlsl.u8    q7,d18,d29
+
+
+    bgt         kernel_8_16out              @jumps to kernel_8
+
+epilog_16out:
+
+    vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+    vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
+    vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
+    vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+    vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+    vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
+    vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+    vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
+    vst1.8      {d12,d13},[r14],r6
+
+    @vqrshrun.s16 d14,q7,#6
+
+    vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+    vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
+    vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
+    vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+    vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+    vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
+    vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+    vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
+    vst1.8      {d14,d15},[r14],r6
+
+    @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
+
+    vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmull.u8    q6,d3,d23
+    vmlsl.u8    q6,d2,d22
+    vmlsl.u8    q6,d4,d24
+    vmlal.u8    q6,d5,d25
+    vmlal.u8    q6,d6,d26
+    vmlsl.u8    q6,d7,d27
+    vmlal.u8    q6,d16,d28
+    vmlsl.u8    q6,d17,d29
+    add         r14,r1,r6
+    vst1.8      {d8,d9},[r1]!               @vst1_u8(pu1_dst,sto_res)@
+    @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
+
+    vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmull.u8    q7,d4,d23
+    vmlsl.u8    q7,d3,d22
+    vmlsl.u8    q7,d5,d24
+    vmlal.u8    q7,d6,d25
+    vmlal.u8    q7,d7,d26
+    vmlsl.u8    q7,d16,d27
+    vmlal.u8    q7,d17,d28
+    vmlsl.u8    q7,d18,d29
+
+    vst1.8      {d10,d11},[r14],r6          @vst1_u8(pu1_dst_tmp,sto_res)@
+    @vqrshrun.s16 d12,q6,#6
+
+epilog_end_16out:
+    vst1.8      {d12,d13},[r14],r6
+    @vqrshrun.s16 d14,q7,#6
+
+    vst1.8      {d14,d15},[r14],r6
+
+
+end_loops_16out:
+    tst         r5,#7
+    ldr         r1, [sp], #4
+    ldr         r0, [sp], #4
+
+    ldmeqfd     sp!,{r4-r12,r15}            @reload the registers from sp
+    mov         r5, #4
+    add         r0, r0, #8
+    add         r1, r1, #16
+    mov         r7, #16
+    mov         r6, r6, lsr #1
+
+    @
+
+core_loop_wd_4_16out:
+    rsb         r9,r5,r6,lsl #2             @r6->dst_strd   r5  ->wd
+    rsb         r8,r5,r2,lsl #2             @r2->src_strd
+    vmov.i8     d4,#0
+    mov         r6, r6, lsl #1
+
+outer_loop_wd_4_16out:
+    subs        r12,r5,#0
+    ble         end_inner_loop_wd_4_16out   @outer loop jump
+
+inner_loop_wd_4_16out:
+    add         r3,r0,r2
+    vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
+    subs        r12,r12,#4
+    vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
+    vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
+    vld1.u32    {d4[0]},[r0]                @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)@
+    vmull.u8    q0,d5,d23                   @mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)@
+
+    vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
+    add         r0,r0,#4
+    vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
+    vmlsl.u8    q0,d4,d22                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)@
+
+    vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
+    vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
+    vmlsl.u8    q0,d6,d24                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)@
+
+    vmull.u8    q4,d7,d23
+    vdup.u32    d4,d7[1]                    @src_tmp1 = vdup_lane_u32(src_tmp4, 1)@
+    vmull.u8    q1,d7,d25                   @mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)@
+    vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
+    vmlsl.u8    q4,d6,d22
+    vmlal.u8    q0,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)@
+
+    vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
+    vmlsl.u8    q4,d4,d24
+    vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
+    vmlsl.u8    q1,d5,d27                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)@
+
+    vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
+    vmlal.u8    q4,d5,d25
+    vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
+    vmlal.u8    q0,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)@
+
+    vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
+    vmlal.u8    q4,d6,d26
+    vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
+    vmlsl.u8    q1,d7,d29                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)@
+
+    vdup.u32    d4,d7[1]
+    vadd.i16    q0,q0,q1                    @mul_res1 = vaddq_u16(mul_res1, mul_res2)@
+
+    vmlsl.u8    q4,d7,d27
+    vld1.u32    {d4[1]},[r3],r2
+    vmlal.u8    q4,d4,d28
+    vdup.u32    d5,d4[1]
+    @vqrshrun.s16 d0,q0,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
+
+    vld1.u32    {d5[1]},[r3]
+    add         r3,r1,r6
+    vst1.32     {d0},[r1]!                  @vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)@
+
+    vmlsl.u8    q4,d5,d29
+    vst1.32     {d1},[r3],r6                @vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)@
+    @vqrshrun.s16 d8,q4,#6
+
+    vst1.32     {d8},[r3],r6
+    @add        r1,r1,#4
+    vst1.32     {d9},[r3]
+    bgt         inner_loop_wd_4_16out
+
+end_inner_loop_wd_4_16out:
+    subs        r7,r7,#4
+    add         r1,r1,r9,lsl #1
+    add         r0,r0,r8
+    bgt         outer_loop_wd_4_16out
+
+    ldmfd       sp!, {r4-r12, r15}          @reload the registers from sp
+
+
+
+
+
+
+
+
+

diff --git a/common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s b/common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s
new file mode 100644
index 0000000..4fbc5d1
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s

@@ -0,0 +1,393 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@******************************************************************************
+@* @file
+@*  ihevc_inter_pred_filters_luma_vert_w16inp.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*  - ihevc_inter_pred_filters_luma_vert_w16inp()
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
+@/* include reconstruction */
+@
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    luma vertical filter for 16bit input.
+@*
+@* @par description:
+@*     applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+@*     the elements pointed by 'pu1_src' and  writes to the location pointed by
+@*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
+@*     clipped to lie  between 0 and 255   assumptions : the function is
+@*     optimized considering the fact width is  multiple of 4. and height as
+@*     multiple of 2.
+@*
+@* @param[in] pi2_src
+@*  word16 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src,
+@                                    uword8 *pu1_dst,
+@                                    word32 src_strd,
+@                                    word32 dst_strd,
+@                                    word8 *pi1_coeff,
+@                                    word32 ht,
+@                                    word32 wd   )
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_luma_vert_w16inp_a9q
+
+.type ihevc_inter_pred_luma_vert_w16inp_a9q, %function
+
+ihevc_inter_pred_luma_vert_w16inp_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r12,[sp,#40]                @load pi1_coeff
+    mov         r6,r3
+    ldr         r5,[sp,#48]                 @load wd
+    vld1.8      {d0},[r12]                  @coeff = vld1_s8(pi1_coeff)
+    mov         r2, r2, lsl #1
+    sub         r12,r2,r2,lsl #2            @src_ctrd & pi1_coeff
+    @vabs.s8    d0,d0               @vabs_s8(coeff)
+    add         r0,r0,r12                   @r0->pu1_src    r12->pi1_coeff
+    ldr         r3,[sp,#44]                 @load ht
+    subs        r7,r3,#0                    @r3->ht
+    @ble        end_loops           @end loop jump
+    vmovl.s8    q0,d0
+    vdup.16     d22,d0[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
+    vdup.16     d23,d0[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
+    vdup.16     d24,d0[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
+    vdup.16     d25,d0[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
+    vdup.16     d26,d1[0]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
+    vdup.16     d27,d1[1]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
+    vdup.16     d28,d1[2]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
+    vdup.16     d29,d1[3]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
+
+    rsb         r9,r5,r6,lsl #2             @r6->dst_strd   r5  ->wd
+    rsb         r8,r5,r2,lsl #2             @r2->src_strd
+    sub         r8,r8,r5
+    mov         r3, r5, lsr #2              @divide by 4
+    mul         r7, r3                      @multiply height by width
+    sub         r7, #4                      @subtract by one for epilog
+    mov         r4,r5                       @r5 ->wd
+    @mov            r2, r2, lsl #1
+
+prolog:
+
+    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
+    vld1.16     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    subs        r4,r4,#4
+    vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+    vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
+    vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
+    vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+    vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+    vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
+    vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+    vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
+
+    vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
+
+    vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+    addle       r0,r0,r8,lsl #0
+    vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
+    movle       r4,r5                       @r5 ->wd
+    vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
+    vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+    vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
+    vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
+    vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+    vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
+    vqshrn.s32  d8, q4, #6
+
+    vld1.16     {d1},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmull.s16   q6,d3,d23
+    vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q6,d2,d22
+    vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q6,d4,d24
+    vmlal.s16   q6,d5,d25
+    vmlal.s16   q6,d6,d26
+    vmlal.s16   q6,d7,d27
+    vmlal.s16   q6,d16,d28
+    vmlal.s16   q6,d17,d29
+    add         r14,r1,r6
+    vqshrn.s32  d10, q5, #6
+    vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
+
+    vmull.s16   q7,d4,d23
+    vmlal.s16   q7,d3,d22
+    vmlal.s16   q7,d5,d24
+    vmlal.s16   q7,d6,d25
+    vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q7,d7,d26
+    vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q7,d16,d27
+    vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q7,d17,d28
+    vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q7,d18,d29
+    vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+
+    vst1.32     {d8[0]},[r1]!               @vst1_u8(pu1_dst,sto_res)@
+    vqshrn.s32  d12, q6, #6
+    vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
+    addle       r1,r1,r9
+
+    subs        r7,r7,#4
+
+    blt         epilog_end                  @jumps to epilog_end
+    beq         epilog                      @jumps to epilog
+
+kernel_8:
+
+    vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+    subs        r4,r4,#4
+    vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
+    addle       r0,r0,r8,lsl #0
+    vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
+    vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+    vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+    vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
+    vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+    vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
+    vst1.32     {d10[0]},[r14],r6           @vst1_u8(pu1_dst_tmp,sto_res)@
+
+    vqshrn.s32  d14, q7, #6
+    vqrshrun.s16 d12,q6,#6
+    vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
+
+    vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+    vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
+    vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
+    vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+    vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+    vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
+    vst1.32     {d12[0]},[r14],r6
+
+    vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+    vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
+
+    vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
+
+    vqshrn.s32  d8, q4, #6
+    vqrshrun.s16 d14,q7,#6
+
+    vmull.s16   q6,d3,d23
+    movle       r4,r5                       @r5 ->wd
+
+    vmlal.s16   q6,d2,d22
+    vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
+
+    vmlal.s16   q6,d4,d24
+    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
+
+    vmlal.s16   q6,d5,d25
+
+    vmlal.s16   q6,d6,d26
+    vst1.32     {d14[0]},[r14],r6
+
+    vmlal.s16   q6,d7,d27
+    vld1.16     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+
+    vmlal.s16   q6,d16,d28
+    add         r14,r1,r6
+
+    vmlal.s16   q6,d17,d29
+    vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
+
+    vqshrn.s32  d10, q5, #6
+    vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
+    vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+
+    vmull.s16   q7,d4,d23
+    vmlal.s16   q7,d3,d22
+    vmlal.s16   q7,d5,d24
+    vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+
+    vmlal.s16   q7,d6,d25
+    vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q7,d7,d26
+    vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q7,d16,d27
+    vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q7,d17,d28
+    vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q7,d18,d29
+    vst1.32     {d8[0]},[r1]!               @vst1_u8(pu1_dst,sto_res)@
+
+    vqshrn.s32  d12, q6, #6
+    addle       r1,r1,r9
+
+    vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
+    subs        r7,r7,#4
+
+    bgt         kernel_8                    @jumps to kernel_8
+
+epilog:
+
+    vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+    vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
+    vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
+    vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+    vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+    vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
+    vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+    vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
+    vst1.32     {d10[0]},[r14],r6
+
+    vqshrn.s32  d14, q7, #6
+    vqrshrun.s16 d12,q6,#6
+
+    vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+    vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
+    vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
+    vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+    vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+    vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
+    vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+    vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
+    vst1.32     {d12[0]},[r14],r6
+
+    vqshrn.s32  d8, q4, #6
+    vqrshrun.s16 d14,q7,#6
+
+    vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmull.s16   q6,d3,d23
+    vmlal.s16   q6,d2,d22
+    vmlal.s16   q6,d4,d24
+    vmlal.s16   q6,d5,d25
+    vmlal.s16   q6,d6,d26
+    vmlal.s16   q6,d7,d27
+    vmlal.s16   q6,d16,d28
+    vmlal.s16   q6,d17,d29
+    vst1.32     {d14[0]},[r14],r6
+    vqshrn.s32  d10, q5, #6
+    vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
+
+    vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmull.s16   q7,d4,d23
+    vmlal.s16   q7,d3,d22
+    vmlal.s16   q7,d5,d24
+    vmlal.s16   q7,d6,d25
+    vmlal.s16   q7,d7,d26
+    vmlal.s16   q7,d16,d27
+    vmlal.s16   q7,d17,d28
+    vmlal.s16   q7,d18,d29
+    vqshrn.s32  d12, q6, #6
+    vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
+
+    add         r14,r1,r6
+    vst1.32     {d8[0]},[r1]!               @vst1_u8(pu1_dst,sto_res)@
+
+epilog_end:
+    vst1.32     {d10[0]},[r14],r6           @vst1_u8(pu1_dst_tmp,sto_res)@
+    vqrshrun.s16 d12,q6,#6
+
+    vst1.32     {d12[0]},[r14],r6
+    vqshrn.s32  d14, q7, #6
+    vqrshrun.s16 d14,q7,#6
+
+    vst1.32     {d14[0]},[r14],r6
+
+
+end_loops:
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

diff --git a/common/arm/ihevc_inter_pred_luma_copy.s b/common/arm/ihevc_inter_pred_luma_copy.s
new file mode 100644
index 0000000..8a61369
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_luma_copy.s

@@ -0,0 +1,188 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     interprediction luma function for copy
+@*
+@* @par description:
+@*   copies the array of width 'wd' and height 'ht' from the  location pointed
+@*   by 'src' to the location pointed by 'dst'
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@void ihevc_inter_pred_luma_copy (
+@                            uword8 *pu1_src,
+@                            uword8 *pu1_dst,
+@                            word32 src_strd,
+@                            word32 dst_strd,
+@                            word8 *pi1_coeff,
+@                            word32 ht,
+@                            word32 wd   )
+
+@**************variables vs registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r7 =>  ht
+@   r12 => wd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_luma_copy_a9q
+
+.type ihevc_inter_pred_luma_copy_a9q, %function
+
+ihevc_inter_pred_luma_copy_a9q:
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+    ldr         r12,[sp,#48]                @loads wd
+    ldr         r7,[sp,#44]                 @loads ht
+    cmp         r7,#0                       @checks ht == 0
+    ble         end_loops
+    tst         r12,#15                     @checks wd for multiples for 4 & 8
+    beq         core_loop_wd_16
+    tst         r12,#7                      @checks wd for multiples for 4 & 8
+    beq         core_loop_wd_8
+    sub         r11,r12,#4
+
+outer_loop_wd_4:
+    subs        r4,r12,#0                   @checks wd == 0
+    ble         end_inner_loop_wd_4
+
+inner_loop_wd_4:
+    vld1.32     {d0[0]},[r0]                @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    add         r5,r0,r2                    @pu1_src_tmp += src_strd
+    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
+    vst1.32     {d0[0]},[r1]                @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    add         r0,r0,#4                    @pu1_src += 4
+    vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    subs        r4,r4,#4                    @(wd -4)
+    vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    add         r1,r1,#4                    @pu1_dst += 4
+    vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+
+    bgt         inner_loop_wd_4
+
+end_inner_loop_wd_4:
+    subs        r7,r7,#4                    @ht - 4
+    sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
+    sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
+    bgt         outer_loop_wd_4
+
+end_loops:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+core_loop_wd_8:
+    sub         r11,r12,#8
+
+outer_loop_wd_8:
+    subs        r4,r12,#0                   @checks wd
+    ble         end_inner_loop_wd_8
+
+inner_loop_wd_8:
+    add         r5,r0,r2                    @pu1_src_tmp += src_strd
+    vld1.8      {d0},[r0]!                  @vld1_u8(pu1_src_tmp)
+    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
+    vst1.8      {d0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
+    vld1.8      {d1},[r5],r2                @vld1_u8(pu1_src_tmp)
+    vst1.8      {d1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
+    subs        r4,r4,#8                    @wd - 8(loop condition)
+    vld1.8      {d2},[r5],r2                @vld1_u8(pu1_src_tmp)
+    vst1.8      {d2},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
+    vld1.8      {d3},[r5],r2                @vld1_u8(pu1_src_tmp)
+    vst1.8      {d3},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
+    bgt         inner_loop_wd_8
+
+end_inner_loop_wd_8:
+    subs        r7,r7,#4                    @ht -= 4
+    sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
+    sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
+    bgt         outer_loop_wd_8
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+core_loop_wd_16:
+    sub         r11,r12,#16
+
+outer_loop_wd_16:
+    subs        r4,r12,#0                   @checks wd
+    ble         end_inner_loop_wd_16
+
+inner_loop_wd_16:
+    add         r5,r0,r2                    @pu1_src_tmp += src_strd
+    vld1.8      {q0},[r0]!                  @vld1_u8(pu1_src_tmp)
+    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
+    vst1.8      {q0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
+    vld1.8      {q1},[r5],r2                @vld1_u8(pu1_src_tmp)
+    vst1.8      {q1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
+    subs        r4,r4,#16                   @wd - 8(loop condition)
+    vld1.8      {q2},[r5],r2                @vld1_u8(pu1_src_tmp)
+    vst1.8      {q2},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
+    vld1.8      {q3},[r5],r2                @vld1_u8(pu1_src_tmp)
+    vst1.8      {q3},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
+    bgt         inner_loop_wd_16
+
+end_inner_loop_wd_16:
+    subs        r7,r7,#4                    @ht -= 4
+    sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
+    sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
+    bgt         outer_loop_wd_16
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+

diff --git a/common/arm/ihevc_inter_pred_luma_copy_w16out.s b/common/arm/ihevc_inter_pred_luma_copy_w16out.s
new file mode 100644
index 0000000..771bcb3
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_luma_copy_w16out.s

@@ -0,0 +1,249 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     interprediction luma function for copy
+@*
+@* @par description:
+@*   copies the array of width 'wd' and height 'ht' from the  location pointed
+@*   by 'src' to the location pointed by 'dst'
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_luma_copy_w16out (
+@                                uword8 *pu1_src,
+@                                word16 *pi2_dst,
+@                                word32 src_strd,
+@                                word32 dst_strd,
+@                                word8 *pi1_coeff,
+@                                word32 ht,
+@                                word32 wd   )
+
+@**************variables vs registers*****************************************
+@   r0 => *pu1_src
+@   r1 => *pi2_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r7 =>  ht
+@   r12 => wd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_luma_copy_w16out_a9q
+
+.type ihevc_inter_pred_luma_copy_w16out_a9q, %function
+
+ihevc_inter_pred_luma_copy_w16out_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+    ldr         r12,[sp,#48]                @loads wd
+    ldr         r7,[sp,#44]                 @loads ht
+    cmp         r7,#0                       @ht condition(ht == 0)
+    ble         end_loops                   @loop
+    tst         r12,#7                      @conditional check for wd (multiples)
+    beq         core_loop_wd_8
+    sub         r11,r12,#4
+    lsls        r6,r3,#1
+
+outer_loop_wd_4:
+    subs        r4,r12,#0                   @wd conditional subtract
+    ble         end_inner_loop_wd_4
+
+inner_loop_wd_4:
+    vld1.8      {d0},[r0]                   @vld1_u8(pu1_src_tmp)
+    add         r5,r0,r2                    @pu1_src +src_strd
+    vmovl.u8    q0,d0                       @vmovl_u8(vld1_u8(pu1_src_tmp)
+    add         r10,r1,r6
+    subs        r4,r4,#4                    @wd - 4
+    vshl.i64    q0,q0,#6                    @vshlq_n_s64(temp, 6)
+    vld1.8      {d22},[r5],r2               @vld1_u8(pu1_src_tmp)
+    add         r0,r0,#4                    @pu1_src += 4
+    vst1.64     {d0},[r1]                   @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    add         r1,r1,#8
+    vmovl.u8    q11,d22                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    vld1.8      {d24},[r5],r2               @vld1_u8(pu1_src_tmp)
+    vshl.i64    q11,q11,#6                  @vshlq_n_s64(temp, 6)
+    vmovl.u8    q12,d24                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    vst1.64     {d22},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    vshl.i64    q12,q12,#6                  @vshlq_n_s64(temp, 6)
+    vld1.8      {d26},[r5],r2               @vld1_u8(pu1_src_tmp)
+    vst1.64     {d24},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    vmovl.u8    q13,d26                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    vshl.i64    q13,q13,#6                  @vshlq_n_s64(temp, 6)
+    vst1.64     {d26},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    bgt         inner_loop_wd_4
+
+end_inner_loop_wd_4:
+    subs        r7,r7,#4                    @ht + 4
+    sub         r0,r5,r11
+    sub         r1,r10,r11,lsl #1
+    bgt         outer_loop_wd_4
+
+end_loops:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+core_loop_wd_8:
+    @sub            r11,r12,#8
+    lsls        r5,r3,#1
+    rsb         r11,r12,r3, lsl #2          @ r11 = (dst_strd * 4) - width
+    rsb         r8,r12,r2,lsl #2            @r2->src_strd
+    mov         r4,r12, lsr #3              @ divide by 8
+    mul         r7, r4
+    sub         r4,r12,#0                   @wd conditional check
+    sub         r7,r7,#4                    @subtract one for epilog
+
+prolog:
+    add         r6,r0,r2                    @pu1_src_tmp += src_strd
+    add         r10,r1,r5
+    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
+    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
+    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
+    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    subs        r4,r4,#8                    @wd decrements by 8
+    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
+    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
+    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
+    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
+    addle       r0,r0,r8
+    add         r6,r0,r2                    @pu1_src_tmp += src_strd
+    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
+    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
+
+    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
+    addle       r1,r1,r11,lsl #1
+    suble       r4,r12,#0                   @wd conditional check
+
+    subs        r7,r7,#4                    @ht - 4
+
+    blt         epilog_end                  @jumps to epilog_end
+    beq         epilog                      @jumps to epilog
+
+
+
+outer_loop_wd_8:
+
+    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
+
+    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    subs        r4,r4,#8                    @wd decrements by 8
+    addle       r0,r0,r8
+
+    add         r6,r0,r2                    @pu1_src_tmp += src_strd
+
+    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
+    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
+
+    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
+
+    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
+    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
+
+    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
+    add         r10,r1,r5
+
+    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
+
+    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
+
+    addle       r1,r1,r11,lsl #1
+    suble       r4,r12,#0                   @wd conditional check
+
+    subs        r7,r7,#4                    @ht - 4
+    bgt         outer_loop_wd_8
+
+epilog:
+    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
+
+    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
+    @add        r6,r0,r2                @pu1_src_tmp += src_strd
+
+    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
+    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
+    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
+    add         r10,r1,r5
+    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
+
+    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
+epilog_end:
+    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
+
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+

diff --git a/common/arm/ihevc_inter_pred_luma_horz_w16out.s b/common/arm/ihevc_inter_pred_luma_horz_w16out.s
new file mode 100644
index 0000000..b27b2e8
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_luma_horz_w16out.s

@@ -0,0 +1,603 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+
+@/**
+@******************************************************************************
+@* @file
+@*  ihevc_inter_pred_luma_horz_w16out.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  parthiban v
+@*
+@* @par list of functions:
+@*
+@*  - ihevc_inter_pred_luma_horz_w16out()
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*   interprediction luma filter for horizontal 16bit output
+@*
+@* @par description:
+@*     applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
+@*     to the elements pointed by 'pu1_src' and  writes to the location pointed
+@*     by 'pu1_dst'  no downshifting or clipping is done and the output is  used
+@*     as an input for vertical filtering or weighted  prediction   assumptions :
+@*     the function is optimized considering the fact width is  multiple of 4 or
+@*     8. if width is multiple of 4 then height  should be multiple of 2, width 8
+@*     is optimized further.
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[out] pi2_dst
+@*  word16 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_luma_horz_w16out(uword8 *pu1_src,
+@                                word16 *pi2_dst,
+@                                word32 src_strd,
+@                                word32 dst_strd,
+@                                word8 *pi1_coeff,
+@                                word32 ht,
+@                                word32 wd
+
+
+@r0 - free
+@r1 - dst_ptr
+@r2 - src_strd
+@r3 - dst_strd
+@r4 - src_ptr2
+@r5 - inner loop counter
+@r6 - dst_ptr2
+@r7 - free
+@r8 - dst_strd2
+@r9 - src_strd1
+@r10 - wd
+@r11 - #1
+@r12 - src_ptr1
+@r14 - loop_counter
+.text
+.align 4
+
+
+
+
+
+.globl ihevc_inter_pred_luma_horz_w16out_a9q
+
+.type ihevc_inter_pred_luma_horz_w16out_a9q, %function
+
+ihevc_inter_pred_luma_horz_w16out_a9q:
+
+    bic         r14, #1                     @ clearing bit[0], so that it goes back to mode
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+    ldr         r4,[sp,#40]                 @loads pi1_coeff
+    ldr         r7,[sp,#44]                 @loads ht
+
+
+    vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
+    sub         r14,r7,#0                   @checks for ht == 0
+    vabs.s8     d2,d0                       @vabs_s8(coeff)
+    mov         r11,#1
+    @ble       end_loops
+    ldr         r10,[sp,#48]                @loads wd
+    vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+    sub         r12,r0,#3                   @pu1_src - 3
+    vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+    add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
+    vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+    rsb         r9,r10,r2,lsl #1            @2*src_strd - wd
+    vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+    rsb         r8,r10,r3                   @dst_strd - wd
+    vdup.8      d28,d2[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)
+
+    vdup.8      d29,d2[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)
+    and         r7,r14,#1                   @calculating ht_residue ht_residue = (ht & 1)
+    vdup.8      d30,d2[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)
+    sub         r14,r14,r7                  @decrement height by ht_residue(residue value is calculated outside)
+    vdup.8      d31,d2[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)
+
+    cmp         r7,#1
+    beq         odd_height_decision
+
+even_height_decision:
+    mov         r7,r1
+    cmp         r10,#4
+    ble         outer_loop_4
+
+    cmp         r10,#24
+    moveq       r10,#16
+    addeq       r8,#8
+    addeq       r9,#8
+
+    cmp         r10,#16
+    bge         outer_loop_16_branch
+
+    cmp         r10,#12
+    addeq       r8,#4
+    addeq       r9,#4
+outer_loop_8_branch:
+    b           outer_loop_8
+
+outer_loop_16_branch:
+    b           outer_loop_16
+
+
+odd_height_decision:
+    cmp         r10,#24
+    beq         outer_loop_8_branch
+    cmp         r10,#12
+    beq         outer_loop_4
+    b           even_height_decision
+
+outer_loop4_residual:
+    sub         r12,r0,#3                   @pu1_src - 3
+    mov         r1,r7
+    add         r1,#16
+    mov         r10,#4
+    add         r12,#8
+    mov         r14,#16
+    add         r8,#4
+    add         r9,#4
+
+outer_loop_4:
+    add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
+    add         r4,r12,r2                   @pu1_src + src_strd
+
+    subs        r5,r10,#0                   @checks wd
+    ble         end_inner_loop_4
+
+inner_loop_4:
+    vld1.u32    {d0},[r12],r11              @vector load pu1_src
+    vld1.u32    {d1},[r12],r11
+    vld1.u32    {d2},[r12],r11
+    vld1.u32    {d3},[r12],r11
+    vld1.u32    {d4},[r12],r11
+    vld1.u32    {d5},[r12],r11
+    vld1.u32    {d6},[r12],r11
+    vld1.u32    {d7},[r12],r11
+    @add       r12,r12,#4                      @increment the input pointer
+    sub         r12,r12,#4
+    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
+    @vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
+    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
+
+    @vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
+    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
+    @vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
+    @vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
+    vld1.u32    {d12},[r4],r11              @vector load pu1_src + src_strd
+    vld1.u32    {d13},[r4],r11
+    vzip.32     d0,d12                      @vector zip the i iteration and ii interation in single register
+    vld1.u32    {d14},[r4],r11
+    vzip.32     d1,d13
+    vld1.u32    {d15},[r4],r11
+    vzip.32     d2,d14
+    vld1.u32    {d16},[r4],r11
+    vzip.32     d3,d15
+    vld1.u32    {d17},[r4],r11
+    vzip.32     d4,d16
+    vld1.u32    {d18},[r4],r11
+    vzip.32     d5,d17
+    vld1.u32    {d19},[r4],r11
+    sub         r4,r4,#4
+    @ add       r4,r4,#4                        @increment the input pointer
+    @ vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
+    @ vext.u8   d15,d12,d13,#3                  @vector extract of src[0_3]
+    @ vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
+    @ vext.u8   d17,d12,d13,#5                  @vector extract of src[0_5]
+    @ vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
+    @ vext.u8   d19,d12,d13,#7                  @vector extract of src[0_7]
+    @vext.u8   d13,d12,d13,#1                  @vector extract of src[0_1]
+
+
+
+
+
+
+
+    vzip.32     d6,d18
+    vzip.32     d7,d19
+
+    vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
+    vmlsl.u8    q4,d0,d24
+    vmlsl.u8    q4,d2,d26
+    vmlal.u8    q4,d3,d27
+    vmlal.u8    q4,d4,d28
+    vmlsl.u8    q4,d5,d29
+    vmlal.u8    q4,d6,d30
+    vmlsl.u8    q4,d7,d31
+
+    @ vqrshrun.s16 d8,q4,#6                     @narrow right shift and saturating the result
+    vst1.64     {d8},[r1]!                  @store the i iteration result which is in upper part of the register
+    vst1.64     {d9},[r6]!                  @store the ii iteration result which is in lower part of the register
+    subs        r5,r5,#4                    @decrement the wd by 4
+    bgt         inner_loop_4
+
+end_inner_loop_4:
+    subs        r14,r14,#2                  @decrement the ht by 4
+    add         r12,r12,r9                  @increment the input pointer 2*src_strd-wd
+    add         r1,r6,r8,lsl #1             @increment the output pointer 2*dst_strd-wd
+    bgt         outer_loop_4
+
+
+height_residue_4:
+
+    ldr         r7,[sp,#44]                 @loads ht
+    and         r7,r7,#1                    @calculating ht_residue ht_residue = (ht & 1)
+    cmp         r7,#0
+    @beq       end_loops
+    ldmeqfd     sp!,{r4-r12,r15}            @reload the registers from sp
+
+outer_loop_height_residue_4:
+
+
+    subs        r5,r10,#0                   @checks wd
+    ble         end_inner_loop_height_residue_4
+
+inner_loop_height_residue_4:
+    vld1.u32    {d0},[r12],r11              @vector load pu1_src
+    vld1.u32    {d1},[r12],r11
+
+
+
+
+
+
+    @ vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
+    @ vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
+    @ vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
+
+
+
+    @add        r12,r12,#4                      @increment the input pointer
+    @ vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
+    @ vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
+    @ vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
+    @ vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
+    vld1.u32    {d2},[r12],r11
+    vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
+    vld1.u32    {d3},[r12],r11
+    vmlsl.u8    q4,d0,d24
+    vld1.u32    {d4},[r12],r11
+    vmlsl.u8    q4,d2,d26
+    vld1.u32    {d5},[r12],r11
+    vmlal.u8    q4,d3,d27
+    vld1.u32    {d6},[r12],r11
+    vmlal.u8    q4,d4,d28
+    vld1.u32    {d7},[r12],r11
+    vmlsl.u8    q4,d5,d29
+    sub         r12,r12,#4
+    vmlal.u8    q4,d6,d30
+    vmlsl.u8    q4,d7,d31                   @store the i iteration result which is in upper part of the register
+    subs        r5,r5,#4                    @decrement the wd by 4
+    vst1.64     {d8},[r1]!
+    bgt         inner_loop_height_residue_4
+
+end_inner_loop_height_residue_4:
+    subs        r7,r7,#1                    @decrement the ht by 4
+    rsb         r9,r10,r2
+    add         r12,r12,r9                  @increment the input pointer src_strd-wd
+    add         r1,r1,r8                    @increment the output pointer dst_strd-wd
+    bgt         outer_loop_height_residue_4
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+outer_loop8_residual:
+    sub         r12,r0,#3                   @pu1_src - 3
+    mov         r1,r7
+    mov         r14,#32
+    add         r1,#32
+    add         r12,#16
+    mov         r10,#8
+    add         r8,#8
+    add         r9,#8
+
+outer_loop_8:
+
+    add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
+    add         r4,r12,r2                   @pu1_src + src_strd
+    subs        r5,r10,#0                   @checks wd
+
+    ble         end_inner_loop_8
+
+inner_loop_8:
+    vld1.u32    {d0},[r12],r11              @vector load pu1_src
+    vld1.u32    {d1},[r12],r11
+    vld1.u32    {d2},[r12],r11
+    vld1.u32    {d3},[r12],r11
+
+
+
+
+
+    @ vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
+    @ vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
+    @ vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
+    @ vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
+    @ vext.u8   d6,d0,d1,#6                     @vector extract of src [0_6]
+    @ vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
+    @ vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
+    @ vext.u8   d14,d12,d13,#2
+
+    @vext.u8    d15,d12,d13,#3                  @vector extract of src[0_3]
+    @ vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
+    @ vext.u8   d17,d12,d13,#5                  @vector extract of src[0_5]
+    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
+    @vext.u8    d19,d12,d13,#7                  @vector extract of src[0_7]
+    @vext.u8   d13,d12,d13,#1                  @vector extract of src[0_1]
+    vld1.u32    {d4},[r12],r11
+    vmull.u8    q4,d1,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+    vld1.u32    {d5},[r12],r11
+    vmlal.u8    q4,d3,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vld1.u32    {d6},[r12],r11
+    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vld1.u32    {d7},[r12],r11
+    vmlsl.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    vld1.u32    {d12},[r4],r11              @vector load pu1_src + src_strd
+    vmlal.u8    q4,d4,d28                   @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+    vld1.u32    {d13},[r4],r11
+    vmlsl.u8    q4,d5,d29                   @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+    vld1.u32    {d14},[r4],r11
+    vmlal.u8    q4,d6,d30                   @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+    vld1.u32    {d15},[r4],r11
+    vmlsl.u8    q4,d7,d31                   @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+    vld1.u32    {d16},[r4],r11              @vector load pu1_src + src_strd
+
+    vmull.u8    q5,d15,d27                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vld1.u32    {d17},[r4],r11
+    vmlsl.u8    q5,d14,d26                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    vld1.u32    {d18},[r4],r11
+    vmlal.u8    q5,d16,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+    vld1.u32    {d19},[r4],r11              @vector load pu1_src + src_strd
+    vmlsl.u8    q5,d17,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+    @ vqrshrun.s16  d20,q4,#6                       @right shift and saturating narrow result 1
+    vmlal.u8    q5,d18,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+    vmlsl.u8    q5,d19,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+    vst1.16     {q4},[r1]!                  @store the result pu1_dst
+    vmlsl.u8    q5,d12,d24                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vmlal.u8    q5,d13,d25                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+
+    @ vqrshrun.s16 d8,q5,#6                     @right shift and saturating narrow result 2
+    subs        r5,r5,#8                    @decrement the wd loop
+    vst1.16     {q5},[r6]!                  @store the result pu1_dst
+    cmp         r5,#4
+    bgt         inner_loop_8
+
+end_inner_loop_8:
+    subs        r14,r14,#2                  @decrement the ht loop
+    add         r12,r12,r9                  @increment the src pointer by 2*src_strd-wd
+    add         r1,r6,r8,lsl #1             @increment the dst pointer by 2*dst_strd-wd
+    bgt         outer_loop_8
+
+
+
+
+
+    ldr         r10,[sp,#48]                @loads wd
+    cmp         r10,#12
+
+    beq         outer_loop4_residual
+
+    ldr         r7,[sp,#44]                 @loads ht
+    and         r7,r7,#1
+    cmp         r7,#1
+    beq         height_residue_4
+
+@end_loops
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+outer_loop_16:
+    str         r0, [sp, #-4]!
+    str         r7, [sp, #-4]!
+    add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
+    add         r4,r12,r2                   @pu1_src + src_strd
+    and         r0, r12, #31
+    sub         r5,r10,#0                   @checks wd
+    @ble       end_loops1
+    pld         [r12, r2, lsl #1]
+    vld1.u32    {q0},[r12],r11              @vector load pu1_src
+    pld         [r4, r2, lsl #1]
+    vld1.u32    {q1},[r12],r11
+    vld1.u32    {q2},[r12],r11
+    vld1.u32    {q3},[r12],r11
+    vld1.u32    {q6},[r12],r11
+    vmull.u8    q4,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+    vld1.u32    {q7},[r12],r11
+    vmlal.u8    q4,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vld1.u32    {q8},[r12],r11
+    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vld1.u32    {q9},[r12],r11
+    vmlsl.u8    q4,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    vmlal.u8    q4,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+    vmlsl.u8    q4,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+    vmlal.u8    q4,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+    vmlsl.u8    q4,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+
+
+inner_loop_16:
+
+
+    subs        r5,r5,#16
+    vmull.u8    q10,d3,d25
+
+    add         r12,#8
+    vmlsl.u8    q10,d1,d24
+
+    vld1.u32    {q0},[r4],r11               @vector load pu1_src
+    vmlal.u8    q10,d7,d27
+
+    vld1.u32    {q1},[r4],r11
+    vmlsl.u8    q10,d5,d26
+
+    vld1.u32    {q2},[r4],r11
+    vmlal.u8    q10,d13,d28
+
+    vld1.u32    {q3},[r4],r11
+    vmlal.u8    q10,d17,d30
+
+    vld1.u32    {q6},[r4],r11
+    vmlsl.u8    q10,d15,d29
+
+    vld1.u32    {q7},[r4],r11
+    vmlsl.u8    q10,d19,d31
+
+    vld1.u32    {q8},[r4],r11
+    vmull.u8    q5,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+    vld1.u32    {q9},[r4],r11
+    vmlal.u8    q5,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+    add         r4,#8
+    vmlsl.u8    q5,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    pld         [r12, r2, lsl #2]
+    pld         [r4, r2, lsl #2]
+    vst1.8      {q4},[r1]!                  @store the result pu1_dst
+    vmlsl.u8    q5,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+    addeq       r12,r12,r9                  @increment the src pointer by 2*src_strd-wd
+    vmlal.u8    q5,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+
+    addeq       r4,r12,r2                   @pu1_src + src_strd
+    vmlsl.u8    q5,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+
+@   and         r7, r12, #31
+    vmlal.u8    q5,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+
+    subeq       r14,r14,#2
+    vmlsl.u8    q5,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+
+    @cmp            r7, r0
+    vmull.u8    q11,d3,d25
+
+@   pld     [r12, r2, lsl #2]
+    vmlsl.u8    q11,d1,d24
+
+    vst1.16     {q10},[r1]!
+    vmlal.u8    q11,d7,d27
+
+@   pld     [r4, r2, lsl #2]
+    vmlsl.u8    q11,d5,d26
+
+@   mov         r0, r7
+    vmlal.u8    q11,d13,d28
+
+    cmp         r14,#0
+    vmlal.u8    q11,d17,d30
+
+    vst1.16     {q5},[r6]!
+    vmlsl.u8    q11,d15,d29
+
+    vmlsl.u8    q11,d19,d31
+
+    beq         epilog_16
+
+    vld1.u32    {q0},[r12],r11              @vector load pu1_src
+    vld1.u32    {q1},[r12],r11
+    vld1.u32    {q2},[r12],r11
+    vld1.u32    {q3},[r12],r11
+    vld1.u32    {q6},[r12],r11
+    vmull.u8    q4,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+    vld1.u32    {q7},[r12],r11
+    vmlal.u8    q4,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+    vld1.u32    {q8},[r12],r11
+    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+    vld1.u32    {q9},[r12],r11
+    vmlsl.u8    q4,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+    vmlal.u8    q4,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+    cmp         r5,#0
+    vmlsl.u8    q4,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+    moveq       r5,r10
+    vmlal.u8    q4,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+    vst1.8      {q11},[r6]!                 @store the result pu1_dst
+    vmlsl.u8    q4,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+    addeq       r1,r6,r8,lsl #1
+    addeq       r6,r1,r3,lsl #1             @pu1_dst + dst_strd
+    b           inner_loop_16
+
+
+epilog_16:
+@   vqrshrun.s16 d11,q11,#6
+    vst1.8      {q11},[r6]!                 @store the result pu1_dst
+
+    ldr         r7, [sp], #4
+    ldr         r0, [sp], #4
+    ldr         r10,[sp,#48]
+    cmp         r10,#24
+    beq         outer_loop8_residual
+    add         r1,r6,r8,lsl #1
+    ldr         r7,[sp,#44]                 @loads ht
+    and         r7,r7,#1
+    cmp         r7,#1
+    beq         height_residue_4
+
+end_loops1:
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+
+
+
+

diff --git a/common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s b/common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s
new file mode 100644
index 0000000..c6716fe
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s

@@ -0,0 +1,404 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@******************************************************************************
+@* @file
+@*  ihevc_inter_pred_filters_luma_vert_w16inp.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*  - ihevc_inter_pred_luma_vert()
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
+@/* include reconstruction */
+@
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    luma vertical filter for 16bit input.
+@*
+@* @par description:
+@*     applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+@*     the elements pointed by 'pu1_src' and  writes to the location pointed by
+@*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
+@*     clipped to lie  between 0 and 255   assumptions : the function is
+@*     optimized considering the fact width is  multiple of 4. and height as
+@*     multiple of 2.
+@*
+@* @param[in] pi2_src
+@*  word16 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src,
+@                                    uword8 *pu1_dst,
+@                                    word32 src_strd,
+@                                    word32 dst_strd,
+@                                    word8 *pi1_coeff,
+@                                    word32 ht,
+@                                    word32 wd   )
+@**************variables vs registers*****************************************
+@   r0 => *pu2_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 => *pi1_coeff
+@   r5 =>  ht
+@   r6 =>  wd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_luma_vert_w16inp_w16out_a9q
+
+.type ihevc_inter_pred_luma_vert_w16inp_w16out_a9q, %function
+
+ihevc_inter_pred_luma_vert_w16inp_w16out_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r12,[sp,#40]                @load pi1_coeff
+    mov         r6,r3,lsl #1
+    ldr         r5,[sp,#48]                 @load wd
+    vld1.8      {d0},[r12]                  @coeff = vld1_s8(pi1_coeff)
+    mov         r2, r2, lsl #1
+    sub         r12,r2,r2,lsl #2            @src_ctrd & pi1_coeff
+    @vabs.s8    d0,d0               @vabs_s8(coeff)
+    add         r0,r0,r12                   @r0->pu1_src    r12->pi1_coeff
+    ldr         r3,[sp,#44]                 @load ht
+    subs        r7,r3,#0                    @r3->ht
+    @ble        end_loops           @end loop jump
+    vmovl.s8    q0,d0
+    vdup.16     d22,d0[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
+    vdup.16     d23,d0[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
+    vdup.16     d24,d0[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
+    vdup.16     d25,d0[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
+    vdup.16     d26,d1[0]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
+    vdup.16     d27,d1[1]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
+    vdup.16     d28,d1[2]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
+    vdup.16     d29,d1[3]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
+    vmov.i32    q15,#0x80000
+
+    rsb         r9,r5,r6,lsl #2             @r6->dst_strd   r5  ->wd
+    rsb         r8,r5,r2,lsl #2             @r2->src_strd
+    sub         r8,r8,r5
+    sub         r9,r9,r5
+    mov         r3, r5, lsr #2              @divide by 4
+    mul         r7, r3                      @multiply height by width
+    sub         r7, #4                      @subtract by one for epilog
+    mov         r4,r5                       @r5 ->wd
+    @mov            r2, r2, lsl #1
+
+prolog:
+
+    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
+    vld1.16     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    subs        r4,r4,#4
+    vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+    vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
+    vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
+    vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+    vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+    vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
+    vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+    vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
+
+    vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
+
+    vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+    addle       r0,r0,r8,lsl #0
+    vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
+    movle       r4,r5                       @r5 ->wd
+    vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
+    vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+    vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
+    vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
+    vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+    vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
+    vsub.s32    q4, q4, q15
+
+    vld1.16     {d1},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmull.s16   q6,d3,d23
+    vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q6,d2,d22
+    vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q6,d4,d24
+    vmlal.s16   q6,d5,d25
+    vmlal.s16   q6,d6,d26
+    vmlal.s16   q6,d7,d27
+    vmlal.s16   q6,d16,d28
+    vmlal.s16   q6,d17,d29
+    add         r14,r1,r6
+    vsub.s32    q5, q5, q15
+    vshrn.s32   d8, q4, #6
+    @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
+
+    vmull.s16   q7,d4,d23
+    vmlal.s16   q7,d3,d22
+    vmlal.s16   q7,d5,d24
+    vmlal.s16   q7,d6,d25
+    vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q7,d7,d26
+    vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q7,d16,d27
+    vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q7,d17,d28
+    vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q7,d18,d29
+    vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+
+    vst1.32     {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
+    vsub.s32    q6, q6, q15
+    vshrn.s32   d10, q5, #6
+    @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
+    addle       r1,r1,r9
+
+    subs        r7,r7,#4
+
+
+    blt         epilog_end                  @jumps to epilog_end
+    beq         epilog                      @jumps to epilog
+
+kernel_8:
+
+    vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+    subs        r4,r4,#4
+    vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
+    addle       r0,r0,r8,lsl #0
+    vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
+    vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+    vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+    vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
+    vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+    vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
+    vst1.32     {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
+
+    vsub.s32    q7, q7, q15
+    vshrn.s32   d12, q6, #6
+    @vqrshrun.s16 d12,q6,#6
+    vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
+
+    vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+    vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
+    vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
+    vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+    vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+    vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
+    vst1.32     {d12},[r14],r6
+
+    vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+    vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
+
+    vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
+
+    vsub.s32    q4, q4, q15
+    vshrn.s32   d14, q7, #6
+    @vqrshrun.s16 d14,q7,#6
+
+    vmull.s16   q6,d3,d23
+    movle       r4,r5                       @r5 ->wd
+
+    vmlal.s16   q6,d2,d22
+    vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
+
+    vmlal.s16   q6,d4,d24
+    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
+
+    vmlal.s16   q6,d5,d25
+
+    vmlal.s16   q6,d6,d26
+    vst1.32     {d14},[r14],r6
+
+    vmlal.s16   q6,d7,d27
+    vld1.16     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+
+    vmlal.s16   q6,d16,d28
+    add         r14,r1,r6
+
+    vmlal.s16   q6,d17,d29
+    vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
+
+    vsub.s32    q5, q5, q15
+    vshrn.s32   d8, q4, #6
+    @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
+    vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+
+    vmull.s16   q7,d4,d23
+    vmlal.s16   q7,d3,d22
+    vmlal.s16   q7,d5,d24
+    vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+
+    vmlal.s16   q7,d6,d25
+    vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q7,d7,d26
+    vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q7,d16,d27
+    vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q7,d17,d28
+    vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
+    vmlal.s16   q7,d18,d29
+    vst1.32     {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
+
+    vsub.s32    q6, q6, q15
+    vshrn.s32   d10, q5, #6
+    addle       r1,r1,r9
+
+    @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
+    subs        r7,r7,#4
+
+    bgt         kernel_8                    @jumps to kernel_8
+
+epilog:
+
+    vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+    vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
+    vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
+    vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+    vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+    vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
+    vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+    vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
+    vst1.32     {d10},[r14],r6
+
+    vsub.s32    q7, q7, q15
+    vshrn.s32   d12, q6, #6
+    @vqrshrun.s16 d12,q6,#6
+
+    vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
+    vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+    vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
+    vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
+    vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+    vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+    vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
+    vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+    vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
+    vst1.32     {d12},[r14],r6
+
+    vsub.s32    q4, q4, q15
+    vshrn.s32   d14, q7, #6
+    @vqrshrun.s16 d14,q7,#6
+
+    vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
+    vmull.s16   q6,d3,d23
+    vmlal.s16   q6,d2,d22
+    vmlal.s16   q6,d4,d24
+    vmlal.s16   q6,d5,d25
+    vmlal.s16   q6,d6,d26
+    vmlal.s16   q6,d7,d27
+    vmlal.s16   q6,d16,d28
+    vmlal.s16   q6,d17,d29
+    vst1.32     {d14},[r14],r6
+    vsub.s32    q5, q5, q15
+    vshrn.s32   d8, q4, #6
+    @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
+
+    vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
+    vmull.s16   q7,d4,d23
+    vmlal.s16   q7,d3,d22
+    vmlal.s16   q7,d5,d24
+    vmlal.s16   q7,d6,d25
+    vmlal.s16   q7,d7,d26
+    vmlal.s16   q7,d16,d27
+    vmlal.s16   q7,d17,d28
+    vmlal.s16   q7,d18,d29
+    vsub.s32    q6, q6, q15
+    vshrn.s32   d10, q5, #6
+    @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
+
+    add         r14,r1,r6
+    vst1.32     {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
+
+epilog_end:
+    vst1.32     {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
+    vshrn.s32   d12, q6, #6
+    @vqrshrun.s16 d12,q6,#6
+
+    vst1.32     {d12},[r14],r6
+    vsub.s32    q7, q7, q15
+    vshrn.s32   d14, q7, #6
+    @vqrshrun.s16 d14,q7,#6
+
+    vst1.32     {d14},[r14],r6
+
+
+end_loops:
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+
+
+

diff --git a/common/arm/ihevc_intra_pred_chroma_dc.s b/common/arm/ihevc_intra_pred_chroma_dc.s
new file mode 100644
index 0000000..72d9730
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_chroma_dc.s

@@ -0,0 +1,292 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_chroma_dc_neon.s
+@*
+@* @brief
+@*  contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the planar coefficients
+@*
+@* @param[in] nt
+@*  size of tranform block
+@*
+@* @param[in] mode
+@*  type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_chroma_dc(uword8 *pu1_ref,
+@                                word32 src_strd,
+@                                uword8 *pu1_dst,
+@                                word32 dst_strd,
+@                                word32 nt,
+@                                word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@   nt
+@   mode
+@   pi1_coeff
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_chroma_dc_a9q
+
+.type ihevc_intra_pred_chroma_dc_a9q, %function
+
+ihevc_intra_pred_chroma_dc_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads nt
+    mov         r9, #0
+    vmov        d17, r9, r9
+
+    clz         r5, r4                      @counts leading zeros
+
+    add         r6, r0, r4,lsl #1           @&src[2nt]
+    vmov        d18, r9, r9
+    rsb         r5, r5, #32                 @log2nt
+    add         r7, r0, r4, lsl #2          @&src[4nt]
+    mov         r12,r5
+    add         r8, r7, #2                  @&src[4nt+2]
+
+    cmp         r4, #4
+    beq         dc_4                        @nt=4 loop
+
+
+add_loop:
+    vld2.s8     {d30,d31}, [r6]!            @load from src[nt]
+    lsl         r10,r4,#1                   @2nt
+
+    vpaddl.u8   d2, d30
+    subs        r10, #0x10
+
+    vld2.s8     {d26,d27}, [r8]!            @load from src[2nt+1]
+
+    vpaddl.u8   d3, d31
+    vpaddl.u16  d2, d2
+    vpaddl.u16  d3, d3
+
+    vpadal.u32  d17, d2
+
+    vpadal.u32  d18, d3
+
+    vpaddl.u8   d2, d26
+    vpaddl.u8   d3, d27
+
+    vpaddl.u16  d2, d2
+    vpaddl.u16  d3, d3
+
+    vpadal.u32  d17, d2
+    vpadal.u32  d18, d3
+
+    beq         epil_add_loop
+
+core_loop_add:
+    vld2.s8     {d30,d31}, [r6]!            @load from src[nt]
+    vpaddl.u8   d28, d30
+    vpaddl.u8   d3, d31
+
+    vld2.s8     {d26,d27}, [r8]!            @load from src[2nt+1]
+
+    vpaddl.u16  d3, d3
+    vpaddl.u16  d29, d28
+
+    vpadal.u32  d18, d3
+    vpadal.u32  d17, d29
+
+    vpaddl.u8   d3, d27
+    vpaddl.u8   d28, d26
+
+    vpaddl.u16  d3, d3
+    vpaddl.u16  d29, d28
+
+    vpadal.u32  d18, d3
+    vpadal.u32  d17, d29
+
+
+epil_add_loop:
+
+    vmov.32     r1,d18[0]
+    vmov.32     r11,d17[0]
+
+    add         r1,r1,r4
+    add         r11,r11,r4
+
+    lsr         r1,r1,r12
+    lsr         r11,r11,r12
+
+    vdup.8      d17,r1
+    vdup.8      d16,r11
+
+prologue_cpy_32:
+
+    add         r5, r2, r3
+    subs        r9, r4, #8
+    lsl         r6, r3, #2
+    moveq       r11,r6
+    add         r8, r5, r3
+    add         r10, r8, r3
+
+    beq         epilogue_copy
+
+    vst2.8      {d16,d17}, [r2]!
+    add         r6, r6, #0xfffffff0
+
+    vst2.8      {d16,d17}, [r5]!
+    vst2.8      {d16,d17}, [r8]!
+    movne       r11,#16
+    vst2.8      {d16,d17}, [r10]!
+
+
+    vst2.8      {d16,d17}, [r2], r6
+    vst2.8      {d16,d17}, [r5], r6
+    vst2.8      {d16,d17}, [r8], r6
+    vst2.8      {d16,d17}, [r10], r6
+
+kernel_copy:
+    vst2.8      {d16,d17}, [r2]!
+    vst2.8      {d16,d17}, [r5]!
+    vst2.8      {d16,d17}, [r8]!
+    vst2.8      {d16,d17}, [r10]!
+
+    vst2.8      {d16,d17}, [r2], r6
+    vst2.8      {d16,d17}, [r5], r6
+    vst2.8      {d16,d17}, [r8], r6
+    vst2.8      {d16,d17}, [r10], r6
+
+    vst2.8      {d16,d17}, [r2]!
+    vst2.8      {d16,d17}, [r5]!
+    vst2.8      {d16,d17}, [r8]!
+    vst2.8      {d16,d17}, [r10]!
+
+    vst2.8      {d16,d17}, [r2], r6
+    vst2.8      {d16,d17}, [r5], r6
+    vst2.8      {d16,d17}, [r8], r6
+    vst2.8      {d16,d17}, [r10], r6
+
+epilogue_copy:
+    vst2.8      {d16,d17}, [r2],r11
+    vst2.8      {d16,d17}, [r5],r11
+    vst2.8      {d16,d17}, [r8],r11
+    vst2.8      {d16,d17}, [r10],r11
+
+    vst2.8      {d16,d17}, [r2]
+    vst2.8      {d16,d17}, [r5]
+    vst2.8      {d16,d17}, [r8]
+    vst2.8      {d16,d17}, [r10]
+    b           end_func
+
+dc_4:
+    vld2.s8     {d30,d31},[r6]              @load from src[nt]
+    vshl.i64    d3,d30,#32
+
+    vld2.s8     {d26,d27},[r8]              @load from src[2nt+1]
+    vshl.i64    d2,d31,#32
+
+    vpaddl.u8   d3,d3
+    vpaddl.u8   d2,d2
+    vpaddl.u16  d3,d3
+    vpaddl.u16  d2,d2
+    vpadal.u32  d17,d3
+    vpadal.u32  d18,d2
+
+    vshl.i64    d3,d26,#32
+    vshl.i64    d2,d27,#32
+    vpaddl.u8   d3,d3
+    vpaddl.u8   d2,d2
+    vpaddl.u16  d3,d3
+    vpaddl.u16  d2,d2
+    vpadal.u32  d17,d3
+    vpadal.u32  d18,d2
+
+    vmov.32     r10,d17[0]
+    vmov.32     r11,d18[0]
+
+    add         r10,r10,r4
+    add         r11,r11,r4
+    lsr         r10,r10,r12
+    lsr         r11,r11,r12
+    orr         r10,r10,r11,lsl #8
+    vdup.16     d0,r10
+
+    vst1.8      {d0},[r2],r3
+    vst1.8      {d0},[r2],r3
+    vst1.8      {d0},[r2],r3
+    vst1.8      {d0},[r2]
+
+end_func:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+

diff --git a/common/arm/ihevc_intra_pred_chroma_horz.s b/common/arm/ihevc_intra_pred_chroma_horz.s
new file mode 100644
index 0000000..6089fd8
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_chroma_horz.s

@@ -0,0 +1,346 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_chroma_horz_neon.s
+@*
+@* @brief
+@*  contains function definition for intra prediction  interpolation filters
+@*
+@*
+@* @author
+@*  parthiban v
+@*
+@* @par list of functions:
+@*  - ihevc_intra_pred_luma_horz()
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     intra prediction interpolation filter for horizontal luma variable.
+@*
+@* @par description:
+@*      horizontal intraprediction(mode 10) with.extern  samples location
+@*      pointed by 'pu1_ref' to the tu block  location pointed by 'pu1_dst'  refer
+@*      to section 8.4.4.2.6 in the standard (special case)
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] nt
+@*  integer transform block size
+@*
+@* @param[in] mode
+@*  integer intraprediction mode
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@void ihevc_intra_pred_chroma_horz(uword8 *pu1_ref,
+@                                  word32 src_strd,
+@                                  uword8 *pu1_dst,
+@                                  word32 dst_strd,
+@                                  word32 nt,
+@                                  word32 mode)
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 =>  src_strd
+@r2 => *pu1_dst
+@r3 =>  dst_strd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_chroma_horz_a9q
+
+.type ihevc_intra_pred_chroma_horz_a9q, %function
+
+ihevc_intra_pred_chroma_horz_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads nt
+
+    lsl         r6,r4,#2                    @four_nt
+
+    add         r12,r0,r6                   @*pu1_ref[four_nt]
+    cmp         r4,#4                       @if nt == 4
+    beq         core_loop_4
+
+    cmp         r4,#8                       @if nt == 8
+    beq         core_loop_8
+
+    @cmp            r4,#16                          @if nt == 16
+    @beq            core_loop_16
+
+    sub         r12,r12,#16                 @move to 16th value pointer
+    add         r9,r2,#16
+
+core_loop_16:
+    vld1.16     {q0},[r12]                  @load 16 values. d1[7] will have the 1st value.
+    sub         r12,r12,#16
+    vld1.16     {q5},[r12]                  @load 16 values. d1[7] will have the 1st value.
+
+    vdup.16     q1,d1[3]                    @duplicate the i value.
+
+    vdup.16     q2,d1[2]                    @duplicate the ii value.
+    vdup.16     q3,d1[1]                    @duplicate the iii value.
+    vst1.16     {q1},[r2],r3                @store in 1st row 0-16 columns
+    vst1.16     {q1},[r9],r3                @store in 1st row 16-32 columns
+
+    vdup.16     q4,d1[0]
+    vst1.16     {q2},[r2],r3
+    vst1.16     {q2},[r9],r3
+
+    vdup.16     q1,d0[3]
+    vst1.16     {q3},[r2],r3
+    vst1.16     {q3},[r9],r3
+
+    vdup.16     q2,d0[2]
+    vst1.16     {q4},[r2],r3
+    vst1.16     {q4},[r9],r3
+
+    vdup.16     q3,d0[1]
+    vst1.16     {q1},[r2],r3
+    vst1.16     {q1},[r9],r3
+
+    vdup.16     q4,d0[0]
+    vst1.16     {q2},[r2],r3
+    vst1.16     {q2},[r9],r3
+
+    vdup.16     q1,d11[3]
+    vst1.16     {q3},[r2],r3
+    vst1.16     {q3},[r9],r3
+
+    vdup.16     q2,d11[2]
+    vst1.16     {q4},[r2],r3
+    vst1.16     {q4},[r9],r3
+
+    vdup.16     q3,d11[1]
+    vst1.16     {q1},[r2],r3
+    vst1.16     {q1},[r9],r3
+
+    vdup.16     q4,d11[0]
+    vst1.16     {q2},[r2],r3
+    vst1.16     {q2},[r9],r3
+
+    vdup.16     q1,d10[3]
+    vst1.16     {q3},[r2],r3
+    vst1.16     {q3},[r9],r3
+
+    vdup.16     q2,d10[2]
+    vst1.16     {q4},[r2],r3
+    vst1.16     {q4},[r9],r3
+
+    vdup.16     q3,d10[1]
+    vst1.16     {q1},[r2],r3
+    vst1.16     {q1},[r9],r3
+    sub         r12,r12,#16                 @move to 16th value pointer
+
+    vdup.16     q4,d10[0]
+    vst1.16     {q2},[r2],r3
+    vst1.16     {q2},[r9],r3
+
+    subs        r4,r4,#16                   @decrement the loop count by 16
+    vst1.16     {q3},[r2],r3
+    vst1.16     {q3},[r9],r3
+
+    vst1.16     {q4},[r2],r3
+    vst1.16     {q4},[r9],r3
+    bgt         core_loop_16
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+    b           endloop
+
+core_loop_8:
+    ldrb        lr,[r12],#1                 @pu1_ref[two_nt]
+    @vld1.8     {q15},[r12]                     @pu1_ref[two_nt + 1 + col]
+
+    vdup.8      d28,lr
+    sub         r12,r12,#17
+    vld1.8      {q0},[r12]
+
+    sub         r12,r12,#16
+    vld1.8      {q15},[r12]
+    vdup.16     q5,d1[3]
+    @vmovl.u8   q13,d26
+
+    vdup.16     q1,d1[2]
+    @vsubl.u8   q12,d30,d28
+
+    vdup.16     q2,d1[1]
+    @vshr.s16   q12,q12,#1
+
+    vdup.16     q3,d1[0]
+    @vqadd.s16  q11,q13,q12
+
+    vdup.16     q4,d0[3]
+    @vqmovun.s16 d22,q11
+
+    vst1.16     {q5},[r2],r3
+
+    vdup.16     q5,d0[2]
+    @vsubl.u8   q12,d31,d28
+
+    vdup.16     q6,d0[1]
+    @vshr.s16   q12,q12,#1
+
+    vdup.16     q7,d0[0]
+    @vqadd.s16  q11,q13,q12
+
+    vdup.16     q8,d0[3]
+    @vqmovun.s16 d22,q11
+
+    vst1.16     {q1},[r2],r3
+    @sub            r2,r2,#8
+
+    vst1.16     {q2},[r2],r3
+
+    vst1.16     {q3},[r2],r3
+    vst1.16     {q4},[r2],r3
+    vst1.16     {q5},[r2],r3
+
+    @vdup.8     q1,d0[2]
+    vst1.16     {q6},[r2],r3
+
+    @vdup.8     q2,d0[1]
+    vst1.16     {q7},[r2],r3
+
+    @vdup.8     q3,d0[0]
+    @vst1.8     {q7},[r2],r3
+
+    @vdup.8     q4,d0[3]
+    @vst1.8     {q8},[r2],r3
+
+    @vdup.8     q5,d0[2]
+    @vst1.8     {q1},[r2],r3
+
+    @vdup.8     q6,d0[1]
+    @vst1.8     {q2},[r2],r3
+
+    @vdup.8     q7,d0[0]
+    @vst1.8     {q3},[r2],r3
+
+    @vst1.8     {q4},[r2],r3
+    @vst1.8     {q5},[r2],r3
+    @vst1.8     {q6},[r2],r3
+    @vst1.8     {q7},[r2],r3
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+    b           endloop
+
+
+core_loop_4:
+    ldrb        lr,[r12]                    @pu1_ref[two_nt]
+    add         r12,r12,#1                  @pu1_ref[two_nt + 1]
+    @vld1.8     {d30},[r12]                     @pu1_ref[two_nt + 1 + col]
+
+    sub         r12,r12,#9
+    vld1.8      {d0},[r12]
+    sub         r12,r12,#8
+    vld1.8      {d30},[r12]
+    vdup.16     d26,d0[3]
+    vdup.8      d28,lr
+
+    vdup.16     d3,d0[2]
+    vmovl.u8    q13,d26
+
+    vdup.16     d4,d0[1]
+    vsubl.u8    q12,d30,d28
+
+    vdup.16     d5,d0[0]
+    vshr.s16    q12,q12,#1
+
+    vdup.16     d6,d0[3]
+    vqadd.s16   q11,q13,q12
+
+    vdup.16     d7,d0[2]
+    vqmovun.s16 d22,q11
+
+    vst1.8      {d6},[r2],r3
+    vst1.8      {d3},[r2],r3
+
+    vdup.16     d8,d0[1]
+    vst1.8      {d4},[r2],r3
+    vst1.8      {d5},[r2],r3
+
+    vdup.16     d9,d0[0]
+    @vst1.8     {d6},[r2],r3
+    @vst1.8     {d7},[r2],r3
+
+    @vst1.8     {d8},[r2],r3
+    @vst1.8     {d9},[r2],r3
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+    b           endloop
+
+
+@core_loop_4
+    ldrb        lr,[r12]                    @pu1_ref[two_nt]
+    add         r12,r12,#1                  @pu1_ref[two_nt + 1]
+    vld1.8      {d30},[r12]                 @pu1_ref[two_nt + 1 + col]
+
+    sub         r12,r12,#5
+    vld1.8      {d0},[r12]
+    vdup.8      d28,lr
+    vdup.8      d26,d0[3]
+    vmovl.u8    q13,d26
+
+    vdup.8      d3,d0[2]
+    vsubl.u8    q12,d30,d28
+
+    vdup.8      d4,d0[1]
+    vshr.s16    q12,q12,#1
+
+    vdup.8      d5,d0[0]
+    vqadd.s16   q11,q13,q12
+
+    vqmovun.s16 d22,q11
+
+    vst1.32     {d22[0]},[r2],r3
+    vst1.32     {d3[0]},[r2],r3
+    vst1.32     {d4[0]},[r2],r3
+    vst1.32     {d5[0]},[r2],r3
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+endloop:
+
+

diff --git a/common/arm/ihevc_intra_pred_chroma_mode2.s b/common/arm/ihevc_intra_pred_chroma_mode2.s
new file mode 100644
index 0000000..cfa2ddb
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_chroma_mode2.s

@@ -0,0 +1,299 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_luma_mode2_neon.s
+@*
+@* @brief
+@*  contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the planar coefficients
+@*
+@* @param[in] nt
+@*  size of tranform block
+@*
+@* @param[in] mode
+@*  type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref,
+@                                 word32 src_strd,
+@                                 uword8 *pu1_dst,
+@                                 word32 dst_strd,
+@                                 word32 nt,
+@                                 word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@   nt
+@   mode
+@   pi1_coeff
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_chroma_mode2_a9q
+
+.type ihevc_intra_pred_chroma_mode2_a9q, %function
+
+ihevc_intra_pred_chroma_mode2_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads nt
+    mov         r8,#-4
+
+    cmp         r4,#4
+    beq         mode2_4
+
+    add         r0,r0,r4,lsl #2
+
+    sub         r0,r0,#0x12                 @src[1]
+    add         r10,r0,#-2
+
+prologue_cpy_32:
+
+    vld2.8      {d0,d1},[r0],r8
+
+    mov         r11,r4
+    vrev64.8    d16,d0
+    vrev64.8    d17,d1
+
+    vld2.8      {d2,d3},[r10],r8
+    mov         r6, r2
+
+    vld2.8      {d4,d5},[r0],r8
+    vld2.8      {d6,d7},[r10],r8
+    lsr         r1, r4, #3
+
+    vld2.8      {d8,d9},[r0],r8
+    vld2.8      {d10,d11},[r10],r8
+    vld2.8      {d12,d13},[r0],r8
+    mul         r1, r4, r1
+
+    vld2.8      {d14,d15},[r10],r8
+    add         r7,r6,r3
+
+    vrev64.8    d18,d2
+    vrev64.8    d19,d3
+    lsl         r5, r3, #2
+
+    vrev64.8    d20,d4
+    vrev64.8    d21,d5
+    add         r9,r7,r3
+
+    vrev64.8    d22,d6
+    vrev64.8    d23,d7
+
+    vrev64.8    d24,d8
+    vrev64.8    d25,d9
+
+    vrev64.8    d26,d10
+    subs        r1,r1,#8
+
+    vrev64.8    d27,d11
+
+    vrev64.8    d28,d12
+    vrev64.8    d29,d13
+
+    vrev64.8    d30,d14
+    add         r14,r9,r3
+    vrev64.8    d31,d15
+
+    beq         epilogue_mode2
+
+    sub         r12,r4,#8
+
+kernel_mode2:
+
+    vst2.8      {d16,d17},[r6],r5
+    vst2.8      {d18,d19},[r7],r5
+    subs        r11,r11,#8
+    vst2.8      {d20,d21},[r9],r5
+    vst2.8      {d22,d23},[r14],r5
+    vst2.8      {d24,d25},[r6],r5
+    addgt       r2,r2,#16
+    vst2.8      {d26,d27},[r7],r5
+    vst2.8      {d28,d29},[r9],r5
+    vst2.8      {d30,d31},[r14],r5
+
+    vld2.8      {d0,d1},[r0],r8
+    movle       r11,r4
+
+    vld2.8      {d2,d3},[r10],r8
+    vld2.8      {d4,d5},[r0],r8
+    addle       r2, r2, r3, lsl #2
+    vld2.8      {d6,d7},[r10],r8
+    vrev64.8    d16,d0
+
+    vld2.8      {d8,d9},[r0],r8
+    vld2.8      {d10,d11},[r10],r8
+    suble       r2, r6,#16
+    vld2.8      {d12,d13},[r0],r8
+    vrev64.8    d17,d1
+    vld2.8      {d14,d15},[r10],r8
+
+    subs        r12,r12,#8
+    mov         r6, r2
+    addle       r0, r0, r4,lsl #1
+    add         r7, r6, r3
+
+    vrev64.8    d18,d2
+    suble       r0, r0, #16
+    vrev64.8    d19,d3
+
+    vrev64.8    d20,d4
+    movle       r12,r4
+    vrev64.8    d21,d5
+
+    vrev64.8    d22,d6
+    add         r9, r7, r3
+    vrev64.8    d23,d7
+
+    vrev64.8    d24,d8
+    add         r10,r0,#-2
+    vrev64.8    d25,d9
+
+    vrev64.8    d26,d10
+    subs        r1, r1, #8
+    vrev64.8    d27,d11
+
+    vrev64.8    d28,d12
+    vrev64.8    d29,d13
+
+    vrev64.8    d30,d14
+    add         r14, r9, r3
+    vrev64.8    d31,d15
+
+    bne         kernel_mode2
+
+epilogue_mode2:
+
+    vst2.8      {d16,d17},[r6],r5
+    vst2.8      {d18,d19},[r7],r5
+    vst2.8      {d20,d21},[r9],r5
+    vst2.8      {d22,d23},[r14],r5
+    vst2.8      {d24,d25},[r6],r5
+    vst2.8      {d26,d27},[r7],r5
+    vst2.8      {d28,d29},[r9],r5
+    vst2.8      {d30,d31},[r14],r5
+
+    b           end_func
+
+mode2_4:
+
+    lsl         r12,r4,#1
+    add         r0,r0,r12
+    sub         r0,r0,#2
+
+    vld2.8      {d12,d13},[r0],r8
+    vshl.i64    d0,d12,#32
+    add         r10,r0,#2
+    vshl.i64    d1,d13,#32
+
+    vrev64.8    d0,d0
+    vld2.8      {d14,d15},[r10],r8
+    vshl.i64    d2,d14,#32
+
+    vrev64.8    d1,d1
+    vshl.i64    d3,d15,#32
+    vzip.8      d0,d1
+    vst1.8      {d0},[r2],r3
+
+    vrev64.8    d2,d2
+    vld2.8      {d16,d17},[r0],r8
+    vshl.i64    d4,d16,#32
+    vrev64.8    d3,d3
+    vshl.i64    d5,d17,#32
+    vzip.8      d2,d3
+    vrev64.8    d4,d4
+    vrev64.8    d5,d5
+    vst1.8      {d2},[r2],r3
+
+
+    vld2.8      {d18,d19},[r10],r8
+    vshl.i64    d6,d18,#32
+
+    vzip.8      d4,d5
+    vshl.i64    d7,d19,#32
+    vrev64.8    d6,d6
+    vst1.8      {d4},[r2],r3
+
+    vrev64.8    d7,d7
+    vzip.8      d6,d7
+    vst1.8      {d6},[r2],r3
+
+end_func:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+

diff --git a/common/arm/ihevc_intra_pred_chroma_mode_18_34.s b/common/arm/ihevc_intra_pred_chroma_mode_18_34.s
new file mode 100644
index 0000000..b0dd1fa
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_chroma_mode_18_34.s

@@ -0,0 +1,190 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_luma_mode_18_34_neon.s
+@*
+@* @brief
+@*  contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the planar coefficients
+@*
+@* @param[in] nt
+@*  size of tranform block
+@*
+@* @param[in] mode
+@*  type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_chroma_mode_18_34(uword8 *pu1_ref,
+@                                      word32 src_strd,
+@                                      uword8 *pu1_dst,
+@                                      word32 dst_strd,
+@                                      word32 nt,
+@                                      word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@   nt
+@   mode
+@   pi1_coeff
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_chroma_mode_18_34_a9q
+
+.type ihevc_intra_pred_chroma_mode_18_34_a9q, %function
+
+ihevc_intra_pred_chroma_mode_18_34_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+
+    ldr         r4,[sp,#40]
+    ldr         r5,[sp,#44]
+
+    cmp         r4,#4
+    beq         mode2_4
+
+    mov         r12,r4
+    mov         r11,r4
+    add         r0,r0,r4,lsl #2
+
+    cmp         r5,#0x22
+    mov         r10,r2
+
+    add         r0,r0,#4
+
+    subne       r0,r0,#4
+    moveq       r6,#2
+    movne       r6,#-2
+    mov         r8,r0
+
+
+kernel:
+
+
+    vld1.8      {d0,d1},[r8],r6
+    vst1.8      {d0,d1},[r10],r3
+    vld1.8      {d2,d3},[r8],r6
+    vst1.8      {d2,d3},[r10],r3
+    vld1.8      {d4,d5},[r8],r6
+    vst1.8      {d4,d5},[r10],r3
+    vld1.8      {d6,d7},[r8],r6
+    vst1.8      {d6,d7},[r10],r3
+    vld1.8      {d8,d9},[r8],r6
+    vst1.8      {d8,d9},[r10],r3
+    vld1.8      {d10,d11},[r8],r6
+    vst1.8      {d10,d11},[r10],r3
+    vld1.8      {d12,d13},[r8],r6
+    vst1.8      {d12,d13},[r10],r3
+    vld1.8      {d14,d15},[r8],r6
+    vst1.8      {d14,d15},[r10],r3
+
+    subs        r12,r12,#8
+    bne         kernel
+
+    cmp         r11,#16
+    add         r8,r0,#16
+    add         r10,r2,#16
+    sub         r11,#16
+    mov         r12,#16
+    beq         kernel
+    b           end_func
+
+mode2_4:
+
+    add         r0,r0,#20
+    cmp         r5,#0x22
+    subne       r0,r0,#4
+
+    moveq       r8,#2
+    movne       r8,#-2
+
+    vld1.8      {d0},[r0],r8
+    vst1.32     {d0},[r2],r3
+
+    vld1.8      {d0},[r0],r8
+    vst1.32     {d0},[r2],r3
+
+    vld1.8      {d0},[r0],r8
+    vst1.32     {d0},[r2],r3
+
+    vld1.8      {d0},[r0],r8
+    vst1.32     {d0},[r2],r3
+
+end_func:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+

diff --git a/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s b/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s
new file mode 100644
index 0000000..f2431e1
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s

@@ -0,0 +1,542 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_chroma_mode_27_to_33.s
+@*
+@* @brief
+@*  contains function definition for intra prediction  interpolation filters
+@*
+@*
+@* @author
+@*  parthiban v
+@*
+@* @par list of functions:
+@*  - ihevc_intra_pred_chroma_mode_27_to_33()
+@*
+@* @remarksll
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*  intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
+@*.extern  neighboring samples location pointed by 'pu1_ref' to the  tu
+@* block location pointed by 'pu1_dst'
+@*
+@* @par description:
+@*
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[in] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] nt
+@*  integer transform block size
+@*
+@* @param[in] mode
+@*  integer intraprediction mode
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@.if intra_pred_chroma_27_t0_33 == c
+@void ihevc_intra_pred_chroma_mode_27_to_33(uword8 *pu1_ref,
+@                                        word32 src_strd,
+@                                         uword8 *pu1_dst,
+@                                         word32 dst_strd,
+@                                         word32 nt,
+@                                         word32 mode)
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_chroma_mode_27_to_33_a9q
+.extern gai4_ihevc_ang_table
+.extern gau1_ihevc_planar_factor
+
+gai4_ihevc_ang_table_addr:
+.long gai4_ihevc_ang_table - ulbl1 - 8
+
+gau1_ihevc_planar_factor_addr:
+.long gau1_ihevc_planar_factor  - ulbl2 - 8
+
+
+.type ihevc_intra_pred_chroma_mode_27_to_33_a9q, %function
+
+ihevc_intra_pred_chroma_mode_27_to_33_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads nt
+    ldr         r5,[sp,#44]                 @loads mode
+    ldr         r6,gai4_ihevc_ang_table_addr @loads word32 gai4_ihevc_ang_table[35]
+ulbl1:
+    add         r6,r6,pc
+
+    lsl         r7,r4,#2                    @four_nt
+
+    add         r8,r6,r5,lsl #2             @*gai4_ihevc_ang_table[mode]
+    ldr         r9,[r8]                     @intra_pred_ang = gai4_ihevc_ang_table[mode]
+    ldr         r1,gau1_ihevc_planar_factor_addr @used for ((row + 1) * intra_pred_ang) row values
+ulbl2:
+    add         r1,r1,pc
+    add         r6,r1,#1
+
+    tst         r4,#7
+    add         r8,r0,r7                    @pu1_ref + four_nt
+    mov         lr,#0                       @row
+    mov         r12,r4
+    bne         core_loop_4
+    lsl         r4,r4,#1
+    b           core_loop_8
+
+core_loop_8:
+    add         r8,r8,#2                    @pu1_ref_main_idx += (four_nt + 1)
+    vdup.8      d0,r9                       @intra_pred_ang
+    mov         r12,r4,lsr #4               @divide by 8
+
+    vmov.i8     d1,#32
+    mul         r7,r4,r12
+
+    vmov.i16    q3,#31
+
+    mov         r1,r8
+    mov         r5,r4
+    mov         r11,#2
+
+prologue:
+    vld1.8      {d3},[r6]                   @loads the row value
+    vmull.u8    q1,d3,d0                    @pos = ((row + 1) * intra_pred_ang)
+    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
+    vmovn.i16   d4,q2
+    vshrn.u16   d5,q1,#5                    @idx = pos >> 5
+
+    vdup.8      d31,d4[0]
+    add         r0,r2,r3
+
+    vmov.u32    lr,d5[0]                    @(i row)extract idx to the r register
+    lsl         lr,lr,#1
+
+    vdup.8      d29,d4[1]                   @(ii)
+    and         r9,lr,#0xff                 @(i row) get the last byte
+
+    add         r10,r8,r9                   @(i row)*pu1_ref[ref_main_idx]
+
+    asr         lr,lr,#8                    @(ii)shift by 8
+    vld1.8      {d8},[r10],r11              @(i row)ref_main_idx
+    and         r9,lr,#0xff                 @(ii)get the last byte
+
+    asr         lr,lr,#8                    @(iii)
+    vld1.8      {d9},[r10]                  @(i row)ref_main_idx_1
+    add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
+
+    and         r9,lr,#0xff                 @(iii)
+    vsub.u8     d30,d1,d31                  @32-fract(dup_const_32_fract)
+    add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
+    vmull.u8    q5,d8,d30                   @(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
+    vmlal.u8    q5,d9,d31                   @(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
+    asr         lr,lr,#8                    @(iv)
+
+    vdup.8      d27,d4[2]                   @(iii)
+    vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
+    and         r9,lr,#0xff                 @(iv)
+
+    vdup.8      d25,d4[3]                   @(iv)
+    vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
+    vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
+    vrshrn.i16  d10,q5,#5                   @(i row)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
+    vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
+
+    vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
+
+    vdup.8      d31,d4[4]                   @(v)
+    vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vmov.u32    lr,d5[1]                    @extract idx to the r register
+    vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    lsl         lr,lr,#1
+
+    vst1.8      {d10},[r2]!                 @(i row)
+    vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    and         r9,lr,#0xff                 @(v)
+    vdup.8      d29,d4[5]                   @(vi)
+    add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d8},[r10],r11              @(v)ref_main_idx
+    vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
+
+    asr         lr,lr,#8                    @(vi)
+    vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+    and         r9,lr,#0xff                 @(vi)
+
+    vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
+    vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vst1.8      {d14},[r0],r3               @(ii)
+    vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
+    vdup.8      d27,d4[6]                   @(vii)
+    asr         lr,lr,#8                    @(vii)
+
+    and         r9,lr,#0xff                 @(vii)
+    vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
+    add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
+    vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
+    vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vst1.8      {d18},[r0],r3               @(iii)
+    vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+    asr         lr,lr,#8                    @(viii)
+    vdup.8      d25,d4[7]                   @(viii)
+    and         r9,lr,#0xff                 @(viii)
+
+    vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
+    vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
+
+    vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
+    vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
+    vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+    subs        r7,r7,#8
+
+    vst1.8      {d22},[r0],r3               @(iv)
+    vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
+    vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
+
+    vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
+    vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    addgt       r8,r8,#8
+    vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    subgt       r4,r4,#8
+
+    vst1.8      {d10},[r0],r3               @(v)
+    vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+    beq         epilogue
+
+    vld1.8      {d5},[r6]                   @loads the row value
+    vmull.u8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
+    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
+    vmovn.i16   d4,q2
+    vshrn.u16   d3,q1,#5                    @idx = pos >> 5
+    vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
+    lsl         lr,lr,#1
+    and         r9,lr,#0xff                 @(i)
+    add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
+
+kernel_8_rows:
+    asr         lr,lr,#8                    @(ii)
+    vdup.8      d31,d4[0]
+    subs        r4,r4,#8
+
+    vld1.8      {d8},[r10],r11              @(i)ref_main_idx
+    vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
+    and         r9,lr,#0xff                 @(ii)
+    addle       r6,r6,#8                    @increment the row value
+
+    vld1.8      {d9},[r10]                  @(i)ref_main_idx_1
+    vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d5},[r6]                   @loads the row value
+    vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    asr         lr,lr,#8                    @(iii)
+
+    vdup.8      d29,d4[1]                   @(ii)
+    vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
+    and         r9,lr,#0xff                 @(iii)
+
+    vst1.8      {d14},[r0],r3               @(vi)
+    vsub.u8     d30,d1,d31                  @(i)32-fract(dup_const_32_fract)
+    add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
+    vmull.u8    q5,d8,d30                   @(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+    asr         lr,lr,#8                    @(iv)
+
+    vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
+    vmlal.u8    q5,d9,d31                   @(i)vmull_u8(ref_main_idx_1, dup_const_fract)
+    and         r9,lr,#0xff                 @(iv)
+
+    vmov.u32    lr,d3[1]                    @extract idx to the r register
+    vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vdup.8      d27,d4[2]                   @(iii)
+    vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
+    movle       r4,r5                       @reload nt
+
+    vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
+    vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
+
+    vst1.8      {d18},[r0],r3               @(vii)
+    vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
+    vrshrn.i16  d10,q5,#5                   @(i)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vdup.8      d25,d4[3]                   @(iv)
+    vmull.u8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
+
+    vst1.8      {d22},[r0]                  @(viii)
+    vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
+
+    vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
+    vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    lsl         lr,lr,#1
+
+    vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
+    vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    add         r0,r2,r3
+
+    vdup.8      d31,d4[4]                   @(v)
+    vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
+    and         r9,lr,#0xff                 @(v)
+
+    vst1.8      {d10},[r2]!                 @(i)
+    vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
+    add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
+
+    vdup.8      d29,d4[5]                   @(vi)
+    vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+    asr         lr,lr,#8                    @(vi)
+
+    vdup.8      d27,d4[6]                   @(vii)
+    vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+    and         r9,lr,#0xff                 @(vi)
+
+    vdup.8      d25,d4[7]                   @(viii)
+    vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
+    add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d8},[r10],r11              @(v)ref_main_idx
+    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
+    asr         lr,lr,#8                    @(vii)
+
+    vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
+    vshrn.u16   d3,q1,#5                    @idx = pos >> 5
+    and         r9,lr,#0xff                 @(vii)
+
+    vst1.8      {d14},[r0],r3               @(ii)
+    vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
+    asr         lr,lr,#8                    @(viii)
+
+    vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
+    vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
+    add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
+    vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+    and         r9,lr,#0xff                 @(viii)
+
+    vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
+    vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+    add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
+    vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
+
+    vst1.8      {d18},[r0],r3               @(iii)
+    vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+    movle       r8,r1                       @reload the source to pu1_src+2nt
+
+    vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
+    vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+    addgt       r8,r8,#8                    @increment the source next set 8 columns in same row
+
+    vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
+    vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
+    vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
+    lslle       r12,r3,#3
+
+    vst1.8      {d22},[r0],r3               @(iv)
+    vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    suble       r12,r12,r5
+
+    vst1.8      {d10},[r0],r3               @(v)
+    vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    addle       r2,r2,r12                   @increment the dst pointer to 8*dst_strd - nt
+
+    vmovn.i16   d4,q2
+    vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
+    lsl         lr,lr,#1
+
+    and         r9,lr,#0xff                 @(i)
+    subs        r7,r7,#8
+    add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
+
+    bne         kernel_8_rows
+
+epilogue:
+    vst1.8      {d14},[r0],r3               @(vi)
+    vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
+    vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vst1.8      {d18},[r0],r3               @(vii)
+    vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vst1.8      {d22},[r0],r3               @(viii)
+    b           end_loops
+
+core_loop_4:
+    add         r10,r8,#2                   @pu1_ref_main_idx += (four_nt + 1)
+    add         r11,r8,#4                   @pu1_ref_main_idx_1 += (four_nt + 2)
+    mov         r8,#0
+
+    add         r5,r8,#1                    @row + 1
+    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
+    and         r5,r5,#31                   @fract = pos & (31)
+    cmp         lr,r5                       @if(fract_prev > fract)
+    addgt       r10,r10,#2                  @pu1_ref_main_idx += 2
+    add         r11,r10,#2                  @pu1_ref_main_idx_1 += 2
+    vdup.8      d0,r5                       @dup_const_fract
+    rsb         r4,r5,#32
+    vdup.8      d1,r4                       @dup_const_32_fract
+
+@inner_loop_4
+    vld1.8      {d2},[r10]                  @ref_main_idx
+    add         r8,r8,#1
+    mov         lr,r5                       @fract_prev = fract
+
+    vld1.8      {d3},[r11]                  @ref_main_idx_1
+    add         r5,r8,#1                    @row + 1
+    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
+    and         r5,r5,#31                   @fract = pos & (31)
+    cmp         lr,r5                       @if(fract_prev > fract)
+    addgt       r10,r10,#2                  @pu1_ref_main_idx += 1
+    add         r11,r10,#2                  @pu1_ref_main_idx_1 += 1
+
+    vdup.8      d6,r5                       @dup_const_fract
+    vmull.u8    q2,d2,d1                    @vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    rsb         r4,r5,#32
+    vdup.8      d7,r4                       @dup_const_32_fract
+    vmlal.u8    q2,d3,d0                    @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.8      {d8},[r10]                  @ref_main_idx
+    add         r8,r8,#1
+
+    vld1.8      {d9},[r11]                  @ref_main_idx_1
+    vrshrn.i16  d4,q2,#5                    @shift_res = vrshrn_n_u16(add_res, 5)
+
+    mov         lr,r5                       @fract_prev = fract
+    add         r5,r8,#1                    @row + 1
+    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
+    and         r5,r5,#31                   @fract = pos & (31)
+    cmp         lr,r5                       @if(fract_prev > fract)
+    addgt       r10,r10,#2                  @pu1_ref_main_idx += 1
+    add         r11,r10,#2                  @pu1_ref_main_idx_1 += 1
+
+    vdup.8      d12,r5                      @dup_const_fract
+    vmull.u8    q5,d8,d7                    @vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    rsb         r4,r5,#32
+    vdup.8      d13,r4                      @dup_const_32_fract
+    vmlal.u8    q5,d9,d6                    @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.8      {d14},[r10]                 @ref_main_idx
+    add         r8,r8,#1
+
+    vst1.8      {d4},[r2],r3
+    vrshrn.i16  d10,q5,#5                   @shift_res = vrshrn_n_u16(add_res, 5)
+
+    vld1.8      {d15},[r11]                 @ref_main_idx_1
+    mov         lr,r5                       @fract_prev = fract
+    add         r5,r8,#1                    @row + 1
+    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
+    and         r5,r5,#31                   @fract = pos & (31)
+    cmp         lr,r5                       @if(fract_prev > fract)
+    addgt       r10,r10,#2                  @pu1_ref_main_idx += 1
+    add         r11,r10,#2                  @pu1_ref_main_idx_1 += 1
+
+    vdup.8      d18,r5                      @dup_const_fract
+    vmull.u8    q8,d14,d13                  @vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    rsb         r4,r5,#32
+    vdup.8      d19,r4                      @dup_const_32_fract
+    vmlal.u8    q8,d15,d12                  @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.8      {d20},[r10]                 @ref_main_idx
+
+    vst1.8      {d10},[r2],r3
+    vrshrn.i16  d16,q8,#5                   @shift_res = vrshrn_n_u16(add_res, 5)
+    vld1.8      {d21},[r11]                 @ref_main_idx_1
+
+    vmull.u8    q11,d20,d19                 @vmull_u8(ref_main_idx, dup_const_32_fract)
+    vmlal.u8    q11,d21,d18                 @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vst1.8      {d16},[r2],r3
+    vrshrn.i16  d22,q11,#5                  @shift_res = vrshrn_n_u16(add_res, 5)
+
+    vst1.8      {d22},[r2],r3
+
+end_loops:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+

diff --git a/common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s b/common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s
new file mode 100644
index 0000000..a5eb3ca
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s

@@ -0,0 +1,497 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_chroma_mode_3_to_9.s
+@*
+@* @brief
+@*  contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  parthiban v
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] nt
+@*  size of tranform block
+@*
+@* @param[in] mode
+@*  type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@void ihevc_intra_pred_chroma_mode_3_to_9(uword8 *pu1_ref,
+@                                       word32 src_strd,
+@                                       uword8 *pu1_dst,
+@                                       word32 dst_strd,
+@                                       word32 nt,
+@                                       word32 mode)
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@   nt
+@   mode
+
+.text
+.align 4
+
+
+
+
+
+.globl ihevc_intra_pred_chroma_mode_3_to_9_a9q
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern col_for_intra_chroma
+.extern idx_neg_idx_chroma_3_9
+
+gai4_ihevc_ang_table_addr:
+.long gai4_ihevc_ang_table - ulbl1 - 8
+
+gai4_ihevc_inv_ang_table_addr:
+.long gai4_ihevc_inv_ang_table - ulbl2 - 8
+
+
+idx_neg_idx_chroma_3_9_addr:
+.long idx_neg_idx_chroma_3_9 - ulbl3 - 8
+
+col_for_intra_chroma_addr_1:
+.long col_for_intra_chroma - ulbl4 - 8
+
+col_for_intra_chroma_addr_2:
+.long col_for_intra_chroma - ulbl5 - 8
+
+col_for_intra_chroma_addr_3:
+.long col_for_intra_chroma - ulbl6 - 8
+
+.type ihevc_intra_pred_chroma_mode_3_to_9_a9q, %function
+
+ihevc_intra_pred_chroma_mode_3_to_9_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads nt
+    ldr         r7, gai4_ihevc_ang_table_addr
+ulbl1:
+    add         r7,r7,pc
+
+    ldr         r5,[sp,#44]                 @mode (3 to 9)
+    ldr         r8, gai4_ihevc_inv_ang_table_addr
+ulbl2:
+    add         r8,r8,pc
+
+    add         r7, r7, r5, lsl #2          @gai4_ihevc_ang_table[mode]
+    ldr         r7, [r7]                    @intra_pred_ang
+    vdup.8      d30, r7                     @intra_pred_ang
+
+    ldr         r14, col_for_intra_chroma_addr_1
+ulbl4:
+    add         r14,r14,pc
+
+prologue_8_16_32:
+    lsr         r10, r4, #3
+    vld1.8      d31, [r14]!
+    mul         r10, r4, r10                @block counter (dec by #8)
+
+    mov         r11, r4, lsl #1             @col counter to be inc/dec by #8
+    vmull.s8    q11, d30, d31               @(col+1)*intra_pred_angle [0:7](col)
+
+    sub         r7, r5, #3
+    ldr         r12, idx_neg_idx_chroma_3_9_addr @load most idx table
+ulbl3:
+    add         r12,r12,pc
+
+    add         r12, r12, r7, lsl #4
+    mov         r8, r12
+
+    mov         r7, #8
+    sub         r7, r7, r3, lsl #3          @r7 = 8-8r3
+
+    ldr         r9, [r8]
+    mov         r9, r9, lsl #1
+    add         r1, r0, r4, lsl #2          @pu1_ref + 4*nt
+
+    vmovn.s16   d6, q11
+    vdup.8      d26, r9                     @most idx added to final idx values
+    sub         r1, r1, #26                 @ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
+
+    sub         r6, r1, r9
+
+    vld1.8      {d0,d1,d2,d3}, [r6]         @stores the 32 values reqd based on indices values (from most idx)
+    vshr.s16    q11, q11, #5
+
+    vmov.i8     d29, #31                    @contains #31 for vand operation
+
+    vmov.i8     d28, #32
+
+    vqmovn.s16  d8, q11
+    vshl.s8     d8, d8, #1                  @ 2 * idx
+
+    vand        d6, d6, d29                 @fract values in d1/ idx values in d0
+    vmov.i8     d29, #2                     @contains #2 for adding to get ref_main_idx + 1
+
+    movw        r0,#0x302                   @ idx value for v is +1 of u
+    vdup.u16    d27,r0
+    mov         r0,#0
+
+    vmov.i8     d9, #22                     @row 0 to 7
+
+    vsub.s8     d8, d8, d27                 @ref_main_idx (sub row)
+    vsub.s8     d8, d26, d8                 @ref_main_idx (row 0)
+    vadd.s8     d8, d8, d9                  @to compensate the pu1_src idx incremented by 8
+    vsub.s8     d9, d8, d29                 @ref_main_idx + 1 (row 0)
+    vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 0)
+    vsub.s8     d7, d28, d6                 @32-fract
+
+    vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 0)
+    vsub.s8     d4, d8, d29                 @ref_main_idx (row 1)
+    vsub.s8     d5, d9, d29                 @ref_main_idx + 1 (row 1)
+
+    vmov.i8     d29, #4
+
+    vtbl.8      d16, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 1)
+    vmull.u8    q12, d12, d7                @mul (row 0)
+    vmlal.u8    q12, d13, d6                @mul (row 0)
+
+    vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 1)
+    vsub.s8     d8, d8, d29                 @ref_main_idx (row 2)
+    vsub.s8     d9, d9, d29                 @ref_main_idx + 1 (row 2)
+
+    vrshrn.i16  d24, q12, #5                @round shft (row 0)
+
+    vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 2)
+    vmull.u8    q11, d16, d7                @mul (row 1)
+    vmlal.u8    q11, d17, d6                @mul (row 1)
+
+    vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 2)
+    vsub.s8     d4, d4, d29                 @ref_main_idx (row 3)
+    vsub.s8     d5, d5, d29                 @ref_main_idx + 1 (row 3)
+
+    vst1.8      d24, [r2], r3               @st (row 0)
+    vrshrn.i16  d22, q11, #5                @round shft (row 1)
+
+    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 3)
+    vmull.u8    q10, d14, d7                @mul (row 2)
+    vmlal.u8    q10, d15, d6                @mul (row 2)
+
+    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 3)
+    vsub.s8     d8, d8, d29                 @ref_main_idx (row 4)
+    vsub.s8     d9, d9, d29                 @ref_main_idx + 1 (row 4)
+
+    vst1.8      d22, [r2], r3               @st (row 1)
+    vrshrn.i16  d20, q10, #5                @round shft (row 2)
+
+    vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 4)
+    vmull.u8    q9, d10, d7                 @mul (row 3)
+    vmlal.u8    q9, d11, d6                 @mul (row 3)
+
+    vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 4)
+    vsub.s8     d4, d4, d29                 @ref_main_idx (row 5)
+    vsub.s8     d5, d5, d29                 @ref_main_idx + 1 (row 5)
+
+    vst1.8      d20, [r2], r3               @st (row 2)
+    vrshrn.i16  d18, q9, #5                 @round shft (row 3)
+
+    vtbl.8      d16, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 5)
+    vmull.u8    q12, d12, d7                @mul (row 4)
+    vmlal.u8    q12, d13, d6                @mul (row 4)
+
+    vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 5)
+    vsub.s8     d8, d8, d29                 @ref_main_idx (row 6)
+    vsub.s8     d9, d9, d29                 @ref_main_idx + 1 (row 6)
+
+    vst1.8      d18, [r2], r3               @st (row 3)
+    cmp         r4,#4
+    beq         end_func
+    vrshrn.i16  d24, q12, #5                @round shft (row 4)
+
+    vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 6)
+    vmull.u8    q11, d16, d7                @mul (row 5)
+    vmlal.u8    q11, d17, d6                @mul (row 5)
+
+    vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 6)
+    vsub.s8     d4, d4, d29                 @ref_main_idx (row 7)
+    vsub.s8     d5, d5, d29                 @ref_main_idx + 1 (row 7)
+
+    vst1.8      d24, [r2], r3               @st (row 4)
+    vrshrn.i16  d22, q11, #5                @round shft (row 5)
+
+    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
+    vmull.u8    q10, d14, d7                @mul (row 6)
+    vmlal.u8    q10, d15, d6                @mul (row 6)
+
+    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 7)
+    vmull.u8    q9, d10, d7                 @mul (row 7)
+    vmlal.u8    q9, d11, d6                 @mul (row 7)
+
+    vst1.8      d22, [r2], r3               @st (row 5)
+    vrshrn.i16  d20, q10, #5                @round shft (row 6)
+    vrshrn.i16  d18, q9, #5                 @round shft (row 7)
+
+    vst1.8      d20, [r2], r3               @st (row 6)
+
+    subs        r10, r10, #4                @subtract 8 and go to end if 8x8
+
+    vst1.8      d18, [r2], r3               @st (row 7)
+
+    beq         end_func
+
+    subs        r11, r11, #8                @decrement the processed col
+    addgt       r8, r8, #4
+    addgt       r2, r2, r7
+    movle       r8, r12
+    suble       r2, r2, r4
+    addle       r2, r2, #8
+    movle       r11, r4, lsl #1
+    ldrle       r14, col_for_intra_chroma_addr_2
+ulbl5:
+    addle       r14,r14,pc
+    addle       r0, r0, #8
+
+    vld1.8      d31, [r14]!
+    vmull.s8    q6, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
+    vmovn.s16   d10, q6
+    vshr.s16    q6, q6, #5
+    vqmovn.s16  d11, q6
+    vshl.s8     d11, d11, #1
+    movw        r5, #0x302                  @idx value for v is +1 of u
+    vdup.u16    d27, r5                     @row value inc or reset accordingly
+    ldr         r9, [r8]                    @loads index value
+    mov         r9, r9, lsl #1
+    mov         r5, #22
+    sub         r5, r5, r0, lsl #1
+    vdup.8      d16, r5
+    vdup.8      d26, r9
+
+    mov         r5,r2
+    vsub.s8     d11, d11, d27               @ref_main_idx (sub row)
+
+kernel_8_16_32:
+    vmov.i8     d29, #2                     @contains #2 for adding to get ref_main_idx + 1
+    vsub.s8     d8, d26, d11                @ref_main_idx
+    vmov        d26,d10
+
+    subs        r11, r11, #8
+    sub         r6, r1, r9
+    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
+    vadd.s8     d8, d8, d16                 @to compensate the pu1_src idx incremented by 8
+
+    vmull.u8    q10, d14, d7                @mul (row 6)
+    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx - 1 (row 7)
+    vmlal.u8    q10, d15, d6                @mul (row 6)
+
+    addle       r0, r0, #8
+    vsub.s8     d9, d8, d29                 @ref_main_idx - 2
+    addgt       r8, r8, #4
+
+    vld1.8      {d0,d1,d2,d3}, [r6]         @stores the 32 values reqd based on indices values (from most idx)
+    vrshrn.i16  d22, q11, #5                @round shft (row 5)
+
+    ldrle       r14, col_for_intra_chroma_addr_3
+ulbl6:
+    addle       r14,r14,pc
+    vst1.8      d24, [r5], r3               @st (row 4)
+    movle       r8, r12
+
+    movw        r9,#0x302
+    vdup.16     d27, r9                     @row value inc or reset accordingly
+    vsub.s8     d4, d8, d29                 @ref_main_idx (row 1)
+
+    vsub.s8     d5, d9, d29                 @ref_main_idx - 1 (row 1)
+    vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 0)
+    vmov.i8     d29, #31                    @contains #2 for adding to get ref_main_idx + 1
+
+    vmull.u8    q9, d10, d7                 @mul (row 7)
+    vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 0)
+    vmlal.u8    q9, d11, d6                 @mul (row 7)
+
+    vld1.8      d31, [r14]!
+    vand        d6, d29, d26                @fract values in d1/ idx values in d0
+
+    movle       r11, r4, lsl #1
+    vmov.i8     d29, #4                     @contains #2 for adding to get ref_main_idx + 1
+    ldr         r9, [r8]
+
+    vst1.8      d22, [r5], r3               @(from previous loop)st (row 5)
+    vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
+
+    vsub.s8     d8, d8, d29                 @ref_main_idx (row 2)
+    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 1)
+    vsub.s8     d9, d9, d29                 @ref_main_idx - 1 (row 2)
+
+    mov         r9,r9,lsl #1
+    vsub.s8     d7, d28, d6                 @32-fract
+
+    vmull.u8    q12, d12, d7                @mul (row 0)
+    vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 1)
+    vmlal.u8    q12, d13, d6                @mul (row 0)
+
+    vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
+    vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
+
+    vsub.s8     d4, d4, d29                 @ref_main_idx (row 3)
+    vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 2)
+    vsub.s8     d5, d5, d29                 @ref_main_idx - 1 (row 3)
+
+    vmull.u8    q11, d10, d7                @mul (row 1)
+    vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 2)
+    vmlal.u8    q11, d17, d6                @mul (row 1)
+
+    vrshrn.i16  d24, q12, #5                @round shft (row 0)
+    vst1.8      d18, [r5], r3               @(from previous loop)st (row 7)
+
+    vsub.s8     d8, d8, d29                 @ref_main_idx (row 4)
+    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 3)
+    vsub.s8     d9, d9, d29                 @ref_main_idx - 1 (row 4)
+
+    vmull.u8    q10, d14, d7                @mul (row 2)
+    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 3)
+    vmlal.u8    q10, d15, d6                @mul (row 2)
+
+    add         r5,r2,r3,lsl#2
+    vmull.s8    q7, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
+    add         r9, r9, r0, lsl #1
+
+    vst1.8      d24, [r2], r3               @st (row 0)
+    vrshrn.i16  d22, q11, #5                @round shft (row 1)
+
+    vsub.s8     d4, d4, d29                 @ref_main_idx (row 5)
+    vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 4)
+    vsub.s8     d5, d5, d29                 @ref_main_idx - 1 (row 5)
+
+    vmull.u8    q9, d10, d7                 @mul (row 3)
+    vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 4)
+    vmlal.u8    q9, d11, d6                 @mul (row 3)
+
+    vst1.8      d22, [r2], r3               @st (row 1)
+    vrshrn.i16  d20, q10, #5                @round shft (row 2)
+
+    vmovn.s16   d10, q7
+    vshr.s16    q7, q7, #5
+
+    vsub.s8     d8, d8, d29                 @ref_main_idx (row 6)
+    vtbl.8      d21, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 5)
+    vsub.s8     d9, d9, d29                 @ref_main_idx - 1 (row 6)
+
+    vmull.u8    q12, d12, d7                @mul (row 4)
+    vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 5)
+    vqmovn.s16  d11, q7
+
+    vst1.8      d20, [r2], r3               @st (row 2)
+    vmlal.u8    q12, d13, d6                @mul (row 4)
+
+    vrshrn.i16  d18, q9, #5                 @round shft (row 3)
+    vdup.8      d26, r9
+
+    vsub.s8     d4, d4, d29                 @ref_main_idx (row 7)
+    vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 6)
+    vsub.s8     d5, d5, d29                 @ref_main_idx - 1 (row 7)
+
+    mov         r6, #22                     @to compensate the 2*row value
+    vshl.u8     d11,#1
+    sub         r6, r6, r0, lsl #1
+
+    vmull.u8    q11, d21, d7                @mul (row 5)
+    vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 6)
+    vmlal.u8    q11, d17, d6                @mul (row 5)
+
+    vst1.8      d18, [r2], r3               @st (row 3)
+    vrshrn.i16  d24, q12, #5                @round shft (row 4)
+
+    add         r2,r2,r3, lsl #2
+    vdup.8      d16, r6
+    addgt       r2, r7, r2
+
+    suble       r2, r2, r4
+    vsub.s8     d11, d11, d27               @ref_main_idx (add row)
+    suble       r2,r2,#8
+
+    subs        r10, r10, #4                @subtract 8 and go to end if 8x8
+
+    bne         kernel_8_16_32
+
+epil_8_16_32:
+    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
+
+    vmull.u8    q10, d14, d7                @mul (row 6)
+    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 7)
+    vmlal.u8    q10, d15, d6                @mul (row 6)
+
+    vst1.8      d24, [r5], r3               @st (row 4)
+    vrshrn.i16  d24, q11, #5                @round shft (row 5)
+
+    vmull.u8    q9, d10, d7                 @mul (row 7)
+    vmlal.u8    q9, d11, d6                 @mul (row 7)
+
+    vst1.8      d24, [r5], r3               @(from previous loop)st (row 5)
+    vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
+
+    vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
+    vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
+
+    vst1.8      d18, [r5], r3               @st (row 7)
+
+end_func:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+
+
+

diff --git a/common/arm/ihevc_intra_pred_chroma_planar.s b/common/arm/ihevc_intra_pred_chroma_planar.s
new file mode 100644
index 0000000..30b3144
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_chroma_planar.s

@@ -0,0 +1,363 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_filters_planar.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  akshaya mukund
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    luma intraprediction filter for planar input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the planar coefficients
+@*
+@* @param[in] nt
+@*  size of tranform block
+@*
+@* @param[in] mode
+@*  type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
+@                                  word32 src_strd,
+@                                  uword8* pu1_dst,
+@                                  word32 dst_strd,
+@                                  word32 nt,
+@                                  word32 mode,
+@                  word32 pi1_coeff)
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@   nt
+@   mode
+@   pi1_coeff
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_chroma_planar_a9q
+.extern gau1_ihevc_planar_factor
+
+gau1_ihevc_planar_factor_addr:
+.long gau1_ihevc_planar_factor - ulbl1 - 8
+
+.type ihevc_intra_pred_chroma_planar_a9q, %function
+
+ihevc_intra_pred_chroma_planar_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads nt
+    ldr         r11, gau1_ihevc_planar_factor_addr @loads table of coeffs
+ulbl1:
+    add         r11,r11,pc
+
+    clz         r5, r4
+    rsb         r5, r5, #32
+    vdup.16     q7, r5
+    vneg.s16    q7, q7                      @shr value (so vneg)
+    vdup.8      d2, r4                      @nt
+    vdup.s16    q8, r4                      @nt
+
+    sub         r6, r4, #1                  @nt-1
+    add         r6, r0,r6,lsl #1            @2*(nt-1)
+    ldr         r7, [r6]
+    vdup.s16    d0, r7                      @src[nt-1]
+
+    add         r6, r4, r4,lsl #1           @3nt
+    add         r6, r6, #1                  @3nt + 1
+    lsl         r6,r6,#1                    @2*(3nt + 1)
+
+    add         r6, r6, r0
+    ldr         r7, [r6]
+    vdup.s16    d1, r7                      @src[3nt+1]
+
+
+    add         r6, r4, r4                  @2nt
+    add         r14, r6, #1                 @2nt+1
+    lsl         r14,#1                      @2*(2nt+1)
+    sub         r6, r6, #1                  @2nt-1
+    lsl         r6,#1                       @2*(2nt-1)
+    add         r6, r6, r0                  @&src[2nt-1]
+    add         r14, r14, r0                @&src[2nt+1]
+
+    mov         r8, #1                      @row+1 (row is first 0)
+    sub         r9, r4, r8                  @nt-1-row (row is first 0)
+
+    vdup.s8     d5, r8                      @row + 1
+    vdup.s8     d6, r9                      @nt - 1 - row
+    vmov        d7, d5                      @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
+
+    add         r12, r11, #1                @coeffs (to be reloaded after every row)
+    mov         r1, r4                      @nt (row counter) (dec after every row)
+    mov         r5, r2                      @dst (to be reloaded after every row and inc by dst_strd)
+    mov         r10, #8                     @increment for the coeffs
+    mov         r0, r14                     @&src[2nt+1] (to be reloaded after every row)
+
+    cmp         r4, #4
+    beq         tf_sz_4
+
+
+
+    mov         r10,r6
+tf_sz_8_16:
+    vld1.s8     {d10,d11}, [r14]!           @load src[2nt+1+col]
+    vld1.s8     d8, [r12]!
+    vmov        d9,d8
+    vzip.8      d8,d9
+    vsub.s8     d30, d2, d8                 @[nt-1-col]
+    vsub.s8     d31, d2, d9
+
+
+
+
+loop_sz_8_16:
+
+    ldr         r7, [r6], #-2               @src[2nt-1-row] (dec to take into account row)
+    vmull.u8    q6, d5, d0                  @(row+1)    *   src[nt-1]
+    ldr         r11, [r6], #-2              @src[2nt-1-row] (dec to take into account row)
+    vmlal.u8    q6, d6, d10                 @(nt-1-row) *   src[2nt+1+col]
+    vdup.s16    d4, r7                      @src[2nt-1-row]
+    vmlal.u8    q6, d8, d1                  @(col+1)    *   src[3nt+1]
+    vdup.s16    d3, r11                     @src[2nt-1-row]
+    vmlal.u8    q6, d30, d4                 @(nt-1-col) *   src[2nt-1-row]
+
+
+
+    vmull.u8    q14,d5,d0
+    ldr         r7, [r6], #-2               @src[2nt-1-row] (dec to take into account row)
+    vmlal.u8    q14,d6,d11
+    vadd.s8     d18, d5, d7                 @row++ [(row+1)++]c
+
+
+    vmlal.u8    q14,d31,d4
+    vsub.s8     d19, d6, d7                 @[nt-1-row]--
+    vmlal.u8    q14,d9,d1
+    vdup.s16    d4, r7                      @src[2nt-1-row]
+
+    vmull.u8    q13, d18, d0                @(row+1)    *   src[nt-1]
+    vadd.i16    q6, q6, q8                  @add (nt)
+    vmlal.u8    q13, d19, d10               @(nt-1-row) *   src[2nt+1+col]
+    vshl.s16    q6, q6, q7                  @shr
+    vmlal.u8    q13, d8, d1                 @(col+1)    *   src[3nt+1]
+    vadd.i16    q14,q14,q8
+    vmlal.u8    q13, d30, d3                @(nt-1-col) *   src[2nt-1-row]
+    vshl.s16    q14,q14,q7
+
+
+
+
+
+    vmull.u8    q12,d18,d0
+    vadd.s8     d5, d18, d7                 @row++ [(row+1)++]
+    vmlal.u8    q12,d19,d11
+    vsub.s8     d6, d19, d7                 @[nt-1-row]--
+    vmlal.u8    q12,d9,d1
+    vmovn.i16   d12, q6
+    vmlal.u8    q12,d31,d3
+    vmovn.i16   d13,q14
+
+
+
+
+    vadd.i16    q13, q13, q8                @add (nt)
+    vmull.u8    q11, d5, d0                 @(row+1)    *   src[nt-1]
+    vshl.s16    q13, q13, q7                @shr
+    vmlal.u8    q11, d6, d10                @(nt-1-row) *   src[2nt+1+col]
+    vst1.s32    {d12,d13}, [r2], r3
+    vmlal.u8    q11, d8, d1                 @(col+1)    *   src[3nt+1]
+    vadd.i16    q12,q12,q8
+    vmlal.u8    q11, d30, d4                @(nt-1-col) *   src[2nt-1-row]
+    vshl.s16    q12,q12,q7
+
+    vmull.u8    q10,d5,d0
+    vadd.s8     d18, d5, d7                 @row++ [(row+1)++]c
+    vmlal.u8    q10,d6,d11
+    vsub.s8     d19, d6, d7                 @[nt-1-row]--
+    vmlal.u8    q10,d31,d4
+
+    ldr         r11, [r6], #-2              @src[2nt-1-row] (dec to take into account row)
+    vmlal.u8    q10,d9,d1
+    vdup.s16    d3, r11                     @src[2nt-1-row]
+    vadd.i16    q11, q11, q8                @add (nt)
+
+    vmull.u8    q6, d18, d0                 @(row+1)    *   src[nt-1]
+    vmovn.i16   d26, q13
+    vmlal.u8    q6, d19, d10                @(nt-1-row) *   src[2nt+1+col]
+    vmovn.i16   d27,q12
+
+    vmlal.u8    q6, d8, d1                  @(col+1)    *   src[3nt+1]
+    vshl.s16    q11, q11, q7                @shr
+
+    vmlal.u8    q6, d30, d3                 @(nt-1-col) *   src[2nt-1-row]
+    vadd.i16    q10,q10,q8
+
+    vmull.u8    q14,d18,d0
+    vst1.s32    {d26,d27}, [r2], r3
+
+    vmlal.u8    q14,d19,d11
+    vadd.s8     d5, d18, d7                 @row++ [(row+1)++]
+
+    vsub.s8     d6, d19, d7                 @[nt-1-row]--
+    vmlal.u8    q14,d9,d1
+
+    vmlal.u8    q14,d31,d3
+    vshl.s16    q10,q10,q7
+
+
+    vadd.i16    q6, q6 ,q8                  @add (nt)
+    vmovn.i16   d22, q11
+
+
+    vadd.i16    q14,q14,q8
+    vmovn.i16   d23,q10
+
+
+    vshl.s16    q6, q6, q7                  @shr
+    vst1.s32    {d22,d23}, [r2], r3
+    vshl.s16    q14,q14,q7
+
+
+
+
+
+    vmovn.i16   d20, q6
+    vmovn.i16   d21,q14
+
+    vst1.s32    {d20,d21}, [r2], r3
+
+
+    subs        r1, r1, #4
+
+    bne         loop_sz_8_16
+
+
+
+
+    cmp         r4,#16
+
+    bne         end_loop
+
+
+    sub         r4,#16
+    vdup.s8     d5, r8                      @row + 1
+    vdup.s8     d6, r9                      @nt - 1 - row
+    vmov        d7, d5                      @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
+
+    mov         r6,r10
+    mov         r1,#16
+    sub         r2,r2,r3,lsl #4
+    add         r2,r2,#16
+
+    vld1.s8     {d10,d11}, [r14]!           @load src[2nt+1+col]
+    vld1.s8     d8, [r12]!
+    vmov        d9,d8
+    vzip.8      d8,d9
+    vsub.s8     d30, d2, d8                 @[nt-1-col]
+    vsub.s8     d31, d2, d9
+
+    beq         loop_sz_8_16
+
+
+
+tf_sz_4:
+    vld1.s8     d10, [r14]                  @load src[2nt+1+col]
+    vld1.s8     d8, [r12], r10              @load 8 coeffs [col+1]
+    vmov        d9,d8
+    vzip.8      d8,d9
+loop_sz_4:
+    @mov        r10, #4             @reduce inc to #4 for 4x4
+    ldr         r7, [r6], #-2               @src[2nt-1-row] (dec to take into account row)
+    vdup.s16    d4, r7                      @src[2nt-1-row]
+
+    vsub.s8     d9, d2, d8                  @[nt-1-col]
+
+    vmull.u8    q6, d5, d0                  @(row+1)    *   src[nt-1]
+    vmlal.u8    q6, d6, d10                 @(nt-1-row) *   src[2nt+1+col]
+    vmlal.u8    q6, d8, d1                  @(col+1)    *   src[3nt+1]
+    vmlal.u8    q6, d9, d4                  @(nt-1-col) *   src[2nt-1-row]
+@   vadd.i16    q6, q6, q8          @add (nt)
+@   vshl.s16    q6, q6, q7          @shr
+@   vmovn.i16   d12, q6
+    vrshrn.s16  d12,q6,#3
+
+    vst1.s32    {d12}, [r2], r3
+
+    vadd.s8     d5, d5, d7                  @row++ [(row+1)++]
+    vsub.s8     d6, d6, d7                  @[nt-1-row]--
+    subs        r1, r1, #1
+
+    bne         loop_sz_4
+
+end_loop:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+
+

diff --git a/common/arm/ihevc_intra_pred_chroma_ver.s b/common/arm/ihevc_intra_pred_chroma_ver.s
new file mode 100644
index 0000000..b68a045
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_chroma_ver.s

@@ -0,0 +1,229 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_chroma_ver_neon.s
+@*
+@* @brief
+@*  contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] nt
+@*  size of tranform block
+@*
+@* @param[in] mode
+@*  type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_chroma_ver(uword8 *pu1_ref,
+@        word32 src_strd,
+@        uword8 *pu1_dst,
+@        word32 dst_strd,
+@        word32 nt,
+@        word32 mode)
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@   nt
+@   mode
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_chroma_ver_a9q
+
+.type ihevc_intra_pred_chroma_ver_a9q, %function
+
+ihevc_intra_pred_chroma_ver_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads nt
+    lsl         r5, r4, #2                  @4nt
+
+
+    cmp         r4, #8
+    beq         blk_8
+    blt         blk_4
+
+copy_16:
+    add         r5, r5, #2                  @2nt+2
+    add         r6, r0, r5                  @&src[2nt+1]
+
+    add         r5, r2, r3                  @pu1_dst + dst_strd
+    vld2.8      {d20,d21}, [r6]!            @16 loads (col 0:15)
+    add         r8, r5, r3
+
+    add         r10, r8, r3
+    vld2.8      {d22,d23}, [r6]             @16 loads (col 16:31)
+    lsl         r11, r3, #2
+
+    add         r11, r11, #0xfffffff0
+
+
+    vst2.8      {d20,d21}, [r2]!
+    vst2.8      {d20,d21}, [r5]!
+    vst2.8      {d20,d21}, [r8]!
+    vst2.8      {d20,d21}, [r10]!
+
+    vst2.8      {d22,d23}, [r2], r11
+    vst2.8      {d22,d23}, [r5], r11
+    vst2.8      {d22,d23}, [r8], r11
+    vst2.8      {d22,d23}, [r10], r11
+
+    subs        r4, r4, #4
+
+kernel_copy_16:
+    vst2.8      {d20,d21}, [r2]!
+    vst2.8      {d20,d21}, [r5]!
+    vst2.8      {d20,d21}, [r8]!
+    vst2.8      {d20,d21}, [r10]!
+
+    vst2.8      {d22,d23}, [r2], r11
+    vst2.8      {d22,d23}, [r5], r11
+    vst2.8      {d22,d23}, [r8], r11
+    vst2.8      {d22,d23}, [r10], r11
+
+    subs        r4, r4, #4
+
+
+    vst2.8      {d20,d21}, [r2]!
+    vst2.8      {d20,d21}, [r5]!
+    vst2.8      {d20,d21}, [r8]!
+    vst2.8      {d20,d21}, [r10]!
+
+    vst2.8      {d22,d23}, [r2], r11
+    vst2.8      {d22,d23}, [r5], r11
+    vst2.8      {d22,d23}, [r8], r11
+    vst2.8      {d22,d23}, [r10], r11
+
+    subs        r4, r4, #4
+
+    vst2.8      {d20,d21}, [r2]!
+    vst2.8      {d20,d21}, [r5]!
+    vst2.8      {d20,d21}, [r8]!
+    vst2.8      {d20,d21}, [r10]!
+
+    vst2.8      {d22,d23}, [r2], r11
+    vst2.8      {d22,d23}, [r5], r11
+    vst2.8      {d22,d23}, [r8], r11
+    vst2.8      {d22,d23}, [r10], r11
+
+    subs        r4, r4, #4
+    bne         kernel_copy_16
+
+    b           end_func
+
+blk_8:
+
+    add         r5, r5, #2                  @2nt+2
+    add         r6, r0, r5                  @&src[2nt+1]
+
+    add         r5, r2, r3                  @pu1_dst + dst_strd
+    vld2.8      {d20,d21}, [r6]!            @16 loads (col 0:15)
+    add         r8, r5, r3
+
+    add         r10, r8, r3
+    vld2.8      {d22,d23}, [r6]             @16 loads (col 16:31)
+
+    lsl         r11,r3,#2
+
+    vst2.8      {d20,d21}, [r2],r11
+    vst2.8      {d20,d21}, [r5],r11
+    vst2.8      {d20,d21}, [r8],r11
+    vst2.8      {d20,d21}, [r10],r11
+
+    vst2.8      {d20,d21}, [r2]
+    vst2.8      {d20,d21}, [r5]
+    vst2.8      {d20,d21}, [r8]
+    vst2.8      {d20,d21}, [r10]
+
+    subs        r4, r4, #8
+    beq         end_func
+
+blk_4:
+
+    @lsl        r5, r4, #2          @4nt
+    add         r5, r5, #2                  @2nt+2
+    add         r6, r0, r5                  @&src[2nt+1]
+
+    vld1.8      {d0},[r6]
+    add         r5, r2, r3                  @pu1_dst + dst_strd
+
+    vst1.8      {d0},[r2]
+    add         r8, r5, r3
+    vst1.8      {d0},[r5]
+    add         r10, r8, r3
+    vst1.8      {d0},[r8]
+    vst1.8      {d0},[r10]
+
+
+
+end_func:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+

diff --git a/common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s b/common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
new file mode 100644
index 0000000..6c882cf
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s

@@ -0,0 +1,616 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_chroma_mode_11_to_17.s
+@*
+@* @brief
+@*  contains function definitions for intra prediction chroma mode 11 to 17
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  akshaya mukund
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] nt
+@*  size of tranform block
+@*
+@* @param[in] mode
+@*  type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_chroma_mode_11_to_17(uword8* pu1_ref,
+@                               word32 src_strd,
+@                               uword8* pu1_dst,
+@                               word32 dst_strd,
+@                               word32 nt,
+@                               word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@   nt
+@   mode
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_chroma_mode_11_to_17_a9q
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern col_for_intra_chroma
+.extern idx_neg_idx_chroma_11_17
+
+gai4_ihevc_ang_table_addr:
+.long gai4_ihevc_ang_table - ulbl1 - 8
+
+gai4_ihevc_inv_ang_table_addr:
+.long gai4_ihevc_inv_ang_table - ulbl2 - 8
+
+idx_neg_idx_chroma_11_17_addr:
+.long idx_neg_idx_chroma_11_17 - ulbl3 - 8
+
+col_for_intra_chroma_addr_1:
+.long col_for_intra_chroma - ulbl4 - 8
+
+col_for_intra_chroma_addr_2:
+.long col_for_intra_chroma - ulbl5 - 8
+
+col_for_intra_chroma_addr_3:
+.long col_for_intra_chroma - ulbl6 - 8
+
+.type ihevc_intra_pred_chroma_mode_11_to_17_a9q, %function
+
+ihevc_intra_pred_chroma_mode_11_to_17_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads nt
+    ldr         r7, gai4_ihevc_ang_table_addr
+ulbl1:
+    add         r7,r7,pc
+
+    ldr         r5,[sp,#44]                 @mode (11 to 17)
+    ldr         r8, gai4_ihevc_inv_ang_table_addr
+ulbl2:
+    add         r8,r8,pc
+
+    add         r7, r7, r5, lsl #2          @gai4_ihevc_ang_table[mode]
+    add         r8, r8, r5, lsl #2          @gai4_ihevc_inv_ang_table[mode - 11]
+    sub         r8, r8, #44
+
+    ldr         r7, [r7]                    @intra_pred_ang
+    sub         sp, sp, #132                @ref_temp[2 * max_cu_size + 2]
+
+    ldr         r8, [r8]                    @inv_ang
+    add         r6, sp, r4, lsl #1          @ref_temp + 2 * nt
+
+    mul         r9, r4, r7                  @nt*intra_pred_ang
+
+    sub         r6, r6, #2                  @ref_temp + 2*nt - 2
+
+    add         r1, r0, r4, lsl #2          @r1 = &src[4nt]
+    vdup.8      d30, r7                     @intra_pred_ang
+
+    mov         r7, r4
+
+    sub         r1,r1,#6                    @address calculation for copying 4 halfwords
+
+    asr         r9, r9, #5
+
+    vld1.8      d0,[r1]
+    vrev64.16   d0,d0
+    vst1.8      d0,[r6]!
+
+    sub         r1,#8
+
+    subs        r7, r7, #4
+    addeq       r1,#8
+    beq         end_loop_copy
+    subs        r7,r7,#4
+    beq         loop_copy_8
+    subs        r7,r7,#8
+    beq         loop_copy_16
+
+loop_copy_32:
+    sub         r1,#24
+    vld1.8      {d0,d1,d2,d3},[r1]
+
+    sub         r1,#24
+    vld1.8      {d0,d1,d2,d3},[r1]!
+
+    vrev64.16   d6,d6
+    vrev64.16   d5,d5
+    vrev64.16   d4,d4
+    vrev64.16   d3,d3
+    vrev64.16   d2,d2
+    vrev64.16   d1,d1
+    vrev64.16   d0,d0
+
+    vst1.8      d6,[r6]!
+    vst1.8      d5,[r6]!
+    vst1.8      d4,[r6]!
+    vst1.8      d3,[r6]!
+    vst1.8      d2,[r6]!
+    vst1.8      d1,[r6]!
+    vst1.8      d0,[r6]!
+
+    vld1.8      {d4,d5,d6},[r1]!
+    b           end_loop_copy
+
+loop_copy_16:
+    sub         r1,#16
+    vld1.8      {d0,d1,d2},[r1]
+
+    vrev64.16   d2,d2
+    vrev64.16   d1,d1
+    vrev64.16   d0,d0
+
+    vst1.8      d2,[r6]!
+    vst1.8      d1,[r6]!
+    vst1.8      d0,[r6]!
+
+    b           end_loop_copy
+loop_copy_8:
+    vld1.8      d0,[r1]
+    vrev64.16   d0,d0
+    vst1.8      d0,[r6]!
+end_loop_copy:
+    sub         r1,#2
+
+    ldrh        r11, [r1], #-2
+    strh        r11, [r6], #2
+
+    cmp         r9, #-1
+    bge         prologue_8_16_32
+
+    add         r6, sp, r4, lsl #1          @ref_temp + 2 * nt
+    sub         r6, r6, #4                  @ref_temp + 2 * nt - 2 - 2
+
+    mov         r12, #0xffffffff
+
+    rsb         r9, r9, r12                 @count to take care off ref_idx
+
+    add         r1, r0, r4, lsl #2          @r1 = &src[4nt]
+
+    mov         r7, #128                    @inv_ang_sum
+
+loop_copy_ref_idx:
+
+    add         r7, r7, r8                  @inv_ang_sum += inv_ang
+
+    mov         r0,r7, lsr #8
+    mov         r0,r0, lsl #1
+
+    ldrh        r11, [r1, r0]
+    strh        r11, [r6], #-2
+
+    subs        r9, r9, #1
+
+    bne         loop_copy_ref_idx
+
+prologue_8_16_32:
+
+    ldr         r14, col_for_intra_chroma_addr_1
+ulbl4:
+    add         r14,r14,pc
+
+    lsr         r10, r4, #3
+    vld1.8      d31, [r14]!
+    mul         r10, r4, r10                @block counter (dec by #8)
+
+    mov         r11, r4, lsl #1             @col counter to be inc/dec by #8
+    vmull.s8    q11, d30, d31               @(col+1)*intra_pred_angle [0:7](col)
+
+    sub         r7, r5, #11
+    ldr         r12, idx_neg_idx_chroma_11_17_addr @load least idx table
+ulbl3:
+    add         r12,r12,pc
+
+    add         r12, r12, r7, lsl #4
+    mov         r8, r12
+
+    mov         r7, #8
+    sub         r7, r7, r3, lsl #3          @r7 = 8-8r3
+
+    ldr         r9, [r8]
+    mov         r9,r9,lsl #1
+    add         r1, sp, r4, lsl #1          @ref_temp + 2nt
+
+    vmovn.s16   d6, q11
+    vdup.8      d26, r9                     @least idx added to final idx values
+    sub         r1, r1, #2                  @ref_temp + 2nt - 2
+
+    add         r6, r1, r9
+
+    vld1.8      {d0,d1,d2,d3}, [r6]         @stores the 32 values reqd based on indices values (from least idx)
+    vshr.s16    q11, q11, #5
+
+@   mov     r0, #31
+    vmov.i8     d29, #31                    @contains #31 for vand operation
+
+@   mov     r0, #32
+    vmov.i8     d28, #32
+
+    vqmovn.s16  d8, q11
+    vshl.s8     d8, d8, #1                  @ 2 * idx
+
+    vand        d6, d6, d29                 @fract values in d1/ idx values in d0
+
+@   mov     r0, #2
+    vmov.i8     d29, #2                     @contains #2 for adding to get ref_main_idx + 1
+
+    mov         r0,#0x100                   @ idx value for v is +1 of u
+    vdup.u16    d27,r0
+    vadd.u8     d27,d27,d29
+    mov         r0,#0
+
+    vadd.s8     d8, d8, d27                 @ref_main_idx (add row)
+    vsub.s8     d8, d8, d26                 @ref_main_idx (row 0)
+    vadd.s8     d9, d8, d29                 @ref_main_idx + 1 (row 0)
+    vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 0)
+    vsub.s8     d7, d28, d6                 @32-fract
+
+    vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 0)
+    vadd.s8     d4, d8, d29                 @ref_main_idx (row 1)
+    vadd.s8     d5, d9, d29                 @ref_main_idx + 1 (row 1)
+
+@   mov     r0, #4              @ 2 *(row * 2 )
+    vmov.i8     d29, #4
+
+    vtbl.8      d16, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 1)
+    vmull.u8    q12, d12, d7                @mul (row 0)
+    vmlal.u8    q12, d13, d6                @mul (row 0)
+
+    vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 1)
+    vadd.s8     d8, d8, d29                 @ref_main_idx (row 2)
+    vadd.s8     d9, d9, d29                 @ref_main_idx + 1 (row 2)
+
+    vrshrn.i16  d24, q12, #5                @round shft (row 0)
+
+    vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 2)
+    vmull.u8    q11, d16, d7                @mul (row 1)
+    vmlal.u8    q11, d17, d6                @mul (row 1)
+
+    vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 2)
+    vadd.s8     d4, d4, d29                 @ref_main_idx (row 3)
+    vadd.s8     d5, d5, d29                 @ref_main_idx + 1 (row 3)
+
+    vst1.8      d24, [r2], r3               @st (row 0)
+    vrshrn.i16  d22, q11, #5                @round shft (row 1)
+
+    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 3)
+    vmull.u8    q10, d14, d7                @mul (row 2)
+    vmlal.u8    q10, d15, d6                @mul (row 2)
+
+    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 3)
+    vadd.s8     d8, d8, d29                 @ref_main_idx (row 4)
+    vadd.s8     d9, d9, d29                 @ref_main_idx + 1 (row 4)
+
+    vst1.8      d22, [r2], r3               @st (row 1)
+    vrshrn.i16  d20, q10, #5                @round shft (row 2)
+
+    vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 4)
+    vmull.u8    q9, d10, d7                 @mul (row 3)
+    vmlal.u8    q9, d11, d6                 @mul (row 3)
+
+    vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 4)
+    vadd.s8     d4, d4, d29                 @ref_main_idx (row 5)
+    vadd.s8     d5, d5, d29                 @ref_main_idx + 1 (row 5)
+
+    vst1.8      d20, [r2], r3               @st (row 2)
+    vrshrn.i16  d18, q9, #5                 @round shft (row 3)
+
+    vtbl.8      d16, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 5)
+    vmull.u8    q12, d12, d7                @mul (row 4)
+    vmlal.u8    q12, d13, d6                @mul (row 4)
+
+    vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 5)
+    vadd.s8     d8, d8, d29                 @ref_main_idx (row 6)
+    vadd.s8     d9, d9, d29                 @ref_main_idx + 1 (row 6)
+
+    vst1.8      d18, [r2], r3               @st (row 3)
+    cmp         r4,#4
+    beq         end_func
+    vrshrn.i16  d24, q12, #5                @round shft (row 4)
+
+    vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 6)
+    vmull.u8    q11, d16, d7                @mul (row 5)
+    vmlal.u8    q11, d17, d6                @mul (row 5)
+
+    vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 6)
+    vadd.s8     d4, d4, d29                 @ref_main_idx (row 7)
+    vadd.s8     d5, d5, d29                 @ref_main_idx + 1 (row 7)
+
+    vst1.8      d24, [r2], r3               @st (row 4)
+    vrshrn.i16  d22, q11, #5                @round shft (row 5)
+
+    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
+    vmull.u8    q10, d14, d7                @mul (row 6)
+    vmlal.u8    q10, d15, d6                @mul (row 6)
+
+    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 7)
+    vmull.u8    q9, d10, d7                 @mul (row 7)
+    vmlal.u8    q9, d11, d6                 @mul (row 7)
+
+    vst1.8      d22, [r2], r3               @st (row 5)
+    vrshrn.i16  d20, q10, #5                @round shft (row 6)
+    vrshrn.i16  d18, q9, #5                 @round shft (row 7)
+
+    vst1.8      d20, [r2], r3               @st (row 6)
+
+    subs        r10, r10, #4                @subtract 8 and go to end if 8x8
+
+    vst1.8      d18, [r2], r3               @st (row 7)
+
+    beq         end_func
+
+    subs        r11, r11, #8
+    addgt       r8, r8, #4
+    addgt       r2, r2, r7
+    movle       r8, r12
+    suble       r2, r2, r4
+    addle       r2, r2, #8
+    movle       r11, r4, lsl #1
+    ldrle       r14, col_for_intra_chroma_addr_2
+ulbl5:
+    addle       r14,r14,pc
+    addle       r0, r0, #8
+
+    vld1.8      d31, [r14]!
+    vmull.s8    q6, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
+    vmovn.s16   d10, q6
+    vshr.s16    q6, q6, #5
+    vqmovn.s16  d11, q6
+    vshl.s8     d11, d11, #1
+    orr         r5,r0,r0, lsl#8
+    add         r5,#0x002
+    add         r5,#0x300
+    vdup.u16    d27, r5                     @row value inc or reset accordingly
+    ldr         r9, [r8]
+    mov         r9,r9,lsl #1
+    add         r9, r9, r0, lsl #1
+@   sub     r9, r9, #1
+    vdup.8      d26, r9
+    vadd.s8     d8, d27, d11                @ref_main_idx (add row)
+    mov         r5,r2
+
+@   sub     r4,r4,#8
+
+kernel_8_16_32:
+    vmov.i8     d29, #2                     @contains #2 for adding to get ref_main_idx + 1
+
+    vsub.s8     d8, d8, d26                 @ref_main_idx
+    vmov        d26,d10
+
+    subs        r11, r11, #8
+    add         r6, r1, r9
+    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
+    vadd.s8     d9, d29, d8                 @ref_main_idx + 1
+
+    vmull.u8    q10, d14, d7                @mul (row 6)
+    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 7)
+    vmlal.u8    q10, d15, d6                @mul (row 6)
+
+    addle       r0, r0, #8
+    addgt       r8, r8, #4
+    vld1.8      {d0,d1,d2,d3}, [r6]         @stores the 32 values reqd based on indices values (from least idx)
+
+    vst1.8      d24, [r5], r3               @st (row 4)
+    vrshrn.i16  d24, q11, #5                @round shft (row 5)
+
+    movle       r8, r12
+    orr         r9,r0,r0, lsl#8
+    mov         r9,r9,lsl #1
+    add         r9,#0x002
+    add         r9,#0x300
+    vdup.u16    d27, r9                     @row value inc or reset accordingly
+
+    ldrle       r14, col_for_intra_chroma_addr_3
+ulbl6:
+    addle       r14,r14,pc
+
+    vadd.s8     d4, d29, d8                 @ref_main_idx (row 1)
+    vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 0)
+    vadd.s8     d5, d29, d9                 @ref_main_idx + 1 (row 1)
+
+    vmov.i8     d29, #31                    @contains #2 for adding to get ref_main_idx + 1
+
+    vmull.u8    q9, d10, d7                 @mul (row 7)
+    vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 0)
+    vmlal.u8    q9, d11, d6                 @mul (row 7)
+
+    vld1.8      d31, [r14]!
+    vand        d6, d29, d26                @fract values in d1/ idx values in d0
+
+    vmov.i8     d29, #4                     @contains #2 for adding to get ref_main_idx + 1
+
+    vst1.8      d24, [r5], r3               @(from previous loop)st (row 5)
+    vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
+
+    vadd.s8     d8, d29, d8                 @ref_main_idx (row 2)
+    vtbl.8      d16, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 1)
+    vadd.s8     d9, d29, d9                 @ref_main_idx + 1 (row 2)
+
+    movle       r11, r4,lsl #1
+    ldr         r9, [r8]
+    mov         r9,r9,lsl #1
+    vsub.s8     d7, d28, d6                 @32-fract
+
+    vmull.u8    q12, d12, d7                @mul (row 0)
+    vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 1)
+    vmlal.u8    q12, d13, d6                @mul (row 0)
+
+    vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
+    vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
+
+    vadd.s8     d4, d4, d29                 @ref_main_idx (row 3)
+    vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 2)
+    vadd.s8     d5, d5, d29                 @ref_main_idx + 1 (row 3)
+
+    vmull.u8    q11, d16, d7                @mul (row 1)
+    vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 2)
+    vmlal.u8    q11, d17, d6                @mul (row 1)
+
+    vrshrn.i16  d24, q12, #5                @round shft (row 0)
+    vst1.8      d18, [r5], r3               @(from previous loop)st (row 7)
+
+    vadd.s8     d8, d8, d29                 @ref_main_idx (row 4)
+    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 3)
+    vadd.s8     d9, d9, d29                 @ref_main_idx + 1 (row 4)
+
+    vmull.u8    q10, d14, d7                @mul (row 2)
+    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 3)
+    vmlal.u8    q10, d15, d6                @mul (row 2)
+
+    vmull.s8    q7, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
+    add         r5,r2,r3,lsl#2
+    add         r9, r9, r0, lsl #1
+
+
+    vst1.8      d24, [r2], r3               @st (row 0)
+    vrshrn.i16  d22, q11, #5                @round shft (row 1)
+
+    vadd.s8     d4, d4, d29                 @ref_main_idx (row 5)
+    vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 4)
+    vadd.s8     d5, d5, d29                 @ref_main_idx + 1 (row 5)
+
+    vmull.u8    q9, d10, d7                 @mul (row 3)
+    vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 4)
+    vmlal.u8    q9, d11, d6                 @mul (row 3)
+
+    vst1.8      d22, [r2], r3               @st (row 1)
+    vrshrn.i16  d20, q10, #5                @round shft (row 2)
+
+    vmovn.s16   d10, q7
+    vshr.s16    q7, q7, #5
+
+    vadd.s8     d8, d8, d29                 @ref_main_idx (row 6)
+    vtbl.8      d16, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 5)
+    vadd.s8     d9, d9, d29                 @ref_main_idx + 1 (row 6)
+
+    vmull.u8    q12, d12, d7                @mul (row 4)
+    vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 5)
+    vmlal.u8    q12, d13, d6                @mul (row 4)
+
+    vst1.8      d20, [r2], r3               @st (row 2)
+    vrshrn.i16  d18, q9, #5                 @round shft (row 3)
+
+@   sub     r9, r9, #1
+    vqmovn.s16  d11, q7
+
+    vadd.s8     d4, d4, d29                 @ref_main_idx (row 7)
+    vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 6)
+    vadd.s8     d5, d5, d29                 @ref_main_idx + 1 (row 7)
+
+    vshl.u8     d11,#1
+
+    vmull.u8    q11, d16, d7                @mul (row 5)
+    vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 6)
+    vmlal.u8    q11, d17, d6                @mul (row 5)
+
+    vadd.s8     d8, d27, d11                @ref_main_idx (add row)
+    vdup.8      d26, r9
+
+    vst1.8      d18, [r2], r3               @st (row 3)
+    vrshrn.i16  d24, q12, #5                @round shft (row 4)
+
+
+    add         r2,r3, lsl #2
+    addgt       r2, r7, r2
+    suble       r2, r2, r4, lsl #1
+    addle       r2,r2,#8
+
+    subs        r10, r10, #4                @subtract 8 and go to end if 8x8
+
+    bne         kernel_8_16_32
+epil_8_16_32:
+
+    vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
+
+    vmull.u8    q10, d14, d7                @mul (row 6)
+    vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 7)
+    vmlal.u8    q10, d15, d6                @mul (row 6)
+
+    vst1.8      d24, [r5], r3               @st (row 4)
+    vrshrn.i16  d24, q11, #5                @round shft (row 5)
+
+    vmull.u8    q9, d10, d7                 @mul (row 7)
+    vmlal.u8    q9, d11, d6                 @mul (row 7)
+
+    vst1.8      d24, [r5], r3               @(from previous loop)st (row 5)
+    vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
+
+    vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
+    vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
+
+    vst1.8      d18, [r5], r3               @st (row 7)
+
+end_func:
+    add         sp, sp, #132
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+

diff --git a/common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s b/common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
new file mode 100644
index 0000000..2ede914
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s

@@ -0,0 +1,571 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_chroma_mode_19_to_25.s
+@*
+@* @brief
+@*  contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  naveen sr
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    chroma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] nt
+@*  size of tranform block
+@*
+@* @param[in] mode
+@*  type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_chroma_mode_19_to_25(uword8* pu1_ref,
+@                               word32 src_strd,
+@                               uword8* pu1_dst,
+@                               word32 dst_strd,
+@                               word32 nt,
+@                               word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@   nt
+@   mode
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_chroma_mode_19_to_25_a9q
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern gau1_ihevc_planar_factor
+
+gai4_ihevc_inv_ang_table_addr:
+.long gai4_ihevc_inv_ang_table - ulbl1 - 8
+
+gau1_ihevc_planar_factor_addr:
+.long gau1_ihevc_planar_factor - ulbl2 - 8
+
+gai4_ihevc_ang_table_addr_1:
+.long gai4_ihevc_ang_table - ulbl3 - 8
+
+gai4_ihevc_ang_table_addr_2:
+.long gai4_ihevc_ang_table - ulbl4 - 8
+
+.type ihevc_intra_pred_chroma_mode_19_to_25_a9q, %function
+
+ihevc_intra_pred_chroma_mode_19_to_25_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads nt
+    ldr         r7, gai4_ihevc_ang_table_addr_1
+ulbl3:
+    add         r7,r7,pc
+
+    ldr         r5,[sp,#44]                 @mode (19 to 25)
+    ldr         r8, gai4_ihevc_inv_ang_table_addr
+ulbl1:
+    add         r8,r8,pc
+
+    add         r7, r7, r5, lsl #2          @gai4_ihevc_ang_table[mode]
+    add         r8, r8, r5, lsl #2          @gai4_ihevc_inv_ang_table
+    sub         r8, r8, #48                 @gai4_ihevc_inv_ang_table[mode - 12]
+
+    ldr         r7, [r7]                    @intra_pred_ang
+    sub         sp, sp, #132                @ref_temp[2 * max_cu_size + 2]
+
+    ldr         r8, [r8]                    @inv_ang
+    add         r6, sp, r4 , lsl #1         @ref_temp + 2 * nt
+
+    mul         r9, r4, r7                  @nt*intra_pred_ang
+
+    sub         r6, r6, #2                  @ref_temp + 2*nt - 2
+
+    add         r1, r0, r4, lsl #2          @r1 = &src[4nt]
+    vdup.8      d30, r7                     @intra_pred_ang
+
+    mov         r7, r4
+
+    asr         r9, r9, #5
+
+    vld1.32     d0,[r1]!                    @ pu1_ref[two_nt + k]
+
+    vst1.32     d0,[r6]!                    @ref_temp[k + nt - 1] = pu1_ref[two_nt + k]@
+
+    subs        r7, r7, #4
+    beq         end_loop_copy
+    subs        r7,r7,#4
+    beq         loop_copy_8
+    subs        r7,r7,#8
+    beq         loop_copy_16
+
+loop_copy_32:
+    vld1.8      {d0,d1,d2,d3},[r1]!
+    vld1.8      {d4,d5,d6},[r1]!
+
+    vst1.8      {d0,d1,d2,d3},[r6]!
+
+
+    vst1.8      {d4,d5,d6},[r6]!
+    b           end_loop_copy
+
+loop_copy_16:
+    vld1.8      {d0,d1,d2},[r1]!
+    vst1.8      {d0,d1,d2},[r6]!
+
+    b           end_loop_copy
+
+loop_copy_8:
+    vld1.8      d0,[r1]!
+    vst1.8      d0,[r6]!
+
+end_loop_copy:
+
+    ldrh        r11, [r1]
+    strh        r11, [r6]
+
+    cmp         r9, #-1
+    bge         linear_filtering
+
+    add         r6, sp, r4 ,lsl #1          @ref_temp + 2 * nt
+    sub         r6, r6, #4                  @ref_temp + 2 * nt - 2 - 2
+
+    mov         r12, #0xffffffff
+
+    rsb         r9, r9, r12                 @count to take care off ref_idx
+
+    add         r1, r0, r4, lsl #2          @r1 = &src[2nt]
+
+    mov         r7, #128                    @inv_ang_sum
+
+loop_copy_ref_idx:
+
+    add         r7, r7, r8                  @inv_ang_sum += inv_ang
+    mov         r0,r7, lsr #8
+    mov         r0,r0, lsl #1
+    ldrh        r11, [r1, -r0]
+    strh        r11, [r6], #-2
+
+    subs        r9, r9, #1
+
+    bne         loop_copy_ref_idx
+
+
+linear_filtering:
+@   after copy
+@   below code is taken from mode 27 to 33 and modified
+
+    ldr         r6,gai4_ihevc_ang_table_addr_2 @loads word32 gai4_ihevc_ang_table[35]
+ulbl4:
+    add         r6,r6,pc
+
+    lsl         r7,r4,#2                    @four_nt
+
+    add         r8,r6,r5,lsl #2             @*gai4_ihevc_ang_table[mode]
+    ldr         r9,[r8]                     @intra_pred_ang = gai4_ihevc_ang_table[mode]
+    ldr         r1,gau1_ihevc_planar_factor_addr @used for ((row + 1) * intra_pred_ang) row values
+ulbl2:
+    add         r1,r1,pc
+    add         r6,r1,#1
+
+    add         r8, sp, r4, lsl #1          @ref_temp + 2 * nt
+    sub         r8,#2                       @ref_temp + 2*nt -2
+
+    mov         lr,#0                       @row
+    mov         r12,r4
+    lsl         r4,r4,#1
+
+core_loop_8:
+    add         r8,r8,#2                    @pu1_ref_main_idx += (four_nt + 1)
+    vdup.8      d0,r9                       @intra_pred_ang
+    mov         r12,r4,lsr #4               @divide by 8
+
+    vmov.i8     d1,#32
+    mul         r7,r4,r12
+
+    vmov.i16    q3,#31
+
+
+    mov         r1,r8
+
+    mov         r5,r4
+    mov         r11,#2
+
+prologue:
+    vld1.8      {d3},[r6]                   @loads the row value
+    vmull.s8    q1,d3,d0                    @pos = ((row + 1) * intra_pred_ang)
+    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
+    vmovn.i16   d4,q2
+    vshrn.s16   d5,q1,#5                    @idx = pos >> 5
+    vshl.s8     d5,d5,#1
+
+    vdup.8      d31,d4[0]
+    add         r0,r2,r3
+
+    vmov.u32    lr,d5[0]                    @(i row)extract idx to the r register
+@   lsl         lr,lr,#1
+
+    vdup.8      d29,d4[1]                   @(ii)
+    sbfx        r9,lr,#0,#8
+
+    add         r10,r8,r9                   @(i row)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d8},[r10],r11              @(i row)ref_main_idx
+    sbfx        r9,lr,#8,#8
+
+    vld1.8      {d9},[r10]                  @(i row)ref_main_idx_1
+    add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
+
+    sbfx        r9,lr,#16,#8
+    vsub.u8     d30,d1,d31                  @32-fract(dup_const_32_fract)
+    add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
+    vmull.u8    q5,d8,d30                   @(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
+    vmlal.u8    q5,d9,d31                   @(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vdup.8      d27,d4[2]                   @(iii)
+    vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
+    sbfx        r9,lr,#24,#8
+
+    vdup.8      d25,d4[3]                   @(iv)
+    vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
+    vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
+    vrshrn.i16  d10,q5,#5                   @(i row)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
+    vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
+
+    vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
+
+    vdup.8      d31,d4[4]                   @(v)
+    vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vmov.u32    lr,d5[1]                    @extract idx to the r register
+    vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+@   lsl         lr,lr,#1
+
+    vst1.8      {d10},[r2]!                 @(i row)
+    vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    sbfx        r9,lr,#0,#8
+    vdup.8      d29,d4[5]                   @(vi)
+    add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d8},[r10],r11              @(v)ref_main_idx
+    vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
+
+    vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+    sbfx        r9,lr,#8,#8
+
+    vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
+    vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vst1.8      {d14},[r0],r3               @(ii)
+    vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
+    vdup.8      d27,d4[6]                   @(vii)
+
+    sbfx        r9,lr,#16,#8
+    vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
+    add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
+    vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
+    vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vst1.8      {d18},[r0],r3               @(iii)
+    vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vdup.8      d25,d4[7]                   @(viii)
+    sbfx        r9,lr,#24,#8
+
+    vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
+    vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
+
+    vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
+    vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
+    vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+    subs        r7,r7,#8
+
+    vst1.8      {d22},[r0],r3               @(iv)
+    cmp         r4,#8                       @ go to end if 4x4
+    beq         end_loops
+
+    vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
+    vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
+
+    vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
+    vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    addgt       r8,r8,#8
+    vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    subgt       r4,r4,#8
+
+    vst1.8      {d10},[r0],r3               @(v)
+    vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+    beq         epilogue
+
+    vld1.8      {d5},[r6]                   @loads the row value
+    vmull.s8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
+    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
+    vmovn.i16   d4,q2
+    vshrn.s16   d3,q1,#5                    @idx = pos >> 5
+    vshl.s8     d3,d3,#1
+    vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
+@   lsl         lr,lr,#1
+    sbfx        r9,lr,#0,#8
+    add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
+
+kernel_8_rows:
+    vdup.8      d31,d4[0]
+    subs        r4,r4,#8
+    sbfx        r9,lr,#8,#8
+
+    vld1.8      {d8},[r10],r11              @(i)ref_main_idx
+    vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
+
+    addle       r6,r6,#8                    @increment the row value
+    add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d9},[r10]                  @(i)ref_main_idx_1
+    vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vld1.8      {d5},[r6]                   @loads the row value
+    vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vdup.8      d29,d4[1]                   @(ii)
+    vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    sbfx        r9,lr,#16,#8
+
+    vst1.8      {d14},[r0],r3               @(vi)
+    vsub.u8     d30,d1,d31                  @(i)32-fract(dup_const_32_fract)
+
+    add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
+    vmull.u8    q5,d8,d30                   @(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
+    vmlal.u8    q5,d9,d31                   @(i)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    sbfx        r9,lr,#24,#8
+    movle       r4,r5                       @reload nt
+
+    vmov.u32    lr,d3[1]                    @extract idx to the r register
+    vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vdup.8      d27,d4[2]                   @(iii)
+    vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
+    add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
+    vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vst1.8      {d18},[r0],r3               @(vii)
+    vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
+    vrshrn.i16  d10,q5,#5                   @(i)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vdup.8      d25,d4[3]                   @(iv)
+    vmull.s8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
+
+    vst1.8      {d22},[r0]                  @(viii)
+    vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
+
+    vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
+    vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+@   lsl         lr,lr,#1
+
+    vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
+    vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    sbfx        r9,lr,#0,#8
+    add         r0,r2,r3
+
+    vdup.8      d31,d4[4]                   @(v)
+    vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
+    sbfx        r9,lr,#8,#8
+
+    vst1.8      {d10},[r2]!                 @(i)
+    vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
+
+    vdup.8      d29,d4[5]                   @(vi)
+    vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vdup.8      d27,d4[6]                   @(vii)
+    vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
+    sbfx        r9,lr,#16,#8
+
+    vdup.8      d25,d4[7]                   @(viii)
+    vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vld1.8      {d8},[r10],r11              @(v)ref_main_idx
+    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
+
+    vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
+    vshrn.s16   d3,q1,#5                    @idx = pos >> 5
+
+    vst1.8      {d14},[r0],r3               @(ii)
+    vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+    add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
+    sbfx        r9,lr,#24,#8
+
+    vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
+    vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
+
+    vshl.s8     d3,d3,#1
+
+    vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
+    vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
+    vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
+    movle       r8,r1                       @reload the source to pu1_src+2nt
+
+    vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
+    vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
+
+    vst1.8      {d18},[r0],r3               @(iii)
+    vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
+    vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
+    vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
+    vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
+
+    addgt       r8,r8,#8                    @increment the source next set 8 columns in same row
+    lslle       r12,r3,#3
+    suble       r12,r12,r5
+
+    vst1.8      {d22},[r0],r3               @(iv)
+    vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vst1.8      {d10},[r0],r3               @(v)
+    vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    addle       r2,r2,r12                   @increment the dst pointer to 8*dst_strd - nt
+    sbfx        r9,lr,#0,#8
+
+    vmovn.i16   d4,q2
+    vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
+@   lsl         lr,lr,#1
+
+    subs        r7,r7,#8
+    add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
+
+    bne         kernel_8_rows
+
+epilogue:
+    vst1.8      {d14},[r0],r3               @(vi)
+    vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
+    vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vst1.8      {d18},[r0],r3               @(vii)
+    vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vst1.8      {d22},[r0],r3               @(viii)
+    b           end_loops
+
+core_loop_4:
+
+end_loops:
+    add         sp, sp, #132
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+

diff --git a/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s b/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s
new file mode 100644
index 0000000..93495f8
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s

@@ -0,0 +1,693 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_luma_mode_11_to_17.s
+@*
+@* @brief
+@*  contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  akshaya mukund
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] nt
+@*  size of tranform block
+@*
+@* @param[in] mode
+@*  type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_mode_11_to_17(uword8* pu1_ref,
+@                               word32 src_strd,
+@                               uword8* pu1_dst,
+@                               word32 dst_strd,
+@                               word32 nt,
+@                               word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@   nt
+@   mode
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_mode_11_to_17_a9q
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern col_for_intra_luma
+.extern idx_11_17
+
+gai4_ihevc_ang_table_addr:
+.long gai4_ihevc_ang_table - ulbl1 - 8
+
+gai4_ihevc_inv_ang_table_addr:
+.long gai4_ihevc_inv_ang_table - ulbl2 - 8
+
+idx_neg_idx_11_17_addr_1:
+.long idx_neg_idx_11_17 - ulbl3 - 8
+
+idx_neg_idx_11_17_addr_2:
+.long idx_neg_idx_11_17 - ulbl4 - 8
+
+col_for_intra_luma_addr_1:
+.long col_for_intra_luma - ulbl_1 - 8
+
+col_for_intra_luma_addr_2:
+.long col_for_intra_luma - ulbl_2 - 8
+
+col_for_intra_luma_addr_3:
+.long col_for_intra_luma - ulbl_3 - 8
+
+col_for_intra_luma_addr_4:
+.long col_for_intra_luma - ulbl_4 - 8
+
+.type ihevc_intra_pred_luma_mode_11_to_17_a9q, %function
+
+ihevc_intra_pred_luma_mode_11_to_17_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads nt
+    ldr         r7, gai4_ihevc_ang_table_addr
+ulbl1:
+    add         r7,r7,pc
+
+    ldr         r5,[sp,#44]                 @mode (11 to 17)
+    ldr         r8, gai4_ihevc_inv_ang_table_addr
+ulbl2:
+    add         r8,r8,pc
+
+    add         r7, r7, r5, lsl #2          @gai4_ihevc_ang_table[mode]
+    add         r8, r8, r5, lsl #2          @gai4_ihevc_inv_ang_table[mode - 11]
+    sub         r8, r8, #44
+
+    ldr         r7, [r7]                    @intra_pred_ang
+    sub         sp, sp, #132                @ref_temp[2 * max_cu_size + 1]
+
+    ldr         r8, [r8]                    @inv_ang
+    add         r6, sp, r4                  @ref_temp + nt
+
+    mul         r9, r4, r7                  @nt*intra_pred_ang
+
+    sub         r6, r6, #1                  @ref_temp + nt - 1
+
+    add         r1, r0, r4, lsl #1          @r1 = &src[2nt]
+    vdup.8      d30, r7                     @intra_pred_ang
+
+    mov         r7, r4
+
+    ldrb        r11, [r1], #-1
+
+    asr         r9, r9, #5
+
+    ldrb        r12, [r1], #-1
+    ldrb        r10, [r1], #-1
+    ldrb        r14, [r1], #-1
+
+    strb        r11, [r6], #1
+    strb        r12, [r6], #1
+    strb        r10, [r6], #1
+    strb        r14, [r6], #1
+
+    subs        r7, r7, #4
+    beq         end_loop_copy
+
+    sub         r6,#4
+    sub         r1,#3
+
+    subs        r7,r7,#4
+    beq         loop_copy_8
+    subs        r7,r7,#8
+    beq         loop_copy_16
+
+loop_copy_32:
+    vld1.8      d0,[r1]
+    sub         r1,#8
+    vld1.8      d1,[r1]
+    sub         r1,#8
+    vld1.8      d2,[r1]
+    sub         r1,#8
+    vld1.8      d3,[r1]
+
+    vrev64.8    d0,d0
+    vrev64.8    d1,d1
+    vst1.8      d0,[r6]!
+    vrev64.8    d2,d2
+    vst1.8      d1,[r6]!
+    vrev64.8    d3,d3
+    vst1.8      d2,[r6]!
+    vst1.8      d3,[r6]!
+    sub         r1,#1
+    b           end_loop_copy
+
+loop_copy_16:
+    vld1.8      d0,[r1]
+    sub         r1,#8
+    vld1.8      d1,[r1]
+
+    vrev64.8    d0,d0
+    vrev64.8    d1,d1
+
+    vst1.8      d0,[r6]!
+    vst1.8      d1,[r6]!
+    sub         r1,#1
+    b           end_loop_copy
+
+loop_copy_8:
+    vld1.8      d0,[r1]
+    vrev64.8    d0,d0
+    vst1.8      d0,[r6]!
+    sub         r1,#1
+end_loop_copy:
+
+    ldrb        r11, [r1], #-1
+    strb        r11, [r6], #1
+
+    cmp         r9, #-1
+    bge         prologue_8_16_32
+
+    add         r6, sp, r4                  @ref_temp + nt
+    sub         r6, r6, #2                  @ref_temp + nt - 2
+
+    mov         r12, #0xffffffff
+
+    rsb         r9, r9, r12                 @count to take care off ref_idx
+
+    add         r1, r0, r4, lsl #1          @r1 = &src[2nt]
+
+    mov         r7, #128                    @inv_ang_sum
+
+loop_copy_ref_idx:
+
+    add         r7, r7, r8                  @inv_ang_sum += inv_ang
+
+    ldrb        r11, [r1, r7, lsr #8]
+    strb        r11, [r6], #-1
+
+    subs        r9, r9, #1
+
+    bne         loop_copy_ref_idx
+
+prologue_8_16_32:
+    cmp         r4, #4
+    beq         sz_4_proc
+    ldr         r14, col_for_intra_luma_addr_1
+ulbl_1:
+    add         r14,r14,pc
+
+    lsr         r10, r4, #3
+    vld1.8      d31, [r14]!
+    mul         r10, r4, r10                @block counter (dec by #8)
+
+    mov         r11, r4                     @col counter to be inc/dec by #8
+    vmull.s8    q11, d30, d31               @(col+1)*intra_pred_angle [0:7](col)
+    mov         r0, #1
+
+    sub         r7, r5, #11
+    vdup.8      d2, r0                      @contains #1 for adding to get ref_main_idx + 1
+    ldr         r12, idx_neg_idx_11_17_addr_1 @load least idx table
+ulbl3:
+    add         r12,r12,pc
+
+    mov         r0, #2
+    vdup.8      d3, r0
+
+    add         r12, r12, r7, lsl #4
+    mov         r8, r12
+
+    mov         r7, #8
+    sub         r7, r7, r3, lsl #3          @r7 = 8-8r3
+
+    ldr         r9, [r8]
+    add         r1, sp, r4                  @ref_temp + nt
+
+    vmovn.s16   d6, q11
+    vdup.8      d26, r9                     @least idx added to final idx values
+    sub         r1, r1, #1                  @ref_temp + nt - 1
+
+    add         r6, r1, r9
+
+    vld1.8      {d0,d1}, [r6]               @stores the 32 values reqd based on indices values (from least idx)
+    vshr.s16    q11, q11, #5
+
+    mov         r0, #31
+    vdup.8      d29, r0                     @contains #31 for vand operation
+
+    mov         r0, #32
+    vdup.8      d28, r0
+
+    vqmovn.s16  d8, q11
+
+    vand        d6, d6, d29                 @fract values in d1/ idx values in d0
+
+    mov         r0, #1
+    vdup.8      d27, r0                     @row value inc or reset accordingly
+
+    vadd.s8     d8, d8, d27                 @ref_main_idx (add row)
+    vsub.s8     d8, d8, d26                 @ref_main_idx (row 0)
+    vadd.s8     d9, d8, d2                  @ref_main_idx + 1 (row 0)
+    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 0)
+    vsub.s8     d7, d28, d6                 @32-fract
+
+    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 0)
+    vadd.s8     d4, d8, d2                  @ref_main_idx (row 1)
+    vadd.s8     d5, d9, d2                  @ref_main_idx + 1 (row 1)
+
+    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 1)
+    vmull.u8    q12, d12, d7                @mul (row 0)
+    vmlal.u8    q12, d13, d6                @mul (row 0)
+
+    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 1)
+    vadd.s8     d8, d8, d3                  @ref_main_idx (row 2)
+    vadd.s8     d9, d9, d3                  @ref_main_idx + 1 (row 2)
+
+    vrshrn.i16  d24, q12, #5                @round shft (row 0)
+
+    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 2)
+    vmull.u8    q11, d16, d7                @mul (row 1)
+    vmlal.u8    q11, d17, d6                @mul (row 1)
+
+    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 2)
+    vadd.s8     d4, d4, d3                  @ref_main_idx (row 3)
+    vadd.s8     d5, d5, d3                  @ref_main_idx + 1 (row 3)
+
+    vst1.8      d24, [r2], r3               @st (row 0)
+    vrshrn.i16  d22, q11, #5                @round shft (row 1)
+
+    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 3)
+    vmull.u8    q10, d14, d7                @mul (row 2)
+    vmlal.u8    q10, d15, d6                @mul (row 2)
+
+    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 3)
+    vadd.s8     d8, d8, d3                  @ref_main_idx (row 4)
+    vadd.s8     d9, d9, d3                  @ref_main_idx + 1 (row 4)
+
+    vst1.8      d22, [r2], r3               @st (row 1)
+    vrshrn.i16  d20, q10, #5                @round shft (row 2)
+
+    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 4)
+    vmull.u8    q9, d10, d7                 @mul (row 3)
+    vmlal.u8    q9, d11, d6                 @mul (row 3)
+
+    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 4)
+    vadd.s8     d4, d4, d3                  @ref_main_idx (row 5)
+    vadd.s8     d5, d5, d3                  @ref_main_idx + 1 (row 5)
+
+    vst1.8      d20, [r2], r3               @st (row 2)
+    vrshrn.i16  d18, q9, #5                 @round shft (row 3)
+
+    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 5)
+    vmull.u8    q12, d12, d7                @mul (row 4)
+    vmlal.u8    q12, d13, d6                @mul (row 4)
+
+    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 5)
+    vadd.s8     d8, d8, d3                  @ref_main_idx (row 6)
+    vadd.s8     d9, d9, d3                  @ref_main_idx + 1 (row 6)
+
+    vst1.8      d18, [r2], r3               @st (row 3)
+    vrshrn.i16  d24, q12, #5                @round shft (row 4)
+
+    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 6)
+    vmull.u8    q11, d16, d7                @mul (row 5)
+    vmlal.u8    q11, d17, d6                @mul (row 5)
+
+    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 6)
+    vadd.s8     d4, d4, d3                  @ref_main_idx (row 7)
+    vadd.s8     d5, d5, d3                  @ref_main_idx + 1 (row 7)
+
+    vst1.8      d24, [r2], r3               @st (row 4)
+    vrshrn.i16  d22, q11, #5                @round shft (row 5)
+
+    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 7)
+    vmull.u8    q10, d14, d7                @mul (row 6)
+    vmlal.u8    q10, d15, d6                @mul (row 6)
+
+    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 7)
+    vmull.u8    q9, d10, d7                 @mul (row 7)
+    vmlal.u8    q9, d11, d6                 @mul (row 7)
+
+    vst1.8      d22, [r2], r3               @st (row 5)
+    vrshrn.i16  d20, q10, #5                @round shft (row 6)
+    vrshrn.i16  d18, q9, #5                 @round shft (row 7)
+
+    vst1.8      d20, [r2], r3               @st (row 6)
+
+    subs        r10, r10, #8                @subtract 8 and go to end if 8x8
+
+    vst1.8      d18, [r2], r3               @st (row 7)
+
+    beq         end_func
+
+    subs        r11, r11, #8
+    addgt       r8, r8, #4
+    addgt       r2, r2, r7
+    movle       r8, r12
+    suble       r2, r2, r4
+    addle       r2, r2, #8
+    movle       r11, r4
+    ldrle       r14, col_for_intra_luma_addr_2
+ulbl_2:
+    addle       r14,r14,pc
+    addle       r0, r0, #8
+
+    mov         r5,r2
+    vld1.8      d31, [r14]!
+    vmull.s8    q6, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
+    vmovn.s16   d10, q6
+    vshr.s16    q6, q6, #5
+    vqmovn.s16  d11, q6
+    vdup.8      d27, r0                     @row value inc or reset accordingly
+    ldr         r9, [r8]
+    add         r9, r0, r9
+    sub         r9, r9, #1
+    vdup.8      d26, r9
+    vadd.s8     d8, d27, d11                @ref_main_idx (add row)
+
+    sub         r4,r4,#8
+
+kernel_8_16_32:
+
+    vsub.s8     d8, d8, d26                 @ref_main_idx
+    vmov        d26,d10
+
+    subs        r11, r11, #8
+    add         r6, r1, r9
+    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 7)
+    vadd.s8     d9, d2, d8                  @ref_main_idx + 1
+
+    vmull.u8    q10, d14, d7                @mul (row 6)
+    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 7)
+    vmlal.u8    q10, d15, d6                @mul (row 6)
+
+    addle       r0, r0, #8
+    addgt       r8, r8, #4
+    vld1.8      {d0,d1}, [r6]               @stores the 32 values reqd based on indices values (from least idx)
+
+    vst1.8      d24, [r5], r3               @st (row 4)
+    vrshrn.i16  d24, q11, #5                @round shft (row 5)
+
+    ldrle       r14, col_for_intra_luma_addr_3
+ulbl_3:
+    addle       r14,r14,pc
+    movle       r8, r12
+    vdup.8      d27, r0                     @row value inc or reset accordingly
+
+    vadd.s8     d4, d2, d8                  @ref_main_idx (row 1)
+    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 0)
+    vadd.s8     d5, d2, d9                  @ref_main_idx + 1 (row 1)
+
+
+    vmull.u8    q9, d10, d7                 @mul (row 7)
+    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 0)
+    vmlal.u8    q9, d11, d6                 @mul (row 7)
+
+    vld1.8      d31, [r14]!
+    vand        d6, d29, d26                @fract values in d1/ idx values in d0
+
+    vst1.8      d24, [r5], r3               @(from previous loop)st (row 5)
+    vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
+
+    vadd.s8     d8, d3, d8                  @ref_main_idx (row 2)
+    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 1)
+    vadd.s8     d9, d3, d9                  @ref_main_idx + 1 (row 2)
+
+    addle       r11, r4, #8
+    ldr         r9, [r8]
+    vsub.s8     d7, d28, d6                 @32-fract
+
+    vmull.u8    q12, d12, d7                @mul (row 0)
+    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 1)
+    vmlal.u8    q12, d13, d6                @mul (row 0)
+
+    vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
+    vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
+
+    vadd.s8     d4, d4, d3                  @ref_main_idx (row 3)
+    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 2)
+    vadd.s8     d5, d5, d3                  @ref_main_idx + 1 (row 3)
+
+    vmull.u8    q11, d16, d7                @mul (row 1)
+    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 2)
+    vmlal.u8    q11, d17, d6                @mul (row 1)
+
+    vrshrn.i16  d24, q12, #5                @round shft (row 0)
+    vst1.8      d18, [r5], r3               @(from previous loop)st (row 7)
+
+    vadd.s8     d8, d8, d3                  @ref_main_idx (row 4)
+    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 3)
+    vadd.s8     d9, d9, d3                  @ref_main_idx + 1 (row 4)
+
+    vmull.u8    q10, d14, d7                @mul (row 2)
+    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 3)
+    vmlal.u8    q10, d15, d6                @mul (row 2)
+
+    vmull.s8    q7, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
+    add         r5,r2,r3,lsl#2
+    add         r9, r0, r9
+
+
+    vst1.8      d24, [r2], r3               @st (row 0)
+    vrshrn.i16  d22, q11, #5                @round shft (row 1)
+
+    vadd.s8     d4, d4, d3                  @ref_main_idx (row 5)
+    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 4)
+    vadd.s8     d5, d5, d3                  @ref_main_idx + 1 (row 5)
+
+    vmull.u8    q9, d10, d7                 @mul (row 3)
+    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 4)
+    vmlal.u8    q9, d11, d6                 @mul (row 3)
+
+    vst1.8      d22, [r2], r3               @st (row 1)
+    vrshrn.i16  d20, q10, #5                @round shft (row 2)
+
+    vmovn.s16   d10, q7
+    vshr.s16    q7, q7, #5
+
+    vadd.s8     d8, d8, d3                  @ref_main_idx (row 6)
+    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 5)
+    vadd.s8     d9, d9, d3                  @ref_main_idx + 1 (row 6)
+
+    vmull.u8    q12, d12, d7                @mul (row 4)
+    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 5)
+    vmlal.u8    q12, d13, d6                @mul (row 4)
+
+    vst1.8      d20, [r2], r3               @st (row 2)
+    vrshrn.i16  d18, q9, #5                 @round shft (row 3)
+
+    sub         r9, r9, #1
+    vqmovn.s16  d11, q7
+
+    vadd.s8     d4, d4, d3                  @ref_main_idx (row 7)
+    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 6)
+    vadd.s8     d5, d5, d3                  @ref_main_idx + 1 (row 7)
+
+    vmull.u8    q11, d16, d7                @mul (row 5)
+    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 6)
+    vmlal.u8    q11, d17, d6                @mul (row 5)
+
+    vadd.s8     d8, d27, d11                @ref_main_idx (add row)
+    vdup.8      d26, r9
+
+    vst1.8      d18, [r2], r3               @st (row 3)
+    vrshrn.i16  d24, q12, #5                @round shft (row 4)
+
+
+    add         r2,r3, lsl #2
+    addgt       r2, r7, r2
+    suble       r2, r2, r4
+
+    subs        r10, r10, #8                @subtract 8 and go to end if 8x8
+
+    bne         kernel_8_16_32
+epil_8_16_32:
+
+    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 7)
+
+    vmull.u8    q10, d14, d7                @mul (row 6)
+    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 7)
+    vmlal.u8    q10, d15, d6                @mul (row 6)
+
+    vst1.8      d24, [r5], r3               @st (row 4)
+    vrshrn.i16  d24, q11, #5                @round shft (row 5)
+
+    vmull.u8    q9, d10, d7                 @mul (row 7)
+    vmlal.u8    q9, d11, d6                 @mul (row 7)
+
+    vst1.8      d24, [r5], r3               @(from previous loop)st (row 5)
+    vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
+
+    vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
+    vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
+
+    vst1.8      d18, [r5], r3               @st (row 7)
+
+
+    b           end_func
+
+sz_4_proc:
+    ldr         r14, col_for_intra_luma_addr_4
+ulbl_4:
+    add         r14,r14,pc
+
+    vld1.8      d31, [r14]
+    mov         r12, #1
+
+    vdup.8      d2, r12                     @contains #1 for adding to get ref_main_idx + 1
+    mov         r0, #2
+
+    vdup.8      d3, r0
+    ldr         r12, idx_neg_idx_11_17_addr_2 @load least idx table
+ulbl4:
+    add         r12,r12,pc
+
+    vmull.s8    q11, d30, d31               @(col+1)*intra_pred_angle [0:7](col)
+    sub         r7, r5, #11
+
+    add         r12, r12, r7, lsl #4
+    mov         r8, r12
+
+    ldr         r9, [r8]
+
+    vdup.8      d26, r9                     @least idx added to final idx values
+    add         r6, sp, r4                  @ref_temp + nt
+
+    sub         r6, r6, #1                  @ref_temp + nt - 1
+    vmovn.s16   d6, q11
+    add         r6, r6, r9
+
+    vld1.8      {d0,d1}, [r6]               @stores the 32 values reqd based on indices values (from least idx)
+    mov         r0, #31
+
+    vdup.8      d29, r0                     @contains #31 for vand operation
+    mov         r1, #32
+
+    vdup.8      d28, r1
+
+    vshr.s16    q11, q11, #5
+    vqmovn.s16  d8, q11
+
+    vand        d6, d6, d29                 @fract values in d1/ idx values in d0
+    vsub.s8     d7, d28, d6                 @32-fract
+
+    vadd.s8     d8, d8, d2                  @ref_main_idx (add 1)
+    vsub.s8     d8, d8, d26                 @ref_main_idx
+    vadd.s8     d9, d8, d2                  @ref_main_idx + 1
+
+    vadd.s8     d4, d8, d2                  @row 1 ref_main_idx
+    vadd.s8     d5, d9, d2
+
+    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 0)
+    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 0)
+
+
+    vmull.u8    q12, d12, d7                @mul (row 0)
+    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 1)
+    vmlal.u8    q12, d13, d6                @mul (row 0)
+
+    vadd.s8     d8, d8, d3                  @idx (row 2)
+    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 1)
+    vadd.s8     d9, d9, d3                  @idx+1 (row 2)
+
+    vmull.u8    q11, d16, d7                @mul (row 1)
+    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 2)
+    vmlal.u8    q11, d17, d6                @mul (row 1)
+
+    vrshrn.i16  d24, q12, #5                @round shift (row 0)
+
+    vadd.s8     d4, d4, d3                  @idx (row 3)
+    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 2)
+    vadd.s8     d5, d5, d3                  @idx+1 (row 3)
+
+    vmull.u8    q10, d12, d7                @mul (row 2)
+    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 3)
+    vmlal.u8    q10, d13, d6                @mul (row 2)
+
+    vst1.32     d24[0], [r2], r3            @st row 0
+    vrshrn.i16  d22, q11, #5                @round shift (row 1)
+
+    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 3)
+
+    vmull.u8    q9, d16, d7                 @mul (row 3)
+    vmlal.u8    q9, d17, d6                 @mul (row 3)
+
+    vst1.32     d22[0], [r2], r3            @st row 1
+    vrshrn.i16  d20, q10, #5                @round shift (row 2)
+
+    vst1.32     d20[0], [r2], r3            @st row 2
+
+    vrshrn.i16  d18, q9, #5                 @round shift (row 3)
+
+    vst1.32     d18[0], [r2], r3            @st (row 3)
+
+end_func:
+    add         sp, sp, #132
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+

diff --git a/common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s b/common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s
new file mode 100644
index 0000000..af342bf
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s

@@ -0,0 +1,653 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_luma_mode_19_to_25.s
+@*
+@* @brief
+@*  contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  naveen sr
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] nt
+@*  size of tranform block
+@*
+@* @param[in] mode
+@*  type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_mode_19_to_25(uword8* pu1_ref,
+@                               word32 src_strd,
+@                               uword8* pu1_dst,
+@                               word32 dst_strd,
+@                               word32 nt,
+@                               word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@   nt
+@   mode
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_mode_19_to_25_a9q
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern gau1_ihevc_planar_factor
+
+gai4_ihevc_inv_ang_table_addr:
+.long gai4_ihevc_inv_ang_table - ulbl1 - 8
+
+gau1_ihevc_planar_factor_addr:
+.long gau1_ihevc_planar_factor - ulbl2 - 8
+
+gai4_ihevc_ang_table_addr_1:
+.long gai4_ihevc_ang_table - ulbl_1 - 8
+
+gai4_ihevc_ang_table_addr_2:
+.long gai4_ihevc_ang_table - ulbl_2 - 8
+
+.type ihevc_intra_pred_luma_mode_19_to_25_a9q, %function
+
+ihevc_intra_pred_luma_mode_19_to_25_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads nt
+    ldr         r7, gai4_ihevc_ang_table_addr_1
+ulbl_1:
+    add         r7,r7,pc
+
+    ldr         r5,[sp,#44]                 @mode (19 to 25)
+    ldr         r8, gai4_ihevc_inv_ang_table_addr
+ulbl1:
+    add         r8,r8,pc
+
+    add         r7, r7, r5, lsl #2          @gai4_ihevc_ang_table[mode]
+    add         r8, r8, r5, lsl #2          @gai4_ihevc_inv_ang_table
+    sub         r8, r8, #48                 @gai4_ihevc_inv_ang_table[mode - 12]
+
+    ldr         r7, [r7]                    @intra_pred_ang
+    sub         sp, sp, #132                @ref_temp[2 * max_cu_size + 1]
+
+    ldr         r8, [r8]                    @inv_ang
+    add         r6, sp, r4                  @ref_temp + nt
+
+    mul         r9, r4, r7                  @nt*intra_pred_ang
+
+    sub         r6, r6, #1                  @ref_temp + nt - 1
+
+    add         r1, r0, r4, lsl #1          @r1 = &src[2nt]
+    vdup.8      d30, r7                     @intra_pred_ang
+
+    mov         r7, r4
+
+    asr         r9, r9, #5
+
+    vld1.32     d0[0],[r1]!                 @ pu1_ref[two_nt + k]
+
+    vst1.32     d0[0],[r6]!                 @ref_temp[k + nt - 1] = pu1_ref[two_nt + k]@
+
+    subs        r7, r7, #4
+    beq         end_loop_copy
+    sub         r1,#4
+    sub         r6,#4
+    subs        r7,r7,#4
+    beq         loop_copy_8
+    subs        r7,r7,#8
+    beq         loop_copy_16
+
+loop_copy_32:
+    vld1.8      d0,[r1]!
+    vld1.8      d1,[r1]!
+    vld1.8      d2,[r1]!
+    vld1.8      d3,[r1]!
+
+    vst1.8      d0,[r6]!
+    vst1.8      d1,[r6]!
+    vst1.8      d2,[r6]!
+    vst1.8      d3,[r6]!
+    b           end_loop_copy
+
+loop_copy_16:
+    vld1.8      d0,[r1]!
+    vld1.8      d1,[r1]!
+
+    vst1.8      d0,[r6]!
+    vst1.8      d1,[r6]!
+    b           end_loop_copy
+
+loop_copy_8:
+    vld1.8      d0,[r1]!
+    vst1.8      d0,[r6]!
+
+end_loop_copy:
+
+    ldrb        r11, [r1]
+    strb        r11, [r6]
+
+    cmp         r9, #-1
+    bge         linear_filtering
+
+    add         r6, sp, r4                  @ref_temp + nt
+    sub         r6, r6, #2                  @ref_temp + nt - 2
+
+    mov         r12, #0xffffffff
+
+    rsb         r9, r9, r12                 @count to take care off ref_idx
+
+    add         r1, r0, r4, lsl #1          @r1 = &src[2nt]
+
+    mov         r7, #128                    @inv_ang_sum
+
+loop_copy_ref_idx:
+
+    add         r7, r7, r8                  @inv_ang_sum += inv_ang
+    mov         r14,r7,lsr #8
+    ldrb        r11, [r1, -r14]
+@   ldrb        r11, [r1, -r7, lsr #8]
+    strb        r11, [r6], #-1
+
+    subs        r9, r9, #1
+
+    bne         loop_copy_ref_idx
+
+
+linear_filtering:
+@   after copy
+@   below code is taken from mode 27 to 33 and modified
+
+    ldr         r6,gai4_ihevc_ang_table_addr_2 @loads word32 gai4_ihevc_ang_table[35]
+ulbl_2:
+    add         r6,r6,pc
+
+    add         r8,r6,r5,lsl #2             @*gai4_ihevc_ang_table[mode]
+    ldr         r9,[r8]                     @intra_pred_ang = gai4_ihevc_ang_table[mode]
+    ldr         r1,gau1_ihevc_planar_factor_addr @used for ((row + 1) * intra_pred_ang) row values
+ulbl2:
+    add         r1,r1,pc
+    add         r6,r1,#1
+
+    add         r8, sp, r4                  @ref_temp + nt
+    sub         r8,#1                       @ref_temp + nt -1
+
+    tst         r4,#7
+    mov         lr,#0                       @row
+    mov         r12,r4
+    bne         core_loop_4
+
+core_loop_8:
+    add         r8,r8,#1                    @pu1_ref_main_idx += (two_nt + 1)
+    vdup.8      d0,r9                       @intra_pred_ang
+    mov         r12,r4,lsr #3               @divide by 8
+
+    vmov.i8     d1,#32
+    mul         r7,r4,r12
+
+    vmov.i16    q3,#31
+    @lsl            r12,r3,#3
+
+    mov         r1,r8
+    @sub            r12,r12,r4
+    mov         r5,r4
+    mov         r11,#1
+
+prologue:
+    vld1.8      {d3},[r6]                   @loads the row value
+    vmull.s8    q1,d3,d0                    @pos = ((row + 1) * intra_pred_ang)
+    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
+    vmovn.i16   d4,q2
+    vshrn.s16   d5,q1,#5                    @idx = pos >> 5
+
+    vdup.8      d31,d4[0]
+    add         r0,r2,r3
+
+    vmov.u32    lr,d5[0]                    @(i row)extract idx to the r register
+
+    vdup.8      d29,d4[1]                   @(ii)
+    sbfx        r9,lr,#0,#8
+
+    add         r10,r8,r9                   @(i row)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d8},[r10],r11              @(i row)ref_main_idx
+    sbfx        r9,lr,#8,#8
+
+    vld1.8      {d9},[r10]                  @(i row)ref_main_idx_1
+    add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
+
+    sbfx        r9,lr,#16,#8
+    vsub.u8     d30,d1,d31                  @32-fract(dup_const_32_fract)
+    add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
+    vmull.u8    q5,d8,d30                   @(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
+    vmlal.u8    q5,d9,d31                   @(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vdup.8      d27,d4[2]                   @(iii)
+    vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
+    sbfx        r9,lr,#24,#8
+
+    vdup.8      d25,d4[3]                   @(iv)
+    vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
+    vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
+    vrshrn.i16  d10,q5,#5                   @(i row)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
+    vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
+
+    vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
+
+    vdup.8      d31,d4[4]                   @(v)
+    vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vmov.u32    lr,d5[1]                    @extract idx to the r register
+    vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vst1.8      {d10},[r2]!                 @(i row)
+    vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    sbfx        r9,lr,#0,#8
+    vdup.8      d29,d4[5]                   @(vi)
+    add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d8},[r10],r11              @(v)ref_main_idx
+    vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
+
+    vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+    sbfx        r9,lr,#8,#8
+
+    vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
+    vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vst1.8      {d14},[r0],r3               @(ii)
+    vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
+    vdup.8      d27,d4[6]                   @(vii)
+
+    sbfx        r9,lr,#16,#8
+    vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
+    add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
+    vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
+    vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vst1.8      {d18},[r0],r3               @(iii)
+    vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vdup.8      d25,d4[7]                   @(viii)
+    sbfx        r9,lr,#24,#8
+
+    vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
+    vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
+
+    vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
+    vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
+    vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+    subs        r4,r4,#8
+
+    vst1.8      {d22},[r0],r3               @(iv)
+    vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
+    vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
+
+    vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
+    vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    addgt       r8,r8,#8
+    vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    subgt       r7,r7,#8
+
+    vst1.8      {d10},[r0],r3               @(v)
+    vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+    beq         epilogue
+
+    vld1.8      {d5},[r6]                   @loads the row value
+    vmull.s8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
+    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
+    vmovn.i16   d4,q2
+    vshrn.s16   d3,q1,#5                    @idx = pos >> 5
+    vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
+    sbfx        r9,lr,#0,#8
+    add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
+
+kernel_8_rows:
+    vdup.8      d31,d4[0]
+    subs        r4,r4,#8
+    sbfx        r9,lr,#8,#8
+
+    vld1.8      {d8},[r10],r11              @(i)ref_main_idx
+    vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
+
+    addle       r6,r6,#8                    @increment the row value
+    add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d9},[r10]                  @(i)ref_main_idx_1
+    vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vld1.8      {d5},[r6]                   @loads the row value
+    vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vdup.8      d29,d4[1]                   @(ii)
+    vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    sbfx        r9,lr,#16,#8
+
+    vst1.8      {d14},[r0],r3               @(vi)
+    vsub.u8     d30,d1,d31                  @(i)32-fract(dup_const_32_fract)
+
+    add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
+    vmull.u8    q5,d8,d30                   @(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
+    vmlal.u8    q5,d9,d31                   @(i)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    sbfx        r9,lr,#24,#8
+    movle       r4,r5                       @reload nt
+
+    vmov.u32    lr,d3[1]                    @extract idx to the r register
+    vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vdup.8      d27,d4[2]                   @(iii)
+    vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
+    add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
+    vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vst1.8      {d18},[r0],r3               @(vii)
+    vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
+    vrshrn.i16  d10,q5,#5                   @(i)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vdup.8      d25,d4[3]                   @(iv)
+    vmull.s8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
+
+    vst1.8      {d22},[r0]                  @(viii)
+    vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
+
+    vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
+    vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
+    vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    sbfx        r9,lr,#0,#8
+    add         r0,r2,r3
+
+    vdup.8      d31,d4[4]                   @(v)
+    vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
+    sbfx        r9,lr,#8,#8
+
+    vst1.8      {d10},[r2]!                 @(i)
+    vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
+
+    vdup.8      d29,d4[5]                   @(vi)
+    vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vdup.8      d27,d4[6]                   @(vii)
+    vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
+    sbfx        r9,lr,#16,#8
+
+    vdup.8      d25,d4[7]                   @(viii)
+    vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vld1.8      {d8},[r10],r11              @(v)ref_main_idx
+    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
+
+    vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
+    vshrn.s16   d3,q1,#5                    @idx = pos >> 5
+
+    vst1.8      {d14},[r0],r3               @(ii)
+    vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+    add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
+    sbfx        r9,lr,#24,#8
+
+    vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
+    vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
+
+    vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
+    vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
+    vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
+    movle       r8,r1                       @reload the source to pu1_src+2nt
+
+    vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
+    vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
+
+    vst1.8      {d18},[r0],r3               @(iii)
+    vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
+    vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
+    vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
+    vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
+
+    addgt       r8,r8,#8                    @increment the source next set 8 columns in same row
+    lslle       r12,r3,#3
+    suble       r12,r12,r5
+
+    vst1.8      {d22},[r0],r3               @(iv)
+    vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vst1.8      {d10},[r0],r3               @(v)
+    vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    addle       r2,r2,r12                   @increment the dst pointer to 8*dst_strd - nt
+    sbfx        r9,lr,#0,#8
+
+    vmovn.i16   d4,q2
+    vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+    subs        r7,r7,#8
+    add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
+
+    bne         kernel_8_rows
+
+epilogue:
+    vst1.8      {d14},[r0],r3               @(vi)
+    vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
+    vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vst1.8      {d18},[r0],r3               @(vii)
+    vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vst1.8      {d22},[r0],r3               @(viii)
+    b           end_loops
+
+core_loop_4:
+    add         r6,r8,#1                    @pu1_ref_main_idx +=  1
+    mov         r8,#0
+
+    add         r5,r8,#1                    @row + 1
+    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
+    mov         lr,r5,asr #5                @if(fract_prev > fract)
+    and         r5,r5,#31                   @fract = pos & (31)
+    add         r10,r6,lr                   @pu1_ref_main_idx += 1
+    add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
+    vdup.8      d0,r5                       @dup_const_fract
+    rsb         r4,r5,#32
+    vdup.8      d1,r4                       @dup_const_32_fract
+
+@inner_loop_4
+    vld1.32     {d2[0]},[r10]               @ref_main_idx
+    add         r8,r8,#1
+@   mov         lr,r5                           @fract_prev = fract
+
+    vld1.32     {d3[0]},[r11]               @ref_main_idx_1
+    add         r5,r8,#1                    @row + 1
+    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
+    mov         lr,r5,asr #5                @ pos >> 5
+    and         r5,r5,#31                   @fract = pos & (31)
+    add         r10,r6,lr                   @pu1_ref_main_idx += 1
+    add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
+
+    vdup.8      d6,r5                       @dup_const_fract
+    vmull.u8    q2,d2,d1                    @vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    rsb         r4,r5,#32
+    vdup.8      d7,r4                       @dup_const_32_fract
+    vmlal.u8    q2,d3,d0                    @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.32     {d8[0]},[r10]               @ref_main_idx
+    add         r8,r8,#1
+
+    vld1.32     {d9[0]},[r11]               @ref_main_idx_1
+    vrshrn.i16  d4,q2,#5                    @shift_res = vrshrn_n_u16(add_res, 5)
+
+@   mov         lr,r5                           @fract_prev = fract
+    add         r5,r8,#1                    @row + 1
+    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
+    mov         lr,r5,asr #5                @if(fract_prev > fract)
+    and         r5,r5,#31                   @fract = pos & (31)
+    add         r10,r6,lr                   @ref_main + idx
+    add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
+
+    vdup.8      d12,r5                      @dup_const_fract
+    vmull.u8    q5,d8,d7                    @vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    rsb         r4,r5,#32
+    vdup.8      d13,r4                      @dup_const_32_fract
+    vmlal.u8    q5,d9,d6                    @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.32     {d14[0]},[r10]              @ref_main_idx
+    add         r8,r8,#1
+
+    vst1.32     {d4[0]},[r2],r3
+    vrshrn.i16  d10,q5,#5                   @shift_res = vrshrn_n_u16(add_res, 5)
+
+    vld1.32     {d15[0]},[r11]              @ref_main_idx_1
+@   mov         lr,r5                           @fract_prev = fract
+    add         r5,r8,#1                    @row + 1
+    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
+    mov         lr,r5,asr #5                @if(fract_prev > fract)
+    and         r5,r5,#31                   @fract = pos & (31)
+    add         r10,r6,lr                   @pu1_ref_main_idx += 1
+    add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
+
+    vdup.8      d18,r5                      @dup_const_fract
+    vmull.u8    q8,d14,d13                  @vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    rsb         r4,r5,#32
+    vdup.8      d19,r4                      @dup_const_32_fract
+    vmlal.u8    q8,d15,d12                  @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.32     {d20[0]},[r10]              @ref_main_idx
+
+    vst1.32     {d10[0]},[r2],r3
+    vrshrn.i16  d16,q8,#5                   @shift_res = vrshrn_n_u16(add_res, 5)
+    vld1.32     {d21[0]},[r11]              @ref_main_idx_1
+
+    vmull.u8    q11,d20,d19                 @vmull_u8(ref_main_idx, dup_const_32_fract)
+    vmlal.u8    q11,d21,d18                 @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vst1.32     {d16[0]},[r2],r3
+    vrshrn.i16  d22,q11,#5                  @shift_res = vrshrn_n_u16(add_res, 5)
+
+    vst1.32     {d22[0]},[r2],r3
+
+end_loops:
+    add         sp, sp, #132
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+

diff --git a/common/arm/ihevc_intra_pred_filters_neon_intr.c b/common/arm/ihevc_intra_pred_filters_neon_intr.c
new file mode 100644
index 0000000..0e89de3
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_filters_neon_intr.c

@@ -0,0 +1,2920 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_intra_pred_filters_neon_intr.c
+*
+* @brief
+*  Contains function Definition for intra prediction  interpolation filters
+*
+*
+* @author
+*  Yogeswaran RS
+*
+* @par List of Functions:
+*  - ihevc_intra_pred_luma_planar()
+*  - ihevc_intra_pred_luma_dc()
+*  - ihevc_intra_pred_luma_horz()
+*  - ihevc_intra_pred_luma_ver()
+*  - ihevc_intra_pred_luma_mode2()
+*  - ihevc_intra_pred_luma_mode_18_34()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+
+#include "ihevc_typedefs.h"
+#include "ihevc_intra_pred.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "arm_neon.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_common_tables.h"
+
+/****************************************************************************/
+/* Constant Macros                                                          */
+/****************************************************************************/
+#define MAX_CU_SIZE 64
+#define BIT_DEPTH 8
+#define T32_4NT 128
+#define T16_4NT 64
+
+
+
+/*****************************************************************************/
+/* Table Look-up                                                             */
+/*****************************************************************************/
+
+#define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
+
+/*****************************************************************************/
+/* Function Definition                                                      */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief
+ *    Intra prediction interpolation filter for pu1_ref substitution
+ *
+ *
+ * @par Description:
+ *    Reference substitution process for samples unavailable  for prediction
+ *    Refer to section 8.4.4.2.2
+ *
+ * @param[in] pu1_top_left
+ *  UWORD8 pointer to the top-left
+ *
+ * @param[in] pu1_top
+ *  UWORD8 pointer to the top
+ *
+ * @param[in] pu1_left
+ *  UWORD8 pointer to the left
+ *
+ * @param[in] src_strd
+ *  WORD32 Source stride
+ *
+ * @param[in] nbr_flags
+ *  WORD32 neighbor availability flags
+ *
+ * @param[in] nt
+ *  WORD32 transform Block size
+ *
+ * @param[in] dst_strd
+ *  WORD32 Destination stride
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 *pu1_top_left,
+                                                     UWORD8 *pu1_top,
+                                                     UWORD8 *pu1_left,
+                                                     WORD32 src_strd,
+                                                     WORD32 nt,
+                                                     WORD32 nbr_flags,
+                                                     UWORD8 *pu1_dst,
+                                                     WORD32 dst_strd)
+{
+    UWORD8 pu1_ref;
+    WORD32 dc_val, i;
+    WORD32 total_samples = (4 * nt) + 1;
+    WORD32 two_nt = 2 * nt;
+    WORD32 three_nt = 3 * nt;
+    WORD32 get_bits;
+    WORD32 next;
+    WORD32 bot_left, left, top, tp_right, tp_left;
+    WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
+    UNUSED(dst_strd);
+    dc_val = 1 << (BIT_DEPTH - 1);
+
+    /* Neighbor Flag Structure*/
+    /*    Top-Left | Top-Right | Top | Left | Bottom-Left
+              1         4         4     4         4
+     */
+
+    /* If no neighbor flags are present, fill the neighbor samples with DC value */
+    if(nbr_flags == 0)
+    {
+        for(i = 0; i < total_samples; i++)
+        {
+            pu1_dst[i] = dc_val;
+        }
+    }
+    else
+    {
+        /* Else fill the corresponding samples */
+        pu1_dst[two_nt] = *pu1_top_left;
+        UWORD8 *pu1_dst_tmp2 = pu1_dst;
+        UWORD8 *pu1_top_tmp = pu1_top;
+        pu1_dst_tmp2 += two_nt + 1;
+
+        for(i = 0; i < two_nt; i++)
+            pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+
+        uint8x8_t src;
+        for(i = two_nt; i > 0; i -= 8)
+        {
+            src = vld1_u8(pu1_top_tmp);
+            pu1_top_tmp += 8;
+            vst1_u8(pu1_dst_tmp2, src);
+            pu1_dst_tmp2 += 8;
+        }
+
+        if(nt <= 8)
+        {
+            /* 1 bit extraction for all the neighboring blocks */
+            tp_left = (nbr_flags & 0x10000) >> 16;
+            bot_left = nbr_flags & 0x1;
+            left = (nbr_flags & 0x10) >> 4;
+            top = (nbr_flags & 0x100) >> 8;
+            tp_right = (nbr_flags & 0x1000) >> 12;
+
+            next = 1;
+
+            /* If bottom -left is not available, reverse substitution process*/
+            if(bot_left == 0)
+            {
+                WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right };
+
+                /* Check for the 1st available sample from bottom-left*/
+                while(!a_nbr_flag[next])
+                    next++;
+
+                /* If Left, top-left are available*/
+                if(next <= 2)
+                {
+                    idx = nt * next;
+                    pu1_ref = pu1_dst[idx];
+                    for(i = 0; i < idx; i++)
+                        pu1_dst[i] = pu1_ref;
+                }
+                else /* If top, top-right are available */
+                {
+                    /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
+                    idx = (nt * (next - 1)) + 1;
+                    pu1_ref = pu1_dst[idx];
+                    for(i = 0; i < idx; i++)
+                        pu1_dst[i] = pu1_ref;
+                }
+            }
+
+            /* Forward Substitution Process */
+            /* If left is Unavailable, copy the last bottom-left value */
+
+            if(left == 0)
+            {
+                uint8x8_t dup_pu1_dst1;
+                UWORD8 *pu1_dst_const_nt = pu1_dst;
+                pu1_dst_const_nt += nt;
+
+                if(0 == (nt & 7))
+                {
+                    dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
+                    for(i = nt; i > 0; i -= 8)
+                    {
+                        vst1_u8(pu1_dst_const_nt, dup_pu1_dst1);
+                        pu1_dst_const_nt += 8;
+
+                    }
+                }
+                else
+                {
+                    //uint32x2_t dup_pu1_dst4;
+                    dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
+                    //dup_pu1_dst4 = vdup_n_u32((uint32_t) pu1_dst[nt - 1]);
+                    for(i = nt; i > 0; i -= 4)
+                    {
+                        vst1_lane_u32((uint32_t *)pu1_dst_const_nt, vreinterpret_u32_u8(dup_pu1_dst1), 0);
+                        pu1_dst_const_nt += 4;
+
+                    }
+
+                }
+
+            }
+            if(tp_left == 0)
+                pu1_dst[two_nt] = pu1_dst[two_nt - 1];
+            if(top == 0)
+            {
+
+                if(0 == (nt & 7))
+                {
+                    uint8x8_t dup_pu1_dst2;
+                    UWORD8 *pu1_dst_const_two_nt_1 = pu1_dst;
+                    pu1_dst_const_two_nt_1 += (two_nt + 1);
+                    dup_pu1_dst2 = vdup_n_u8(pu1_dst[two_nt]);
+                    for(i = nt; i > 0; i -= 8)
+                    {
+                        vst1_u8(pu1_dst_const_two_nt_1, dup_pu1_dst2);
+                        pu1_dst_const_two_nt_1 += 8;
+
+                    }
+                }
+                else
+                {
+                    for(i = 0; i < nt; i++)
+                        pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt];
+                }
+            }
+            if(tp_right == 0)
+            {
+                uint8x8_t dup_pu1_dst3;
+                UWORD8 *pu1_dst_const_three_nt_1 = pu1_dst;
+                pu1_dst_const_three_nt_1 += (three_nt + 1);
+                dup_pu1_dst3 = vdup_n_u8(pu1_dst[two_nt]);
+                if(0 == (nt & 7))
+                {
+                    for(i = nt; i > 0; i -= 8)
+                    {
+                        vst1_u8(pu1_dst_const_three_nt_1, dup_pu1_dst3);
+                        pu1_dst_const_three_nt_1 += 8;
+
+                    }
+                }
+                else
+                {
+                    for(i = nt; i > 0; i -= 4)
+                    {
+                        vst1_lane_u32((uint32_t *)pu1_dst_const_three_nt_1, vreinterpret_u32_u8(dup_pu1_dst3), 0);
+                        pu1_dst_const_three_nt_1 += 4;
+                    }
+
+                }
+
+            }
+        }
+        if(nt == 16)
+        {
+            WORD32 nbr_flags_temp = 0;
+            nbr_flags_temp = (nbr_flags & 0x3) + ((nbr_flags & 0x30) >> 2)
+                            + ((nbr_flags & 0x300) >> 4)
+                            + ((nbr_flags & 0x3000) >> 6)
+                            + ((nbr_flags & 0x10000) >> 8);
+
+            /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
+            /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+            {
+                nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
+
+                if(nbr_id_from_bl == 64)
+                    nbr_id_from_bl = 32;
+
+                if(nbr_id_from_bl == 32)
+                {
+                    /* for top left : 1 pel per nbr bit */
+                    if(!((nbr_flags_temp >> 8) & 0x1))
+                    {
+                        nbr_id_from_bl++;
+                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right;  8 pels per nbr bit */
+                    }
+                }
+                /* Reverse Substitution Process*/
+                if(nbr_id_from_bl)
+                {
+                    /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+                    pu1_ref = pu1_dst[nbr_id_from_bl];
+                    for(i = (nbr_id_from_bl - 1); i >= 0; i--)
+                    {
+                        pu1_dst[i] = pu1_ref;
+                    }
+                }
+            }
+
+            /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+            while(nbr_id_from_bl < ((T16_4NT) + 1))
+            {
+                /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
+                /* Devide by 8 to obtain the original index */
+                frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
+
+                /* The Top-left flag is at the last bit location of nbr_flags*/
+                if(nbr_id_from_bl == (T16_4NT / 2))
+                {
+                    get_bits = GET_BITS(nbr_flags_temp, 8);
+
+                    /* only pel substitution for TL */
+                    if(!get_bits)
+                        pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
+                }
+                else
+                {
+                    get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag);
+                    if(!get_bits)
+                    {
+                        /* 8 pel substitution (other than TL) */
+                        pu1_ref = pu1_dst[nbr_id_from_bl - 1];
+                        for(i = 0; i < 8; i++)
+                            pu1_dst[nbr_id_from_bl + i] = pu1_ref;
+                    }
+
+                }
+                nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
+            }
+        }
+
+        if(nt == 32)
+        {
+            /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
+            /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+            {
+                nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
+
+                if(nbr_id_from_bl == 64)
+                {
+                    /* for top left : 1 pel per nbr bit */
+                    if(!((nbr_flags >> 16) & 0x1))
+                    {
+                        /* top left not available */
+                        nbr_id_from_bl++;
+                        /* top and top right;  8 pels per nbr bit */
+                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
+                    }
+                }
+                /* Reverse Substitution Process*/
+                if(nbr_id_from_bl)
+                {
+                    /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+                    pu1_ref = pu1_dst[nbr_id_from_bl];
+                    for(i = (nbr_id_from_bl - 1); i >= 0; i--)
+                        pu1_dst[i] = pu1_ref;
+                }
+            }
+
+            /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+            while(nbr_id_from_bl < ((T32_4NT)+1))
+            {
+                /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
+                /* Devide by 8 to obtain the original index */
+                frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
+
+                /* The Top-left flag is at the last bit location of nbr_flags*/
+                if(nbr_id_from_bl == (T32_4NT / 2))
+                {
+                    get_bits = GET_BITS(nbr_flags, 16);
+                    /* only pel substitution for TL */
+                    if(!get_bits)
+                        pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
+                }
+                else
+                {
+                    get_bits = GET_BITS(nbr_flags, frwd_nbr_flag);
+                    if(!get_bits)
+                    {
+                        /* 8 pel substitution (other than TL) */
+                        pu1_ref = pu1_dst[nbr_id_from_bl - 1];
+                        for(i = 0; i < 8; i++)
+                            pu1_dst[nbr_id_from_bl + i] = pu1_ref;
+                    }
+
+                }
+                nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
+            }
+        }
+
+    }
+
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *    Intra prediction interpolation filter for ref_filtering
+ *
+ *
+ * @par Description:
+ *    Reference DC filtering for neighboring samples dependent  on TU size and
+ *    mode  Refer to section 8.4.4.2.3 in the standard
+ *
+ * @param[in] pu1_src
+ *  UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ *  UWORD8 pointer to the destination
+ *
+ * @param[in] nt
+ *  integer Transform Block size
+ *
+ * @param[in] mode
+ *  integer intraprediction mode
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_intra_pred_ref_filtering_neonintr(UWORD8 *pu1_src,
+                                             WORD32 nt,
+                                             UWORD8 *pu1_dst,
+                                             WORD32 mode,
+                                             WORD32 strong_intra_smoothing_enable_flag)
+{
+    WORD32 filter_flag;
+    WORD32 i = 0;
+    WORD32 four_nt = 4 * nt;
+
+    WORD32 src_4nt;
+
+    /* Naming has been made as per the functionlity it has, For eg. pu1_src_tmp_1 is denoting pu1_src + 1   */
+    /* src_val_1 to load value from pointer pu1_src_tmp_1, add_res has the result of adding 2 values        */
+    UWORD8 *pu1_src_tmp_0 = pu1_src;
+    UWORD8 *pu1_src_tmp_1;
+    UWORD8 *pu1_src_tmp_2;
+    UWORD8 *pu1_dst_tmp_0 = pu1_dst;
+    UWORD8 *pu1_dst_tmp_1;
+
+    uint8x8_t src_val_0, src_val_2;
+    uint8x8_t src_val_1, shift_res;
+    uint8x8_t dup_const_2;
+    uint16x8_t mul_res, add_res;
+    WORD32 bi_linear_int_flag = 0;
+    WORD32 abs_cond_left_flag = 0;
+    WORD32 abs_cond_top_flag = 0;
+    WORD32 dc_val = 1 << (BIT_DEPTH - 5);
+    shift_res = vdup_n_u8(0);
+
+    filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
+
+    if(0 == filter_flag)
+    {
+        if(pu1_src == pu1_dst)
+        {
+            return;
+        }
+        else
+        {
+            for(i = four_nt; i > 0; i -= 8)
+            {
+                src_val_0 = vld1_u8(pu1_src_tmp_0);
+                pu1_src_tmp_0 += 8;
+                vst1_u8(pu1_dst_tmp_0, src_val_0);
+                pu1_dst_tmp_0 += 8;
+            }
+            pu1_dst[four_nt] = pu1_src[four_nt];
+        }
+    }
+
+    else
+    {
+        /* If strong intra smoothin is enabled and transform size is 32 */
+        if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
+        {
+            /*Strong Intra Filtering*/
+            abs_cond_top_flag = (ABS(pu1_src[2 * nt] + pu1_src[4 * nt]
+                            - (2 * pu1_src[3 * nt]))) < dc_val;
+            abs_cond_left_flag = (ABS(pu1_src[2 * nt] + pu1_src[0]
+                            - (2 * pu1_src[nt]))) < dc_val;
+
+            bi_linear_int_flag = ((1 == abs_cond_left_flag)
+                            && (1 == abs_cond_top_flag));
+        }
+
+        src_4nt = pu1_src[4 * nt];
+        /* Strong filtering of reference samples */
+        if(1 == bi_linear_int_flag)
+        {
+            WORD32 two_nt = four_nt >> 1;
+
+            WORD32 pu1_src_0_val = pu1_src[0];
+            WORD32 pu1_src_2_nt_val = pu1_src[2 * nt];
+            WORD32 pu1_src_4_nt_val = pu1_src[4 * nt];
+
+            WORD32 prod_two_nt_src_0_val = two_nt * pu1_src_0_val;
+            uint16x8_t prod_two_nt_src_0_val_t = vdupq_n_u16(prod_two_nt_src_0_val);
+
+            WORD32 prod_two_nt_src_2_nt_val = two_nt * pu1_src_2_nt_val;
+            uint16x8_t prod_two_nt_src_2_nt_val_t = vdupq_n_u16(prod_two_nt_src_2_nt_val);
+
+            const UWORD8 *const_col_i;
+            uint8x8_t const_col_i_val;
+            uint16x8_t prod_val_1;
+            uint16x8_t prod_val_2;
+            uint16x8_t prod_val_3;
+            uint16x8_t prod_val_4;
+            uint8x8_t res_val_1;
+            uint8x8_t res_val_2;
+            uint8x8_t pu1_src_0_val_t = vdup_n_u8(pu1_src_0_val);
+            uint8x8_t pu1_src_2_nt_val_t = vdup_n_u8(pu1_src_2_nt_val);
+            uint8x8_t pu1_src_4_nt_val_t = vdup_n_u8(pu1_src_4_nt_val);
+            pu1_dst_tmp_0 = pu1_dst + 1;
+            pu1_dst_tmp_1 = pu1_dst + two_nt + 1;
+
+            const_col_i = gau1_ihevc_planar_factor + 1;
+
+            for(i = two_nt; i > 0; i -= 8)
+            {
+                const_col_i_val = vld1_u8(const_col_i);
+                const_col_i += 8;
+
+                prod_val_1 = vmlsl_u8(prod_two_nt_src_0_val_t, const_col_i_val, pu1_src_0_val_t);
+                prod_val_2 = vmlal_u8(prod_val_1, const_col_i_val, pu1_src_2_nt_val_t);
+
+                res_val_1 = vrshrn_n_u16(prod_val_2, 6);
+                prod_val_3 = vmlsl_u8(prod_two_nt_src_2_nt_val_t, const_col_i_val, pu1_src_2_nt_val_t);
+
+                vst1_u8(pu1_dst_tmp_0, res_val_1);
+                pu1_dst_tmp_0 += 8;
+                prod_val_4 = vmlal_u8(prod_val_3, const_col_i_val, pu1_src_4_nt_val_t);
+
+                res_val_2 = vrshrn_n_u16(prod_val_4, 6);
+                vst1_u8(pu1_dst_tmp_1, res_val_2);
+                pu1_dst_tmp_1 += 8;
+            }
+            pu1_dst[2 * nt] = pu1_src[2 * nt];
+        }
+        else
+        {
+            pu1_src_tmp_1 = pu1_src + 1;
+            pu1_src_tmp_2 = pu1_src + 2;
+            pu1_dst_tmp_0 += 1;
+
+            dup_const_2 = vdup_n_u8(2);
+
+            /* Extremities Untouched*/
+            pu1_dst[0] = pu1_src[0];
+
+            /* To avoid the issue when the dest and src has the same pointer this load has been done
+             * outside and the 2nd consecutive load is done before the store of the 1st */
+
+            /* Perform bilinear filtering of Reference Samples */
+            for(i = (four_nt - 1); i > 0; i -= 8)
+            {
+                src_val_0 = vld1_u8(pu1_src_tmp_0);
+                pu1_src_tmp_0 += 8;
+
+                src_val_2 = vld1_u8(pu1_src_tmp_2);
+                pu1_src_tmp_2 += 8;
+
+                src_val_1 = vld1_u8(pu1_src_tmp_1);
+                pu1_src_tmp_1 += 8;
+
+                if(i < four_nt - 1)
+                {
+                    vst1_u8(pu1_dst_tmp_0, shift_res);
+                    pu1_dst_tmp_0 += 8;
+                }
+
+                add_res = vaddl_u8(src_val_0, src_val_2);
+
+                mul_res = vmlal_u8(add_res, src_val_1, dup_const_2);
+                shift_res = vrshrn_n_u16(mul_res, 2);
+
+            }
+            vst1_u8(pu1_dst_tmp_0, shift_res);
+            pu1_dst_tmp_0 += 8;
+        }
+        pu1_dst[4 * nt] = src_4nt;
+
+    }
+
+}
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+*   Intra prediction interpolation filter for luma planar
+*
+* @par Description:
+*      Planar Intraprediction with reference neighboring samples  location
+*      pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_planar_neonintr(UWORD8 *pu1_ref,
+                                           WORD32 src_strd,
+                                           UWORD8 *pu1_dst,
+                                           WORD32 dst_strd,
+                                           WORD32 nt,
+                                           WORD32 mode)
+{
+    /* named it in the way (nt - 1 - col) --> const_nt_1_col(const denotes g_ihevc_planar_factor)   */
+    /* load const_nt_1_col values into a d register                                                 */
+    /* named it in the way pu1_ref[nt - 1] --> pu1_ref_nt_1                                         */
+    /* the value of pu1_ref_nt_1 is duplicated to d register hence pu1_ref_nt_1_dup                 */
+    /* log2nt + 1 is taken care while assigning the values itself                                   */
+    /* In width multiple of 4 case the row also has been unrolled by 2 and store has been taken care*/
+
+    WORD32 row, col = 0;
+    WORD32 log2nt_plus1 = 6;
+    WORD32 two_nt, three_nt;
+    UWORD8 *pu1_ref_two_nt_1;
+    UWORD8 *pu1_dst_tmp;
+    const UWORD8 *const_nt_1_col;
+    uint8x8_t const_nt_1_col_t;
+    const UWORD8 *const_col_1;
+    uint8x8_t const_col_1_t;
+    uint8_t const_nt_1_row;
+    uint8x8_t const_nt_1_row_dup;
+    uint8_t const_row_1;
+    uint8x8_t const_row_1_dup;
+    uint8_t const_nt = nt;
+    uint16x8_t const_nt_dup;
+    uint8_t pu1_ref_nt_1 = pu1_ref[nt - 1];
+    uint8x8_t pu1_ref_nt_1_dup;
+    uint8_t pu1_ref_two_nt_1_row;
+    uint8_t pu1_ref_three_nt_1;
+    uint8x8_t pu1_ref_two_nt_1_row_dup;
+    uint8x8_t pu1_ref_two_nt_1_t;
+    uint8x8_t pu1_ref_three_nt_1_dup;
+    uint16x8_t prod_t1;
+    uint16x8_t prod_t2;
+    uint16x8_t sto_res_tmp;
+    uint8x8_t sto_res;
+    int16x8_t log2nt_dup;
+    UNUSED(src_strd);
+    UNUSED(mode);
+    log2nt_plus1 = 32 - CLZ(nt);
+    two_nt = 2 * nt;
+    three_nt = 3 * nt;
+    /* loops have been unrolld considering the fact width is multiple of 8  */
+    if(0 == (nt & 7))
+    {
+        pu1_dst_tmp = pu1_dst;
+        const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
+
+        const_col_1 = gau1_ihevc_planar_factor + 1;
+        pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
+
+        pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
+        const_nt_dup = vdupq_n_u16(const_nt);
+
+        log2nt_dup = vdupq_n_s16(log2nt_plus1);
+        log2nt_dup = vnegq_s16(log2nt_dup);
+
+        pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
+
+        for(row = 0; row < nt; row++)
+        {
+            pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
+            pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
+
+            const_nt_1_row = nt - 1 - row;
+            const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
+
+            const_row_1 = row + 1;
+            const_row_1_dup = vdup_n_u8(const_row_1);
+
+            const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
+
+            const_col_1 = gau1_ihevc_planar_factor + 1;
+            pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
+
+            for(col = nt; col > 0; col -= 8)
+            {
+                const_nt_1_col_t = vld1_u8(const_nt_1_col);
+                const_nt_1_col -= 8;
+                const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
+
+                const_col_1_t = vld1_u8(const_col_1);
+                const_col_1 += 8;
+                prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
+
+                pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
+                pu1_ref_two_nt_1 += 8;
+                prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
+
+                prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
+                prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
+                prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
+                prod_t1 = vaddq_u16(prod_t1, prod_t2);
+
+                sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
+                sto_res = vmovn_u16(sto_res_tmp);
+                vst1_u8(pu1_dst_tmp, sto_res);
+                pu1_dst_tmp += 8;
+            }
+            pu1_dst_tmp += dst_strd - nt;
+        }
+    }
+    /* loops have been unrolld considering the fact width is multiple of 4  */
+    /* If column is multiple of 4 then height should be multiple of 2       */
+    else
+    {
+        uint8x8_t const_row_1_dup1;
+        uint8x8_t pu1_ref_two_nt_1_t1;
+        uint8x8_t const_nt_1_col_t1;
+        uint8x8_t const_col_1_t1;
+        uint8x8_t pu1_ref_two_nt_1_row_dup1;
+        uint8x8_t const_nt_1_row_dup1;
+
+        pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
+
+        pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
+        const_nt_dup = vdupq_n_u16(const_nt);
+
+        log2nt_dup = vdupq_n_s16(log2nt_plus1);
+        log2nt_dup = vnegq_s16(log2nt_dup);
+
+        pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
+
+        for(row = 0; row < nt; row += 2)
+        {
+            pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
+            pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
+            pu1_ref_two_nt_1_row = pu1_ref[two_nt - 2 - row];
+            pu1_ref_two_nt_1_row_dup1 = vdup_n_u8(pu1_ref_two_nt_1_row);
+            pu1_ref_two_nt_1_row_dup = vext_u8(pu1_ref_two_nt_1_row_dup, pu1_ref_two_nt_1_row_dup1, 4);
+
+            const_nt_1_row = nt - 1 - row;
+            const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
+            const_nt_1_row = nt - 2 - row;
+            const_nt_1_row_dup1 = vdup_n_u8(const_nt_1_row);
+            const_nt_1_row_dup = vext_u8(const_nt_1_row_dup, const_nt_1_row_dup1, 4);
+
+            const_row_1 = row + 1;
+            const_row_1_dup = vdup_n_u8(const_row_1);
+            const_row_1 = row + 2;
+            const_row_1_dup1 = vdup_n_u8(const_row_1);
+            const_row_1_dup = vext_u8(const_row_1_dup, const_row_1_dup1, 4);
+
+            const_nt_1_col = gau1_ihevc_planar_factor + nt - 4;
+
+            const_col_1 = gau1_ihevc_planar_factor + 1;
+
+            pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
+
+            for(col = nt; col > 0; col -= 4)
+            {
+                const_nt_1_col_t = vld1_u8(const_nt_1_col);
+                const_nt_1_col -= 4;
+                const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
+
+                const_col_1_t = vld1_u8(const_col_1);
+                const_col_1 += 4;
+                const_nt_1_col_t1 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(const_nt_1_col_t), 32));
+
+                pu1_dst_tmp = pu1_dst;
+                const_nt_1_col_t = vext_u8(const_nt_1_col_t, const_nt_1_col_t1, 4);
+
+                const_col_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(const_col_1_t), 32));
+                prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
+
+                pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
+                pu1_ref_two_nt_1 += 4;
+                const_col_1_t = vext_u8(const_col_1_t1, const_col_1_t, 4);
+
+                pu1_ref_two_nt_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(pu1_ref_two_nt_1_t), 32));
+                prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
+
+                pu1_ref_two_nt_1_t = vext_u8(pu1_ref_two_nt_1_t1, pu1_ref_two_nt_1_t, 4);
+                prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
+
+                prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
+                prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
+                prod_t1 = vaddq_u16(prod_t1, prod_t2);
+
+                sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
+                sto_res = vmovn_u16(sto_res_tmp);
+
+                vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
+                pu1_dst_tmp += dst_strd;
+
+                vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+                pu1_dst += 4;
+            }
+            pu1_dst += 2 * dst_strd - nt;
+        }
+    }
+
+}
+/* INTRA_PRED_LUMA_PLANAR */
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for luma dc
+*
+* @par Description:
+*    Intraprediction for DC mode with reference neighboring  samples location
+*    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_dc_neonintr(UWORD8 *pu1_ref,
+                                       WORD32 src_strd,
+                                       UWORD8 *pu1_dst,
+                                       WORD32 dst_strd,
+                                       WORD32 nt,
+                                       WORD32 mode)
+{
+    WORD32 dc_val = 0, two_dc_val = 0, three_dc_val = 0;
+    WORD32 i = 0;
+    WORD32 row = 0, col = 0, col_count;
+    WORD32 log2nt_plus1 = 6;
+    WORD32 two_nt = 0;
+    uint16x8_t ref_load_q;
+    uint16x8_t three_dc_val_t;
+    uint8x8_t sto_res_tmp;
+    uint8x8_t sto_res_tmp1;
+    uint8x8_t sto_res_tmp2;
+    uint8x8_t sto_res_tmp3;
+    uint8x8_t sto_res_tmp4;
+    uint8x8_t dc_val_t;
+
+    UWORD8 *pu1_ref_tmp;
+    UWORD8 *pu1_ref_tmp1;
+    UWORD8 *pu1_dst_tmp;
+    UWORD8 *pu1_dst_tmp1;
+    UWORD8 *pu1_dst_tmp2;
+    UNUSED(src_strd);
+    UNUSED(mode);
+
+    /* log2nt + 1 is taken care while assigning the values itself.          */
+    log2nt_plus1 = 32 - CLZ(nt);
+
+    /* loops have been unrolld considering the fact width is multiple of 8  */
+    if(0 == (nt & 7))
+    {
+        uint8x8_t ref_load1;
+        uint8x8_t ref_load2;
+        uint16x4_t acc_dc_pair1;
+        uint32x2_t acc_dc_pair2;
+        uint64x1_t acc_dc = vdup_n_u64(col);
+
+        two_nt = 2 * nt;
+        pu1_ref_tmp = pu1_ref + nt;
+        pu1_ref_tmp1 = pu1_ref + two_nt + 1;
+
+        for(i = two_nt; i > nt; i -= 8)
+        {
+            ref_load1 = vld1_u8(pu1_ref_tmp);
+            pu1_ref_tmp += 8;
+            acc_dc_pair1 = vpaddl_u8(ref_load1);
+
+            ref_load2 = vld1_u8(pu1_ref_tmp1);
+            pu1_ref_tmp1 += 8;
+
+            acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
+            acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
+
+            acc_dc_pair1 = vpaddl_u8(ref_load2);
+            acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
+            acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
+        }
+
+        dc_val = (vget_lane_u32(vreinterpret_u32_u64(acc_dc), 0) + nt) >> (log2nt_plus1);
+        dc_val_t = vdup_n_u8(dc_val);
+        two_dc_val = 2 * dc_val;
+        three_dc_val = 3 * dc_val;
+        three_dc_val += 2;
+
+        three_dc_val_t = vdupq_n_u16((WORD16)three_dc_val);
+        pu1_ref_tmp = pu1_ref + two_nt + 1 + 0;
+        pu1_dst_tmp = pu1_dst;
+
+
+        if(nt == 32)
+        {
+            for(row = 0; row < nt; row++)
+            {
+                for(col = nt; col > 0; col -= 8)
+                {
+                    vst1_u8(pu1_dst_tmp, dc_val_t);
+                    pu1_dst_tmp += 8;
+                }
+                pu1_dst_tmp += dst_strd - nt;
+            }
+        }
+        else
+
+        {
+            for(col = nt; col > 0; col -= 8)
+            {
+                ref_load1 = vld1_u8(pu1_ref_tmp);
+                pu1_ref_tmp += 8;
+                ref_load_q = vmovl_u8(ref_load1);
+                ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
+                ref_load_q = vshrq_n_u16(ref_load_q, 2);
+                sto_res_tmp = vmovn_u16(ref_load_q);
+                vst1_u8(pu1_dst_tmp, sto_res_tmp);
+                pu1_dst_tmp += 8;
+            }
+
+            pu1_ref_tmp = pu1_ref + two_nt - 9;
+            pu1_dst_tmp = pu1_dst + dst_strd;
+            col_count = nt - 8;
+
+            /* Except the first row the remaining rows are done here                            */
+            /* Both column and row has been unrolled by 8                                       */
+            /* Store has been taken care for the unrolling                                      */
+            /* Except the 1st column of the remaining rows(other than 1st row), the values are  */
+            /* constant hence it is extracted with an constant value and stored                 */
+            /* If the column is greater than 8, then the remaining values are constant which is */
+            /* taken care in the inner for loop                                                 */
+
+            for(row = nt; row > 0; row -= 8)
+            {
+                pu1_dst_tmp1 = pu1_dst_tmp + 8;
+                ref_load1 = vld1_u8(pu1_ref_tmp);
+                pu1_ref_tmp -= 8;
+                ref_load_q = vmovl_u8(ref_load1);
+                ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
+                ref_load_q = vshrq_n_u16(ref_load_q, 2);
+                sto_res_tmp = vmovn_u16(ref_load_q);
+
+                sto_res_tmp1 = vext_u8(sto_res_tmp, dc_val_t, 7);
+
+                sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 8));
+                sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
+                vst1_u8(pu1_dst_tmp, sto_res_tmp1);
+                pu1_dst_tmp += dst_strd;
+
+                sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 16));
+                sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
+                vst1_u8(pu1_dst_tmp, sto_res_tmp2);
+                pu1_dst_tmp += dst_strd;
+
+                sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 24));
+                sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
+                vst1_u8(pu1_dst_tmp, sto_res_tmp3);
+                pu1_dst_tmp += dst_strd;
+
+                sto_res_tmp1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 32));
+                sto_res_tmp1 = vext_u8(sto_res_tmp1, dc_val_t, 7);
+                vst1_u8(pu1_dst_tmp, sto_res_tmp4);
+                pu1_dst_tmp += dst_strd;
+
+                sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 40));
+                sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
+                vst1_u8(pu1_dst_tmp, sto_res_tmp1);
+                pu1_dst_tmp += dst_strd;
+
+                sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 48));
+                sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
+                vst1_u8(pu1_dst_tmp, sto_res_tmp2);
+                pu1_dst_tmp += dst_strd;
+
+                sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 56));
+                sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
+                vst1_u8(pu1_dst_tmp, sto_res_tmp3);
+                pu1_dst_tmp += dst_strd;
+                /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
+                if(row != 8)
+                    vst1_u8(pu1_dst_tmp, sto_res_tmp4);
+                pu1_dst_tmp += dst_strd;
+
+                for(col = col_count; col > 0; col -= 8)
+                {
+                    pu1_dst_tmp2 = pu1_dst_tmp1;
+                    vst1_u8(pu1_dst_tmp1, dc_val_t);
+                    pu1_dst_tmp1 += dst_strd;
+                    vst1_u8(pu1_dst_tmp1, dc_val_t);
+                    pu1_dst_tmp1 += dst_strd;
+                    vst1_u8(pu1_dst_tmp1, dc_val_t);
+                    pu1_dst_tmp1 += dst_strd;
+                    vst1_u8(pu1_dst_tmp1, dc_val_t);
+                    pu1_dst_tmp1 += dst_strd;
+                    vst1_u8(pu1_dst_tmp1, dc_val_t);
+                    pu1_dst_tmp1 += dst_strd;
+                    vst1_u8(pu1_dst_tmp1, dc_val_t);
+                    pu1_dst_tmp1 += dst_strd;
+                    vst1_u8(pu1_dst_tmp1, dc_val_t);
+                    pu1_dst_tmp1 += dst_strd;
+
+                    /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
+                    if(row != 8)
+                        vst1_u8(pu1_dst_tmp1, dc_val_t);
+                    pu1_dst_tmp1 = pu1_dst_tmp2 + 8;
+                }
+            }
+            pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
+        }
+    }
+    /* loops have been unrolld considering the fact width is multiple of 4  */
+    else
+    {
+        WORD32 acc_dc;
+        two_nt = 2 * nt;
+
+        acc_dc = 0;
+        pu1_ref_tmp = pu1_ref + nt + 1;
+        for(i = nt; i < two_nt; i++)
+        {
+            acc_dc += pu1_ref[i];
+            acc_dc += pu1_ref_tmp[i];
+        }
+        dc_val = (acc_dc + nt) >> (log2nt_plus1);
+        two_dc_val = 2 * dc_val;
+        three_dc_val = 3 * dc_val;
+        three_dc_val = three_dc_val + 2;
+        dc_val_t = vdup_n_u8(dc_val);
+
+        if(nt == 32)
+        {
+            pu1_dst_tmp = pu1_dst;
+            for(row = 0; row < nt; row++)
+            {
+                for(col = nt; col > 0; col -= 4)
+                {
+                    vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
+                    pu1_dst_tmp += 4;
+                }
+                pu1_dst_tmp += dst_strd - nt;
+            }
+        }
+        else
+
+        {
+            for(col = 1; col < nt; col++)
+            {
+                pu1_dst[col] = (pu1_ref[two_nt + 1 + col] + three_dc_val) >> 2;
+            }
+
+            pu1_dst_tmp = pu1_dst + dst_strd + 0;
+            /* Since first row is already updated before, loop count is nt-1 */
+            for(row = nt - 1; row > 0; row -= 1)
+            {
+                for(col = nt; col > 0; col -= 4)
+                {
+                    vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
+                    pu1_dst_tmp += 4;
+                }
+                pu1_dst_tmp += dst_strd - nt;
+            }
+
+            for(row = 1; row < nt; row++)
+            {
+                pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val) >> 2;
+            }
+            pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
+        }
+    }
+}
+/* INTRA_PRED_LUMA_DC */
+
+/**
+*******************************************************************************
+*
+* @brief
+ *   Intra prediction interpolation filter for horizontal luma variable.
+ *
+ * @par Description:
+ *   Horizontal intraprediction with reference neighboring  samples location
+ *   pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
+ *
+ * @param[in] pu1_src
+ *  UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ *  UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ *  integer source stride
+ *
+ * @param[in] dst_strd
+ *  integer destination stride
+ *
+ * @param[in] nt
+ *  integer Transform Block size
+ *
+ * @param[in] wd
+ *  integer width of the array
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_intra_pred_luma_horz_neonintr(UWORD8 *pu1_ref,
+                                         WORD32 src_strd,
+                                         UWORD8 *pu1_dst,
+                                         WORD32 dst_strd,
+                                         WORD32 nt,
+                                         WORD32 mode)
+{
+
+    WORD32 row, col;
+    WORD32 two_nt;
+    UNUSED(src_strd);
+    UNUSED(mode);
+
+    two_nt = 2 * nt;
+
+
+    UWORD8 *pu1_dst_tmp = pu1_dst;
+    UWORD32 pu1_val;
+    uint8x8_t pu1_val_two_nt_1_row;
+    if(nt == 32)
+    {
+        pu1_dst_tmp = pu1_dst;
+        for(row = 0; row < nt; row++)
+        {
+            pu1_val = pu1_ref[two_nt - 1 - row];
+            pu1_val_two_nt_1_row = vdup_n_u8(pu1_val);
+            for(col = nt; col > 0; col -= 8)
+            {
+                vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_row);
+                pu1_dst_tmp += 8;
+            }
+            pu1_dst_tmp += dst_strd - nt;
+        }
+    }
+    else
+
+
+    /* row loop has been unrolled, hence had pu1_ref_val1 and pu1_ref_val2 variables*/
+    /* naming of variables made according to the operation(instructions) it performs*/
+    /* (eg. shift_val which contains the shifted value,                             */
+    /* add_sat which has add and saturated value)                                   */
+    /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
+    /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
+    {
+        if(0 != (nt & 7))      /* cond for multiple of 4 */
+        {
+            UWORD8 *pu1_ref_4_two_nt_plus1 = pu1_ref;
+            UWORD8 *pu1_ref_4_two_nt_minus_nt = pu1_ref;
+            UWORD8 *pu1_dst_4 = pu1_dst;
+            UWORD8 *pu1_dst_4_tmp = pu1_dst;
+
+            uint32x2_t pu1_ref_val1, pu1_ref_val2;
+            uint8x8_t dup_sub, round_val, dup_val;
+            uint16x8_t dup_add, sub_val;
+            int16x8_t shift_val, add_sat;
+
+            pu1_ref_val1 = vdup_n_u32(0);
+            pu1_ref_val2 = vdup_n_u32(0);
+
+            dup_sub = vdup_n_u8(pu1_ref[two_nt]);
+
+            dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
+
+            pu1_ref_4_two_nt_plus1 += (two_nt + 1);
+
+            pu1_ref_4_two_nt_minus_nt += (two_nt - nt);
+
+            for(row = nt; row > 0; row -= 4)
+            {
+                for(col = nt; col > 0; col -= 4)
+                {
+                    pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_plus1, pu1_ref_val1, 0);
+                    sub_val = vsubl_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_sub);
+                    shift_val  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
+
+                    add_sat = vqaddq_s16(shift_val, vreinterpretq_s16_u16(dup_add));
+                    round_val = vqmovun_s16(add_sat);
+                    vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(round_val), 0);
+                    pu1_dst_4 += dst_strd;
+
+                    pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_minus_nt, pu1_ref_val2, 0);
+                    dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 2);
+                    vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
+                    pu1_dst_4 += dst_strd;
+
+                    dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 1);
+                    vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
+                    pu1_dst_4 += dst_strd;
+
+                    dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 0);
+                    vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
+                    pu1_dst_4 += dst_strd;
+
+
+                }
+                /* worst cases */
+                pu1_ref_4_two_nt_minus_nt += 3;
+                pu1_ref_4_two_nt_plus1 += 4;
+                pu1_dst_4 = (pu1_dst_4_tmp + 4);
+            }
+
+        }
+
+        /* dup_1 - dup_8 are variables to load the duplicated values from the loaded source */
+        /* naming of variables made according to the operation(instructions) it performs    */
+        /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
+        /* rows and columns are unrolled by 8, when the width is multiple of 8                              */
+
+        else
+        {
+            UWORD8 *pu1_ref_tmp_1 = pu1_ref;
+            UWORD8 *pu1_ref_tmp_2 = pu1_ref;
+
+            UWORD8 *pu1_dst_tmp_1 = pu1_dst;
+            UWORD8 *pu1_dst_tmp_2 = pu1_dst + dst_strd;
+            UWORD8 *pu1_dst_tmp_3 = pu1_dst + dst_strd;
+
+            uint8x8_t dup_sub, src_tmp, src_tmp_1, round_val, dup_1, dup_2, dup_3, dup_4, dup_5, dup_6, dup_7, dup_8, rev_res;
+            uint16x8_t sub_res, dup_add;
+            int16x8_t shift_res, add_res;
+
+            dup_sub = vdup_n_u8(pu1_ref[two_nt]);
+            dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
+
+            pu1_ref_tmp_1 += (two_nt + 1);
+            pu1_ref_tmp_2 += (two_nt - 1);
+
+            for(col = nt; col > 0; col -= 8)
+            {
+                src_tmp = vld1_u8(pu1_ref_tmp_1);
+                pu1_ref_tmp_1 += 8;
+
+                sub_res = vsubl_u8(src_tmp, dup_sub);
+                shift_res  = vshrq_n_s16(vreinterpretq_s16_u16(sub_res), 1);
+                add_res = vqaddq_s16(shift_res, vreinterpretq_s16_u16(dup_add));
+                round_val = vqmovun_s16(add_res);
+                vst1_u8(pu1_dst_tmp_1, round_val);
+                pu1_dst_tmp_1 += 8;
+            }
+
+            for(row = nt; row > 0; row -= 8)
+            {
+                pu1_ref_tmp_2 -= 8;
+
+                src_tmp_1 = vld1_u8(pu1_ref_tmp_2);
+                rev_res = vrev64_u8(src_tmp_1); /* Reversing the loaded values */
+
+                dup_1 = vdup_lane_u8(rev_res, 0);
+                dup_2 = vdup_lane_u8(rev_res, 1);
+                dup_3 = vdup_lane_u8(rev_res, 2);
+                dup_4 = vdup_lane_u8(rev_res, 3);
+                dup_5 = vdup_lane_u8(rev_res, 4);
+                dup_6 = vdup_lane_u8(rev_res, 5);
+                dup_7 = vdup_lane_u8(rev_res, 6);
+                dup_8 = vdup_lane_u8(rev_res, 7);
+
+                for(col = nt; col > 0; col -= 8)
+                {
+                    pu1_dst_tmp_2 = pu1_dst_tmp_3;
+
+                    vst1_u8(pu1_dst_tmp_2, dup_1);
+                    pu1_dst_tmp_2 += dst_strd;
+
+                    vst1_u8(pu1_dst_tmp_2, dup_2);
+                    pu1_dst_tmp_2 += dst_strd;
+
+                    vst1_u8(pu1_dst_tmp_2, dup_3);
+                    pu1_dst_tmp_2 += dst_strd;
+
+                    vst1_u8(pu1_dst_tmp_2, dup_4);
+                    pu1_dst_tmp_2 += dst_strd;
+
+                    vst1_u8(pu1_dst_tmp_2, dup_5);
+                    pu1_dst_tmp_2 += dst_strd;
+
+                    vst1_u8(pu1_dst_tmp_2, dup_6);
+                    pu1_dst_tmp_2 += dst_strd;
+
+                    vst1_u8(pu1_dst_tmp_2, dup_7);
+                    pu1_dst_tmp_2 += dst_strd;
+
+                    /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
+                    if(row != 8)
+                        vst1_u8(pu1_dst_tmp_2, dup_8);
+                    pu1_dst_tmp_2 += dst_strd;
+
+                    pu1_dst_tmp_3 += 8;
+                }
+                pu1_dst_tmp_2 -= (nt - 8);
+                pu1_dst_tmp_3 = pu1_dst_tmp_2;
+            }
+        }
+    }
+}
+/* INTRA_PRED_LUMA_HORZ */
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for vertical luma variable.
+*
+* @par Description:
+*    Horizontal intraprediction with reference neighboring  samples location
+*    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_ver_neonintr(UWORD8 *pu1_ref,
+                                        WORD32 src_strd,
+                                        UWORD8 *pu1_dst,
+                                        WORD32 dst_strd,
+                                        WORD32 nt,
+                                        WORD32 mode)
+{
+    WORD32 row, col;
+    WORD32 two_nt;
+    UNUSED(src_strd);
+    UNUSED(mode);
+
+    two_nt = 2 * nt;
+
+    UWORD8 *pu1_dst_tmp = pu1_dst;
+    UWORD8 *pu1_ref_tmp_1 = pu1_ref + two_nt + 1;
+    uint8x8_t pu1_val_two_nt_1_col;
+    if(nt == 32)
+    {
+        pu1_dst_tmp = pu1_dst;
+        for(row = 0; row < nt; row++)
+        {
+            for(col = nt; col > 0; col -= 8)
+            {
+                pu1_val_two_nt_1_col = vld1_u8(pu1_ref_tmp_1);
+                pu1_ref_tmp_1 += 8;
+                vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_col);
+                pu1_dst_tmp += 8;
+            }
+            pu1_ref_tmp_1 -= nt;
+            pu1_dst_tmp += dst_strd - nt;
+        }
+    }
+    else
+
+    {
+        /* naming of variables made according to the operation(instructions) it performs                    */
+        /* (eg. shift_val which contains the shifted value,                                                 */
+        /* add_sat which has add and saturated value)                                                       */
+        /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
+        /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
+
+        if(0 != (nt & 7))
+        {
+            WORD32 cond_4 = 0;
+            UWORD8 *pu1_ref_val1 = pu1_ref;
+            UWORD8 *pu1_ref_val2 = pu1_ref;
+            UWORD8 *pu1_ref_val3 = pu1_ref;
+
+            UWORD8 *pu1_dst_val1 = pu1_dst;
+            UWORD8 *pu1_dst_val2 = pu1_dst;
+            UWORD8 *pu1_dst_val3 = pu1_dst;
+
+            uint8x8_t dup_2_sub, round_val, vext_val;
+            uint16x8_t dup_2_add;
+            uint32x2_t src_val1, src_val2, src_val3;
+            uint16x8_t sub_val;
+            int16x8_t shift_val1, add_sat;
+            uint64x1_t shift_val2;
+
+            src_val1 = vdup_n_u32(0);
+            src_val2 = vdup_n_u32(0);
+            src_val3 = vdup_n_u32(0);
+            pu1_ref_val1 += (two_nt - nt);
+            pu1_ref_val3 += (two_nt + 2);
+            pu1_ref_val2 += (two_nt + 1);
+
+            dup_2_sub = vdup_n_u8(pu1_ref[two_nt]);
+            dup_2_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
+
+            /* loops to store the first nt sets of values in the destination */
+
+            for(row = nt; row > 0; row -= 4)
+            {
+                for(col = nt; (col > 0) && (cond_4 == 0); col -= 4)
+                {
+                    /*  unrolling s2_predpixel = pu1_ref[two_nt + 1] + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1); here*/
+                    src_val1 = vld1_lane_u32((uint32_t *)pu1_ref_val1, src_val1, 1);
+                    sub_val = vsubl_u8(vreinterpret_u8_u32(src_val1), dup_2_sub);
+                    shift_val1  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
+                    add_sat = vqaddq_s16(shift_val1, vreinterpretq_s16_u16(dup_2_add));
+                    round_val = vqmovun_s16(add_sat);
+
+                    /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
+                    src_val2 = vld1_lane_u32((uint32_t *)pu1_ref_val3, src_val2, 0);
+                    vext_val = vext_u8(round_val, vreinterpret_u8_u32(src_val2), 7);
+                    vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
+                    pu1_dst_val1 += dst_strd;
+
+                    shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
+
+                    vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
+                    vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
+                    pu1_dst_val1 += dst_strd;
+
+                    shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
+
+                    vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
+                    vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
+                    pu1_dst_val1 += dst_strd;
+
+                    shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
+
+                    vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
+                    vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
+                    pu1_dst_val1 += dst_strd;
+
+                    pu1_ref_val1  -= 4;
+                }
+
+                /* loop to store next sets of eight values in the destination */
+
+                for(col = nt - 3; (col > 0) && (cond_4 == 1); col -= 4)
+                {
+                    src_val3 = vld1_lane_u32((uint32_t *)pu1_ref_val2, src_val3, 0);
+
+                    vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
+                    pu1_dst_val2 += dst_strd;
+
+                    vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
+                    pu1_dst_val2 += dst_strd;
+
+                    vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
+                    pu1_dst_val2 += dst_strd;
+
+                    vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
+                    pu1_dst_val2 += dst_strd;
+                }
+                pu1_ref_val2 += 4;
+                pu1_dst_val3 += 4;
+                pu1_dst_val2 = pu1_dst_val3;
+                cond_4 = 1;
+            }
+        }
+
+        /* rows and columns are unrolled by 8, when the width is multiple of 8          */
+        else
+        {
+            WORD32 cond = 0, col_1;
+            UWORD8 *pu1_dst_tmp_1 = pu1_dst;
+            UWORD8 *pu1_dst_tmp_2 = pu1_dst;
+            UWORD8 *pu1_dst_tmp_3 = pu1_dst;
+
+            UWORD8 *pu1_ref_tmp_1 = pu1_ref;
+            UWORD8 *pu1_ref_tmp_2 = pu1_ref;
+            UWORD8 *pu1_ref_tmp_3 = pu1_ref;
+
+            uint8x8_t pu1_src_tmp1;
+            uint8x8_t pu1_src_tmp2;
+
+            uint8x8_t dup_sub;
+            uint16x8_t dup_add;
+            int16x8_t subsh_val;
+            int16x8_t addsat_val;
+            uint16x8_t sub_val;
+            uint8x8_t round_val;
+            uint8x8_t vext_t;
+            uint64x1_t shift_64;
+
+            dup_sub = vdup_n_u8(pu1_ref[two_nt]);
+            dup_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
+
+            pu1_ref_tmp_1 += (two_nt);
+            pu1_ref_tmp_1 -= 8;
+            pu1_ref_tmp_2 += (two_nt + 2);
+            pu1_ref_tmp_3 += (two_nt + 1);
+
+            /* loops to store the first nt sets of values in the destination */
+
+            for(row = nt; row > 0; row -= 8)
+            {
+                for(col = (nt - 1); (col > 0) && (cond == 0); col -= 8)
+                {
+                    pu1_src_tmp1 = vld1_u8(pu1_ref_tmp_1);
+
+                    sub_val = vsubl_u8(pu1_src_tmp1, dup_sub);
+                    subsh_val  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
+                    addsat_val = vqaddq_s16(subsh_val, vreinterpretq_s16_u16(dup_add));
+                    round_val = vqmovun_s16(addsat_val);
+
+                    /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
+
+                    pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_2);
+                    vext_t = vext_u8(round_val, pu1_src_tmp2, 7);
+                    vst1_u8(pu1_dst_tmp_1, vext_t);
+                    pu1_dst_tmp_1 += dst_strd;
+
+                    shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
+
+                    vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
+                    vst1_u8(pu1_dst_tmp_1, vext_t);
+                    pu1_dst_tmp_1 += dst_strd;
+
+                    shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
+                    vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
+                    vst1_u8(pu1_dst_tmp_1, vext_t);
+                    pu1_dst_tmp_1 += dst_strd;
+
+                    shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
+                    vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
+                    vst1_u8(pu1_dst_tmp_1, vext_t);
+                    pu1_dst_tmp_1 += dst_strd;
+
+                    shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 32);
+                    vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
+                    vst1_u8(pu1_dst_tmp_1, vext_t);
+                    pu1_dst_tmp_1 += dst_strd;
+
+                    shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 40);
+                    vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
+                    vst1_u8(pu1_dst_tmp_1, vext_t);
+                    pu1_dst_tmp_1 += dst_strd;
+
+                    shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 48);
+                    vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
+                    vst1_u8(pu1_dst_tmp_1, vext_t);
+                    pu1_dst_tmp_1 += dst_strd;
+
+                    shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 56);
+                    vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
+                    vst1_u8(pu1_dst_tmp_1, vext_t);
+                    pu1_dst_tmp_1 += dst_strd;
+
+                    pu1_ref_tmp_1 -= 8;
+                }
+
+                /* loop to store next sets of eight values in the destination */
+
+                for(col_1 = nt - 7; (col_1 > 0) && (cond == 1); col_1 -= 8)
+                {
+                    pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_3);
+
+                    vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
+                    pu1_dst_tmp_2 += dst_strd;
+
+                    vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
+                    pu1_dst_tmp_2 += dst_strd;
+
+                    vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
+                    pu1_dst_tmp_2 += dst_strd;
+
+                    vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
+                    pu1_dst_tmp_2 += dst_strd;
+
+                    vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
+                    pu1_dst_tmp_2 += dst_strd;
+
+                    vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
+                    pu1_dst_tmp_2 += dst_strd;
+
+                    vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
+                    pu1_dst_tmp_2 += dst_strd;
+
+                    vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
+                    pu1_dst_tmp_2 += dst_strd;
+                }
+                pu1_ref_tmp_3 += 8;
+                pu1_dst_tmp_3 += 8;
+                pu1_dst_tmp_2 = pu1_dst_tmp_3;
+                cond = 1;
+            }
+        }
+    }
+}
+/* INTRA_PRED_LUMA_VER */
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for luma mode2.
+*
+* @par Description:
+*    Intraprediction for mode 2 (sw angle) with reference  neighboring samples
+*    location pointed by 'pu1_ref' to the  TU block location pointed by
+*    'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_mode2_neonintr(UWORD8 *pu1_ref,
+                                          WORD32 src_strd,
+                                          UWORD8 *pu1_dst,
+                                          WORD32 dst_strd,
+                                          WORD32 nt,
+                                          WORD32 mode)
+{
+
+    WORD32 row, col;
+    WORD32 two_nt;
+    UNUSED(src_strd);
+    UNUSED(mode);
+
+    /* rev_res naming has been made to have the reverse result value in it                              */
+    /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
+    /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
+
+    if(0 != (nt & 7))
+    {
+        UWORD8 *pu1_ref_tmp = pu1_ref;
+        UWORD8 *pu1_dst_tmp = pu1_dst;
+        uint8x8_t pu1_src_val, rev_res;
+        uint64x1_t shift_res;
+
+        for(col = nt; col > 0; col -= 4)
+        {
+            for(row = nt; row > 0; row -= 4)
+            {
+                /* unrolling all col & rows for pu1_dst[row + (col * dst_strd)] = pu1_ref[two_nt - col - idx - 1]; */
+
+                pu1_src_val = vld1_u8(pu1_ref_tmp);
+                shift_res = vshl_n_u64(vreinterpret_u64_u8(pu1_src_val), 8);
+                rev_res = vrev64_u8(vreinterpret_u8_u64(shift_res));
+
+                vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(rev_res), 0);
+                pu1_dst_tmp += dst_strd;
+
+                shift_res = vshr_n_u64(vreinterpret_u64_u8(rev_res), 8);
+                vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
+                pu1_dst_tmp += dst_strd;
+
+                shift_res = vshr_n_u64(shift_res, 8);
+                vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
+                pu1_dst_tmp += dst_strd;
+
+                shift_res = vshr_n_u64(shift_res, 8);
+                vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
+                pu1_dst_tmp += dst_strd;
+            }
+        }
+    }
+
+    /* rev_val_second, rev_val_first  to reverse the loaded values in order to get the values in right order */
+    /* shift_64 to shift the reversed 2nd values to get the value what we need                               */
+    /* rows and columns are unrolled by 8, when the width is multiple of 8                              */
+
+    else
+    {
+        UWORD8 *pu1_ref_two_nt_minus2 = pu1_ref;
+        UWORD8 *pu1_dst_tmp = pu1_dst;
+        UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
+
+        uint8x8_t pu1_src_val1, pu1_src_val2, vext_t, rev_val_second, rev_val_first;
+        uint64x1_t shift_val;
+
+        two_nt = 2 * nt;
+        pu1_ref_two_nt_minus2 += (two_nt);
+        pu1_ref_two_nt_minus2 -= 8;
+
+        for(col = nt; col > 0; col -= 8)
+        {
+            for(row = nt; row > 0; row -= 8)
+            {
+                pu1_src_val2 = vld1_u8(pu1_ref_two_nt_minus2);
+                rev_val_first = vrev64_u8(pu1_src_val2);
+
+                pu1_ref_two_nt_minus2 -= 8;
+                pu1_src_val1 = vld1_u8(pu1_ref_two_nt_minus2);
+                rev_val_second = vrev64_u8(pu1_src_val1);
+
+                vext_t = vext_u8(rev_val_first, rev_val_second, 1);
+                vst1_u8(pu1_dst_tmp, vext_t);
+                pu1_dst_tmp += dst_strd;
+
+                shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 8);
+                vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
+                vst1_u8(pu1_dst_tmp, vext_t);
+                pu1_dst_tmp += dst_strd;
+
+                shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 16);
+                vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
+                vst1_u8(pu1_dst_tmp, vext_t);
+                pu1_dst_tmp += dst_strd;
+
+                shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 24);
+                vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
+                vst1_u8(pu1_dst_tmp, vext_t);
+                pu1_dst_tmp += dst_strd;
+
+                shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 32);
+                vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
+                vst1_u8(pu1_dst_tmp, vext_t);
+                pu1_dst_tmp += dst_strd;
+
+                shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 40);
+                vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
+                vst1_u8(pu1_dst_tmp, vext_t);
+                pu1_dst_tmp += dst_strd;
+
+                shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 48);
+                vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
+                vst1_u8(pu1_dst_tmp, vext_t);
+                pu1_dst_tmp += dst_strd;
+
+                shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 56);
+                vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
+                vst1_u8(pu1_dst_tmp, vext_t);
+                pu1_dst_tmp += dst_strd;
+            }
+            pu1_dst_tmp_plus8 += 8;
+            pu1_dst_tmp = pu1_dst_tmp_plus8;
+            pu1_ref_two_nt_minus2 += (nt - 8);
+        }
+    }
+}
+/* INTRA_PRED_LUMA_MODE2 */
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Intra prediction interpolation filter for luma mode 18 & mode 34.
+*
+* @par Description:
+*    Intraprediction for mode 34 (ne angle) with reference  neighboring
+*    samples location pointed by 'pu1_ref' to the  TU block location pointed by
+*    'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 *pu1_ref,
+                                               WORD32 src_strd,
+                                               UWORD8 *pu1_dst,
+                                               WORD32 dst_strd,
+                                               WORD32 nt,
+                                               WORD32 mode)
+{
+
+    WORD32 row, col, idx;
+    WORD32 intraPredAngle = 32;
+    WORD32 two_nt;
+    UNUSED(src_strd);
+    two_nt = 2 * nt;
+
+    UWORD8 *pu1_ref_tmp = pu1_ref;
+    UWORD8 *pu1_ref_tmp1 = pu1_ref;
+    UWORD8 *pu1_dst_tmp = pu1_dst;
+    UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
+
+    uint8x8_t src_tmp_1st, src_tmp_2nd, vext1, vext2, vext3, vext4, vext5, vext6, vext7;
+
+    /* src_tmp_1st, src_tmp_2nd are named as to load the 1st eight and next 8 values from source(pu1_ref)   */
+    /* vext1 - vext7 are named to do vext operation between 2 loaded values and to handle dual issue        */
+    /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8      */
+    /* rows and columns are unrolled by 8, when the width is multiple of 8                                  */
+    /* loops are maintained separately for mode18 and mode34                                                */
+
+    /* cond to allow multiples of 8 */
+    if(0 == (nt & 7))
+    {
+        if(mode == 34)
+        {
+            pu1_ref_tmp += (two_nt + 2);
+
+            for(row = nt; row > 0; row -= 8)
+            {
+                for(col = nt; col > 0; col -= 8)
+                {
+                    /* Loading 1st eight values */
+                    src_tmp_1st = vld1_u8(pu1_ref_tmp);
+                    pu1_ref_tmp += 8;
+
+                    /* Loading next eight values */
+                    src_tmp_2nd = vld1_u8(pu1_ref_tmp);
+
+                    /* UNROLLED  pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
+                    vext1 = vext_u8(src_tmp_1st, src_tmp_2nd, 1);
+                    vst1_u8(pu1_dst_tmp, src_tmp_1st);
+                    pu1_dst_tmp += dst_strd;
+
+                    vext2 = vext_u8(src_tmp_1st, src_tmp_2nd, 2);
+                    vst1_u8(pu1_dst_tmp, vext1);
+                    pu1_dst_tmp += dst_strd;
+
+                    vext3 = vext_u8(src_tmp_1st, src_tmp_2nd, 3);
+                    vst1_u8(pu1_dst_tmp, vext2);
+                    pu1_dst_tmp += dst_strd;
+
+                    vext4 = vext_u8(src_tmp_1st, src_tmp_2nd, 4);
+                    vst1_u8(pu1_dst_tmp, vext3);
+                    pu1_dst_tmp += dst_strd;
+
+                    vext5 = vext_u8(src_tmp_1st, src_tmp_2nd, 5);
+                    vst1_u8(pu1_dst_tmp, vext4);
+                    pu1_dst_tmp += dst_strd;
+
+                    vext6 = vext_u8(src_tmp_1st, src_tmp_2nd, 6);
+                    vst1_u8(pu1_dst_tmp, vext5);
+                    pu1_dst_tmp += dst_strd;
+
+                    vext7 = vext_u8(src_tmp_1st, src_tmp_2nd, 7);
+                    vst1_u8(pu1_dst_tmp, vext6);
+                    pu1_dst_tmp += dst_strd;
+
+                    vst1_u8(pu1_dst_tmp, vext7);
+                    pu1_dst_tmp += dst_strd;
+                }
+
+                pu1_dst_tmp_plus8 += 8;
+                pu1_dst_tmp = pu1_dst_tmp_plus8;
+                pu1_ref_tmp -= (nt - 8);
+            }
+        }
+        else /* Loop for mode 18 */
+        {
+            pu1_ref_tmp += (two_nt);
+
+            for(row = nt; row > 0; row -= 8)
+            {
+                for(col = nt; col > 0; col -= 8)
+                {
+                    /* Loading 1st eight values */
+                    src_tmp_1st = vld1_u8(pu1_ref_tmp);
+                    pu1_ref_tmp -= 8;
+
+                    /* Loading next eight values */
+                    src_tmp_2nd = vld1_u8(pu1_ref_tmp);
+
+                    /* UNROLLED  pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
+                    vext1 = vext_u8(src_tmp_2nd, src_tmp_1st, 7);
+                    vst1_u8(pu1_dst_tmp, src_tmp_1st);
+                    pu1_dst_tmp += dst_strd;
+
+                    vext2 = vext_u8(src_tmp_2nd, src_tmp_1st, 6);
+                    vst1_u8(pu1_dst_tmp, vext1);
+                    pu1_dst_tmp += dst_strd;
+
+                    vext3 = vext_u8(src_tmp_2nd, src_tmp_1st, 5);
+                    vst1_u8(pu1_dst_tmp, vext2);
+                    pu1_dst_tmp += dst_strd;
+
+                    vext4 = vext_u8(src_tmp_2nd, src_tmp_1st, 4);
+                    vst1_u8(pu1_dst_tmp, vext3);
+                    pu1_dst_tmp += dst_strd;
+
+                    vext5 = vext_u8(src_tmp_2nd, src_tmp_1st, 3);
+                    vst1_u8(pu1_dst_tmp, vext4);
+                    pu1_dst_tmp += dst_strd;
+
+                    vext6 = vext_u8(src_tmp_2nd, src_tmp_1st, 2);
+                    vst1_u8(pu1_dst_tmp, vext5);
+                    pu1_dst_tmp += dst_strd;
+
+                    vext7 = vext_u8(src_tmp_2nd, src_tmp_1st, 1);
+                    vst1_u8(pu1_dst_tmp, vext6);
+                    pu1_dst_tmp += dst_strd;
+
+                    vst1_u8(pu1_dst_tmp, vext7);
+                    pu1_dst_tmp += dst_strd;
+                }
+                pu1_dst_tmp_plus8 += 8;
+                pu1_dst_tmp = pu1_dst_tmp_plus8;
+                pu1_ref_tmp += (nt + 8);
+            }
+        }
+    }
+
+    /* rows and columns are unrolled by 4, when the width is multiple of 4  */
+
+    else /* loop for multiples of 4 */
+    {
+        uint8x8_t src_val1;
+        uint8x8_t src_val2;
+
+        if(mode == 18)
+            intraPredAngle = -32;
+        else if(mode == 34)
+            intraPredAngle = 32;
+
+        for(row = 0; row < nt; row += 2)
+        {
+            /* unrolling 2 rows */
+            idx = ((row + 1) * intraPredAngle) >> 5;
+            pu1_ref_tmp = pu1_ref + two_nt + idx + 1;
+            src_val1 = vld1_u8(pu1_ref_tmp);
+
+            idx = ((row + 2) * intraPredAngle) >> 5;
+            pu1_ref_tmp1 = pu1_ref + two_nt + idx + 1;
+            src_val2 = vld1_u8(pu1_ref_tmp1);
+
+            /* unrolling 4 col */
+            for(col = nt; col > 0; col -= 4)
+            {
+                pu1_dst_tmp = pu1_dst;
+                vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val1), 0);
+                pu1_dst_tmp += dst_strd;
+                vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val2), 0);
+                pu1_dst += 4;
+            }
+            pu1_dst += 2 * dst_strd - nt;
+        }
+    }
+}
+/* INTRA_PRED_LUMA_MODE_18_34 */
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *    Intra prediction interpolation filter for luma mode 3 to mode 9
+ *
+ * @par Description:
+ *    Intraprediction for mode 3 to 9  (positive angle, horizontal mode ) with
+ *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+ *    block location pointed by 'pu1_dst'
+ *
+ * @param[in] pu1_src
+ *  UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ *  UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ *  integer source stride
+ *
+ * @param[in] dst_strd
+ *  integer destination stride
+ *
+ * @param[in] nt
+ *  integer Transform Block size
+ *
+ * @param[in] mode
+ *  integer intraprediction mode
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 *pu1_ref,
+                                                WORD32 src_strd,
+                                                UWORD8 *pu1_dst,
+                                                WORD32 dst_strd,
+                                                WORD32 nt,
+                                                WORD32 mode)
+{
+
+    WORD32 row, col;
+    WORD32 intra_pred_ang;
+    WORD32 pos, fract = 100, fract_prev;
+    UNUSED(src_strd);
+    if(0 == (nt & 7))
+    {
+
+        UWORD8 *pu1_ref_main_idx = pu1_ref;
+        UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
+
+        UWORD8 *pu1_dst_tmp1 = pu1_dst;
+        UWORD8 *pu1_dst_tmp2 = pu1_dst;
+
+        WORD32 two_nt = 2 * nt;
+
+        pu1_ref_main_idx += two_nt;
+        pu1_ref_main_idx_1 += two_nt - 1;
+
+        uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
+        uint8x8_t shift_res;
+        uint16x8_t mul_res1, mul_res2, add_res;
+
+        /* Intra Pred Angle according to the mode */
+        intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+        pu1_ref_main_idx -= 8;
+        pu1_ref_main_idx_1 -= 8;
+
+        for(col = 0; col < nt; col++)
+        {
+            fract_prev = fract;
+
+            pos = ((col + 1) * intra_pred_ang);
+            fract = pos & (31);
+
+            if(fract_prev < fract)
+            {
+                pu1_ref_main_idx += 1;
+                pu1_ref_main_idx_1 += 1;
+            }
+
+            dup_const_fract = vdup_n_u8((uint8_t)fract);
+            dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
+
+            for(row = nt; row > 0; row -= 8)
+            {
+                ref_main_idx = vld1_u8(pu1_ref_main_idx);
+                ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
+
+                mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
+                mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
+
+                add_res = vaddq_u16(mul_res1, mul_res2);
+
+                shift_res = vrshrn_n_u16(add_res, 5);
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
+                pu1_dst_tmp1 += dst_strd;
+
+                pu1_ref_main_idx -= 8;
+                pu1_ref_main_idx_1 -= 8;
+
+            }
+            pu1_dst_tmp2 += 1;
+            pu1_dst_tmp1 = pu1_dst_tmp2;
+
+            pu1_ref_main_idx += nt;
+            pu1_ref_main_idx_1 += nt;
+
+            pu1_ref_main_idx -= 1;
+            pu1_ref_main_idx_1 -= 1;
+
+        }
+    }
+    else
+    {
+        UWORD8 *pu1_ref_tmp1 = pu1_ref;
+        UWORD8 *pu1_ref_tmp2 = pu1_ref;
+        UWORD8 *pu1_dst_tmp1 = pu1_dst;
+        UWORD8 *pu1_dst_tmp2 = pu1_dst;
+
+        pu1_ref_tmp1 += nt;
+        pu1_ref_tmp2 += (nt - 1);
+
+        uint8x8_t dup_fract, dup_32_fract, shift_res;
+        uint16x8_t mul_res1, mul_res2, add_res;
+        uint32x2_t  pu1_ref_val1, pu1_ref_val2;
+
+        pu1_ref_val1 = vdup_n_u32(0);
+        pu1_ref_val2 = vdup_n_u32(0);
+
+        /* Intra Pred Angle according to the mode */
+        intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+
+        for(col = 0; col < nt; col++)
+        {
+            fract_prev = fract;
+            pos = ((col + 1) * intra_pred_ang);
+            fract = pos & (31);
+            if(fract_prev < fract)
+            {
+                pu1_ref_tmp1 += 1;
+                pu1_ref_tmp2 += 1;
+            }
+            dup_fract = vdup_n_u8((uint8_t)fract);
+            dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
+
+            for(row = nt; row > 0; row -= 4)
+            {
+                pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
+                pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
+
+                mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
+                mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
+
+                add_res = vaddq_u16(mul_res1, mul_res2);
+
+                shift_res = vrshrn_n_u16(add_res, 5);
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
+
+            }
+            pu1_ref_tmp1 -= 1;
+            pu1_ref_tmp2 -= 1;
+
+            pu1_dst_tmp2 += 1;
+            pu1_dst_tmp1 = pu1_dst_tmp2;
+
+        }
+
+
+    }
+
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *   Intra prediction interpolation filter for luma mode 11 to mode 17
+ *
+ * @par Description:
+ *    Intraprediction for mode 11 to 17  (negative angle, horizontal mode )
+ *    with reference  neighboring samples location pointed by 'pu1_ref' to the
+ *    TU block location pointed by 'pu1_dst'
+ *
+ * @param[in] pu1_src
+ *  UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ *  UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ *  integer source stride
+ *
+ * @param[in] dst_strd
+ *  integer destination stride
+ *
+ * @param[in] nt
+ *  integer Transform Block size
+ *
+ * @param[in] mode
+ *  integer intraprediction mode
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 *pu1_ref,
+                                                  WORD32 src_strd,
+                                                  UWORD8 *pu1_dst,
+                                                  WORD32 dst_strd,
+                                                  WORD32 nt,
+                                                  WORD32 mode)
+{
+
+    WORD32 row, col, k;
+    WORD32 two_nt;
+    WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
+    WORD32 pos, fract = 1000, fract_prev;
+    WORD32  ref_idx;
+
+    UWORD8 *ref_main;
+    UWORD8 *ref_main_tmp;
+
+    UWORD8 *pu1_ref_tmp1 = pu1_ref;
+    UWORD8 *pu1_ref_tmp2 = pu1_ref;
+    UWORD8 *pu1_dst_tmp1 = pu1_dst;
+    UWORD8 *pu1_dst_tmp2 = pu1_dst;
+
+    UWORD8 ref_temp[2 * MAX_CU_SIZE + 1];
+
+    uint16x8_t mul_res1, mul_res2, add_res;
+    uint8x8_t dup_const_fract, dup_const_32_fract;
+    uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
+    uint8x8_t ref_left_t;
+    uint32x2_t  ref_left_tmp;
+    UNUSED(src_strd);
+    ref_left_tmp = vdup_n_u32(0);
+
+    inv_ang_sum = 128;
+    two_nt = 2 * nt;
+
+    intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+    inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
+
+    pu1_ref_tmp1 += two_nt;
+
+    ref_main = ref_temp + (nt - 1);
+    ref_main_tmp = ref_main;
+
+    if(0 == (nt & 7))
+    {
+        pu1_ref_tmp2 += (two_nt - 7);
+
+        for(k = nt - 1; k >= 0; k -= 8)
+        {
+
+            ref_left_t = vld1_u8(pu1_ref_tmp2);
+
+            ref_left_t = vrev64_u8(ref_left_t);
+            vst1_u8(ref_main_tmp, ref_left_t);
+            ref_main_tmp += 8;
+            pu1_ref_tmp2 -= 8;
+
+        }
+
+    }
+    else
+    {
+        uint8x8_t rev_val;
+        pu1_ref_tmp2 += (two_nt - (nt - 1));
+
+        for(k = nt - 1; k >= 0; k -= 8)
+        {
+
+            ref_left_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, ref_left_tmp, 1);
+
+            rev_val = vrev64_u8(vreinterpret_u8_u32(ref_left_tmp));
+            vst1_lane_u32((uint32_t *)ref_main_tmp, vreinterpret_u32_u8(rev_val), 0);
+
+        }
+
+    }
+
+    ref_main[nt] = pu1_ref[two_nt - nt];
+
+    /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
+
+    ref_idx = (nt * intra_pred_ang) >> 5;
+
+    /* SIMD Optimization can be done using look-up table for the loop */
+    /* For negative angled derive the main reference samples from side */
+    /*  reference samples refer to section 8.4.4.2.6 */
+    for(k = -1; k > ref_idx; k--)
+    {
+        inv_ang_sum += inv_ang;
+        ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
+    }
+
+    UWORD8 *ref_main_tmp1 = ref_main;
+    UWORD8 *ref_main_tmp2 = ref_main;
+
+    ref_main_tmp2 += 1;
+
+    if(0 == (nt & 7))
+    {
+        /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+        /* samples dependent on distance to obtain destination sample */
+        for(col = 0; col < nt; col++)
+        {
+
+            fract_prev = fract;
+            pos = ((col + 1) * intra_pred_ang);
+            fract = pos & (31);
+
+            if(fract_prev < fract)
+            {
+                ref_main_tmp1 -= 1;
+                ref_main_tmp2 -= 1;
+            }
+
+            dup_const_fract = vdup_n_u8((uint8_t)fract);
+            dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
+
+            // Do linear filtering
+            for(row = nt; row > 0; row -= 8)
+            {
+                ref_main_idx = vld1_u8(ref_main_tmp1);
+
+                ref_main_idx_1 = vld1_u8(ref_main_tmp2);
+
+                mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
+                mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
+
+                add_res = vaddq_u16(mul_res1, mul_res2);
+
+                shift_res = vrshrn_n_u16(add_res, 5);
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
+                pu1_dst_tmp1 += dst_strd;
+
+                ref_main_tmp1 += 8;
+                ref_main_tmp2 += 8;
+            }
+
+            ref_main_tmp1 -= nt;
+            ref_main_tmp2 -= nt;
+
+            pu1_dst_tmp2 += 1;
+            pu1_dst_tmp1 = pu1_dst_tmp2;
+        }
+    }
+    else
+    {
+        uint32x2_t ref_main_idx1, ref_main_idx2;
+
+        ref_main_idx1 = vdup_n_u32(0);
+        ref_main_idx2 = vdup_n_u32(0);
+
+        for(col = 0; col < nt; col++)
+        {
+            fract_prev = fract;
+            pos = ((col + 1) * intra_pred_ang);
+            fract = pos & (31);
+
+            if(fract_prev < fract)
+            {
+                ref_main_tmp1 -= 1;
+                ref_main_tmp2 -= 1;
+            }
+
+            dup_const_fract = vdup_n_u8((uint8_t)fract);
+            dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
+
+            for(row = nt; row > 0; row -= 4)
+            {
+
+                ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
+                ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
+
+                mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
+                mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
+
+                add_res = vaddq_u16(mul_res1, mul_res2);
+
+                shift_res = vrshrn_n_u16(add_res, 5);
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
+                pu1_dst_tmp1 += dst_strd;
+
+                vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
+                pu1_dst_tmp1 += dst_strd;
+
+            }
+
+            pu1_dst_tmp2 += 1;
+            pu1_dst_tmp1 = pu1_dst_tmp2;
+
+        }
+
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *   Intra prediction interpolation filter for luma mode 19 to mode 25
+ *
+ * @par Description:
+ *    Intraprediction for mode 19 to 25  (negative angle, vertical mode ) with
+ *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+ *    block location pointed by 'pu1_dst'
+ *
+ * @param[in] pu1_src
+ *  UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ *  UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ *  integer source stride
+ *
+ * @param[in] dst_strd
+ *  integer destination stride
+ *
+ * @param[in] nt
+ *  integer Transform Block size
+ *
+ * @param[in] mode
+ *  integer intraprediction mode
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 *pu1_ref,
+                                                  WORD32 src_strd,
+                                                  UWORD8 *pu1_dst,
+                                                  WORD32 dst_strd,
+                                                  WORD32 nt,
+                                                  WORD32 mode)
+{
+
+    WORD32 row, col, k;
+    WORD32 two_nt, intra_pred_ang;
+    WORD32 inv_ang, inv_ang_sum, pos, fract = 1000, fract_prev;;
+    WORD32 ref_idx;
+    UWORD8 *ref_main;
+    UWORD8 *ref_main_tmp;
+    UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 1];
+
+    UWORD8 *pu1_ref_tmp1 = pu1_ref;
+    UWORD8 *pu1_ref_tmp2 = pu1_ref;
+    UWORD8 *pu1_dst_tmp1 = pu1_dst;
+
+    uint16x8_t mul_res1, mul_res2, add_res;
+    uint8x8_t dup_const_fract, dup_const_32_fract;
+    uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
+    uint8x8_t ref_above_t;
+    uint32x2_t ref_above_tmp;
+    UNUSED(src_strd);
+    ref_above_tmp = vdup_n_u32(0);
+
+    two_nt = 2 * nt;
+    intra_pred_ang = gai4_ihevc_ang_table[mode];
+    inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
+
+    /* Intermediate reference samples for negative angle modes */
+    /* This have to be removed during optimization*/
+    pu1_ref_tmp1 += two_nt;
+
+
+    ref_main = ref_temp + (nt - 1);
+    ref_main_tmp = ref_main;
+
+    if(0 == (nt & 7))
+    {
+        pu1_ref_tmp2 += (two_nt - 7);
+        for(k = nt - 1; k >= 0; k -= 8)
+        {
+
+            ref_above_t = vld1_u8(pu1_ref_tmp1);
+            vst1_u8(ref_main_tmp, ref_above_t);
+            ref_main_tmp += 8;
+            pu1_ref_tmp1 += 8;
+
+        }
+
+    }
+    else
+    {
+        pu1_ref_tmp2 += (two_nt - (nt - 1));
+
+        for(k = nt - 1; k >= 0; k -= 4)
+        {
+
+            ref_above_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, ref_above_tmp, 0);
+            vst1_lane_u32((uint32_t *)ref_main_tmp, ref_above_tmp, 0);
+
+        }
+
+    }
+
+    ref_main[nt] = pu1_ref[two_nt + nt];
+
+    /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+
+    ref_idx = (nt * intra_pred_ang) >> 5;
+    inv_ang_sum = 128;
+
+    /* SIMD Optimization can be done using look-up table for the loop */
+    /* For negative angled derive the main reference samples from side */
+    /*  reference samples refer to section 8.4.4.2.6 */
+    for(k = -1; k > ref_idx; k--)
+    {
+        inv_ang_sum += inv_ang;
+        ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
+    }
+
+    UWORD8 *ref_main_tmp1 = ref_main;
+    UWORD8 *ref_main_tmp2 = ref_main;
+
+    ref_main_tmp2 += 1;
+
+    if(0 == (nt & 7))
+    {
+        /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+        /* samples dependent on distance to obtain destination sample */
+        for(row = 0; row < nt; row++)
+        {
+
+            fract_prev = fract;
+            pos = ((row + 1) * intra_pred_ang);
+            fract = pos & (31);
+
+            if(fract_prev < fract)
+            {
+                ref_main_tmp1 -= 1;
+                ref_main_tmp2 -= 1;
+            }
+
+            dup_const_fract = vdup_n_u8((uint8_t)fract);
+            dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
+
+            // Do linear filtering
+            for(col = nt; col > 0; col -= 8)
+            {
+                ref_main_idx = vld1_u8(ref_main_tmp1);
+
+                ref_main_idx_1 = vld1_u8(ref_main_tmp2);
+
+                mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
+                mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
+
+                add_res = vaddq_u16(mul_res1, mul_res2);
+
+                shift_res = vrshrn_n_u16(add_res, 5);
+
+                vst1_u8(pu1_dst_tmp1, shift_res);
+                pu1_dst_tmp1 += 8;
+
+                ref_main_tmp1 += 8;
+                ref_main_tmp2 += 8;
+            }
+
+            ref_main_tmp1 -= nt;
+            ref_main_tmp2 -= nt;
+
+            pu1_dst_tmp1 += (dst_strd - nt);
+        }
+    }
+    else
+    {
+        uint32x2_t ref_main_idx1, ref_main_idx2;
+
+        ref_main_idx1 = vdup_n_u32(0);
+        ref_main_idx2 = vdup_n_u32(0);
+
+        for(row = 0; row < nt; row++)
+        {
+            fract_prev = fract;
+            pos = ((row + 1) * intra_pred_ang);
+            fract = pos & (31);
+
+            if(fract_prev < fract)
+            {
+                ref_main_tmp1 -= 1;
+                ref_main_tmp2 -= 1;
+            }
+
+            dup_const_fract = vdup_n_u8((uint8_t)fract);
+            dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
+
+            for(col = nt; col > 0; col -= 4)
+            {
+
+                ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
+                ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
+
+                mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
+                mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
+
+                add_res = vaddq_u16(mul_res1, mul_res2);
+
+                shift_res = vrshrn_n_u16(add_res, 5);
+
+                vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
+                pu1_dst_tmp1 += 4;
+
+            }
+            pu1_dst_tmp1 += (dst_strd - nt);
+        }
+
+    }
+
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *    Intra prediction interpolation filter for luma mode 27 to mode 33
+ *
+ * @par Description:
+ *    Intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
+ *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+ *    block location pointed by 'pu1_dst'
+ *
+ * @param[in] pu1_src
+ *  UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ *  UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ *  integer source stride
+ *
+ * @param[in] dst_strd
+ *  integer destination stride
+ *
+ * @param[in] nt
+ *  integer Transform Block size
+ *
+ * @param[in] mode
+ *  integer intraprediction mode
+ *
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 *pu1_ref,
+                                                  WORD32 src_strd,
+                                                  UWORD8 *pu1_dst,
+                                                  WORD32 dst_strd,
+                                                  WORD32 nt,
+                                                  WORD32 mode)
+{
+
+    WORD32 row, col;
+    WORD32 intra_pred_ang;
+    WORD32 pos, fract = 0, fract_prev;
+
+    WORD32 two_nt = 2 * nt;
+    UNUSED(src_strd);
+    if(0 == (nt & 7))
+    {
+
+        UWORD8 *pu1_ref_main_idx = pu1_ref;
+        UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
+
+        UWORD8 *pu1_dst_tmp1 = pu1_dst;
+        pu1_ref_main_idx += (two_nt + 1);
+        pu1_ref_main_idx_1 += (two_nt + 2);
+
+        uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
+        uint8x8_t shift_res;
+        uint16x8_t mul_res1, mul_res2, add_res;
+
+        /* Intra Pred Angle according to the mode */
+        intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+        for(row = 0; row < nt; row++)
+        {
+            fract_prev = fract;
+
+            pos = ((row + 1) * intra_pred_ang);
+            fract = pos & (31);
+
+            if(fract_prev > fract)
+            {
+                pu1_ref_main_idx += 1;
+                pu1_ref_main_idx_1 += 1;
+            }
+
+            dup_const_fract = vdup_n_u8((uint8_t)fract);
+            dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
+
+            for(col = nt; col > 0; col -= 8)
+            {
+                ref_main_idx = vld1_u8(pu1_ref_main_idx);
+                ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
+
+                mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
+                mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
+
+                add_res = vaddq_u16(mul_res1, mul_res2);
+
+                shift_res = vrshrn_n_u16(add_res, 5);
+
+                vst1_u8(pu1_dst_tmp1, shift_res);
+                pu1_dst_tmp1 += 8;
+
+                pu1_ref_main_idx += 8;
+                pu1_ref_main_idx_1 += 8;
+            }
+
+            pu1_ref_main_idx -= nt;
+            pu1_ref_main_idx_1 -= nt;
+
+            pu1_dst_tmp1 += (dst_strd - nt);
+        }
+
+    }
+    else
+    {
+        UWORD8 *pu1_ref_tmp1 = pu1_ref;
+        UWORD8 *pu1_ref_tmp2 = pu1_ref;
+        UWORD8 *pu1_dst_tmp1 = pu1_dst;
+
+        pu1_ref_tmp1 += (two_nt + 1);;
+        pu1_ref_tmp2 += (two_nt + 2);;
+
+        uint8x8_t dup_fract, dup_32_fract, shift_res;
+        uint16x8_t mul_res1, mul_res2, add_res;
+        uint32x2_t  pu1_ref_val1, pu1_ref_val2;
+
+        pu1_ref_val1 = vdup_n_u32(0);
+        pu1_ref_val2 = vdup_n_u32(0);
+
+        /* Intra Pred Angle according to the mode */
+        intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+        for(row = 0; row < nt; row++)
+        {
+            fract_prev = fract;
+            pos = ((row + 1) * intra_pred_ang);
+            fract = pos & (31);
+            if(fract_prev > fract)
+            {
+                pu1_ref_tmp1 += 1;
+                pu1_ref_tmp2 += 1;
+            }
+            dup_fract = vdup_n_u8((uint8_t)fract);
+            dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
+
+            for(col = nt; col > 0; col -= 4)
+            {
+                pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
+                pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
+
+                mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
+                mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
+
+                add_res = vaddq_u16(mul_res1, mul_res2);
+
+                shift_res = vrshrn_n_u16(add_res, 5);
+
+                vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
+                pu1_dst_tmp1 += 4;
+
+            }
+
+            pu1_dst_tmp1 += (dst_strd - nt);
+
+        }
+
+
+    }
+
+}

diff --git a/common/arm/ihevc_intra_pred_luma_dc.s b/common/arm/ihevc_intra_pred_luma_dc.s
new file mode 100644
index 0000000..f380d94
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_luma_dc.s

@@ -0,0 +1,508 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_filters_dc.s
+@*
+@* @brief
+@*  contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  akshaya mukund
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the planar coefficients
+@*
+@* @param[in] nt
+@*  size of tranform block
+@*
+@* @param[in] mode
+@*  type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_dc(uword8 *pu1_ref,
+@                              word32 src_strd,
+@                              uword8 *pu1_dst,
+@                              word32 dst_strd,
+@                              word32 nt,
+@                              word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@   nt
+@   mode
+@   pi1_coeff
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_dc_a9q
+
+.type ihevc_intra_pred_luma_dc_a9q, %function
+
+ihevc_intra_pred_luma_dc_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads nt
+
+@********** testing
+    @mov        r6, #128
+    @b      prologue_cpy_32
+@********** testing
+
+    mov         r11, #2                     @mov #2 to r11 (to be used to add to 2dc_val & 3dc_val)
+    mov         r9, #0
+    vmov        d17, r11, r9
+
+    clz         r5, r4
+
+    add         r6, r0, r4                  @&src[nt]
+    rsb         r5, r5, #32                 @log2nt
+    add         r7, r0, r4, lsl #1          @&src[2nt]
+
+    add         r8, r7, #1                  @&src[2nt+1]
+    mvn         r5, r5
+    add         r5, r5, #1
+    vdup.32     d8, r5
+
+    ldrb        r14, [r8]
+    vshl.i64    d8, d8, #32
+
+    sub         r9, r7, #1                  @&src[2nt-1]
+    vshr.s64    d8, d8, #32
+
+    mov         r7, r8                      @r7 also stores 2nt+1
+
+    ldrb        r12, [r9]
+    add         r14, r14, r12               @src[2nt+1] + src[2nt-1]
+    add         r14, r14, r11               @src[2nt+1] + src[2nt-1] + 2
+
+    cmp         r4, #4
+    beq         dc_4
+
+    mov         r10, r4                     @nt
+
+add_loop:
+    vld1.s8     d0, [r6]!                   @load from src[nt]
+    mov         r5, #0                      @
+    vld1.s8     d1, [r8]!                   @load from src[2nt+1]
+
+    vpaddl.u8   d2, d0
+
+    vmov        d6, r4, r5                  @store nt to accumulate
+    vpaddl.u8   d3, d1
+
+    vld1.s8     d0, [r6]!                   @load from src[nt] (extra load for 8)
+
+    vld1.s8     d1, [r8]!                   @load from src[2nt+1] (extra load for 8)
+    vadd.u16    d4, d2, d3
+
+
+    vpaddl.u16  d5, d4
+
+
+    vpadal.u32  d6, d5                      @accumulate all inp into d6 (end for nt==8)
+
+    subs        r10, #8
+    beq         epil_add_loop
+
+core_loop_add:
+    vpaddl.u8   d2, d0
+    subs        r10, #8
+    vpaddl.u8   d3, d1
+
+
+
+    vadd.u16    d4, d2, d3
+    vld1.s8     d0, [r6]!                   @load from src[nt] (extra load for 16)
+
+    vpaddl.u16  d5, d4
+    vld1.s8     d1, [r8]!                   @load from src[2nt+1] (extra load for 16)
+
+    vpadal.u32  d6, d5                      @accumulate all inp into d6
+    bne         core_loop_add
+
+epil_add_loop:
+
+    vshl.s64    d9, d6, d8                  @(dc_val) shr by log2nt+1
+    cmp         r4, #32
+
+    vmov        d28, r14, r5                @src[2nt+1]+2+src[2nt-1] moved to d28
+    moveq       r6, #128
+
+    vdup.8      d16, d9[0]                  @dc_val
+    vshl.s64    d13, d9, #1                 @2*dc
+
+    beq         prologue_cpy_32
+
+    vadd.i64    d14, d13, d28               @src[2nt+1]+2+src[2nt-1]+2dc_val
+    movne       r6, #0                      @nt
+
+    vshr.u16    d15, d14, #2                @final dst[0]'s value in d15[0]
+    movne       r10, r4
+
+    vadd.i64    d11, d13, d9                @3*dc
+    sub         r12, r3, r3, lsl #3         @-7*strd
+
+    vadd.i64    d11, d11, d17               @3*dc + 2
+    add         r12, r12, #8                @offset after one 8x8 block (-7*strd + 8)
+
+    vdup.16     q12, d11[0]                 @3*dc + 2 (moved to all lanes)
+    sub         r0, r3, r4                  @strd - nt
+
+prologue_col:
+    @0th column and 0-7 rows done here
+    @r8 and r9 (2nt+1+col 2nt-1-row)
+
+    mov         r8, r7                      @&src[2nt+1]
+
+    add         r0, r0, #8                  @strd - nt + 8
+    vld1.s8     d0, [r8]!                   @col 1::7 load (prol)
+    sub         r9, r9, #7                  @&src[2nt-1-row]
+
+    vld1.s8     d1, [r9]                    @row 7::1 (0 also) load (prol)
+    sub         r9, r9, #8
+
+    vmovl.u8    q10, d0
+
+    vld1.s8     d6, [r8]                    @col 8::15 load (prol extra)
+    vadd.i16    q10, q10, q12               @col 1::7 add 3dc+2 (prol)
+
+    vmovl.u8    q11, d1
+    vqshrun.s16 d2, q10, #2                 @columns shr2 movn (prol)
+
+    vmovl.u8    q13, d6
+    vadd.i16    q11, q11, q12               @row 1::7 add 3dc+2 (prol)
+
+    vmov.i64    d19, #0x00000000000000ff    @
+    vqshrun.s16 d3, q11, #2                 @rows shr2 movn (prol)
+
+    vbsl        d19, d15, d2                @first row with dst[0]
+    vadd.i16    q13, q13, q12               @col 8::15 add 3dc+2 (prol extra)
+
+    vrev64.8    d3, d3
+
+    vst1.8      d19, [r2], r3               @store row 0 (prol)
+    vshr.s64    d3, d3, #8                  @row 0 shift (prol) (first value to be ignored)
+
+    vmov.i64    d20, #0x00000000000000ff    @byte mask row 1 (prol)
+
+loop_again_col_row:
+
+    vbsl        d20, d3, d16                @row 1  (prol)
+
+    vmov.i64    d21, #0x00000000000000ff    @byte mask row 2 (prol)
+    vshr.s64    d3, d3, #8                  @row 1 shift (prol)
+
+    vst1.8      d20, [r2], r3               @store row 1 (prol)
+    vqshrun.s16 d4, q13, #2                 @columns shr2 movn (prol extra)
+
+
+    vbsl        d21, d3, d16                @row 2 (prol)
+
+    vmov.i64    d20, #0x00000000000000ff    @byte mask row 3 (prol)
+    vshr.s64    d3, d3, #8                  @row 2 shift (prol)
+
+    vst1.8      d21, [r2], r3               @store row 2 (prol)
+
+
+    vbsl        d20, d3, d16                @row 3  (prol)
+
+    vmov.i64    d21, #0x00000000000000ff    @byte mask row 4 (prol)
+    vshr.s64    d3, d3, #8                  @row 3 shift (prol)
+
+    vst1.8      d20, [r2], r3               @store row 3 (prol)
+
+
+    vbsl        d21, d3, d16                @row 4 (prol)
+
+    vmov.i64    d20, #0x00000000000000ff    @byte mask row 5 (prol)
+    vshr.s64    d3, d3, #8                  @row 4 shift (prol)
+
+    vst1.8      d21, [r2], r3               @store row 4 (prol)
+
+
+    vbsl        d20, d3, d16                @row 5 (prol)
+
+    vmov.i64    d21, #0x00000000000000ff    @byte mask row 6 (prol)
+    vshr.s64    d3, d3, #8                  @row 5 shift (prol)
+
+    vst1.8      d20, [r2], r3               @store row 5 (prol)
+
+    vld1.s8     d1, [r9]                    @row 8::15 load (prol extra)
+
+    vbsl        d21, d3, d16                @row 6 (prol)
+
+    vmovl.u8    q11, d1
+
+    vmov.i64    d20, #0x00000000000000ff    @byte mask row 7 (prol)
+    vshr.s64    d3, d3, #8                  @row 6 shift (prol)
+
+    vst1.8      d21, [r2], r3               @store row 6 (prol)
+
+    vbsl        d20, d3, d16                @row 7 (prol)
+    vadd.i16    q11, q11, q12               @row 8::15 add 3dc+2 (prol extra)
+
+    vshr.s64    d3, d3, #8                  @row 7 shift (prol)
+    vst1.8      d20, [r2], r12              @store row 7 (prol)
+
+    subs        r10, r10, #8                @counter for cols
+
+    beq         end_func
+    blt         copy_16
+
+
+    vmov.i64    d20, #0x00000000000000ff    @byte mask row 9 (prol)
+    vqshrun.s16 d3, q11, #2                 @rows shr2 movn (prol)
+
+    vrev64.8    d3, d3
+
+    vst1.8      d4, [r2], r3                @store 2nd col (for 16x16)
+
+    vst1.8      d16, [r2], r3
+    vst1.8      d16, [r2], r3
+    vst1.8      d16, [r2], r3
+    vst1.8      d16, [r2], r3
+    vst1.8      d16, [r2], r3
+    vst1.8      d16, [r2], r3
+    vst1.8      d16, [r2], r0               @go to next row for 16
+
+
+    vbsl        d20, d3, d16                @row 9  (prol)
+    subs        r10, r10, #8
+
+    vst1.8      d20, [r2], r3               @store row 9 (prol)
+    vshr.s64    d3, d3, #8                  @row 9 shift (prol)
+
+    vmov.i64    d20, #0x00000000000000ff    @byte mask row 9 (prol)
+
+    b           loop_again_col_row
+
+
+copy_16:
+    vst1.8      d16, [r2], r3
+    vst1.8      d16, [r2], r3
+    vst1.8      d16, [r2], r3
+    vst1.8      d16, [r2], r3
+    vst1.8      d16, [r2], r3
+    vst1.8      d16, [r2], r3
+    vst1.8      d16, [r2], r3
+    vst1.8      d16, [r2]
+
+    b           end_func
+
+prologue_cpy_32:
+    mov         r9, #128
+    @sub        r7, r3, #-24
+    add         r5, r2, r3
+    add         r8, r5, r3
+    add         r10, r8, r3
+    vdup.8      q10, d16[0]
+    lsl         r6, r3, #2
+    add         r6, r6, #0xfffffff0
+
+    vst1.8      {d20,d21}, [r2]!
+    vst1.8      {d20,d21}, [r5]!
+    vst1.8      {d20,d21}, [r8]!
+    vst1.8      {d20,d21}, [r10]!
+
+    vst1.8      {d20,d21}, [r2], r6
+    vst1.8      {d20,d21}, [r5], r6
+    vst1.8      {d20,d21}, [r8], r6
+    vst1.8      {d20,d21}, [r10], r6
+
+    sub         r9, r9, #32                 @32x32 prol/epil counter dec
+
+kernel_copy:
+    vst1.8      {d20,d21}, [r2]!
+    vst1.8      {d20,d21}, [r5]!
+    vst1.8      {d20,d21}, [r8]!
+    vst1.8      {d20,d21}, [r10]!
+
+    vst1.8      {d20,d21}, [r2], r6
+    vst1.8      {d20,d21}, [r5], r6
+    vst1.8      {d20,d21}, [r8], r6
+    vst1.8      {d20,d21}, [r10], r6
+
+    subs        r9, r9, #32
+
+    vst1.8      {d20,d21}, [r2]!
+    vst1.8      {d20,d21}, [r5]!
+    vst1.8      {d20,d21}, [r8]!
+    vst1.8      {d20,d21}, [r10]!
+
+    vst1.8      {d20,d21}, [r2], r6
+    vst1.8      {d20,d21}, [r5], r6
+    vst1.8      {d20,d21}, [r8], r6
+    vst1.8      {d20,d21}, [r10], r6
+
+    bne         kernel_copy
+
+epilogue_copy:
+    vst1.8      {d20,d21}, [r2]!
+    vst1.8      {d20,d21}, [r5]!
+    vst1.8      {d20,d21}, [r8]!
+    vst1.8      {d20,d21}, [r10]!
+
+    vst1.8      {d20,d21}, [r2]
+    vst1.8      {d20,d21}, [r5]
+    vst1.8      {d20,d21}, [r8]
+    vst1.8      {d20,d21}, [r10]
+
+    b           end_func
+
+
+dc_4:
+    vld1.s8     d0, [r6]!                   @load from src[nt]
+    vld1.s8     d1, [r8]!                   @load from src[2nt+1]
+
+    vpaddl.u8   d2, d0
+    mov         r5, #0                      @
+    vmov        d6, r4, r5                  @store nt to accumulate
+    vpaddl.u8   d3, d1
+
+    vadd.u16    d4, d2, d3
+
+
+    vpaddl.u16  d5, d4
+    vmov.i64    d30, #0x00000000ffffffff
+
+    vand        d5, d5, d30
+
+    vmov        d28, r14, r5                @src[2nt+1]+2+src[2nt-1] moved to d28
+    vadd.i64    d6, d6, d5                  @accumulate all inp into d6 (end for nt==8)
+
+    vshl.s64    d9, d6, d8                  @(dc_val) shr by log2nt+1
+    mov         r8, r7                      @&src[2nt+1]
+
+    vshl.s64    d13, d9, #1                 @2*dc
+    sub         r9, r9, #3                  @&src[2nt-1-row]
+
+    vdup.8      d16, d9[0]                  @dc_val
+    vadd.i64    d14, d13, d28               @src[2nt+1]+2+src[2nt-1]+2dc_val
+
+    vshr.u16    d15, d14, #2                @final dst[0]'s value in d15[0]
+    sub         r12, r3, r3, lsl #2         @-3*strd
+    vadd.i64    d11, d13, d9                @3*dc
+
+    vadd.i64    d11, d11, d17               @3*dc + 2
+    add         r12, r12, #4                @offset after one 4x4 block (-3*strd + 4)
+
+    vdup.16     q12, d11[0]                 @3*dc + 2 (moved to all lanes)
+    sub         r0, r3, r4                  @strd - nt
+
+
+    vld1.s8     d0, [r8]                    @col 1::3 load (prol)
+    vld1.s8     d1, [r9]                    @row 3::1 (0 also) load (prol)
+
+    vmovl.u8    q10, d0
+
+    vmovl.u8    q11, d1
+    vadd.i16    q10, q10, q12               @col 1::7 add 3dc+2 (prol)
+
+    vadd.i16    q11, q11, q12               @row 1::7 add 3dc+2 (prol)
+
+    vmov.i64    d19, #0x00000000000000ff    @
+    vqshrun.s16 d2, q10, #2                 @columns shr2 movn (prol)
+
+    vmov.i64    d20, #0x00000000000000ff    @byte mask row 1 (prol)
+    vqshrun.s16 d3, q11, #2                 @rows shr2 movn (prol)
+
+
+    vbsl        d19, d15, d2                @first row with dst[0]
+
+    vrev64.8    d3, d3
+
+    vst1.32     d19[0], [r2], r3            @store row 0 (prol)
+    vshr.s64    d3, d3, #40                 @row 0 shift (prol) (first value to be ignored)
+
+    vmov.i64    d21, #0x00000000000000ff    @byte mask row 2 (prol)
+
+    vbsl        d20, d3, d16                @row 1  (prol)
+    vshr.s64    d3, d3, #8                  @row 1 shift (prol)
+
+    vst1.32     d20[0], [r2], r3            @store row 1 (prol)
+
+    vbsl        d21, d3, d16                @row 2 (prol)
+
+    vmov.i64    d20, #0x00000000000000ff    @byte mask row 3 (prol)
+
+    vshr.s64    d3, d3, #8                  @row 2 shift (prol)
+    vst1.32     d21[0], [r2], r3            @store row 2 (prol)
+
+    vbsl        d20, d3, d16                @row 3  (prol)
+    vst1.32     d20[0], [r2]                @store row 3 (prol)
+
+epilogue_end:
+end_func:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+
+

diff --git a/common/arm/ihevc_intra_pred_luma_horz.s b/common/arm/ihevc_intra_pred_luma_horz.s
new file mode 100644
index 0000000..581b673
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_luma_horz.s

@@ -0,0 +1,339 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_luma_horz_neon.s
+@*
+@* @brief
+@*  contains function definition for intra prediction  interpolation filters
+@*
+@*
+@* @author
+@*  parthiban v
+@*
+@* @par list of functions:
+@*  - ihevc_intra_pred_luma_horz()
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*     intra prediction interpolation filter for horizontal luma variable.
+@*
+@* @par description:
+@*      horizontal intraprediction(mode 10) with.extern  samples location
+@*      pointed by 'pu1_ref' to the tu block  location pointed by 'pu1_dst'  refer
+@*      to section 8.4.4.2.6 in the standard (special case)
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] nt
+@*  integer transform block size
+@*
+@* @param[in] mode
+@*  integer intraprediction mode
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@void ihevc_intra_pred_luma_horz(uword8 *pu1_ref,
+@                                word32 src_strd,
+@                                uword8 *pu1_dst,
+@                                word32 dst_strd,
+@                                word32 nt,
+@                                word32 mode)
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 =>  src_strd
+@r2 => *pu1_dst
+@r3 =>  dst_strd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_horz_a9q
+
+.type ihevc_intra_pred_luma_horz_a9q, %function
+
+ihevc_intra_pred_luma_horz_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads nt
+    @ldr        r5,[sp,#44]                     @loads mode
+
+    lsl         r6,r4,#1                    @two_nt
+
+    add         r12,r0,r6                   @*pu1_ref[two_nt]
+    cmp         r4,#4                       @if nt == 4
+    beq         core_loop_4
+
+    cmp         r4,#8                       @if nt == 8
+    beq         core_loop_8
+
+    cmp         r4,#16                      @if nt == 16
+    beq         core_loop_16
+    sub         r12,r12,#16                 @move to 16th value pointer
+    add         r9,r2,#16
+
+core_loop_32:
+    vld1.8      {q0},[r12]                  @load 16 values. d1[7] will have the 1st value.
+
+    vdup.8      q1,d1[7]                    @duplicate the i value.
+
+    vdup.8      q2,d1[6]                    @duplicate the ii value.
+    vdup.8      q3,d1[5]                    @duplicate the iii value.
+    vst1.8      {q1},[r2],r3                @store in 1st row 0-16 columns
+    vst1.8      {q1},[r9],r3                @store in 1st row 16-32 columns
+
+    vdup.8      q4,d1[4]
+    vst1.8      {q2},[r2],r3
+    vst1.8      {q2},[r9],r3
+
+    vdup.8      q1,d1[3]
+    vst1.8      {q3},[r2],r3
+    vst1.8      {q3},[r9],r3
+
+    vdup.8      q2,d1[2]
+    vst1.8      {q4},[r2],r3
+    vst1.8      {q4},[r9],r3
+
+    vdup.8      q3,d1[1]
+    vst1.8      {q1},[r2],r3
+    vst1.8      {q1},[r9],r3
+
+    vdup.8      q4,d1[0]
+    vst1.8      {q2},[r2],r3
+    vst1.8      {q2},[r9],r3
+
+    vdup.8      q1,d0[7]
+    vst1.8      {q3},[r2],r3
+    vst1.8      {q3},[r9],r3
+
+    vdup.8      q2,d0[6]
+    vst1.8      {q4},[r2],r3
+    vst1.8      {q4},[r9],r3
+
+    vdup.8      q3,d0[5]
+    vst1.8      {q1},[r2],r3
+    vst1.8      {q1},[r9],r3
+
+    vdup.8      q4,d0[4]
+    vst1.8      {q2},[r2],r3
+    vst1.8      {q2},[r9],r3
+
+    vdup.8      q1,d0[3]
+    vst1.8      {q3},[r2],r3
+    vst1.8      {q3},[r9],r3
+
+    vdup.8      q2,d0[2]
+    vst1.8      {q4},[r2],r3
+    vst1.8      {q4},[r9],r3
+
+    vdup.8      q3,d0[1]
+    vst1.8      {q1},[r2],r3
+    vst1.8      {q1},[r9],r3
+    sub         r12,r12,#16                 @move to 16th value pointer
+
+    vdup.8      q4,d0[0]
+    vst1.8      {q2},[r2],r3
+    vst1.8      {q2},[r9],r3
+
+    subs        r4,r4,#16                   @decrement the loop count by 16
+    vst1.8      {q3},[r2],r3
+    vst1.8      {q3},[r9],r3
+
+    vst1.8      {q4},[r2],r3
+    vst1.8      {q4},[r9],r3
+    bgt         core_loop_32
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+    b           end_func
+
+core_loop_16:
+    ldrb        lr,[r12],#1                 @pu1_ref[two_nt]
+    vld1.8      {q15},[r12]                 @pu1_ref[two_nt + 1 + col]
+
+    vdup.8      d28,lr
+    sub         r12,r12,#17
+    vld1.8      {q0},[r12]
+    vdup.8      d26,d1[7]
+    vmovl.u8    q13,d26
+
+    vdup.8      q1,d1[6]
+    vsubl.u8    q12,d30,d28
+
+    vdup.8      q2,d1[5]
+    vshr.s16    q12,q12,#1
+
+    vdup.8      q3,d1[4]
+    vqadd.s16   q11,q13,q12
+
+    vdup.8      q4,d1[3]
+    vqmovun.s16 d22,q11
+
+    vst1.8      {d22},[r2]!
+
+    vdup.8      q5,d1[2]
+    vsubl.u8    q12,d31,d28
+
+    vdup.8      q6,d1[1]
+    vshr.s16    q12,q12,#1
+
+    vdup.8      q7,d1[0]
+    vqadd.s16   q11,q13,q12
+
+    vdup.8      q8,d0[7]
+    vqmovun.s16 d22,q11
+
+    vst1.8      {d22},[r2],r3
+    sub         r2,r2,#8
+
+    vst1.8      {q1},[r2],r3
+
+    vst1.8      {q2},[r2],r3
+    vst1.8      {q3},[r2],r3
+    vst1.8      {q4},[r2],r3
+
+    vdup.8      q1,d0[6]
+    vst1.8      {q5},[r2],r3
+
+    vdup.8      q2,d0[5]
+    vst1.8      {q6},[r2],r3
+
+    vdup.8      q3,d0[4]
+    vst1.8      {q7},[r2],r3
+
+    vdup.8      q4,d0[3]
+    vst1.8      {q8},[r2],r3
+
+    vdup.8      q5,d0[2]
+    vst1.8      {q1},[r2],r3
+
+    vdup.8      q6,d0[1]
+    vst1.8      {q2},[r2],r3
+
+    vdup.8      q7,d0[0]
+    vst1.8      {q3},[r2],r3
+
+    vst1.8      {q4},[r2],r3
+    vst1.8      {q5},[r2],r3
+    vst1.8      {q6},[r2],r3
+    vst1.8      {q7},[r2],r3
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+    b           end_func
+
+
+core_loop_8:
+    ldrb        lr,[r12]                    @pu1_ref[two_nt]
+    add         r12,r12,#1                  @pu1_ref[two_nt + 1]
+    vld1.8      {d30},[r12]                 @pu1_ref[two_nt + 1 + col]
+
+    sub         r12,r12,#9
+    vld1.8      {d0},[r12]
+    vdup.8      d26,d0[7]
+    vdup.8      d28,lr
+
+    vdup.8      d3,d0[6]
+    vmovl.u8    q13,d26
+
+    vdup.8      d4,d0[5]
+    vsubl.u8    q12,d30,d28
+
+    vdup.8      d5,d0[4]
+    vshr.s16    q12,q12,#1
+
+    vdup.8      d6,d0[3]
+    vqadd.s16   q11,q13,q12
+
+    vdup.8      d7,d0[2]
+    vqmovun.s16 d22,q11
+
+    vst1.8      {d22},[r2],r3
+    vst1.8      {d3},[r2],r3
+
+    vdup.8      d8,d0[1]
+    vst1.8      {d4},[r2],r3
+    vst1.8      {d5},[r2],r3
+
+    vdup.8      d9,d0[0]
+    vst1.8      {d6},[r2],r3
+    vst1.8      {d7},[r2],r3
+
+    vst1.8      {d8},[r2],r3
+    vst1.8      {d9},[r2],r3
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+    b           end_func
+
+
+core_loop_4:
+    ldrb        lr,[r12]                    @pu1_ref[two_nt]
+    add         r12,r12,#1                  @pu1_ref[two_nt + 1]
+    vld1.8      {d30},[r12]                 @pu1_ref[two_nt + 1 + col]
+
+    sub         r12,r12,#5
+    vld1.8      {d0},[r12]
+    vdup.8      d28,lr
+    vdup.8      d26,d0[3]
+    vmovl.u8    q13,d26
+
+    vdup.8      d3,d0[2]
+    vsubl.u8    q12,d30,d28
+
+    vdup.8      d4,d0[1]
+    vshr.s16    q12,q12,#1
+
+    vdup.8      d5,d0[0]
+    vqadd.s16   q11,q13,q12
+
+    vqmovun.s16 d22,q11
+
+    vst1.32     {d22[0]},[r2],r3
+    vst1.32     {d3[0]},[r2],r3
+    vst1.32     {d4[0]},[r2],r3
+    vst1.32     {d5[0]},[r2],r3
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+end_func:
+
+
+

diff --git a/common/arm/ihevc_intra_pred_luma_mode2.s b/common/arm/ihevc_intra_pred_luma_mode2.s
new file mode 100644
index 0000000..cf7999b
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_luma_mode2.s

@@ -0,0 +1,270 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_luma_mode2_neon.s
+@*
+@* @brief
+@*  contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the planar coefficients
+@*
+@* @param[in] nt
+@*  size of tranform block
+@*
+@* @param[in] mode
+@*  type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref,
+@                                 word32 src_strd,
+@                                 uword8 *pu1_dst,
+@                                 word32 dst_strd,
+@                                 word32 nt,
+@                                 word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@   nt
+@   mode
+@   pi1_coeff
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_mode2_a9q
+
+.type ihevc_intra_pred_luma_mode2_a9q, %function
+
+ihevc_intra_pred_luma_mode2_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads nt
+    mov         r8,#-2
+
+    cmp         r4,#4
+    beq         mode2_4
+
+    add         r0,r0,r4,lsl #1
+
+    sub         r0,r0,#9                    @src[1]
+    add         r10,r0,#-1
+
+prologue_cpy_32:
+
+    vld1.8      {d0},[r0],r8
+    mov         r11,r4
+
+    vld1.8      {d1},[r10],r8
+    mov         r6, r2
+
+    vld1.8      {d2},[r0],r8
+    vld1.8      {d3},[r10],r8
+    lsr         r1, r4, #3
+
+    vld1.8      {d4},[r0],r8
+    vld1.8      {d5},[r10],r8
+    vld1.8      {d6},[r0],r8
+    mul         r1, r4, r1
+
+    vld1.8      {d7},[r10],r8
+    add         r7,r6,r3
+
+    vrev64.8    d8,d0
+    vrev64.8    d9,d1
+    lsl         r5, r3, #2
+
+    vrev64.8    d10,d2
+    vrev64.8    d11,d3
+    add         r9,r7,r3
+
+    vrev64.8    d12,d4
+    subs        r1,r1,#8
+
+    vrev64.8    d13,d5
+    vrev64.8    d14,d6
+    vrev64.8    d15,d7
+    add         r14,r9,r3
+
+    beq         epilogue_mode2
+
+    sub         r12,r4,#8
+
+kernel_mode2:
+
+    vst1.8      {d8},[r6],r5
+    vst1.8      {d9},[r7],r5
+    subs        r11,r11,#8
+
+    vst1.8      {d10},[r9],r5
+    addgt       r2,r2,#8
+
+    vst1.8      {d11},[r14],r5
+    vst1.8      {d12},[r6],r5
+    movle       r11,r4
+
+    vst1.8      {d13},[r7],r5
+    vst1.8      {d14},[r9],r5
+    addle       r2, r2, r3, lsl #2
+
+    vst1.8      {d15},[r14],r5
+    vld1.8      {d0},[r0],r8
+    sub         r14,r4,#8
+
+    vld1.8      {d1},[r10],r8
+    vld1.8      {d2},[r0],r8
+    addle       r2, r2, #8
+
+    vld1.8      {d3},[r10],r8
+    vld1.8      {d4},[r0],r8
+    suble       r2, r6, r14
+
+    vld1.8      {d5},[r10],r8
+    subs        r12,r12,#8
+
+    vld1.8      {d6},[r0],r8
+    mov         r6, r2
+
+    vld1.8      {d7},[r10],r8
+    addle       r0, r0, r4
+
+    vrev64.8    d8,d0
+    add         r7, r6, r3
+
+    vrev64.8    d9,d1
+    suble       r0, r0, #8
+
+    vrev64.8    d10,d2
+    movle       r12,r4
+
+    vrev64.8    d11,d3
+    add         r9, r7, r3
+
+    vrev64.8    d12,d4
+    add         r10,r0,#-1
+
+    vrev64.8    d13,d5
+    subs        r1, r1, #8
+
+    vrev64.8    d14,d6
+    add         r14, r9, r3
+
+    vrev64.8    d15,d7
+
+    bne         kernel_mode2
+
+epilogue_mode2:
+
+    vst1.8      {d8},[r6],r5
+    vst1.8      {d9},[r7],r5
+    vst1.8      {d10},[r9],r5
+    vst1.8      {d11},[r14],r5
+    vst1.8      {d12},[r6],r5
+    vst1.8      {d13},[r7],r5
+    vst1.8      {d14},[r9],r5
+    vst1.8      {d15},[r14],r5
+
+    b           end_func
+
+mode2_4:
+
+    mov         r8,#-2
+    sub         r0,r0,#1
+    add         r10,r0,#-1
+
+    vld1.8      {d0},[r0],r8
+    add         r5,r2,r3
+    vld1.8      {d2},[r10],r8
+    add         r6,r5,r3
+    vld1.8      {d4},[r0]
+    add         r7,r6,r3
+    vld1.8      {d6},[r10]
+
+    vrev64.8    d1,d0
+    vrev64.8    d3,d2
+
+
+
+    vst1.32     {d1[0]},[r2]
+    vrev64.8    d5,d4
+    vst1.32     {d3[0]},[r5]
+    vrev64.8    d7,d6
+    vst1.32     {d5[0]},[r6]
+    vst1.32     {d7[0]},[r7]
+
+end_func:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+
+

diff --git a/common/arm/ihevc_intra_pred_luma_mode_18_34.s b/common/arm/ihevc_intra_pred_luma_mode_18_34.s
new file mode 100644
index 0000000..438c0f5
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_luma_mode_18_34.s

@@ -0,0 +1,273 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_luma_mode_18_34_neon.s
+@*
+@* @brief
+@*  contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the planar coefficients
+@*
+@* @param[in] nt
+@*  size of tranform block
+@*
+@* @param[in] mode
+@*  type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_mode_18_34(uword8 *pu1_ref,
+@                                      word32 src_strd,
+@                                      uword8 *pu1_dst,
+@                                      word32 dst_strd,
+@                                      word32 nt,
+@                                      word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@   nt
+@   mode
+@   pi1_coeff
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_mode_18_34_a9q
+
+.type ihevc_intra_pred_luma_mode_18_34_a9q, %function
+
+ihevc_intra_pred_luma_mode_18_34_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+
+    ldr         r4,[sp,#40]
+    ldr         r5,[sp,#44]
+
+    cmp         r4,#4
+    beq         mode2_4
+
+    mov         r11,r4
+    mov         r12,r4
+    sub         r14,r4,#8
+
+    add         r0,r0,r4,lsl #1
+
+    cmp         r5,#0x22
+    mov         r10,r2
+
+    add         r0,r0,#2
+    subne       r0,r0,#2
+    moveq       r6,#1
+    movne       r6,#-1
+    mov         r8,r0
+
+prologue_cpy_32:
+
+    vld1.8      {d0},[r8],r6
+    lsr         r1, r4, #3
+    vld1.8      {d1},[r8],r6
+    mul         r1, r4, r1
+    vld1.8      {d2},[r8],r6
+    vld1.8      {d3},[r8],r6
+    subs        r1,r1,#8
+    vld1.8      {d4},[r8],r6
+    vld1.8      {d5},[r8],r6
+    vld1.8      {d6},[r8],r6
+
+    vld1.8      {d7},[r8],r6
+
+
+    beq         epilogue_mode2
+    sub         r11,r11,#8
+
+    cmp         r5,#0x22
+    addne       r0,r0,#8
+    movne       r8,r0
+    bne         kernel_mode18
+    @add        r8,r0,#8
+
+kernel_mode2:
+    vst1.8      {d0},[r10],r3
+    vst1.8      {d1},[r10],r3
+    subs        r12,r12,#8
+    vst1.8      {d2},[r10],r3
+    addne       r2,r2,#8
+    vst1.8      {d3},[r10],r3
+
+    vld1.8      {d0},[r8],r6
+    vst1.8      {d4},[r10],r3
+
+    vst1.8      {d5},[r10],r3
+    vld1.8      {d1},[r8],r6
+    vst1.8      {d6},[r10],r3
+    vld1.8      {d2},[r8],r6
+    vst1.8      {d7},[r10],r3
+
+    vld1.8      {d3},[r8],r6
+    subeq       r2,r10,r14
+    vld1.8      {d4},[r8],r6
+    mov         r10,r2
+    vld1.8      {d5},[r8],r6
+    moveq       r12,r4
+    vld1.8      {d6},[r8],r6
+    subs        r11,r11,#8
+
+    vld1.8      {d7},[r8],r6
+
+    addeq       r0,r0,#8
+    moveq       r11,r4
+    moveq       r8,r0
+
+    subs        r1, r1, #8
+
+    bne         kernel_mode2
+
+    b           epilogue_mode2
+
+kernel_mode18:
+    vst1.8      {d0},[r10],r3
+    vst1.8      {d1},[r10],r3
+    subs        r12,r12,#8
+    vst1.8      {d2},[r10],r3
+    addne       r2,r2,#8
+    vst1.8      {d3},[r10],r3
+
+    vld1.8      {d0},[r8],r6
+    vst1.8      {d4},[r10],r3
+
+    vst1.8      {d5},[r10],r3
+    vld1.8      {d1},[r8],r6
+
+    vst1.8      {d6},[r10],r3
+    vld1.8      {d2},[r8],r6
+    vst1.8      {d7},[r10],r3
+
+    vld1.8      {d3},[r8],r6
+    subeq       r2,r10,r14
+    vld1.8      {d4},[r8],r6
+    mov         r10,r2
+    vld1.8      {d5},[r8],r6
+    moveq       r12,r4
+    vld1.8      {d6},[r8],r6
+    subs        r11,r11,#8
+    vld1.8      {d7},[r8],r6
+
+    addne       r0,r0,#8
+    moveq       r11,r4
+    subeq       r0,r8,r14
+    subs        r1, r1, #8
+    mov         r8,r0
+
+    bne         kernel_mode18
+
+
+epilogue_mode2:
+
+    vst1.8      {d0},[r10],r3
+    vst1.8      {d1},[r10],r3
+    vst1.8      {d2},[r10],r3
+    vst1.8      {d3},[r10],r3
+    vst1.8      {d4},[r10],r3
+    vst1.8      {d5},[r10],r3
+    vst1.8      {d6},[r10],r3
+    vst1.8      {d7},[r10],r3
+
+    b           end_func
+
+mode2_4:
+
+    add         r0,r0,#10
+    cmp         r5,#0x22
+    subne       r0,r0,#2
+
+    moveq       r8,#1
+    movne       r8,#-1
+
+    vld1.8      {d0},[r0],r8
+    vst1.32     {d0[0]},[r2],r3
+
+    vld1.8      {d0},[r0],r8
+    vst1.32     {d0[0]},[r2],r3
+
+    vld1.8      {d0},[r0],r8
+    vst1.32     {d0[0]},[r2],r3
+
+    vld1.8      {d0},[r0],r8
+    vst1.32     {d0[0]},[r2],r3
+
+end_func:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+
+

diff --git a/common/arm/ihevc_intra_pred_luma_mode_27_to_33.s b/common/arm/ihevc_intra_pred_luma_mode_27_to_33.s
new file mode 100644
index 0000000..595d82a
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_luma_mode_27_to_33.s

@@ -0,0 +1,540 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_luma_mode_27_to_33.s
+@*
+@* @brief
+@*  contains function definition for intra prediction  interpolation filters
+@*
+@*
+@* @author
+@*  parthiban v
+@*
+@* @par list of functions:
+@*  - ihevc_intra_pred_luma_mode_27_to_33()
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    intra prediction interpolation filter for luma mode 27 to mode 33
+@*
+@* @par description:
+@*    intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
+@*   .extern  neighboring samples location pointed by 'pu1_ref' to the  tu
+@*    block location pointed by 'pu1_dst'
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] nt
+@*  integer transform block size
+@*
+@* @param[in] mode
+@*  integer intraprediction mode
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_mode_27_to_33(uword8 *pu1_ref,
+@                                       word32 src_strd,
+@                                       uword8 *pu1_dst,
+@                                       word32 dst_strd,
+@                                       word32 nt,
+@                                       word32 mode)
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 =>  src_strd
+@r2 => *pu1_dst
+@r3 =>  dst_strd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_mode_27_to_33_a9q
+.extern gai4_ihevc_ang_table
+.extern gau1_ihevc_planar_factor
+
+gai4_ihevc_ang_table_addr:
+.long gai4_ihevc_ang_table - ulbl1 - 8
+
+gau1_ihevc_planar_factor_addr:
+.long gau1_ihevc_planar_factor - ulbl2 - 8
+
+
+.type ihevc_intra_pred_luma_mode_27_to_33_a9q, %function
+
+ihevc_intra_pred_luma_mode_27_to_33_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads nt
+    ldr         r5,[sp,#44]                 @loads mode
+    ldr         r6,gai4_ihevc_ang_table_addr @loads word32 gai4_ihevc_ang_table[35]
+ulbl1:
+    add         r6,r6,pc
+
+    lsl         r7,r4,#1                    @two_nt
+
+    add         r8,r6,r5,lsl #2             @*gai4_ihevc_ang_table[mode]
+    ldr         r9,[r8]                     @intra_pred_ang = gai4_ihevc_ang_table[mode]
+    ldr         r1,gau1_ihevc_planar_factor_addr @used for ((row + 1) * intra_pred_ang) row values
+ulbl2:
+    add         r1,r1,pc
+    add         r6,r1,#1
+
+    tst         r4,#7
+    add         r8,r0,r7                    @pu1_ref + two_nt
+    mov         lr,#0                       @row
+    mov         r12,r4
+    bne         core_loop_4
+
+core_loop_8:
+    add         r8,r8,#1                    @pu1_ref_main_idx += (two_nt + 1)
+    vdup.8      d0,r9                       @intra_pred_ang
+    mov         r12,r4,lsr #3               @divide by 8
+
+    vmov.i8     d1,#32
+    mul         r7,r4,r12
+
+    vmov.i16    q3,#31
+    @lsl            r12,r3,#3
+
+    mov         r1,r8
+    @sub            r12,r12,r4
+    mov         r5,r4
+    mov         r11,#1
+
+prologue:
+    vld1.8      {d3},[r6]                   @loads the row value
+    vmull.u8    q1,d3,d0                    @pos = ((row + 1) * intra_pred_ang)
+    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
+    vmovn.i16   d4,q2
+    vshrn.u16   d5,q1,#5                    @idx = pos >> 5
+
+    vdup.8      d31,d4[0]
+    add         r0,r2,r3
+
+    vmov.u32    lr,d5[0]                    @(i row)extract idx to the r register
+
+    vdup.8      d29,d4[1]                   @(ii)
+    and         r9,lr,#0xff                 @(i row) get the last byte
+
+    add         r10,r8,r9                   @(i row)*pu1_ref[ref_main_idx]
+
+    asr         lr,lr,#8                    @(ii)shift by 8
+    vld1.8      {d8},[r10],r11              @(i row)ref_main_idx
+    and         r9,lr,#0xff                 @(ii)get the last byte
+
+    asr         lr,lr,#8                    @(iii)
+    vld1.8      {d9},[r10]                  @(i row)ref_main_idx_1
+    add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
+
+    and         r9,lr,#0xff                 @(iii)
+    vsub.u8     d30,d1,d31                  @32-fract(dup_const_32_fract)
+    add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
+    vmull.u8    q5,d8,d30                   @(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
+    vmlal.u8    q5,d9,d31                   @(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
+    asr         lr,lr,#8                    @(iv)
+
+    vdup.8      d27,d4[2]                   @(iii)
+    vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
+    and         r9,lr,#0xff                 @(iv)
+
+    vdup.8      d25,d4[3]                   @(iv)
+    vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
+    vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
+    vrshrn.i16  d10,q5,#5                   @(i row)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
+    vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
+
+    vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
+
+    vdup.8      d31,d4[4]                   @(v)
+    vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vmov.u32    lr,d5[1]                    @extract idx to the r register
+    vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vst1.8      {d10},[r2]!                 @(i row)
+    vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    and         r9,lr,#0xff                 @(v)
+    vdup.8      d29,d4[5]                   @(vi)
+    add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d8},[r10],r11              @(v)ref_main_idx
+    vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
+
+    asr         lr,lr,#8                    @(vi)
+    vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+    and         r9,lr,#0xff                 @(vi)
+
+    vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
+    vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vst1.8      {d14},[r0],r3               @(ii)
+    vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
+    vdup.8      d27,d4[6]                   @(vii)
+    asr         lr,lr,#8                    @(vii)
+
+    and         r9,lr,#0xff                 @(vii)
+    vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
+    add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
+    vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
+    vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vst1.8      {d18},[r0],r3               @(iii)
+    vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+    asr         lr,lr,#8                    @(viii)
+    vdup.8      d25,d4[7]                   @(viii)
+    and         r9,lr,#0xff                 @(viii)
+
+    vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
+    vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
+
+    vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
+    vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
+    vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+    subs        r4,r4,#8
+
+    vst1.8      {d22},[r0],r3               @(iv)
+    vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
+    vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
+
+    vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
+    vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    addgt       r8,r8,#8
+    vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    subgt       r7,r7,#8
+
+    vst1.8      {d10},[r0],r3               @(v)
+    vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+    beq         epilogue
+
+    vld1.8      {d5},[r6]                   @loads the row value
+    vmull.u8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
+    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
+    vmovn.i16   d4,q2
+    vshrn.u16   d3,q1,#5                    @idx = pos >> 5
+    vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
+    and         r9,lr,#0xff                 @(i)
+    add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
+
+kernel_8_rows:
+    asr         lr,lr,#8                    @(ii)
+    vdup.8      d31,d4[0]
+    subs        r4,r4,#8
+
+    vld1.8      {d8},[r10],r11              @(i)ref_main_idx
+    vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
+    and         r9,lr,#0xff                 @(ii)
+    addle       r6,r6,#8                    @increment the row value
+
+    vld1.8      {d9},[r10]                  @(i)ref_main_idx_1
+    vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d5},[r6]                   @loads the row value
+    vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    asr         lr,lr,#8                    @(iii)
+
+    vdup.8      d29,d4[1]                   @(ii)
+    vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
+    and         r9,lr,#0xff                 @(iii)
+
+    vst1.8      {d14},[r0],r3               @(vi)
+    vsub.u8     d30,d1,d31                  @(i)32-fract(dup_const_32_fract)
+    add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
+    vmull.u8    q5,d8,d30                   @(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+    asr         lr,lr,#8                    @(iv)
+
+    vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
+    vmlal.u8    q5,d9,d31                   @(i)vmull_u8(ref_main_idx_1, dup_const_fract)
+    and         r9,lr,#0xff                 @(iv)
+
+    vmov.u32    lr,d3[1]                    @extract idx to the r register
+    vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vdup.8      d27,d4[2]                   @(iii)
+    vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
+    movle       r4,r5                       @reload nt
+
+    vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
+    vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
+
+    vst1.8      {d18},[r0],r3               @(vii)
+    vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
+    vrshrn.i16  d10,q5,#5                   @(i)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vdup.8      d25,d4[3]                   @(iv)
+    vmull.u8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
+
+    vst1.8      {d22},[r0]                  @(viii)
+    vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
+
+    vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
+    vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    add         r0,r2,r3
+
+    vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
+    vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    and         r9,lr,#0xff                 @(v)
+
+    vdup.8      d31,d4[4]                   @(v)
+    vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
+    add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
+
+    vst1.8      {d10},[r2]!                 @(i)
+    vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
+    asr         lr,lr,#8                    @(vi)
+
+    vdup.8      d29,d4[5]                   @(vi)
+    vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+    and         r9,lr,#0xff                 @(vi)
+
+    vdup.8      d27,d4[6]                   @(vii)
+    vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+    add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
+
+    vdup.8      d25,d4[7]                   @(viii)
+    vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
+    asr         lr,lr,#8                    @(vii)
+
+    vld1.8      {d8},[r10],r11              @(v)ref_main_idx
+    vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
+    and         r9,lr,#0xff                 @(vii)
+
+    vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
+    vshrn.u16   d3,q1,#5                    @idx = pos >> 5
+    asr         lr,lr,#8                    @(viii)
+
+    vst1.8      {d14},[r0],r3               @(ii)
+    vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
+    add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
+    vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
+    and         r9,lr,#0xff                 @(viii)
+
+    vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
+    vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
+    vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+    add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
+
+    vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
+    vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
+
+    vst1.8      {d18},[r0],r3               @(iii)
+    vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+    movle       r8,r1                       @reload the source to pu1_src+2nt
+
+    vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
+    vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+    addgt       r8,r8,#8                    @increment the source next set 8 columns in same row
+
+    vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
+    vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
+    vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
+    lslle       r12,r3,#3
+
+    vst1.8      {d22},[r0],r3               @(iv)
+    vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    suble       r12,r12,r5
+
+    vst1.8      {d10},[r0],r3               @(v)
+    vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    addle       r2,r2,r12                   @increment the dst pointer to 8*dst_strd - nt
+
+    vmovn.i16   d4,q2
+    vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
+    and         r9,lr,#0xff                 @(i)
+
+    subs        r7,r7,#8
+    add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
+
+    bne         kernel_8_rows
+
+epilogue:
+    vst1.8      {d14},[r0],r3               @(vi)
+    vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
+    vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vst1.8      {d18},[r0],r3               @(vii)
+    vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    vst1.8      {d22},[r0],r3               @(viii)
+    b           end_loops
+
+core_loop_4:
+    add         r10,r8,#1                   @pu1_ref_main_idx += (two_nt + 1)
+    add         r11,r8,#2                   @pu1_ref_main_idx_1 += (two_nt + 2)
+    mov         r8,#0
+
+    add         r5,r8,#1                    @row + 1
+    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
+    and         r5,r5,#31                   @fract = pos & (31)
+    cmp         lr,r5                       @if(fract_prev > fract)
+    addgt       r10,r10,#1                  @pu1_ref_main_idx += 1
+    add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
+    vdup.8      d0,r5                       @dup_const_fract
+    rsb         r4,r5,#32
+    vdup.8      d1,r4                       @dup_const_32_fract
+
+@inner_loop_4
+    vld1.32     {d2[0]},[r10]               @ref_main_idx
+    add         r8,r8,#1
+    mov         lr,r5                       @fract_prev = fract
+
+    vld1.32     {d3[0]},[r11]               @ref_main_idx_1
+    add         r5,r8,#1                    @row + 1
+    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
+    and         r5,r5,#31                   @fract = pos & (31)
+    cmp         lr,r5                       @if(fract_prev > fract)
+    addgt       r10,r10,#1                  @pu1_ref_main_idx += 1
+    add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
+
+    vdup.8      d6,r5                       @dup_const_fract
+    vmull.u8    q2,d2,d1                    @vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    rsb         r4,r5,#32
+    vdup.8      d7,r4                       @dup_const_32_fract
+    vmlal.u8    q2,d3,d0                    @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.32     {d8[0]},[r10]               @ref_main_idx
+    add         r8,r8,#1
+
+    vld1.32     {d9[0]},[r11]               @ref_main_idx_1
+    vrshrn.i16  d4,q2,#5                    @shift_res = vrshrn_n_u16(add_res, 5)
+
+    mov         lr,r5                       @fract_prev = fract
+    add         r5,r8,#1                    @row + 1
+    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
+    and         r5,r5,#31                   @fract = pos & (31)
+    cmp         lr,r5                       @if(fract_prev > fract)
+    addgt       r10,r10,#1                  @pu1_ref_main_idx += 1
+    add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
+
+    vdup.8      d12,r5                      @dup_const_fract
+    vmull.u8    q5,d8,d7                    @vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    rsb         r4,r5,#32
+    vdup.8      d13,r4                      @dup_const_32_fract
+    vmlal.u8    q5,d9,d6                    @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.32     {d14[0]},[r10]              @ref_main_idx
+    add         r8,r8,#1
+
+    vst1.32     {d4[0]},[r2],r3
+    vrshrn.i16  d10,q5,#5                   @shift_res = vrshrn_n_u16(add_res, 5)
+
+    vld1.32     {d15[0]},[r11]              @ref_main_idx_1
+    mov         lr,r5                       @fract_prev = fract
+    add         r5,r8,#1                    @row + 1
+    mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
+    and         r5,r5,#31                   @fract = pos & (31)
+    cmp         lr,r5                       @if(fract_prev > fract)
+    addgt       r10,r10,#1                  @pu1_ref_main_idx += 1
+    add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
+
+    vdup.8      d18,r5                      @dup_const_fract
+    vmull.u8    q8,d14,d13                  @vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    rsb         r4,r5,#32
+    vdup.8      d19,r4                      @dup_const_32_fract
+    vmlal.u8    q8,d15,d12                  @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vld1.32     {d20[0]},[r10]              @ref_main_idx
+
+    vst1.32     {d10[0]},[r2],r3
+    vrshrn.i16  d16,q8,#5                   @shift_res = vrshrn_n_u16(add_res, 5)
+    vld1.32     {d21[0]},[r11]              @ref_main_idx_1
+
+    vmull.u8    q11,d20,d19                 @vmull_u8(ref_main_idx, dup_const_32_fract)
+    vmlal.u8    q11,d21,d18                 @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    vst1.32     {d16[0]},[r2],r3
+    vrshrn.i16  d22,q11,#5                  @shift_res = vrshrn_n_u16(add_res, 5)
+
+    vst1.32     {d22[0]},[r2],r3
+
+end_loops:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+

diff --git a/common/arm/ihevc_intra_pred_luma_mode_3_to_9.s b/common/arm/ihevc_intra_pred_luma_mode_3_to_9.s
new file mode 100644
index 0000000..a8e93c8
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_luma_mode_3_to_9.s

@@ -0,0 +1,573 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_luma_mode_3_to_9.s
+@*
+@* @brief
+@*  contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  parthiban v
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] nt
+@*  size of tranform block
+@*
+@* @param[in] mode
+@*  type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_mode_3_to_9(uword8* pu1_ref,
+@                               word32 src_strd,
+@                               uword8* pu1_dst,
+@                               word32 dst_strd,
+@                               word32 nt,
+@                               word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@   nt
+@   mode
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_mode_3_to_9_a9q
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern col_for_intra_luma
+.extern idx_neg_idx_3_9
+
+gai4_ihevc_ang_table_addr:
+.long gai4_ihevc_ang_table - ulbl1 - 8
+
+gai4_ihevc_inv_ang_table_addr:
+.long gai4_ihevc_inv_ang_table - ulbl2 - 8
+
+idx_neg_idx_3_9_addr_1:
+.long idx_neg_idx_3_9 - ulbl3_1 - 8
+
+idx_neg_idx_3_9_addr_2:
+.long idx_neg_idx_3_9 - ulbl3_2 - 8
+
+col_for_intra_luma_addr_1:
+.long col_for_intra_luma - ulbl4_1 - 8
+
+col_for_intra_luma_addr_2:
+.long col_for_intra_luma - ulbl4_2 - 8
+
+col_for_intra_luma_addr_3:
+.long col_for_intra_luma - ulbl4_3 - 8
+
+.type ihevc_intra_pred_luma_mode_3_to_9_a9q, %function
+
+ihevc_intra_pred_luma_mode_3_to_9_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads nt
+    ldr         r7, gai4_ihevc_ang_table_addr
+ulbl1:
+    add         r7,r7,pc
+
+    ldr         r5,[sp,#44]                 @mode (3 to 9)
+    ldr         r8, gai4_ihevc_inv_ang_table_addr
+ulbl2:
+    add         r8,r8,pc
+
+    add         r7, r7, r5, lsl #2          @gai4_ihevc_ang_table[mode]
+    ldr         r7, [r7]                    @intra_pred_ang
+    vdup.8      d30, r7                     @intra_pred_ang
+
+    ldr         r14, col_for_intra_luma_addr_1
+ulbl4_1:
+    add         r14,r14,pc
+    cmp         r4, #4
+
+    beq         sz_4_proc
+    b           prologue_8_16_32
+
+prologue_8_16_32:
+    lsr         r10, r4, #3
+    vld1.8      d31, [r14]!
+    mul         r10, r4, r10                @block counter (dec by #8)
+
+    mov         r11, r4                     @col counter to be inc/dec by #8
+    vmull.s8    q11, d30, d31               @(col+1)*intra_pred_angle [0:7](col)
+
+    sub         r7, r5, #3
+    vmov.i8     d2,#1                       @contains #1 for adding to get ref_main_idx + 1
+    ldr         r12, idx_neg_idx_3_9_addr_1 @load least idx table
+ulbl3_1:
+    add         r12,r12,pc
+
+    vmov.i8     d3, #2
+
+    add         r12, r12, r7, lsl #4
+    mov         r8, r12
+
+    mov         r7, #8
+    sub         r7, r7, r3, lsl #3          @r7 = 8-8r3
+
+    ldr         r9, [r8]
+    add         r1, r0, r4, lsl #1          @pu1_ref + nt
+
+    vmovn.s16   d6, q11
+    vdup.8      d26, r9                     @least idx added to final idx values
+    sub         r1, r1, #9                  @ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
+
+    sub         r6, r1, r9
+
+    vld1.8      {d0,d1}, [r6]               @stores the 32 values reqd based on indices values (from least idx)
+    vshr.s16    q11, q11, #5
+
+    vmov.i8     d29, #31                    @contains #31 for vand operation
+
+    vmov.i8     d28, #32
+
+    vqmovn.s16  d8, q11
+
+    vand        d6, d6, d29                 @fract values in d1/ idx values in d0
+
+    mov         r0, #1
+
+    vmov.i8     d27, #7                     @row 0 to 7
+
+    vsub.s8     d8, d8, d2                  @ref_main_idx (sub row)
+    vsub.s8     d8, d26, d8                 @ref_main_idx (row 0)
+    vadd.s8     d8, d8, d27                 @t0 compensate the pu1_src idx incremented by 8
+    vsub.s8     d9, d8, d2                  @ref_main_idx + 1 (row 0)
+    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 0)
+    vsub.s8     d7, d28, d6                 @32-fract
+
+    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 0)
+    vsub.s8     d4, d8, d2                  @ref_main_idx (row 1)
+    vsub.s8     d5, d9, d2                  @ref_main_idx + 1 (row 1)
+
+    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 1)
+    vmull.u8    q12, d12, d7                @mul (row 0)
+    vmlal.u8    q12, d13, d6                @mul (row 0)
+
+    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 1)
+    vsub.s8     d8, d8, d3                  @ref_main_idx (row 2)
+    vsub.s8     d9, d9, d3                  @ref_main_idx + 1 (row 2)
+
+    vrshrn.i16  d24, q12, #5                @round shft (row 0)
+
+    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 2)
+    vmull.u8    q11, d16, d7                @mul (row 1)
+    vmlal.u8    q11, d17, d6                @mul (row 1)
+
+    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 2)
+    vsub.s8     d4, d4, d3                  @ref_main_idx (row 3)
+    vsub.s8     d5, d5, d3                  @ref_main_idx + 1 (row 3)
+
+    vst1.8      d24, [r2], r3               @st (row 0)
+    vrshrn.i16  d22, q11, #5                @round shft (row 1)
+
+    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 3)
+    vmull.u8    q10, d14, d7                @mul (row 2)
+    vmlal.u8    q10, d15, d6                @mul (row 2)
+
+    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 3)
+    vsub.s8     d8, d8, d3                  @ref_main_idx (row 4)
+    vsub.s8     d9, d9, d3                  @ref_main_idx + 1 (row 4)
+
+    vst1.8      d22, [r2], r3               @st (row 1)
+    vrshrn.i16  d20, q10, #5                @round shft (row 2)
+
+    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 4)
+    vmull.u8    q9, d10, d7                 @mul (row 3)
+    vmlal.u8    q9, d11, d6                 @mul (row 3)
+
+    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 4)
+    vsub.s8     d4, d4, d3                  @ref_main_idx (row 5)
+    vsub.s8     d5, d5, d3                  @ref_main_idx + 1 (row 5)
+
+    vst1.8      d20, [r2], r3               @st (row 2)
+    vrshrn.i16  d18, q9, #5                 @round shft (row 3)
+
+    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 5)
+    vmull.u8    q12, d12, d7                @mul (row 4)
+    vmlal.u8    q12, d13, d6                @mul (row 4)
+
+    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 5)
+    vsub.s8     d8, d8, d3                  @ref_main_idx (row 6)
+    vsub.s8     d9, d9, d3                  @ref_main_idx + 1 (row 6)
+
+    vst1.8      d18, [r2], r3               @st (row 3)
+    vrshrn.i16  d24, q12, #5                @round shft (row 4)
+
+    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 6)
+    vmull.u8    q11, d16, d7                @mul (row 5)
+    vmlal.u8    q11, d17, d6                @mul (row 5)
+
+    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 6)
+    vsub.s8     d4, d4, d3                  @ref_main_idx (row 7)
+    vsub.s8     d5, d5, d3                  @ref_main_idx + 1 (row 7)
+
+    vst1.8      d24, [r2], r3               @st (row 4)
+    vrshrn.i16  d22, q11, #5                @round shft (row 5)
+
+    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 7)
+    vmull.u8    q10, d14, d7                @mul (row 6)
+    vmlal.u8    q10, d15, d6                @mul (row 6)
+
+    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 7)
+    vmull.u8    q9, d10, d7                 @mul (row 7)
+    vmlal.u8    q9, d11, d6                 @mul (row 7)
+
+    vst1.8      d22, [r2], r3               @st (row 5)
+    vrshrn.i16  d20, q10, #5                @round shft (row 6)
+    vrshrn.i16  d18, q9, #5                 @round shft (row 7)
+
+    vst1.8      d20, [r2], r3               @st (row 6)
+
+    subs        r10, r10, #8                @subtract 8 and go to end if 8x8
+
+    vst1.8      d18, [r2], r3               @st (row 7)
+
+    beq         end_func
+
+    subs        r11, r11, #8
+    addgt       r8, r8, #4
+    addgt       r2, r2, r7
+    movle       r8, r12
+    suble       r2, r2, r4
+    addle       r2, r2, #8
+    movle       r11, r4
+    ldrle       r14, col_for_intra_luma_addr_2
+ulbl4_2:
+    addle       r14,r14,pc
+    addle       r0, r0, #8
+
+    mov         r5,r2
+    vld1.8      d31, [r14]!
+    vmull.s8    q6, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
+    vmovn.s16   d10, q6
+    vshr.s16    q6, q6, #5
+    vqmovn.s16  d11, q6
+    ldr         r9, [r8]
+    add         r9, r0, r9
+    sub         r9, r9, #1
+    vdup.8      d26, r9
+    vmov.i8     d16,#8
+
+    sub         r4,r4,#8
+
+kernel_8_16_32:
+
+    vsub.s8     d8, d26, d11                @ref_main_idx
+    vmov        d26,d10
+
+    subs        r11, r11, #8
+    sub         r6, r1, r9
+    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 7)
+    vadd.s8     d8, d8, d16                 @to compensate the pu1_src idx incremented by 8
+
+    vmull.u8    q10, d14, d7                @mul (row 6)
+    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx - 1 (row 7)
+    vmlal.u8    q10, d15, d6                @mul (row 6)
+
+    vsub.s8     d9, d8, d2                  @ref_main_idx - 1
+    addle       r0, r0, #8
+    addgt       r8, r8, #4
+    vld1.8      {d0,d1}, [r6]               @stores the 32 values reqd based on indices values (from least idx)
+
+    vst1.8      d24, [r5], r3               @st (row 4)
+    vrshrn.i16  d22, q11, #5                @round shft (row 5)
+
+    ldrle       r14, col_for_intra_luma_addr_3
+ulbl4_3:
+    addle       r14,r14,pc
+
+    movle       r8, r12
+    vdup.8      d27, r0                     @row value inc or reset accordingly
+
+    vsub.s8     d4, d8, d2                  @ref_main_idx (row 1)
+    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 0)
+    vsub.s8     d5, d9, d2                  @ref_main_idx - 1 (row 1)
+
+
+    vmull.u8    q9, d10, d7                 @mul (row 7)
+    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 0)
+    vmlal.u8    q9, d11, d6                 @mul (row 7)
+
+    vld1.8      d31, [r14]!
+    vand        d6, d29, d26                @fract values in d1/ idx values in d0
+
+    vst1.8      d22, [r5], r3               @(from previous loop)st (row 5)
+    vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
+
+    vsub.s8     d8, d8, d3                  @ref_main_idx (row 2)
+    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 1)
+    vsub.s8     d9, d9, d3                  @ref_main_idx - 1 (row 2)
+
+    addle       r11, r4, #8
+    ldr         r9, [r8]
+    vsub.s8     d7, d28, d6                 @32-fract
+
+    vmull.u8    q12, d12, d7                @mul (row 0)
+    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 1)
+    vmlal.u8    q12, d13, d6                @mul (row 0)
+
+    vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
+    vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
+
+    vsub.s8     d4, d4, d3                  @ref_main_idx (row 3)
+    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 2)
+    vsub.s8     d5, d5, d3                  @ref_main_idx - 1 (row 3)
+
+    vmull.u8    q11, d10, d7                @mul (row 1)
+    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 2)
+    vmlal.u8    q11, d17, d6                @mul (row 1)
+
+    vrshrn.i16  d24, q12, #5                @round shft (row 0)
+    vst1.8      d18, [r5], r3               @(from previous loop)st (row 7)
+
+    vsub.s8     d8, d8, d3                  @ref_main_idx (row 4)
+    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 3)
+    vsub.s8     d9, d9, d3                  @ref_main_idx - 1 (row 4)
+
+    vmull.u8    q10, d14, d7                @mul (row 2)
+    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 3)
+    vmlal.u8    q10, d15, d6                @mul (row 2)
+
+    vmull.s8    q7, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
+    add         r5,r2,r3,lsl#2
+    add         r9, r0, r9
+
+    vst1.8      d24, [r2], r3               @st (row 0)
+    vrshrn.i16  d22, q11, #5                @round shft (row 1)
+
+    vsub.s8     d4, d4, d3                  @ref_main_idx (row 5)
+    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 4)
+    vsub.s8     d5, d5, d3                  @ref_main_idx - 1 (row 5)
+
+    vmull.u8    q9, d10, d7                 @mul (row 3)
+    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 4)
+    vmlal.u8    q9, d11, d6                 @mul (row 3)
+
+    vst1.8      d22, [r2], r3               @st (row 1)
+    vrshrn.i16  d20, q10, #5                @round shft (row 2)
+
+    vmovn.s16   d10, q7
+    vshr.s16    q7, q7, #5
+
+    vsub.s8     d8, d8, d3                  @ref_main_idx (row 6)
+    vtbl.8      d21, {d0,d1}, d4            @load from ref_main_idx (row 5)
+    vsub.s8     d9, d9, d3                  @ref_main_idx - 1 (row 6)
+
+    vmull.u8    q12, d12, d7                @mul (row 4)
+    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 5)
+    vmlal.u8    q12, d13, d6                @mul (row 4)
+
+    vst1.8      d20, [r2], r3               @st (row 2)
+    vrshrn.i16  d18, q9, #5                 @round shft (row 3)
+
+    sub         r9, r9, #1
+    vqmovn.s16  d11, q7
+
+    vsub.s8     d4, d4, d3                  @ref_main_idx (row 7)
+    vtbl.8      d14, {d0,d1}, d8            @load from ref_main_idx (row 6)
+    vsub.s8     d5, d5, d3                  @ref_main_idx - 1 (row 7)
+
+    vmull.u8    q11, d21, d7                @mul (row 5)
+    vtbl.8      d15, {d0,d1}, d9            @load from ref_main_idx + 1 (row 6)
+    vmlal.u8    q11, d17, d6                @mul (row 5)
+
+    vadd.s8     d11, d27, d11               @ref_main_idx (add row)
+    vdup.8      d26, r9
+
+    vst1.8      d18, [r2], r3               @st (row 3)
+    vrshrn.i16  d24, q12, #5                @round shft (row 4)
+
+    add         r2,r3, lsl #2
+    vsub.s8     d11, d11, d2                @ref_main_idx -1 (sub 1)
+    addgt       r2, r7, r2
+
+    suble       r2, r2, r4
+
+    subs        r10, r10, #8                @subtract 8 and go to end if 8x8
+
+    bne         kernel_8_16_32
+
+epil_8_16_32:
+    vtbl.8      d10, {d0,d1}, d4            @load from ref_main_idx (row 7)
+
+    vmull.u8    q10, d14, d7                @mul (row 6)
+    vtbl.8      d11, {d0,d1}, d5            @load from ref_main_idx + 1 (row 7)
+    vmlal.u8    q10, d15, d6                @mul (row 6)
+
+    vst1.8      d24, [r5], r3               @st (row 4)
+    vrshrn.i16  d24, q11, #5                @round shft (row 5)
+
+    vmull.u8    q9, d10, d7                 @mul (row 7)
+    vmlal.u8    q9, d11, d6                 @mul (row 7)
+
+    vst1.8      d24, [r5], r3               @(from previous loop)st (row 5)
+    vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
+
+    vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
+    vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
+
+    vst1.8      d18, [r5], r3               @st (row 7)
+
+    b           end_func
+
+sz_4_proc:
+    vld1.8      d31, [r14]
+    vmov.i8     d2, #1                      @contains #1 for adding to get ref_main_idx - 1
+
+    vmov.i8     d3, #2
+    ldr         r12, idx_neg_idx_3_9_addr_2 @load least idx table
+ulbl3_2:
+    add         r12,r12,pc
+
+    vmull.s8    q11, d30, d31               @(col+1)*intra_pred_angle [0:7](col)
+    sub         r7, r5, #3
+
+    add         r12, r12, r7, lsl #4
+    mov         r8, r12
+
+    ldr         r9, [r8]
+
+    vdup.8      d26, r9                     @least idx added to final idx values
+    add         r6, r0, r4, lsl #1          @pu1_ref + 2nt
+
+    vmovn.s16   d6, q11
+    sub         r6, r6, #9                  @ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
+    sub         r6, r6, r9
+
+    vld1.8      {d0,d1}, [r6]               @stores the 32 values reqd based on indices values (from least idx)
+
+    vmov.i8     d29, #31                    @contains #31 for vand operation
+
+    vmov.i8     d28, #32
+
+    vshr.s16    q11, q11, #5
+    vqmovn.s16  d8, q11
+
+    vand        d6, d6, d29                 @fract values in d1/ idx values in d0
+    vsub.s8     d7, d28, d6                 @32-fract
+
+    vmov.i8     d27, #7                     @row 0 to 7(row-1)
+    vsub.s8     d8, d8, d2                  @ref_main_idx (add 1)
+    vsub.s8     d8, d26, d8                 @ref_main_idx
+    vadd.s8     d8, d8, d27                 @t0 compensate the pu1_src idx incremented by 8
+    vsub.s8     d9, d8, d2                  @ref_main_idx - 1
+
+    vsub.s8     d4, d8, d2                  @row 1 ref_main_idx
+    vsub.s8     d5, d9, d2
+
+    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 0)
+    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 0)
+
+
+    vmull.u8    q12, d12, d7                @mul (row 0)
+    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 1)
+    vmlal.u8    q12, d13, d6                @mul (row 0)
+
+    vsub.s8     d8, d8, d3                  @idx (row 2)
+    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 1)
+    vsub.s8     d9, d9, d3                  @idx+1 (row 2)
+
+    vmull.u8    q11, d16, d7                @mul (row 1)
+    vtbl.8      d12, {d0,d1}, d8            @load from ref_main_idx (row 2)
+    vmlal.u8    q11, d17, d6                @mul (row 1)
+
+    vrshrn.i16  d24, q12, #5                @round shift (row 0)
+
+    vsub.s8     d4, d4, d3                  @idx (row 3)
+    vtbl.8      d13, {d0,d1}, d9            @load from ref_main_idx + 1 (row 2)
+    vsub.s8     d5, d5, d3                  @idx+1 (row 3)
+
+    vmull.u8    q10, d12, d7                @mul (row 2)
+    vtbl.8      d16, {d0,d1}, d4            @load from ref_main_idx (row 3)
+    vmlal.u8    q10, d13, d6                @mul (row 2)
+
+    vst1.32     d24[0], [r2], r3            @st row 0
+    vrshrn.i16  d22, q11, #5                @round shift (row 1)
+
+    vtbl.8      d17, {d0,d1}, d5            @load from ref_main_idx + 1 (row 3)
+
+    vmull.u8    q9, d16, d7                 @mul (row 3)
+    vmlal.u8    q9, d17, d6                 @mul (row 3)
+
+    vst1.32     d22[0], [r2], r3            @st row 1
+    vrshrn.i16  d20, q10, #5                @round shift (row 2)
+
+    vst1.32     d20[0], [r2], r3            @st row 2
+
+    vrshrn.i16  d18, q9, #5                 @round shift (row 3)
+
+    vst1.32     d18[0], [r2], r3            @st (row 3)
+
+end_func:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+

diff --git a/common/arm/ihevc_intra_pred_luma_planar.s b/common/arm/ihevc_intra_pred_luma_planar.s
new file mode 100644
index 0000000..666798e
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_luma_planar.s

@@ -0,0 +1,557 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_filters_planar.s
+@*
+@* @brief
+@*  contains function definitions for inter prediction  interpolation.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  akshaya mukund
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    luma intraprediction filter for planar input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] pi1_coeff
+@*  word8 pointer to the planar coefficients
+@*
+@* @param[in] nt
+@*  size of tranform block
+@*
+@* @param[in] mode
+@*  type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
+@                                  word32 src_strd,
+@                                  uword8* pu1_dst,
+@                                  word32 dst_strd,
+@                                  word32 nt,
+@                                  word32 mode,
+@                  word32 pi1_coeff)
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@   nt
+@   mode
+@   pi1_coeff
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_planar_a9q
+.extern gau1_ihevc_planar_factor
+.extern gau1_ihevc_planar_factor_1
+
+gau1_ihevc_planar_factor_addr:
+.long gau1_ihevc_planar_factor - ulbl1 - 8
+
+gau1_ihevc_planar_factor_1_addr:
+.long gau1_ihevc_planar_factor_1 - ulbl2 - 8
+
+
+.type ihevc_intra_pred_luma_planar_a9q, %function
+
+ihevc_intra_pred_luma_planar_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads nt
+    ldr         r11, gau1_ihevc_planar_factor_addr @loads table of coeffs
+ulbl1:
+    add         r11,r11,pc
+
+    clz         r5, r4
+    rsb         r5, r5, #32
+    vdup.16     q7, r5
+    vneg.s16    q7, q7                      @shr value (so vneg)
+    vdup.8      d2, r4                      @nt
+    vdup.s16    q8, r4                      @nt
+
+    sub         r6, r4, #1                  @nt-1
+    add         r6, r6, r0
+    ldr         r7, [r6]
+    vdup.s8     d0, r7                      @src[nt-1]
+
+    add         r6, r4, r4,lsl #1           @3nt
+    add         r6, r6, #1                  @3nt + 1
+    add         r6, r6, r0
+    ldr         r7, [r6]
+    vdup.s8     d1, r7                      @src[3nt+1]
+
+    add         r6, r4, r4                  @2nt
+    add         r14, r6, #1                 @2nt+1
+    sub         r6, r6, #1                  @2nt-1
+    add         r6, r6, r0                  @&src[2nt-1]
+    add         r14, r14, r0                @&src[2nt+1]
+
+    mov         r8, #1                      @row+1 (row is first 0)
+    sub         r9, r4, r8                  @nt-1-row (row is first 0)
+
+    vdup.s8     d5, r8                      @row + 1
+    vdup.s8     d6, r9                      @nt - 1 - row
+    vmov        d7, d5                      @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
+
+    add         r12, r11, #1                @coeffs (to be reloaded after every row)
+    mov         r1, r4                      @nt (row counter) (dec after every row)
+    mov         r5, r2                      @dst (to be reloaded after every row and inc by dst_strd)
+    mov         r10, #8                     @increment for the coeffs
+    mov         r0, r14                     @&src[2nt+1] (to be reloaded after every row)
+
+    cmp         r4, #4
+    beq         tf_sz_4
+
+@@ ========== ***************** =====================
+prolog:
+tf_sz_8_16_32:
+
+    mov         r7, r4                      @column counter (set to no of cols)
+    mov         r9, r4, lsr #3              @divide nt by 8
+    mul         r7, r7, r9                  @multiply width * height
+    ldr         r5, gau1_ihevc_planar_factor_1_addr @loads table of coeffs
+ulbl2:
+    add         r5,r5,pc
+    sub         r6, r6, #7
+    mov         r8, r2
+    lsl         r9, r3, #3                  @4*stride
+    rsb         r9, r9, #8                  @8-4*stride
+    mov         r10, r4                     @nt
+    sub         r10, r10, #8                @nt - 8
+
+col_loop_8_16_32:
+
+    vld1.s8     d8, [r12]                   @(1-8)load 8 coeffs [col+1]
+    vdup.16     q6, r4                      @(1)
+    vld1.s8     d4, [r6]                    @(1-8)src[2nt-1-row]
+    vsub.s8     d9, d2, d8                  @(1-8)[nt-1-col]
+
+
+    vmlal.u8    q6, d5, d0                  @(1)(row+1) *   src[nt-1]
+
+    vld1.s8     d3, [r14]                   @(1-8)load 8 src[2nt+1+col]
+    vmlal.u8    q6, d8, d1                  @(1)(col+1) *   src[3nt+1]
+
+    vdup.s8     d20, d4[7]                  @(1)
+    vmlal.u8    q6, d6, d3                  @(1)(nt-1-row)  *   src[2nt+1+col]
+
+    vdup.s8     d21, d4[6]                  @(2)
+    vmlal.u8    q6, d9, d20                 @(1)(nt-1-col)  *   src[2nt-1-row]
+
+    vdup.16     q15, r4                     @(2)
+    vadd.s8     d5, d5, d7                  @(1)
+
+    vsub.s8     d6, d6, d7                  @(1)
+
+    vdup.s8     d22, d4[5]                  @(3)
+    vmlal.u8    q15, d5, d0                 @(2)
+
+    vdup.16     q14, r4                     @(3)
+    vmlal.u8    q15, d8, d1                 @(2)
+
+    vmlal.u8    q15, d6, d3                 @(2)
+    vmlal.u8    q15, d9, d21                @(2)
+
+    vshl.s16    q6, q6, q7                  @(1)shr
+
+    vadd.s8     d5, d5, d7                  @(2)
+    vsub.s8     d6, d6, d7                  @(2)
+
+    vmovn.i16   d12, q6                     @(1)
+    vmlal.u8    q14, d5, d0                 @(3)
+
+    vdup.8      d23, d4[4]                  @(4)
+    vmlal.u8    q14, d8, d1                 @(3)
+
+    vdup.16     q5, r4                      @(4)
+    vmlal.u8    q14, d6, d3                 @(3)
+
+    vst1.s8     d12, [r2], r3               @(1)str 8 values
+    vmlal.u8    q14, d9, d22                @(3)
+
+    vshl.s16    q15, q15, q7                @(2)shr
+
+    vadd.s8     d5, d5, d7                  @(3)
+    vsub.s8     d6, d6, d7                  @(3)
+
+    vmovn.i16   d30, q15                    @(2)
+    vmlal.u8    q5, d5, d0                  @(4)
+
+    vdup.8      d20, d4[3]                  @(5)
+    vmlal.u8    q5, d8, d1                  @(4)
+
+    vdup.16     q8, r4                      @(5)
+    vmlal.u8    q5, d6, d3                  @(4)
+
+    vst1.s8     d30, [r2], r3               @(2)str 8 values
+    vmlal.u8    q5, d9, d23                 @(4)
+
+    vshl.s16    q14, q14, q7                @(3)shr
+
+    vadd.s8     d5, d5, d7                  @(4)
+    vsub.s8     d6, d6, d7                  @(4)
+
+    vmovn.i16   d28, q14                    @(3)
+    vmlal.u8    q8, d5, d0                  @(5)
+
+    vdup.8      d21, d4[2]                  @(6)
+    vmlal.u8    q8, d8, d1                  @(5)
+
+    vdup.16     q9, r4                      @(6)
+    vmlal.u8    q8, d6, d3                  @(5)
+
+    vst1.s8     d28, [r2], r3               @(3)str 8 values
+    vmlal.u8    q8, d9, d20                 @(5)
+
+    vshl.s16    q5, q5, q7                  @(4)shr
+    vadd.s8     d5, d5, d7                  @(5)
+    vsub.s8     d6, d6, d7                  @(5)
+
+    vmovn.i16   d10, q5                     @(4)
+    vmlal.u8    q9, d5, d0                  @(6)
+
+    vdup.8      d22, d4[1]                  @(7)
+    vmlal.u8    q9, d8, d1                  @(6)
+
+    vdup.16     q13, r4                     @(7)
+    vmlal.u8    q9, d6, d3                  @(6)
+
+    vst1.s8     d10, [r2], r3               @(4)str 8 values
+    vmlal.u8    q9, d9, d21                 @(6)
+
+    vshl.s16    q8, q8, q7                  @(5)shr
+
+    vadd.s8     d5, d5, d7                  @(6)
+    vsub.s8     d6, d6, d7                  @(6)
+
+    vmovn.i16   d16, q8                     @(5)
+    vmlal.u8    q13, d5, d0                 @(7)
+
+    vdup.8      d23, d4[0]                  @(8)
+    vmlal.u8    q13, d8, d1                 @(7)
+
+    vdup.16     q12, r4                     @(8)
+    vmlal.u8    q13, d6, d3                 @(7)
+
+    vst1.s8     d16, [r2], r3               @(5)str 8 values
+    vmlal.u8    q13, d9, d22                @(7)
+
+    vshl.s16    q9, q9, q7                  @(6)shr
+
+    vadd.s8     d5, d5, d7                  @(7)
+    vsub.s8     d6, d6, d7                  @(7)
+
+    vmovn.i16   d18, q9                     @(6)
+    vmlal.u8    q12, d5, d0                 @(8)
+
+
+    vmlal.u8    q12, d8, d1                 @(8)
+
+    vmlal.u8    q12, d6, d3                 @(8)
+
+    vst1.s8     d18, [r2], r3               @(6)str 8 values
+    vmlal.u8    q12, d9, d23                @(8)
+
+    vshl.s16    q13, q13, q7                @(7)shr
+
+    subs        r7, r7, #8
+
+    beq         epilog
+
+    subs        r1, r1, #8                  @row counter
+    addgt       r12, r12, #8                @col inc
+    addgt       r14, r14, #8                @also for col inc
+    movle       r1, r4                      @nt reloaded (refresh the value)
+    addle       r12, r11, #1                @r12 reset
+
+    movle       r14, r0                     @r14 reset
+    vld1.s8     d8, [r12]                   @(1n)(1-8)load 8 coeffs [col+1]
+
+    suble       r6, r6, #8                  @for next set of rows
+    vld1.s8     d3, [r14]                   @(1n)(1-8)load 8 src[2nt+1+col]
+
+    addle       r5, r5, #8
+    vdup.16     q6, r4                      @(1n)(1)
+
+    vld1.s8     d5, [r5]
+
+    vld1.s8     d4, [r6]                    @(1n)(1-8)src[2nt-1-row]
+    vsub.s8     d9, d2, d8                  @(1n)(1-8)[nt-1-col]
+
+    vdup.s8     d20, d4[7]                  @(1n)(1)
+    vsub.s8     d6, d2, d5
+
+    beq         epilog
+
+kernel_plnr:
+
+    cmp         r1, #0                      @ (cond loop)
+    vshl.s16    q12, q12, q7                @(8)shr
+
+    vmovn.i16   d26, q13                    @(7)
+    vmlal.u8    q6, d5, d0                  @(1)(row+1) *   src[nt-1]
+
+    vmovn.i16   d24, q12                    @(8)
+    vmlal.u8    q6, d8, d1                  @(1)(col+1) *   src[3nt+1]
+
+    vdup.s8     d21, d4[6]                  @(2)
+    vmlal.u8    q6, d6, d3                  @(1)(nt-1-row)  *   src[2nt+1+col]
+
+    vdup.16     q15, r4                     @(2)
+    vmlal.u8    q6, d9, d20                 @(1)(nt-1-col)  *   src[2nt-1-row]
+
+    vst1.s8     d26, [r2], r3               @(7)str 8 values
+    vadd.s8     d5, d5, d7                  @(1)
+
+    vst1.s8     d24, [r2], r3               @(8)str 8 values
+    vsub.s8     d6, d6, d7                  @(1)
+
+    addgt       r2, r2, r9                  @since more cols to fill, dst + 8 - 6*strd (cond loop)
+    vmlal.u8    q15, d5, d0                 @(2)
+
+    suble       r2, r2, r10                 @else go to next set of rows, dst - (nt-8) (cond loop)
+    vmlal.u8    q15, d8, d1                 @(2)
+
+    vdup.s8     d22, d4[5]                  @(3)
+    vmlal.u8    q15, d6, d3                 @(2)
+
+    vdup.16     q14, r4                     @(3)
+    vmlal.u8    q15, d9, d21                @(2)
+
+    vshl.s16    q6, q6, q7                  @(1)shr
+
+    vadd.s8     d5, d5, d7                  @(2)
+    movle       r1, r4                      @nt reloaded (refresh the value)    (cond loop)
+
+    vsub.s8     d6, d6, d7                  @(2)
+    subs        r1, r1, #8                  @row counter (loop)
+
+    vmovn.i16   d12, q6                     @(1)
+    vmlal.u8    q14, d5, d0                 @(3)
+
+    vdup.8      d23, d4[4]                  @(4)
+    vmlal.u8    q14, d8, d1                 @(3)
+
+    vdup.16     q5, r4                      @(4)
+    vmlal.u8    q14, d6, d3                 @(3)
+
+    vst1.s8     d12, [r2], r3               @(1)str 8 values
+    vmlal.u8    q14, d9, d22                @(3)
+
+    vshl.s16    q15, q15, q7                @(2)shr
+
+    vadd.s8     d5, d5, d7                  @(3)
+
+    vsub.s8     d6, d6, d7                  @(3)
+
+    vmovn.i16   d30, q15                    @(2)
+    vmlal.u8    q5, d5, d0                  @(4)
+
+    vdup.8      d20, d4[3]                  @(5)
+    vmlal.u8    q5, d8, d1                  @(4)
+
+    vdup.16     q8, r4                      @(5)
+    vmlal.u8    q5, d6, d3                  @(4)
+
+    vst1.s8     d30, [r2], r3               @(2)str 8 values
+    vmlal.u8    q5, d9, d23                 @(4)
+
+    vshl.s16    q14, q14, q7                @(3)shr
+
+    vadd.s8     d5, d5, d7                  @(4)
+
+    vsub.s8     d6, d6, d7                  @(4)
+
+    vmovn.i16   d28, q14                    @(3)
+    vmlal.u8    q8, d5, d0                  @(5)
+
+    vdup.8      d21, d4[2]                  @(6)
+    vmlal.u8    q8, d8, d1                  @(5)
+
+    vdup.16     q9, r4                      @(6)
+    vmlal.u8    q8, d6, d3                  @(5)
+
+    vst1.s8     d28, [r2], r3               @(3)str 8 values
+    vmlal.u8    q8, d9, d20                 @(5)
+
+    addle       r12, r11, #1                @r12 reset (cond loop)
+    vshl.s16    q5, q5, q7                  @(4)shr
+
+    addgt       r12, r12, #8                @col inc (cond loop)
+    vadd.s8     d5, d5, d7                  @(5)
+
+    addgt       r14, r14, #8                @also for col inc (cond loop)
+    vsub.s8     d6, d6, d7                  @(5)
+
+    vmovn.i16   d10, q5                     @(4)
+    vmlal.u8    q9, d5, d0                  @(6)
+
+    vdup.8      d22, d4[1]                  @(7)
+    vmlal.u8    q9, d8, d1                  @(6)
+
+    vdup.16     q13, r4                     @(7)
+    vmlal.u8    q9, d6, d3                  @(6)
+
+    vst1.s8     d10, [r2], r3               @(4)str 8 values
+    vmlal.u8    q9, d9, d21                 @(6)
+
+    movle       r14, r0                     @r14 reset (cond loop)
+    vshl.s16    q8, q8, q7                  @(5)shr
+
+    suble       r6, r6, #8                  @for next set of rows (cond loop)
+    vadd.s8     d5, d5, d7                  @(6)
+
+    addle       r5, r5, #8                  @ (cond loop)
+    vsub.s8     d6, d6, d7                  @(6)
+
+    vmovn.i16   d16, q8                     @(5)
+    vmlal.u8    q13, d5, d0                 @(7)
+
+    vdup.8      d23, d4[0]                  @(8)
+    vmlal.u8    q13, d8, d1                 @(7)
+
+    vdup.16     q12, r4                     @(8)
+    vmlal.u8    q13, d6, d3                 @(7)
+
+    vst1.s8     d16, [r2], r3               @(5)str 8 values
+    vmlal.u8    q13, d9, d22                @(7)
+
+    vld1.s8     d4, [r6]                    @(1n)(1-8)src[2nt-1-row]
+    vshl.s16    q9, q9, q7                  @(6)shr
+
+    vadd.s8     d5, d5, d7                  @(7)
+
+    vsub.s8     d6, d6, d7                  @(7)
+
+    vmovn.i16   d18, q9                     @(6)
+    vmlal.u8    q12, d5, d0                 @(8)
+
+    vld1.s8     d5, [r5]                    @(row+1 value)
+    vmlal.u8    q12, d8, d1                 @(8)
+
+    vdup.s8     d20, d4[7]                  @(1n)(1)
+    vmlal.u8    q12, d6, d3                 @(8)
+
+    vst1.s8     d18, [r2], r3               @(6)str 8 values
+    vmlal.u8    q12, d9, d23                @(8)
+
+    vld1.s8     d8, [r12]                   @(1n)(1-8)load 8 coeffs [col+1]
+    vsub.s8     d6, d2, d5                  @(nt-1-row) value
+
+    subs        r7, r7, #8                  @col counter
+
+    vld1.s8     d3, [r14]                   @(1n)(1-8)load 8 src[2nt+1+col]
+    vshl.s16    q13, q13, q7                @(7)shr
+
+    vdup.16     q6, r4                      @(1n)(1)
+    vsub.s8     d9, d2, d8                  @(1n)(1-8)[nt-1-col]
+
+    bne         kernel_plnr
+
+epilog:
+
+    vmovn.i16   d26, q13                    @(7)
+    vst1.s8     d26, [r2], r3               @(7)str 8 values
+
+    vshl.s16    q12, q12, q7                @(8)shr
+    vmovn.i16   d24, q12                    @(8)
+    vst1.s8     d24, [r2], r3               @(8)str 8 values
+
+@@ ========== ***************** =====================
+
+    beq         end_loop
+
+tf_sz_4:
+    vld1.s8     d10, [r14]                  @load src[2nt+1+col]
+    vld1.s8     d8, [r12], r10              @load 8 coeffs [col+1]
+loop_sz_4:
+    mov         r10, #4                     @reduce inc to #4 for 4x4
+    ldr         r7, [r6], #-1               @src[2nt-1-row] (dec to take into account row)
+    vdup.s8     d4, r7                      @src[2nt-1-row]
+
+    vsub.s8     d9, d2, d8                  @[nt-1-col]
+
+    vmull.u8    q6, d5, d0                  @(row+1)    *   src[nt-1]
+    vmlal.u8    q6, d6, d10                 @(nt-1-row) *   src[2nt+1+col]
+    vmlal.u8    q6, d8, d1                  @(col+1)    *   src[3nt+1]
+    vmlal.u8    q6, d9, d4                  @(nt-1-col) *   src[2nt-1-row]
+@   vadd.i16    q6, q6, q8          @add (nt)
+@   vshl.s16    q6, q6, q7          @shr
+@   vmovn.i16   d12, q6
+    vrshrn.s16  d12,q6,#3
+    vst1.s32    {d12[0]}, [r2], r3
+
+    vadd.s8     d5, d5, d7                  @row++ [(row+1)++]
+    vsub.s8     d6, d6, d7                  @[nt-1-row]--
+    subs        r1, r1, #1
+
+    bne         loop_sz_4
+
+end_loop:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+
+
+

diff --git a/common/arm/ihevc_intra_pred_luma_vert.s b/common/arm/ihevc_intra_pred_luma_vert.s
new file mode 100644
index 0000000..5eeaeb3
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_luma_vert.s

@@ -0,0 +1,421 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_intra_pred_filters_vert.s
+@*
+@* @brief
+@*  contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon  intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@*  akshaya mukund
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*    luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@*  uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] dst_strd
+@*  integer destination stride
+@*
+@* @param[in] nt
+@*  size of tranform block
+@*
+@* @param[in] mode
+@*  type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_ver(uword8* pu1_ref,
+@                               word32 src_strd,
+@                               uword8* pu1_dst,
+@                               word32 dst_strd,
+@                               word32 nt,
+@                               word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@   nt
+@   mode
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_ver_a9q
+
+.type ihevc_intra_pred_luma_ver_a9q, %function
+
+ihevc_intra_pred_luma_ver_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @loads nt
+
+    lsl         r5, r4, #1                  @2nt
+
+    cmp         r4, #16
+    beq         blk_16
+    blt         blk_4_8
+
+    add         r5, r5, #1                  @2nt+1
+    add         r6, r0, r5                  @&src[2nt+1]
+
+copy_32:
+    add         r5, r2, r3
+    vld1.8      {d20,d21}, [r6]!            @16 loads (col 0:15)
+    add         r8, r5, r3
+
+    add         r10, r8, r3
+    vld1.8      {d22,d23}, [r6]             @16 loads (col 16:31)
+    lsl         r11, r3, #2
+
+    add         r11, r11, #0xfffffff0
+    vst1.8      {d20,d21}, [r2]!
+    vst1.8      {d20,d21}, [r5]!
+    vst1.8      {d20,d21}, [r8]!
+    vst1.8      {d20,d21}, [r10]!
+
+    vst1.8      {d22,d23}, [r2], r11
+    vst1.8      {d22,d23}, [r5], r11
+    vst1.8      {d22,d23}, [r8], r11
+    vst1.8      {d22,d23}, [r10], r11
+
+    subs        r4, r4, #8
+
+kernel_copy_32:
+    vst1.8      {d20,d21}, [r2]!
+    vst1.8      {d20,d21}, [r5]!
+    vst1.8      {d20,d21}, [r8]!
+    vst1.8      {d20,d21}, [r10]!
+
+    vst1.8      {d22,d23}, [r2], r11
+    vst1.8      {d22,d23}, [r5], r11
+    vst1.8      {d22,d23}, [r8], r11
+    vst1.8      {d22,d23}, [r10], r11
+
+    subs        r4, r4, #8
+
+    vst1.8      {d20,d21}, [r2]!
+    vst1.8      {d20,d21}, [r5]!
+    vst1.8      {d20,d21}, [r8]!
+    vst1.8      {d20,d21}, [r10]!
+
+    vst1.8      {d22,d23}, [r2], r11
+    vst1.8      {d22,d23}, [r5], r11
+    vst1.8      {d22,d23}, [r8], r11
+    vst1.8      {d22,d23}, [r10], r11
+
+    bne         kernel_copy_32
+
+    vst1.8      {d20,d21}, [r2]!
+    vst1.8      {d20,d21}, [r5]!
+    vst1.8      {d20,d21}, [r8]!
+    vst1.8      {d20,d21}, [r10]!
+
+    vst1.8      {d22,d23}, [r2], r11
+    vst1.8      {d22,d23}, [r5], r11
+    vst1.8      {d22,d23}, [r8], r11
+    vst1.8      {d22,d23}, [r10], r11
+
+    b           end_func
+
+blk_16:
+    add         r6, r0, r5                  @&src[2nt]
+
+    ldrb        r11, [r6], #1               @src[2nt]
+
+    vdup.8      q11, r11                    @src[2nt]
+    ldrb        r12, [r6]                   @src[2nt+1]
+
+    vld1.8      {d16,d17}, [r6]             @ld for repl to cols src[2nt+1+col(0:15)] (0 ignored for stores)
+    add         r6, r6, #0xffffffef         @subtract -9 to take it to src[2nt-1-row(15)]
+
+    vdup.8      q12, r12                    @src[2nt+1]
+    vdup.16     q15, r12
+    lsl         r5, r3, #3                  @8*stride
+
+    vld1.8      {d26,d27}, [r6]!            @load src[2nt-1-row](rows 0:15)
+    add         r5, r2, r5                  @r5 ->
+
+    vmov.i64    d18, #0x00000000000000ff
+    vhsub.u8    q13, q13, q11               @(src[2nt-1-row] - src[2nt])>>1
+    @vsubl.u8   q0, d26, d22
+    @vsubl.u8   q14, d27, d22
+
+    @vshr.s16   q0, q0, #1
+    @vshr.s16   q14, q14, #1
+
+    vmov.i64    d19, d17
+    @vaddl.s8   q0, d24, d26
+    vmovl.s8    q0, d26
+    vmovl.s8    q14, d27
+    vqadd.s16   q0, q0, q15
+    vqadd.s16   q14, q14, q15
+
+    vmov.i64    d10, #0x00000000000000ff
+    @vaddl.s8   q1, d25, d27
+
+    vqmovun.s16 d25, q0
+    vqmovun.s16 d24, q14
+    @vmovn.u16  d25, q0
+    @vmovn.u16  d24, q1
+
+
+    vrev64.8    q12, q12
+
+    vmov.i64    d11, d17
+
+    vbsl        d18, d24, d16               @only select row values from q12(predpixel)
+    vbsl        d10, d25, d16
+
+    vmov.i64    d8, #0x00000000000000ff
+    vmov.i64    d9, d17
+
+    vmov.i64    d6, #0x00000000000000ff
+    vmov.i64    d7, d17
+
+    vst1.8      {d18,d19}, [r2], r3
+    vshr.s64    d24, d24, #8
+
+    vst1.8      {d10,d11}, [r5], r3
+    vshr.s64    d25, d25, #8
+
+
+    vbsl        d8, d24, d16
+    vbsl        d6, d25, d16
+
+    vst1.8      {d8,d9}, [r2], r3
+    vshr.s64    d24, d24, #8
+
+    vst1.8      {d6,d7}, [r5], r3
+    vshr.s64    d25, d25, #8
+
+    subs        r4, #8
+
+    vmov.i64    d18, #0x00000000000000ff
+    @vmov.i64   d19, d17
+
+    vmov.i64    d10, #0x00000000000000ff
+    @vmov.i64   d11, d17
+
+
+loop_16:
+
+
+    vmov.i64    d8, #0x00000000000000ff
+
+    vmov.i64    d6, #0x00000000000000ff
+
+    vbsl        d18, d24, d16               @only select row values from q12(predpixel)
+    vbsl        d10, d25, d16
+
+    vst1.8      {d18,d19}, [r2], r3
+    vshr.s64    d24, d24, #8
+
+    vst1.8      {d10,d11}, [r5], r3
+    vshr.s64    d25, d25, #8
+
+    vmov.i64    d18, #0x00000000000000ff
+
+    vmov.i64    d10, #0x00000000000000ff
+
+    vbsl        d8, d24, d16
+    vbsl        d6, d25, d16
+
+    vst1.8      {d8,d9}, [r2], r3
+    vshr.s64    d24, d24, #8
+
+    vst1.8      {d6,d7}, [r5], r3
+    vshr.s64    d25, d25, #8
+
+    subs        r4, r4, #4
+
+    bne         loop_16
+
+    vmov.i64    d8, #0x00000000000000ff
+
+    vmov.i64    d6, #0x00000000000000ff
+
+    vbsl        d18, d24, d16               @only select row values from q12(predpixel)
+    vbsl        d10, d25, d16
+
+    vst1.8      {d18,d19}, [r2], r3
+    vshr.s64    d24, d24, #8
+
+    vst1.8      {d10,d11}, [r5], r3
+    vshr.s64    d25, d25, #8
+
+    vbsl        d8, d24, d16
+    vbsl        d6, d25, d16
+
+    vst1.8      {d8,d9}, [r2], r3
+
+    vst1.8      {d6,d7}, [r5], r3
+
+    b           end_func
+
+
+blk_4_8:
+    vmov.i64    d11, #0x00000000000000ff
+    add         r6, r0, r5                  @&src[2nt]
+
+    vmov.i64    d10, #0x00000000000000ff
+    ldrb        r11, [r6], #1               @src[2nt]
+
+    vdup.8      d22, r11                    @src[2nt]
+    ldrb        r12, [r6]                   @src[2nt+1]
+
+    vld1.8      d16, [r6]                   @ld for repl to cols src[2nt+1+col(0:3 or 0:7)](0 ignored for st)
+    add         r6, r6, #0xfffffff7         @subtract -9 to take it to src[2nt-1-row(15)]
+
+    vdup.8      d24, r12                    @src[2nt+1]
+    vdup.16     q15, r12
+
+    vld1.8      d26, [r6]!                  @load src[2nt-1-row](rows 0:15)
+
+    vmov.i64    d18, #0x00000000000000ff
+    vhsub.u8    d26, d26, d22               @(src[2nt-1-row] - src[2nt])>>1
+    @vsubl.u8   q13, d26, d22
+
+    @vshr.s16   q13, q13, #1
+
+    vmov.i64    d19, #0x00000000000000ff
+    vmovl.s8    q13, d26
+    @vaddl.s8   q0, d24, d26
+    vqadd.s16   q0, q13, q15
+
+    vqmovun.s16 d24, q0
+    @vmovn.s16  d24, q0
+
+    vrev64.8    d24, d24
+
+    cmp         r4, #4
+    beq         blk_4
+
+    vbsl        d18, d24, d16               @only select row values from q12(predpixel)
+
+    vst1.8      d18, [r2], r3
+    vshr.s64    d24, d24, #8
+
+    vmov.i64    d18, #0x00000000000000ff
+
+    vbsl        d19, d24, d16
+
+    vst1.8      d19, [r2], r3
+    vshr.s64    d24, d24, #8
+
+    vmov.i64    d19, #0x00000000000000ff
+
+    vbsl        d10, d24, d16
+
+    vst1.8      d10, [r2], r3
+    vshr.s64    d24, d24, #8
+
+    vmov.i64    d10, #0x00000000000000ff
+
+    vbsl        d11, d24, d16
+
+    vst1.8      d11, [r2], r3
+    vshr.s64    d24, d24, #8
+
+    vmov.i64    d11, #0x00000000000000ff
+
+    vbsl        d18, d24, d16               @only select row values from q12(predpixel)
+
+    vst1.8      d18, [r2], r3
+    vshr.s64    d24, d24, #8
+
+    vbsl        d19, d24, d16
+
+    vst1.8      d19, [r2], r3
+    vshr.s64    d24, d24, #8
+
+    vbsl        d10, d24, d16
+
+    vst1.8      d10, [r2], r3
+    vshr.s64    d24, d24, #8
+
+    vbsl        d11, d24, d16
+
+    vst1.8      d11, [r2], r3
+    vshr.s64    d24, d24, #8
+
+    b           end_func
+
+
+blk_4:
+    vbsl        d18, d24, d16               @only select row values from q12(predpixel)
+
+    vst1.32     d18[0], [r2], r3
+    vshr.s64    d24, d24, #8
+
+    vbsl        d19, d24, d16
+
+    vst1.32     d19[0], [r2], r3
+    vshr.s64    d24, d24, #8
+
+    vbsl        d10, d24, d16
+
+    vst1.32     d10[0], [r2], r3
+    vshr.s64    d24, d24, #8
+
+    vbsl        d11, d24, d16
+    vst1.32     d11[0], [r2], r3
+
+
+end_func:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+

diff --git a/common/arm/ihevc_intra_ref_substitution_a9q.c b/common/arm/ihevc_intra_ref_substitution_a9q.c
new file mode 100644
index 0000000..e100893
--- /dev/null
+++ b/common/arm/ihevc_intra_ref_substitution_a9q.c

@@ -0,0 +1,777 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_intra_ref_substitution.c
+*
+* @brief
+*  Contains ref substitution functions
+*
+* @author
+*  Naveen
+*
+* @par List of Functions:
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_intra_pred.h"
+#include "ihevc_mem_fns.h"
+#include "ihevc_chroma_intra_pred.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_defs.h"
+#include "ihevc_mem_fns.h"
+#include "ihevc_macros.h"
+
+#define MAX_CU_SIZE 64
+#define BIT_DEPTH 8
+#define T32_4NT 128
+#define T16_4NT 64
+#define T16C_4NT 64
+#define T8C_4NT 32
+/****************************************************************************/
+/* Function Macros                                                          */
+/****************************************************************************/
+
+#define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x)
+#define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
+/**
+*******************************************************************************
+*
+* @brief
+*  Reference substitution process for samples unavailable  for prediction
+* Refer to section 8.4.4.2.2
+*
+* @par Description:
+*
+*
+* @param[in] pu1_top_left
+*  UWORD8 pointer to the top-left
+*
+* @param[in] pu1_top
+*  UWORD8 pointer to the top
+*
+* @param[in] pu1_left
+*  UWORD8 pointer to the left
+*
+* @param[in] src_strd
+*  WORD32 Source stride
+*
+* @param[in] nbr_flags
+*  WORD32 neighbor availability flags
+*
+* @param[in] nt
+*  WORD32 transform Block size
+*
+* @param[in] dst_strd
+*  WORD32 Destination stride
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_ref_substitution_a9q(UWORD8 *pu1_top_left,
+                                                  UWORD8 *pu1_top,
+                                                  UWORD8 *pu1_left,
+                                                  WORD32 src_strd,
+                                                  WORD32 nt,
+                                                  WORD32 nbr_flags,
+                                                  UWORD8 *pu1_dst,
+                                                  WORD32 dst_strd)
+{
+    UWORD8 pu1_ref_u, pu1_ref_v;
+    WORD32 dc_val, i, j;
+    WORD32 total_samples = (4 * nt) + 1;
+    WORD32 get_bits;
+    WORD32 next;
+    WORD32 bot_left, left, top, tp_right, tp_left;
+    WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
+    WORD32 a_nbr_flag[5];
+    UNUSED(dst_strd);
+    /* Neighbor Flag Structure*/
+    /* WORD32 nbr_flags MSB-->LSB   TOP LEFT | TOP-RIGHT |  TOP   | LEFT    | BOTTOM LEFT*/
+    /*                              (1 bit)     (4 bits)  (4 bits) (4 bits)  (4 bits)  */
+
+    if(nbr_flags == 0)
+    {
+/* If no neighbor flags are present, fill the neighbor samples with DC value */
+        /*dc_val = 1 << (BIT_DEPTH - 1);*/
+        dc_val = 1 << (8 - 1);
+        for(i = 0; i < (2 * total_samples); i++)
+        {
+            pu1_dst[i] = dc_val;
+        }
+    }
+    else
+    {
+        /* Else fill the corresponding samples */
+
+        /* Check for the neighbors availibility */
+        tp_left     = (nbr_flags & 0x10000);
+        tp_right    = (nbr_flags & 0x0f000);
+        top         = (nbr_flags & 0x00f00);
+        left        = (nbr_flags & 0x000f0);
+        bot_left    = (nbr_flags & 0x0000f);
+
+        /* Fill nbrs depending on avalibility */
+        /* Top -Left nbrs  */
+        if(0 != tp_left)
+        {
+            pu1_dst[(4 * nt)] = *pu1_top_left; // U top-left sample
+            pu1_dst[(4 * nt) + 1] = *(pu1_top_left + 1); // V top-left sample
+        }
+        /* Left nbrs  */
+        if(0 != left)
+        {
+            for(i = 0, j = 0; i < (2 * nt); i += 2)
+            {
+                pu1_dst[(4 * nt) - 2 - i] = pu1_left[j * src_strd]; // U left samples
+                pu1_dst[(4 * nt) - 1 - i] = pu1_left[(j * src_strd) + 1]; // V left samples
+                j++;
+            }
+        }
+        /* Bottom - Left nbrs  */
+        if(0 != bot_left)
+        {
+            for(i = (2 * nt), j = nt; i < (4 * nt); i += 2)
+            {
+                pu1_dst[(4 * nt) - 2 - i] = pu1_left[j * src_strd]; // U left samples
+                pu1_dst[(4 * nt) - 1 - i] = pu1_left[(j * src_strd) + 1]; // V left samples
+                j++;
+            }
+        }
+        /* Top nbrs  */
+        if(0 != top)
+        {
+            ihevc_memcpy_mul_8_a9q(&pu1_dst[(4 * nt) + 2], pu1_top, 2 * nt);
+            // U-V interleaved Top-top right samples
+        }
+
+        /* Top - Right nbrs  */
+        if(0 != tp_right)
+        {
+            ihevc_memcpy_mul_8_a9q(&pu1_dst[(4 * nt) + 2 + 2 * nt], pu1_top + 2 * nt, 2 * nt);
+            // U-V interleaved Top-top right samples
+        }
+
+        if(nt == 4)
+        {
+            /* 1 bit extraction for all the neighboring blocks */
+            tp_left = (nbr_flags & 0x10000) >> 16;
+            bot_left = (nbr_flags & 0x8) >> 3;
+            left = (nbr_flags & 0x80) >> 7;
+            top = (nbr_flags & 0x100) >> 8;
+            tp_right = (nbr_flags & 0x1000) >> 12;
+
+            next = 1;
+            a_nbr_flag[0] = bot_left;
+            a_nbr_flag[1] = left;
+            a_nbr_flag[2] = tp_left;
+            a_nbr_flag[3] = top;
+            a_nbr_flag[4] = tp_right;
+
+            /* If bottom -left is not available, reverse substitution process*/
+            if(bot_left == 0)
+            {
+                /* Check for the 1st available sample from bottom-left*/
+                while(!a_nbr_flag[next])
+                    next++;
+
+                /* If Left, top-left are available*/
+                if(next <= 2)
+                {
+                    UWORD16 *pu2_dst;
+                    idx = (nt * next);
+                    pu2_dst = (UWORD16 *)&pu1_dst[2 * idx];
+                    ihevc_memset_16bit_a9q((UWORD16 *)pu1_dst, pu2_dst[0], idx);
+                }
+                else /* If top, top-right are available */
+                {
+                    UWORD16 *pu2_dst;
+                    /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
+                    idx = (nt * (next - 1)) + 1;
+                    pu2_dst = (UWORD16 *)&pu1_dst[2 * idx];
+                    ihevc_memset_16bit_a9q((UWORD16 *)pu1_dst, pu2_dst[0], idx);
+                }
+            }
+
+            if(left == 0)
+            {
+                UWORD16 *pu2_dst = (UWORD16 *)&pu1_dst[(2 * nt) - 2];
+                ihevc_memset_16bit_a9q((UWORD16 *)&pu1_dst[(2 * nt)], pu2_dst[0], nt);
+
+
+            }
+            if(tp_left == 0)
+            {
+                pu1_dst[4 * nt] = pu1_dst[(4 * nt) - 2];
+                pu1_dst[(4 * nt) + 1] = pu1_dst[(4 * nt) - 1];
+            }
+            if(top == 0)
+            {
+                UWORD16 *pu2_dst = (UWORD16 *)&pu1_dst[(4 * nt)];
+                ihevc_memset_16bit_a9q((UWORD16 *)&pu1_dst[(4 * nt) + 2], pu2_dst[0], nt);
+
+
+            }
+            if(tp_right == 0)
+            {
+                UWORD16 *pu2_dst = (UWORD16 *)&pu1_dst[(6 * nt)];
+                ihevc_memset_16bit_a9q((UWORD16 *)&pu1_dst[(6 * nt) + 2], pu2_dst[0], nt);
+
+
+            }
+        }
+        else if(nt == 8)
+        {
+            WORD32 nbr_flags_temp = 0;
+            nbr_flags_temp = ((nbr_flags & 0xC) >> 2) + ((nbr_flags & 0xC0) >> 4)
+                            + ((nbr_flags & 0x300) >> 4)
+                            + ((nbr_flags & 0x3000) >> 6)
+                            + ((nbr_flags & 0x10000) >> 8);
+
+            /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
+            /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+            {
+                nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 4; /* for bottom left and left */
+                if(nbr_id_from_bl == 32)
+                    nbr_id_from_bl = 16;
+                if(nbr_id_from_bl == 16)
+                {
+                    /* for top left : 1 pel per nbr bit */
+                    if(!((nbr_flags_temp >> 8) & 0x1))
+                    {
+                        nbr_id_from_bl++;
+                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 4; /* top and top right;  8 pels per nbr bit */
+
+                    }
+                }
+                /* Reverse Substitution Process*/
+                if(nbr_id_from_bl)
+                {
+                    /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+                    pu1_ref_u = pu1_dst[2 * nbr_id_from_bl];
+                    pu1_ref_v = pu1_dst[(2 * nbr_id_from_bl) + 1];
+                    for(i = 2 * (nbr_id_from_bl - 1); i >= 0; i -= 2)
+                    {
+                        pu1_dst[i] = pu1_ref_u;
+                        pu1_dst[i + 1] = pu1_ref_v;
+                    }
+                }
+            }
+
+            /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+            while(nbr_id_from_bl < ((T8C_4NT)+1))
+            {
+                /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
+                /* Divide by 8 to obtain the original index */
+                frwd_nbr_flag = (nbr_id_from_bl >> 2); /*+ (nbr_id_from_bl & 0x1);*/
+
+                /* The Top-left flag is at the last bit location of nbr_flags*/
+                if(nbr_id_from_bl == (T8C_4NT / 2))
+                {
+                    get_bits = GET_BIT(nbr_flags_temp, 8);
+
+                    /* only pel substitution for TL */
+                    if(!get_bits)
+                    {
+                        pu1_dst[2 * nbr_id_from_bl] = pu1_dst[(2 * nbr_id_from_bl) - 2];
+                        pu1_dst[(2 * nbr_id_from_bl) + 1] = pu1_dst[(2 * nbr_id_from_bl) - 1];
+                    }
+                }
+                else
+                {
+                    get_bits = GET_BIT(nbr_flags_temp, frwd_nbr_flag);
+                    if(!get_bits)
+                    {
+                        UWORD16 *pu2_dst;
+                        /* 8 pel substitution (other than TL) */
+                        pu2_dst = (UWORD16 *)&pu1_dst[(2 * nbr_id_from_bl) - 2];
+                        ihevc_memset_16bit_a9q((UWORD16 *)(pu1_dst + (2 * nbr_id_from_bl)), pu2_dst[0], 4);
+                    }
+
+                }
+                nbr_id_from_bl += (nbr_id_from_bl == (T8C_4NT / 2)) ? 1 : 4;
+            }
+
+        }
+        else if(nt == 16)
+        {
+            /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
+            /* as each bit in nbr flags corresponds to 4 pels for bot_left, left, top and topright but 1 pel for topleft */
+            {
+                nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 4; /* for bottom left and left */
+
+                if(nbr_id_from_bl == 32)
+                {
+                    /* for top left : 1 pel per nbr bit */
+                    if(!((nbr_flags >> 16) & 0x1))
+                    {
+                        /* top left not available */
+                        nbr_id_from_bl++;
+                        /* top and top right;  4 pels per nbr bit */
+                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 4;
+                    }
+                }
+                /* Reverse Substitution Process*/
+                if(nbr_id_from_bl)
+                {
+                    /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+                    pu1_ref_u = pu1_dst[2 * nbr_id_from_bl];
+                    pu1_ref_v = pu1_dst[2 * nbr_id_from_bl + 1];
+                    for(i = (2 * (nbr_id_from_bl - 1)); i >= 0; i -= 2)
+                    {
+                        pu1_dst[i] = pu1_ref_u;
+                        pu1_dst[i + 1] = pu1_ref_v;
+                    }
+                }
+            }
+
+            /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+            while(nbr_id_from_bl < ((T16C_4NT)+1))
+            {
+                /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
+                /* Devide by 4 to obtain the original index */
+                frwd_nbr_flag = (nbr_id_from_bl >> 2); /*+ (nbr_id_from_bl & 0x1);*/
+
+                /* The Top-left flag is at the last bit location of nbr_flags*/
+                if(nbr_id_from_bl == (T16C_4NT / 2))
+                {
+                    get_bits = GET_BIT(nbr_flags, 16);
+                    /* only pel substitution for TL */
+                    if(!get_bits)
+                    {
+                        pu1_dst[2 * nbr_id_from_bl] = pu1_dst[(2 * nbr_id_from_bl) - 2];
+                        pu1_dst[(2 * nbr_id_from_bl) + 1] = pu1_dst[(2 * nbr_id_from_bl) - 1];
+                    }
+                }
+                else
+                {
+                    get_bits = GET_BIT(nbr_flags, frwd_nbr_flag);
+                    if(!get_bits)
+                    {
+                        UWORD16 *pu2_dst;
+                        /* 4 pel substitution (other than TL) */
+                        pu2_dst = (UWORD16 *)&pu1_dst[(2 * nbr_id_from_bl) - 2];
+                        ihevc_memset_16bit_a9q((UWORD16 *)(pu1_dst + (2 * nbr_id_from_bl)), pu2_dst[0], 4);
+                    }
+
+                }
+                nbr_id_from_bl += (nbr_id_from_bl == (T16C_4NT / 2)) ? 1 : 4;
+            }
+        }
+    }
+}
+
+
+void ihevc_intra_pred_luma_ref_substitution_a9q(UWORD8 *pu1_top_left,
+                                                UWORD8 *pu1_top,
+                                                UWORD8 *pu1_left,
+                                                WORD32 src_strd,
+                                                WORD32 nt,
+                                                WORD32 nbr_flags,
+                                                UWORD8 *pu1_dst,
+                                                WORD32 dst_strd)
+{
+    UWORD8 pu1_ref;
+    WORD32 dc_val, i;
+    WORD32 total_samples = (4 * nt) + 1;
+    WORD32 two_nt = 2 * nt;
+
+    WORD32 three_nt = 3 * nt;
+    WORD32 get_bits;
+    WORD32 next;
+    WORD32 bot_left, left, top, tp_right, tp_left;
+
+    WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
+    UNUSED(dst_strd);
+    /*dc_val = 1 << (BIT_DEPTH - 1);*/
+    dc_val = 1 << (8 - 1);
+
+
+    /* Neighbor Flag Structure*/
+    /* MSB ---> LSB */
+    /*    Top-Left | Top-Right | Top | Left | Bottom-Left
+              1         4         4     4         4
+     */
+    /* If no neighbor flags are present, fill the neighbor samples with DC value */
+    if(nbr_flags == 0)
+    {
+        for(i = 0; i < total_samples; i++)
+        {
+            pu1_dst[i] = dc_val;
+        }
+    }
+    else
+    {
+        if(nt <= 8)
+        {
+            /* 1 bit extraction for all the neighboring blocks */
+            tp_left = (nbr_flags & 0x10000) >> 16;
+            bot_left = (nbr_flags & 0x8) >> 3;
+            left = (nbr_flags & 0x80) >> 7;
+            top = (nbr_flags & 0x100) >> 8;
+            tp_right = (nbr_flags & 0x1000) >> 12;
+
+#if 1
+            /* Else fill the corresponding samples */
+            if(tp_left)
+                pu1_dst[two_nt] = *pu1_top_left;
+            else
+                pu1_dst[two_nt] = 0;
+
+
+            if(left)
+            {
+                for(i = 0; i < nt; i++)
+                    pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+            }
+            else
+            {
+                ihevc_memset_a9q(&pu1_dst[two_nt - 1 - (nt - 1)], 0, nt);
+            }
+
+
+            if(bot_left)
+            {
+                for(i = nt; i < two_nt; i++)
+                    pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+            }
+            else
+            {
+                ihevc_memset_a9q(&pu1_dst[two_nt - 1 - (two_nt - 1)], 0, nt);
+            }
+
+
+            if(top)
+            {
+                ihevc_memcpy_a9q(&pu1_dst[two_nt + 1], pu1_top, nt);
+            }
+            else
+            {
+                ihevc_memset_a9q(&pu1_dst[two_nt + 1], 0, nt);
+            }
+
+            if(tp_right)
+            {
+                ihevc_memcpy_a9q(&pu1_dst[two_nt + 1 + nt], pu1_top + nt, nt);
+            }
+            else
+            {
+                ihevc_memset_a9q(&pu1_dst[two_nt + 1 + nt], 0, nt);
+            }
+#endif
+            next = 1;
+
+            /* If bottom -left is not available, reverse substitution process*/
+            if(bot_left == 0)
+            {
+                WORD32 a_nbr_flag[5];
+                a_nbr_flag[0] = bot_left;
+                a_nbr_flag[1] = left;
+                a_nbr_flag[2] = tp_left;
+                a_nbr_flag[3] = top;
+                a_nbr_flag[4] = tp_right;
+
+                /* Check for the 1st available sample from bottom-left*/
+                while(!a_nbr_flag[next])
+                    next++;
+
+                /* If Left, top-left are available*/
+                if(next <= 2)
+                {
+                    idx = nt * next;
+                    pu1_ref = pu1_dst[idx];
+                    for(i = 0; i < idx; i++)
+                        pu1_dst[i] = pu1_ref;
+                }
+                else /* If top, top-right are available */
+                {
+                    /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
+                    idx = (nt * (next - 1)) + 1;
+                    pu1_ref = pu1_dst[idx];
+                    for(i = 0; i < idx; i++)
+                        pu1_dst[i] = pu1_ref;
+                }
+            }
+
+            /* Forward Substitution Process */
+            /* If left is Unavailable, copy the last bottom-left value */
+            if(left == 0)
+            {
+                ihevc_memset_a9q(&pu1_dst[nt], pu1_dst[nt - 1], nt);
+
+            }
+            /* If top-left is Unavailable, copy the last left value */
+            if(tp_left == 0)
+                pu1_dst[two_nt] = pu1_dst[two_nt - 1];
+            /* If top is Unavailable, copy the last top-left value */
+            if(top == 0)
+            {
+                ihevc_memset_a9q(&pu1_dst[two_nt + 1], pu1_dst[two_nt], nt);
+            }
+            /* If to right is Unavailable, copy the last top value */
+            if(tp_right == 0)
+            {
+                ihevc_memset_a9q(&pu1_dst[three_nt + 1], pu1_dst[three_nt], nt);
+
+            }
+        }
+
+        if(nt == 16)
+        {
+            WORD32 nbr_flags_temp = 0;
+            nbr_flags_temp = ((nbr_flags & 0xC) >> 2) + ((nbr_flags & 0xC0) >> 4)
+                            + ((nbr_flags & 0x300) >> 4)
+                            + ((nbr_flags & 0x3000) >> 6)
+                            + ((nbr_flags & 0x10000) >> 8);
+
+#if 1
+            /* Else fill the corresponding samples */
+            if(nbr_flags & 0x10000)
+                pu1_dst[two_nt] = *pu1_top_left;
+            else
+                pu1_dst[two_nt] = 0;
+
+            if(nbr_flags & 0xC0)
+            {
+                for(i = 0; i < nt; i++)
+                    pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+            }
+            else
+            {
+                ihevc_memset_mul_8_a9q(&pu1_dst[two_nt - 1 - (nt - 1)], 0, nt);
+            }
+
+            if(nbr_flags & 0xC)
+            {
+                for(i = nt; i < two_nt; i++)
+                    pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+            }
+            else
+            {
+                ihevc_memset_mul_8_a9q(&pu1_dst[two_nt - 1 - (two_nt - 1)], 0, nt);
+            }
+
+
+            if(nbr_flags & 0x300)
+            {
+                ihevc_memcpy_mul_8_a9q(&pu1_dst[two_nt + 1], pu1_top, nt);
+            }
+            else
+            {
+                ihevc_memset_mul_8_a9q(&pu1_dst[two_nt + 1], 0, nt);
+            }
+
+            if(nbr_flags & 0x3000)
+            {
+                ihevc_memcpy_mul_8_a9q(&pu1_dst[two_nt + 1 + nt], pu1_top + nt, nt);
+            }
+            else
+            {
+                ihevc_memset_mul_8_a9q(&pu1_dst[two_nt + 1 + nt], 0, nt);
+            }
+#endif
+            /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
+            /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+            {
+                nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
+
+                if(nbr_id_from_bl == 64)
+                    nbr_id_from_bl = 32;
+
+                if(nbr_id_from_bl == 32)
+                {
+                    /* for top left : 1 pel per nbr bit */
+                    if(!((nbr_flags_temp >> 8) & 0x1))
+                    {
+                        nbr_id_from_bl++;
+                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right;  8 pels per nbr bit */
+                        //nbr_id_from_bl += idx * 8;
+                    }
+                }
+                /* Reverse Substitution Process*/
+                if(nbr_id_from_bl)
+                {
+                    /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+                    pu1_ref = pu1_dst[nbr_id_from_bl];
+                    for(i = (nbr_id_from_bl - 1); i >= 0; i--)
+                    {
+                        pu1_dst[i] = pu1_ref;
+                    }
+                }
+            }
+
+            /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+            while(nbr_id_from_bl < ((T16_4NT) + 1))
+            {
+                /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
+                /* Devide by 8 to obtain the original index */
+                frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
+
+                /* The Top-left flag is at the last bit location of nbr_flags*/
+                if(nbr_id_from_bl == (T16_4NT / 2))
+                {
+                    get_bits = GET_BITS(nbr_flags_temp, 8);
+
+                    /* only pel substitution for TL */
+                    if(!get_bits)
+                        pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
+                }
+                else
+                {
+                    get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag);
+                    if(!get_bits)
+                    {
+                        /* 8 pel substitution (other than TL) */
+                        pu1_ref = pu1_dst[nbr_id_from_bl - 1];
+                        ihevc_memset_mul_8_a9q(pu1_dst + nbr_id_from_bl, pu1_ref, 8);
+
+
+                    }
+
+                }
+                nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
+            }
+
+
+        }
+
+        if(nt == 32)
+        {
+#if 1
+            /* Else fill the corresponding samples */
+            if(nbr_flags & 0x10000)
+                pu1_dst[two_nt] = *pu1_top_left;
+            else
+                pu1_dst[two_nt] = 0;
+
+            if(nbr_flags & 0xF0)
+            {
+                for(i = 0; i < nt; i++)
+                    pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+            }
+            else
+            {
+                ihevc_memset_mul_8_a9q(&pu1_dst[two_nt - 1 - (nt - 1)], 0, nt);
+            }
+
+            if(nbr_flags & 0xF)
+            {
+                for(i = nt; i < two_nt; i++)
+                    pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+            }
+            else
+            {
+                ihevc_memset_mul_8_a9q(&pu1_dst[two_nt - 1 - (two_nt - 1)], 0, nt);
+            }
+
+
+            if(nbr_flags & 0xF00)
+            {
+                ihevc_memcpy_mul_8_a9q(&pu1_dst[two_nt + 1], pu1_top, nt);
+            }
+            else
+            {
+                ihevc_memset_mul_8_a9q(&pu1_dst[two_nt + 1], 0, nt);
+            }
+
+            if(nbr_flags & 0xF000)
+            {
+                ihevc_memcpy_mul_8_a9q(&pu1_dst[two_nt + 1 + nt], pu1_top + nt, nt);
+            }
+            else
+            {
+                ihevc_memset_mul_8_a9q(&pu1_dst[two_nt + 1 + nt], 0, nt);
+            }
+#endif
+            /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
+            /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+            {
+                nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
+
+                if(nbr_id_from_bl == 64)
+                {
+                    /* for top left : 1 pel per nbr bit */
+                    if(!((nbr_flags >> 16) & 0x1))
+                    {
+                        /* top left not available */
+                        nbr_id_from_bl++;
+                        /* top and top right;  8 pels per nbr bit */
+                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
+                    }
+                }
+                /* Reverse Substitution Process*/
+                if(nbr_id_from_bl)
+                {
+                    /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+                    pu1_ref = pu1_dst[nbr_id_from_bl];
+                    for(i = (nbr_id_from_bl - 1); i >= 0; i--)
+                        pu1_dst[i] = pu1_ref;
+                }
+            }
+
+            /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+            while(nbr_id_from_bl < ((T32_4NT) + 1))
+            {
+                /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
+                /* Devide by 8 to obtain the original index */
+                frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
+
+                /* The Top-left flag is at the last bit location of nbr_flags*/
+                if(nbr_id_from_bl == (T32_4NT / 2))
+                {
+                    get_bits = GET_BITS(nbr_flags, 16);
+                    /* only pel substitution for TL */
+                    if(!get_bits)
+                        pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
+                }
+                else
+                {
+                    get_bits = GET_BITS(nbr_flags, frwd_nbr_flag);
+                    if(!get_bits)
+                    {
+                        /* 8 pel substitution (other than TL) */
+                        pu1_ref = pu1_dst[nbr_id_from_bl - 1];
+                        ihevc_memset_mul_8_a9q(&pu1_dst[nbr_id_from_bl], pu1_ref, 8);
+
+                    }
+
+                }
+                nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
+            }
+        }
+
+    }
+}

diff --git a/common/arm/ihevc_itrans_recon_16x16.s b/common/arm/ihevc_itrans_recon_16x16.s
new file mode 100644
index 0000000..82055ad
--- /dev/null
+++ b/common/arm/ihevc_itrans_recon_16x16.s

@@ -0,0 +1,1141 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@ *******************************************************************************
+@ * @file
+@ *  ihevc_itrans_recon_8x8_neon.s
+@ *
+@ * @brief
+@ *  contains function definitions for single stage  inverse transform
+@ *
+@ * @author
+@ * anand s
+@ *
+@ * @par list of functions:
+@ *  - ihevc_itrans_recon_16x16()
+@ *
+@ * @remarks
+@ *  none
+@ *
+@ *******************************************************************************
+@*/
+
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ *  this function performs inverse transform  and reconstruction for 8x8
+@ * input block
+@ *
+@ * @par description:
+@ *  performs inverse transform and adds the prediction  data and clips output
+@ * to 8 bit
+@ *
+@ * @param[in] pi2_src
+@ *  input 16x16 coefficients
+@ *
+@ * @param[in] pi2_tmp
+@ *  temporary 16x16 buffer for storing inverse
+@ *
+@ *  transform
+@ *  1st stage output
+@ *
+@ * @param[in] pu1_pred
+@ *  prediction 16x16 block
+@ *
+@ * @param[out] pu1_dst
+@ *  output 8x8 block
+@ *
+@ * @param[in] src_strd
+@ *  input stride
+@ *
+@ * @param[in] pred_strd
+@ *  prediction stride
+@ *
+@ * @param[in] dst_strd
+@ *  output stride
+@ *
+@ * @param[in] shift
+@ *  output shift
+@ *
+@ * @param[in] r12
+@ *  zero columns in pi2_src
+@ *
+@ * @returns  void
+@ *
+@ * @remarks
+@ *  none
+@ *
+@ *******************************************************************************
+@ */
+
+@void ihevc_itrans_recon_16x16(word16 *pi2_src,
+@                            word16 *pi2_tmp,
+@                            uword8 *pu1_pred,
+@                            uword8 *pu1_dst,
+@                            word32 src_strd,
+@                            word32 pred_strd,
+@                            word32 dst_strd,
+@                            word32 r12
+@                            word32 r11             )
+
+@**************variables vs registers*************************
+@   r0 => *pi2_src
+@   r1 => *pi2_tmp
+@   r2 => *pu1_pred
+@   r3 => *pu1_dst
+@   src_strd
+@   pred_strd
+@   dst_strd
+@   r12
+@   r11
+
+.text
+.align 4
+
+
+
+
+
+
+.set shift_stage1_idct ,   7
+.set shift_stage2_idct ,   12
+@#define zero_cols       r12
+@#define zero_rows       r11
+.globl ihevc_itrans_recon_16x16_a9q
+
+.extern g_ai2_ihevc_trans_16_transpose
+
+g_ai2_ihevc_trans_16_transpose_addr:
+.long g_ai2_ihevc_trans_16_transpose - ulbl1 - 8
+
+.type ihevc_itrans_recon_16x16_a9q, %function
+
+ihevc_itrans_recon_16x16_a9q:
+
+    stmfd       sp!,{r4-r12,lr}
+@   add             sp,sp,#40
+
+
+
+@   ldr         r8,[sp,#4]  @ prediction stride
+@   ldr         r7,[sp,#8]  @ destination stride
+    ldr         r6,[sp,#40]                 @ src stride
+    ldr         r12,[sp,#52]
+    ldr         r11,[sp,#56]
+
+
+
+    ldr         r14,g_ai2_ihevc_trans_16_transpose_addr
+ulbl1:
+    add         r14,r14,pc
+    vld1.16     {d0,d1,d2,d3},[r14]         @//d0,d1 are used for storing the constant data
+    movw        r7,#0xffff
+    and         r12,r12,r7
+    and         r11,r11,r7
+    mov         r6,r6,lsl #1                @ x sizeof(word16)
+    add         r9,r0,r6, lsl #1            @ 2 rows
+
+    add         r10,r6,r6, lsl #1           @ 3 rows
+    add         r5,r6,r6,lsl #2
+    movw        r7,#0xfff0
+
+    cmp         r12,r7
+    bge         zero_12cols_decision
+
+    cmp         r12,#0xff00
+    bge         zero_8cols_decision
+
+
+
+
+    mov         r14,#4
+    cmp         r11,r7
+    rsbge       r10,r6,#0
+
+    cmp         r11,#0xff00
+    movge       r8,r5
+    rsbge       r8,r8,#0
+    movlt       r8,r10
+    add         r5,r5,r6,lsl #3
+    rsb         r5,r5,#0
+
+    b           first_stage_top_four_bottom_four
+
+zero_12cols_decision:
+    mov         r14,#1
+    cmp         r11,#0xff00
+    movge       r8,r5
+    movlt       r8,r10
+    add         r5,r5,r6,lsl #3
+    rsb         r5,r5,#0
+
+    b           first_stage_top_four_bottom_four
+
+zero_8cols_decision:
+    mov         r14,#2
+    mov         r8,r5
+    rsb         r8,r8,#0
+    cmp         r11,#0xff00
+    movlt       r8,r10
+    add         r5,r5,r6,lsl #3
+    rsb         r5,r5,#0
+    cmp         r11,r7
+    rsbge       r10,r6,#0
+
+
+    b           first_stage_top_four_bottom_four
+
+
+@d0[0]= 64      d2[0]=64
+@d0[1]= 90      d2[1]=57
+@d0[2]= 89      d2[2]=50
+@d0[3]= 87      d2[3]=43
+@d1[0]= 83      d3[0]=36
+@d1[1]= 80      d3[1]=25
+@d1[2]= 75      d3[2]=18
+@d1[3]= 70      d3[3]=9
+
+
+
+first_stage:
+    add         r0,r0,#8
+    add         r9,r9,#8
+
+first_stage_top_four_bottom_four:
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d11,[r9],r6
+    vld1.16     d6,[r0],r10
+    vld1.16     d7,[r9],r10
+    cmp         r11,r7
+    bge         skip_load4rows
+
+    vld1.16     d4,[r0],r6
+    vld1.16     d5,[r9],r6
+    vld1.16     d8,[r0],r8
+    vld1.16     d9,[r9],r8
+
+@ registers used: q0,q1,q3,q5,q2,q4
+
+@ d10 =r0
+@d6= r1
+@d11=r2
+@d7=r3
+
+skip_load4rows:
+    vmull.s16   q12,d6,d0[1]                @// y1 * cos1(part of b0)
+    vmull.s16   q13,d6,d0[3]                @// y1 * cos3(part of b1)
+    vmull.s16   q14,d6,d1[1]                @// y1 * sin3(part of b2)
+    vmull.s16   q15,d6,d1[3]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d7,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlal.s16   q13,d7,d2[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d7,d3[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d7,d2[3]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+
+    vmull.s16   q6,d10,d0[0]
+    vmlal.s16   q6,d11,d0[2]
+    vmull.s16   q7,d10,d0[0]
+    vmlal.s16   q7,d11,d1[2]
+    vmull.s16   q8,d10,d0[0]
+    vmlal.s16   q8,d11,d2[2]
+    vmull.s16   q9,d10,d0[0]
+    vmlal.s16   q9,d11,d3[2]
+
+    bge         skip_last12rows_kernel1
+
+
+    vmlal.s16   q12,d8,d1[1]
+    vmlal.s16   q13,d8,d3[3]
+    vmlsl.s16   q14,d8,d1[3]
+    vmlsl.s16   q15,d8,d0[3]
+
+
+    vmlal.s16   q12,d9,d1[3]
+    vmlsl.s16   q13,d9,d2[3]
+    vmlsl.s16   q14,d9,d0[3]
+    vmlal.s16   q15,d9,d3[3]
+
+
+
+
+
+    vmlal.s16   q6,d4,d1[0]
+    vmlal.s16   q6,d5,d1[2]
+    vmlal.s16   q7,d4,d3[0]
+    vmlsl.s16   q7,d5,d3[2]
+    vmlsl.s16   q8,d4,d3[0]
+    vmlsl.s16   q8,d5,d0[2]
+    vmlsl.s16   q9,d4,d1[0]
+    vmlsl.s16   q9,d5,d2[2]
+
+@d0[0]= 64      d2[0]=64
+@d0[1]= 90      d2[1]=57
+@d0[2]= 89      d2[2]=50
+@d0[3]= 87      d2[3]=43
+@d1[0]= 83      d3[0]=36
+@d1[1]= 80      d3[1]=25
+@d1[2]= 75      d3[2]=18
+@d1[3]= 70      d3[3]=9
+    cmp         r11,#0xff00
+    bge         skip_last12rows_kernel1
+
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d11,[r9],r6
+    vld1.16     d6,[r0],r10
+    vld1.16     d7,[r9],r10
+    vld1.16     d4,[r0],r6
+    vld1.16     d5,[r9],r6
+    vld1.16     d8,[r0],r5
+    vld1.16     d9,[r9],r5
+
+
+
+
+    vmlal.s16   q12,d6,d2[1]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d6,d1[1]                @// y1 * cos3(part of b1)
+    vmlsl.s16   q14,d6,d3[1]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d6,d0[1]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d7,d2[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d7,d0[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d7,d2[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlal.s16   q15,d7,d3[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+    vmlal.s16   q12,d8,d3[1]
+    vmlsl.s16   q13,d8,d1[3]
+    vmlal.s16   q14,d8,d0[1]
+    vmlsl.s16   q15,d8,d1[1]
+
+
+    vmlal.s16   q12,d9,d3[3]
+    vmlsl.s16   q13,d9,d3[1]
+    vmlal.s16   q14,d9,d2[3]
+    vmlsl.s16   q15,d9,d2[1]
+
+
+
+
+
+    vmlal.s16   q6,d10,d0[0]
+    vmlal.s16   q6,d11,d2[2]
+    vmlal.s16   q6,d4,d3[0]
+    vmlal.s16   q6,d5,d3[2]
+
+
+
+
+    vmlsl.s16   q7,d10,d0[0]
+    vmlsl.s16   q7,d11,d0[2]
+    vmlsl.s16   q7,d4,d1[0]
+    vmlsl.s16   q7,d5,d2[2]
+
+
+    vmlsl.s16   q8,d10,d0[0]
+    vmlal.s16   q8,d11,d3[2]
+    vmlal.s16   q8,d4,d1[0]
+    vmlal.s16   q8,d5,d1[2]
+
+
+    vmlal.s16   q9,d10,d0[0]
+    vmlal.s16   q9,d11,d1[2]
+    vmlsl.s16   q9,d4,d3[0]
+    vmlsl.s16   q9,d5,d0[2]
+
+skip_last12rows_kernel1:
+    vadd.s32    q10,q6,q12
+    vsub.s32    q11,q6,q12
+
+    vadd.s32    q6,q7,q13
+    vsub.s32    q12,q7,q13
+
+    vadd.s32    q7,q8,q14
+    vsub.s32    q13,q8,q14
+
+
+    vadd.s32    q8,q9,q15
+    vsub.s32    q14,q9,q15
+
+
+
+
+
+
+
+    vqrshrn.s32 d30,q10,#shift_stage1_idct  @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d19,q11,#shift_stage1_idct  @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d31,q7,#shift_stage1_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d18,q13,#shift_stage1_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d12,q6,#shift_stage1_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d15,q12,#shift_stage1_idct  @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d13,q8,#shift_stage1_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d14,q14,#shift_stage1_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+    vst1.16     {d30,d31},[r1]!
+    vst1.16     {d18,d19},[r1]!
+    sub         r1,r1,#32
+
+    bge         skip_stage1_kernel_load
+
+first_stage_middle_eight:
+
+
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d11,[r9],r6
+    vld1.16     d6,[r0],r10
+    vld1.16     d7,[r9],r10
+    vld1.16     d4,[r0],r6
+    vld1.16     d5,[r9],r6
+    vld1.16     d8,[r0],r8
+    vld1.16     d9,[r9],r8
+
+
+skip_stage1_kernel_load:
+    vmull.s16   q12,d6,d2[1]                @// y1 * cos1(part of b0)
+    vmull.s16   q13,d6,d2[3]                @// y1 * cos3(part of b1)
+    vmull.s16   q14,d6,d3[1]                @// y1 * sin3(part of b2)
+    vmull.s16   q15,d6,d3[3]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d7,d1[1]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d7,d0[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d7,d1[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d7,d3[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+
+    vmull.s16   q11,d10,d0[0]
+    vmlsl.s16   q11,d11,d3[2]
+    vmull.s16   q10,d10,d0[0]
+    vmlsl.s16   q10,d11,d2[2]
+    vmull.s16   q8,d10,d0[0]
+    vmlsl.s16   q8,d11,d1[2]
+    vmull.s16   q9,d10,d0[0]
+    vmlsl.s16   q9,d11,d0[2]
+
+
+    cmp         r11,r7
+    bge         skip_last12rows_kernel2
+
+    vmlsl.s16   q12,d8,d3[1]
+    vmlal.s16   q13,d8,d2[1]
+    vmlal.s16   q14,d8,d0[1]
+    vmlal.s16   q15,d8,d2[3]
+
+
+    vmlal.s16   q12,d9,d0[1]
+    vmlal.s16   q13,d9,d3[1]
+    vmlsl.s16   q14,d9,d1[1]
+    vmlsl.s16   q15,d9,d2[1]
+
+
+
+    vmlsl.s16   q11,d4,d1[0]
+    vmlal.s16   q11,d5,d2[2]
+    vmlsl.s16   q10,d4,d3[0]
+    vmlal.s16   q10,d5,d0[2]
+    vmlal.s16   q8,d4,d3[0]
+    vmlal.s16   q8,d5,d3[2]
+    vmlal.s16   q9,d4,d1[0]
+    vmlsl.s16   q9,d5,d1[2]
+
+@d0[0]= 64      d2[0]=64
+@d0[1]= 90      d2[1]=57
+@d0[2]= 89      d2[2]=50
+@d0[3]= 87      d2[3]=43
+@d1[0]= 83      d3[0]=36
+@d1[1]= 80      d3[1]=25
+@d1[2]= 75      d3[2]=18
+@d1[3]= 70      d3[3]=9
+    cmp         r11,#0xff00
+    bge         skip_last12rows_kernel2
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d11,[r9],r6
+    vld1.16     d6,[r0],r10
+    vld1.16     d7,[r9],r10
+    vld1.16     d4,[r0],r6
+    vld1.16     d5,[r9],r6
+    vld1.16     d8,[r0],r5
+    vld1.16     d9,[r9],r5
+
+
+    vmlsl.s16   q12,d6,d3[3]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d6,d0[3]                @// y1 * cos3(part of b1)
+    vmlal.s16   q14,d6,d2[3]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d6,d1[3]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d7,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlal.s16   q13,d7,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d7,d3[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d7,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+    vmlal.s16   q12,d8,d2[3]
+    vmlal.s16   q13,d8,d3[3]
+    vmlsl.s16   q14,d8,d2[1]
+    vmlal.s16   q15,d8,d0[3]
+
+
+    vmlal.s16   q12,d9,d1[3]
+    vmlsl.s16   q13,d9,d1[1]
+    vmlal.s16   q14,d9,d0[3]
+    vmlsl.s16   q15,d9,d0[1]
+
+
+
+
+    vmlal.s16   q11,d10,d0[0]
+    vmlsl.s16   q11,d11,d1[2]
+    vmlsl.s16   q11,d4,d3[0]
+    vmlal.s16   q11,d5,d0[2]
+
+
+
+    vmlsl.s16   q10,d10,d0[0]
+    vmlsl.s16   q10,d11,d3[2]
+    vmlal.s16   q10,d4,d1[0]
+    vmlsl.s16   q10,d5,d1[2]
+
+
+    vmlsl.s16   q8,d10,d0[0]
+    vmlal.s16   q8,d11,d0[2]
+    vmlsl.s16   q8,d4,d1[0]
+    vmlal.s16   q8,d5,d2[2]
+
+
+
+    vmlal.s16   q9,d10,d0[0]
+    vmlsl.s16   q9,d11,d2[2]
+    vmlal.s16   q9,d4,d3[0]
+    vmlsl.s16   q9,d5,d3[2]
+
+skip_last12rows_kernel2:
+
+    vadd.s32    q2,q11,q12
+    vsub.s32    q11,q11,q12
+
+    vadd.s32    q3,q10,q13
+    vsub.s32    q12,q10,q13
+
+    vadd.s32    q5,q8,q14
+    vsub.s32    q13,q8,q14
+
+
+    vadd.s32    q8,q9,q15
+    vsub.s32    q14,q9,q15
+
+
+    vqrshrn.s32 d18,q2,#shift_stage1_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d31,q11,#shift_stage1_idct  @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d19,q5,#shift_stage1_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d30,q13,#shift_stage1_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d20,q3,#shift_stage1_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d23,q12,#shift_stage1_idct  @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d21,q8,#shift_stage1_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d22,q14,#shift_stage1_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+
+    @ registers used:   {q2,q4,q6,q7}, {q9,q15,q10,q11}
+
+
+
+
+
+
+    vld1.16     {d4,d5},[r1]!
+    vld1.16     {d8,d9},[r1]!
+    sub         r1,r1,#32
+
+@d4=r0
+@d12=r1
+@d5=r2
+@d13=r3
+
+@d18=r4
+@d20=r5
+@d19=r6
+@d21=r7
+
+@d22=r8
+@d30=r9
+@d23=r10
+@d31=r11
+
+@d14=r12
+@d8=r13
+@d15=r14
+@d9=r15
+
+
+    vtrn.16     q2,q6
+    vtrn.16     q9,q10
+    vtrn.16     q11,q15
+    vtrn.16     q7,q4
+
+
+
+    vtrn.32     d4,d5
+    vtrn.32     d12,d13
+
+    vtrn.32     d18,d19
+    vtrn.32     d20,d21
+
+    vtrn.32     d22,d23
+    vtrn.32     d30,d31
+
+    vtrn.32     d14,d15
+    vtrn.32     d8,d9
+
+
+@ d4 =r0 1- 4 values
+@ d5 =r2 1- 4 values
+@ d12=r1 1- 4 values
+@ d13=r3 1- 4 values
+
+@ d18 =r0 5- 8 values
+@ d19 =r2 5- 8 values
+@ d20=r1 5- 8 values
+@ d21=r3 5- 8 values
+
+@ d22 =r0 9- 12 values
+@ d23 =r2 9- 12 values
+@ d30=r1 9- 12 values
+@ d31=r3 9- 12 values
+
+@ d14 =r0 13-16 values
+@ d15 =r2 13- 16 values
+@ d8=r1 13- 16 values
+@ d9=r3 13- 16 values
+
+
+    vst1.16     {q2},[r1]!
+    vst1.16     {q6},[r1]!
+
+    vst1.16     {q9},[r1]!
+    vst1.16     {q10},[r1]!
+    vst1.16     {q11},[r1]!
+    vst1.16     {q15},[r1]!
+    vst1.16     {q7},[r1]!
+    vst1.16     {q4},[r1]!
+
+
+    subs        r14,r14,#1
+    bne         first_stage
+
+
+
+
+
+
+
+
+
+
+    mov         r6,r7
+
+    ldr         r8,[sp,#44]                 @ prediction stride
+    ldr         r7,[sp,#48]                 @ destination stride
+
+    mov         r10,#16
+
+    cmp         r12,r6
+    subge       r1,r1,#128
+    bge         label1
+
+    cmp         r12,#0xff00
+    subge       r1,r1,#256
+    bge         label_2
+
+    sub         r1,r1,#512
+    rsb         r10,r10,#0
+
+label_2:
+    add         r9,r1,#128
+    add         r11,r9,#128
+    add         r0,r11,#128
+
+
+
+label1:
+@   mov   r6,r1
+
+
+    mov         r14,#4
+    add         r4,r2,r8, lsl #1            @ r4 = r2 + pred_strd * 2    => r4 points to 3rd row of pred data
+    add         r5,r8,r8, lsl #1            @
+@   add r0,r3,r7, lsl #1    @ r0 points to 3rd row of dest data
+@   add r10,r7,r7, lsl #1   @
+
+
+
+
+second_stage:
+    vld1.16     {d10,d11},[r1]!
+    vld1.16     {d6,d7},[r1],r10
+    cmp         r12,r6
+    bge         second_stage_process
+    vld1.16     {d4,d5},[r9]!
+    vld1.16     {d8,d9},[r9],r10
+
+second_stage_process:
+
+
+    vmull.s16   q12,d6,d0[1]                @// y1 * cos1(part of b0)
+    vmull.s16   q13,d6,d0[3]                @// y1 * cos3(part of b1)
+    vmull.s16   q14,d6,d1[1]                @// y1 * sin3(part of b2)
+    vmull.s16   q15,d6,d1[3]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d7,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlal.s16   q13,d7,d2[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d7,d3[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d7,d2[3]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+    vmull.s16   q6,d10,d0[0]
+    vmlal.s16   q6,d11,d0[2]
+    vmull.s16   q7,d10,d0[0]
+    vmlal.s16   q7,d11,d1[2]
+    vmull.s16   q8,d10,d0[0]
+    vmlal.s16   q8,d11,d2[2]
+    vmull.s16   q9,d10,d0[0]
+    vmlal.s16   q9,d11,d3[2]
+
+    bge         skip_last8rows_stage2_kernel1
+
+    vmlal.s16   q12,d8,d1[1]
+    vmlal.s16   q13,d8,d3[3]
+    vmlsl.s16   q14,d8,d1[3]
+    vmlsl.s16   q15,d8,d0[3]
+
+
+    vmlal.s16   q12,d9,d1[3]
+    vmlsl.s16   q13,d9,d2[3]
+    vmlsl.s16   q14,d9,d0[3]
+    vmlal.s16   q15,d9,d3[3]
+
+
+    vmlal.s16   q6,d4,d1[0]
+    vmlal.s16   q6,d5,d1[2]
+    vmlal.s16   q7,d4,d3[0]
+    vmlsl.s16   q7,d5,d3[2]
+    vmlsl.s16   q8,d4,d3[0]
+    vmlsl.s16   q8,d5,d0[2]
+    vmlsl.s16   q9,d4,d1[0]
+    vmlsl.s16   q9,d5,d2[2]
+
+    cmp         r12,#0xff00
+    bge         skip_last8rows_stage2_kernel1
+
+
+    vld1.16     {d10,d11},[r11]!
+    vld1.16     {d6,d7},[r11],r10
+    vld1.16     {d4,d5},[r0]!
+    vld1.16     {d8,d9},[r0],r10
+
+
+
+
+
+    vmlal.s16   q12,d6,d2[1]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d6,d1[1]                @// y1 * cos3(part of b1)
+    vmlsl.s16   q14,d6,d3[1]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d6,d0[1]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d7,d2[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d7,d0[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d7,d2[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlal.s16   q15,d7,d3[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+    vmlal.s16   q12,d8,d3[1]
+    vmlsl.s16   q13,d8,d1[3]
+    vmlal.s16   q14,d8,d0[1]
+    vmlsl.s16   q15,d8,d1[1]
+
+
+    vmlal.s16   q12,d9,d3[3]
+    vmlsl.s16   q13,d9,d3[1]
+    vmlal.s16   q14,d9,d2[3]
+    vmlsl.s16   q15,d9,d2[1]
+
+
+
+
+
+    vmlal.s16   q6,d10,d0[0]
+    vmlal.s16   q6,d11,d2[2]
+    vmlal.s16   q6,d4,d3[0]
+    vmlal.s16   q6,d5,d3[2]
+
+
+
+
+    vmlsl.s16   q7,d10,d0[0]
+    vmlsl.s16   q7,d11,d0[2]
+    vmlsl.s16   q7,d4,d1[0]
+    vmlsl.s16   q7,d5,d2[2]
+
+
+    vmlsl.s16   q8,d10,d0[0]
+    vmlal.s16   q8,d11,d3[2]
+    vmlal.s16   q8,d4,d1[0]
+    vmlal.s16   q8,d5,d1[2]
+
+
+    vmlal.s16   q9,d10,d0[0]
+    vmlal.s16   q9,d11,d1[2]
+    vmlsl.s16   q9,d4,d3[0]
+    vmlsl.s16   q9,d5,d0[2]
+
+
+
+
+
+
+skip_last8rows_stage2_kernel1:
+
+
+
+    vadd.s32    q10,q6,q12
+    vsub.s32    q11,q6,q12
+
+    vadd.s32    q6,q7,q13
+    vsub.s32    q12,q7,q13
+
+    vadd.s32    q7,q8,q14
+    vsub.s32    q13,q8,q14
+
+
+    vadd.s32    q8,q9,q15
+    vsub.s32    q14,q9,q15
+
+
+
+
+
+
+
+    vqrshrn.s32 d30,q10,#shift_stage2_idct  @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d19,q11,#shift_stage2_idct  @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d31,q7,#shift_stage2_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d18,q13,#shift_stage2_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d12,q6,#shift_stage2_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d15,q12,#shift_stage2_idct  @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d13,q8,#shift_stage2_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d14,q14,#shift_stage2_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+    bge         skip_stage2_kernel_load
+
+    @q2,q4,q6,q7 is used
+    vld1.16     {d10,d11},[r1]!
+    vld1.16     {d6,d7},[r1]!
+    vld1.16     {d4,d5},[r9]!
+    vld1.16     {d8,d9},[r9]!
+skip_stage2_kernel_load:
+    sub         r1,r1,#32
+    vst1.16     {d30,d31},[r1]!
+    vst1.16     {d18,d19},[r1]!
+    sub         r1,r1,#32
+
+    vmull.s16   q12,d6,d2[1]                @// y1 * cos1(part of b0)
+    vmull.s16   q13,d6,d2[3]                @// y1 * cos3(part of b1)
+    vmull.s16   q14,d6,d3[1]                @// y1 * sin3(part of b2)
+    vmull.s16   q15,d6,d3[3]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d7,d1[1]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d7,d0[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d7,d1[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d7,d3[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+    vmull.s16   q11,d10,d0[0]
+    vmlsl.s16   q11,d11,d3[2]
+    vmull.s16   q10,d10,d0[0]
+    vmlsl.s16   q10,d11,d2[2]
+    vmull.s16   q8,d10,d0[0]
+    vmlsl.s16   q8,d11,d1[2]
+    vmull.s16   q9,d10,d0[0]
+    vmlsl.s16   q9,d11,d0[2]
+
+
+
+    cmp         r12,r6
+    bge         skip_last8rows_stage2_kernel2
+
+
+    vmlsl.s16   q12,d8,d3[1]
+    vmlal.s16   q13,d8,d2[1]
+    vmlal.s16   q14,d8,d0[1]
+    vmlal.s16   q15,d8,d2[3]
+
+
+    vmlal.s16   q12,d9,d0[1]
+    vmlal.s16   q13,d9,d3[1]
+    vmlsl.s16   q14,d9,d1[1]
+    vmlsl.s16   q15,d9,d2[1]
+
+
+
+    vmlsl.s16   q11,d4,d1[0]
+    vmlal.s16   q11,d5,d2[2]
+    vmlsl.s16   q10,d4,d3[0]
+    vmlal.s16   q10,d5,d0[2]
+    vmlal.s16   q8,d4,d3[0]
+    vmlal.s16   q8,d5,d3[2]
+    vmlal.s16   q9,d4,d1[0]
+    vmlsl.s16   q9,d5,d1[2]
+    cmp         r12,#0xff00
+    bge         skip_last8rows_stage2_kernel2
+
+    vld1.16     {d10,d11},[r11]!
+    vld1.16     {d6,d7},[r11]!
+    vld1.16     {d4,d5},[r0]!
+    vld1.16     {d8,d9},[r0]!
+
+    vmlsl.s16   q12,d6,d3[3]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d6,d0[3]                @// y1 * cos3(part of b1)
+    vmlal.s16   q14,d6,d2[3]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d6,d1[3]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d7,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlal.s16   q13,d7,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d7,d3[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d7,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+    vmlal.s16   q12,d8,d2[3]
+    vmlal.s16   q13,d8,d3[3]
+    vmlsl.s16   q14,d8,d2[1]
+    vmlal.s16   q15,d8,d0[3]
+
+
+    vmlal.s16   q12,d9,d1[3]
+    vmlsl.s16   q13,d9,d1[1]
+    vmlal.s16   q14,d9,d0[3]
+    vmlsl.s16   q15,d9,d0[1]
+
+
+
+
+    vmlal.s16   q11,d10,d0[0]
+    vmlsl.s16   q11,d11,d1[2]
+    vmlsl.s16   q11,d4,d3[0]
+    vmlal.s16   q11,d5,d0[2]
+
+
+
+    vmlsl.s16   q10,d10,d0[0]
+    vmlsl.s16   q10,d11,d3[2]
+    vmlal.s16   q10,d4,d1[0]
+    vmlsl.s16   q10,d5,d1[2]
+
+
+    vmlsl.s16   q8,d10,d0[0]
+    vmlal.s16   q8,d11,d0[2]
+    vmlsl.s16   q8,d4,d1[0]
+    vmlal.s16   q8,d5,d2[2]
+
+
+
+    vmlal.s16   q9,d10,d0[0]
+    vmlsl.s16   q9,d11,d2[2]
+    vmlal.s16   q9,d4,d3[0]
+    vmlsl.s16   q9,d5,d3[2]
+
+
+skip_last8rows_stage2_kernel2:
+
+
+
+    vadd.s32    q2,q11,q12
+    vsub.s32    q11,q11,q12
+
+    vadd.s32    q3,q10,q13
+    vsub.s32    q12,q10,q13
+
+    vadd.s32    q5,q8,q14
+    vsub.s32    q13,q8,q14
+
+
+    vadd.s32    q8,q9,q15
+    vsub.s32    q14,q9,q15
+
+
+    vqrshrn.s32 d18,q2,#shift_stage2_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d31,q11,#shift_stage2_idct  @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d19,q5,#shift_stage2_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d30,q13,#shift_stage2_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d20,q3,#shift_stage2_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d23,q12,#shift_stage2_idct  @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d21,q8,#shift_stage2_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d22,q14,#shift_stage2_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+    vld1.16     {d4,d5},[r1]!
+    vld1.16     {d8,d9},[r1]!
+
+
+
+    @ registers used:   {q2,q4,q6,q7}, {q9,q15,q10,q11}
+
+@d4=r0
+@d12=r1
+@d5=r2
+@d13=r3
+
+@d18=r4
+@d20=r5
+@d19=r6
+@d21=r7
+
+@d22=r8
+@d30=r9
+@d23=r10
+@d31=r11
+
+@d14=r12
+@d8=r13
+@d15=r14
+@d9=r15
+
+
+    vtrn.16     q2,q6
+    vtrn.16     q9,q10
+    vtrn.16     q11,q15
+    vtrn.16     q7,q4
+
+
+
+    vtrn.32     d4,d5
+    vtrn.32     d12,d13
+
+    vtrn.32     d18,d19
+    vtrn.32     d20,d21
+
+    vtrn.32     d22,d23
+    vtrn.32     d30,d31
+
+    vtrn.32     d14,d15
+    vtrn.32     d8,d9
+
+@ d4 =r0 1- 4 values
+@ d5 =r2 1- 4 values
+@ d12=r1 1- 4 values
+@ d13=r3 1- 4 values
+
+@ d18 =r0 5- 8 values
+@ d19 =r2 5- 8 values
+@ d20=r1 5- 8 values
+@ d21=r3 5- 8 values
+
+@ d22 =r0 9- 12 values
+@ d23 =r2 9- 12 values
+@ d30=r1 9- 12 values
+@ d31=r3 9- 12 values
+
+@ d14 =r0 13-16 values
+@ d15 =r2 13- 16 values
+@ d8=r1 13- 16 values
+@ d9=r3 13- 16 values
+
+
+    vswp        d5,d18
+    vswp        d23,d14
+    vswp        d13,d20
+    vswp        d31,d8
+
+@ q2: r0 1-8 values
+@ q11: r0 9-16 values
+@ q9 : r2 1-8 values
+@ q7 : r2 9-16 values
+@ q6 : r1 1- 8 values
+@ q10: r3 1-8 values
+@ q15: r1 9-16 values
+@ q4:  r3 9-16 values
+
+
+@   registers free: q8,q14,q12,q13
+
+
+    vld1.8      {d16,d17},[r2],r8
+    vld1.8      {d28,d29},[r2],r5
+    vld1.8      {d24,d25},[r4],r8
+    vld1.8      {d26,d27},[r4],r5
+
+
+
+
+    vaddw.u8    q2,q2,d16
+    vaddw.u8    q11,q11,d17
+    vaddw.u8    q6,q6,d28
+    vaddw.u8    q15,q15,d29
+    vaddw.u8    q9,q9,d24
+    vaddw.u8    q7,q7,d25
+    vaddw.u8    q10,q10,d26
+    vaddw.u8    q4,q4,d27
+
+
+    vqmovun.s16 d16,q2
+    vqmovun.s16 d17,q11
+    vqmovun.s16 d28,q6
+    vqmovun.s16 d29,q15
+    vqmovun.s16 d24,q9
+    vqmovun.s16 d25,q7
+    vqmovun.s16 d26,q10
+    vqmovun.s16 d27,q4
+
+
+
+    vst1.8      {d16,d17},[r3],r7
+    vst1.8      {d28,d29},[r3],r7
+    vst1.8      {d24,d25},[r3],r7
+    vst1.8      {d26,d27},[r3],r7
+
+    subs        r14,r14,#1
+
+
+
+    bne         second_stage
+
+
+@   sub         sp,sp,#40
+    ldmfd       sp!,{r4-r12,pc}
+
+
+
+
+
+
+
+
+
+
+

diff --git a/common/arm/ihevc_itrans_recon_32x32.s b/common/arm/ihevc_itrans_recon_32x32.s
new file mode 100644
index 0000000..eeb1d66
--- /dev/null
+++ b/common/arm/ihevc_itrans_recon_32x32.s

@@ -0,0 +1,2863 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@ *******************************************************************************
+@ * @file
+@ *  ihevc_itrans_recon_8x8_neon.s
+@ *
+@ * @brief
+@ *  contains function definitions for single stage  inverse transform
+@ *
+@ * @author
+@ * anand s
+@ *
+@ * @par list of functions:
+@ *  - ihevc_itrans_recon_32x32()
+@ *
+@ * @remarks
+@ *  the input buffer is being corrupted
+@ *
+@ *******************************************************************************
+@*/
+
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ *  this function performs inverse transform  and reconstruction for 8x8
+@ * input block
+@ *
+@ * @par description:
+@ *  performs inverse transform and adds the prediction  data and clips output
+@ * to 8 bit
+@ *
+@ * @param[in] pi2_src
+@ *  input 16x16 coefficients
+@ *
+@ * @param[in] pi2_tmp
+@ *  temporary 16x16 buffer for storing inverse
+@ *
+@ *  transform
+@ *  1st stage output
+@ *
+@ * @param[in] pu1_pred
+@ *  prediction 16x16 block
+@ *
+@ * @param[out] pu1_dst
+@ *  output 8x8 block
+@ *
+@ * @param[in] src_strd
+@ *  input stride
+@ *
+@ * @param[in] pred_strd
+@ *  prediction stride
+@ *
+@ * @param[in] dst_strd
+@ *  output stride
+@ *
+@ * @param[in] shift
+@ *  output shift
+@ *
+@ * @param[in] r12
+@ *  zero columns in pi2_src
+@ *
+@ * @returns  void
+@ *
+@ * @remarks
+@ *  none
+@ *
+@ *******************************************************************************
+@ */
+
+@void ihevc_itrans_recon_32x32(word16 *pi2_src,
+@                            word16 *pi2_tmp,
+@                            uword8 *pu1_pred,
+@                            uword8 *pu1_dst,
+@                            word32 src_strd,
+@                            word32 pred_strd,
+@                            word32 dst_strd,
+@                            word32 r12
+@                            word32 r11             )
+
+@**************variables vs registers*************************
+@   r0 => *pi2_src
+@   r1 => *pi2_tmp
+@   r2 => *pu1_pred
+@   r3 => *pu1_dst
+@   src_strd
+@   pred_strd
+@   dst_strd
+@   r12
+@   r11
+
+
+@d0[0]= 64      d2[0]=83
+@d0[1]= 90      d2[1]=82
+@d0[2]= 90      d2[2]=80
+@d0[3]= 90      d2[3]=78
+@d1[0]= 89      d3[0]=75
+@d1[1]= 88      d3[1]=73
+@d1[2]= 87      d3[2]=70
+@d1[3]= 85      d3[3]=67
+
+@d4[0]= 64      d6[0]=36
+@d4[1]= 61      d6[1]=31
+@d4[2]= 57      d6[2]=25
+@d4[3]= 54      d6[3]=22
+@d5[0]= 50      d7[0]=18
+@d5[1]= 46      d7[1]=13
+@d5[2]= 43      d7[2]=9
+@d5[3]= 38      d7[3]=4
+
+.text
+.align 4
+
+
+
+
+
+.set shift_stage1_idct ,   7
+.set shift_stage2_idct ,   12
+
+@#define zero_cols   r12
+@#define zero_rows   r11
+
+.globl ihevc_itrans_recon_32x32_a9q
+
+.extern g_ai2_ihevc_trans_32_transpose
+
+g_ai2_ihevc_trans_32_transpose_addr:
+.long g_ai2_ihevc_trans_32_transpose - ulbl1 - 8
+
+r5_addr: .word 0xfffff000
+r9_addr: .word 0xffff0000
+
+.type ihevc_itrans_recon_32x32_a9q, %function
+
+ihevc_itrans_recon_32x32_a9q:
+
+    stmfd       sp!,{r0-r12,lr}
+
+
+@ldr            r8,[sp,#56]     @ prediction stride
+@ldr            r7,[sp,#64]     @ destination stride
+    ldr         r6,[sp,#56]                 @ src stride
+    ldr         r12,[sp,#68]
+    ldr         r11,[sp,#72]
+    mov         r6,r6,lsl #1                @ x sizeof(word16)
+    add         r10,r6,r6, lsl #1           @ 3 rows
+
+
+    mov         r8,r0
+
+    ldr         r14,g_ai2_ihevc_trans_32_transpose_addr
+ulbl1:
+    add         r14,r14,pc
+    vld1.16     {d0,d1,d2,d3},[r14]!
+    vld1.16     {d4,d5,d6,d7},[r14]!
+
+@registers which are free
+@  r10,r9,r11,r12
+    mov         r9,#0xffffff00
+    mov         r10,#0xfffffff0
+    ldr         r5,r5_addr
+    ldr         r7,r9_addr
+    cmp         r12,r10
+    movhs       r14,#1
+    bhs         stage1
+
+
+    cmp         r12,r9
+    movhs       r14,#2
+    bhs         stage1
+
+    cmp         r12,r5
+    movhs       r14,#3
+    bhs         stage1
+
+    cmp         r12,r7
+    movhs       r14,#4
+
+    mov         r14,#8
+    b           stage1
+@.ltorg
+
+
+dct_stage1:
+    add         r8,r8,#8
+    mov         r0,r8
+
+stage1:
+    vld1.16     d10,[r0],r6
+    vld1.16     d8,[r0],r6
+    vld1.16     d11,[r0],r6
+    vld1.16     d9,[r0],r6
+
+    vmull.s16   q12,d8,d0[1]                @// y1 * cos1(part of b0)
+    vmull.s16   q13,d8,d0[3]                @// y1 * cos3(part of b1)
+    vmull.s16   q14,d8,d1[1]                @// y1 * sin3(part of b2)
+    vmull.s16   q15,d8,d1[3]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d9,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlal.s16   q13,d9,d2[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d9,d3[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlal.s16   q15,d9,d5[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmull.s16   q10,d10,d0[0]
+    vmlal.s16   q10,d11,d0[2]
+
+
+    vmull.s16   q11,d10,d0[0]
+    vmlal.s16   q11,d11,d1[2]
+
+    vmull.s16   q8,d10,d0[0]
+    vmlal.s16   q8,d11,d2[2]
+
+    vmull.s16   q9,d10,d0[0]
+    vmlal.s16   q9,d11,d3[2]
+    cmp         r11,r10
+    bhs         shift1
+
+    vld1.16     d12,[r0],r6
+    vld1.16     d14,[r0],r6
+    vld1.16     d13,[r0],r6
+    vld1.16     d15,[r0],r6
+
+
+
+
+
+
+
+    vmlal.s16   q12,d14,d1[1]
+    vmlal.s16   q13,d14,d3[3]
+    vmlal.s16   q14,d14,d6[1]
+    vmlsl.s16   q15,d14,d7[1]
+
+
+    vmlal.s16   q12,d15,d1[3]
+    vmlal.s16   q13,d15,d5[1]
+    vmlsl.s16   q14,d15,d7[1]
+    vmlsl.s16   q15,d15,d3[3]
+
+
+    vmlal.s16   q10,d12,d1[0]
+    vmlal.s16   q10,d13,d1[2]
+    vmlal.s16   q11,d12,d3[0]
+    vmlal.s16   q11,d13,d4[2]
+    vmlal.s16   q8,d12,d5[0]
+    vmlal.s16   q8,d13,d7[2]
+    vmlal.s16   q9,d12,d7[0]
+    vmlsl.s16   q9,d13,d5[2]
+
+    cmp         r11,r9
+    bhs         shift1
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d8,[r0],r6
+    vld1.16     d11,[r0],r6
+    vld1.16     d9,[r0],r6
+
+
+    vmlal.s16   q12,d8,d2[1]                @// y1 * cos1(part of b0)
+    vmlal.s16   q13,d8,d6[3]                @// y1 * cos3(part of b1)
+    vmlsl.s16   q14,d8,d4[3]                @// y1 * sin3(part of b2)
+    vmlsl.s16   q15,d8,d0[1]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d9,d2[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d7[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d2[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d3[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlal.s16   q10,d10,d2[0]
+    vmlal.s16   q10,d11,d2[2]
+
+
+    vmlal.s16   q11,d10,d6[0]
+    vmlal.s16   q11,d11,d7[2]
+
+    vmlsl.s16   q8,d10,d6[0]
+    vmlsl.s16   q8,d11,d3[2]
+
+    vmlsl.s16   q9,d10,d2[0]
+    vmlsl.s16   q9,d11,d1[2]
+
+    cmp         r11,r5
+    bhs         shift1
+
+
+    vld1.16     d12,[r0],r6
+    vld1.16     d14,[r0],r6
+    vld1.16     d13,[r0],r6
+    vld1.16     d15,[r0],r6
+
+
+
+
+
+
+
+
+
+    vmlal.s16   q12,d14,d3[1]
+    vmlsl.s16   q13,d14,d6[1]
+    vmlsl.s16   q14,d14,d0[1]
+    vmlsl.s16   q15,d14,d6[3]
+
+
+    vmlal.s16   q12,d15,d3[3]
+    vmlsl.s16   q13,d15,d4[3]
+    vmlsl.s16   q14,d15,d2[3]
+    vmlal.s16   q15,d15,d5[3]
+
+
+    vmlal.s16   q10,d12,d3[0]
+    vmlal.s16   q10,d13,d3[2]
+    vmlsl.s16   q11,d12,d7[0]
+    vmlsl.s16   q11,d13,d5[2]
+    vmlsl.s16   q8,d12,d1[0]
+    vmlsl.s16   q8,d13,d1[2]
+    vmlsl.s16   q9,d12,d5[0]
+    vmlal.s16   q9,d13,d7[2]
+
+    cmp         r11,r7
+    bhs         shift1
+
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d8,[r0],r6
+    vld1.16     d11,[r0],r6
+    vld1.16     d9,[r0],r6
+
+
+
+    vmlal.s16   q12,d8,d4[1]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d8,d3[1]                @// y1 * cos3(part of b1)
+    vmlsl.s16   q14,d8,d5[1]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d8,d2[1]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d9,d4[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d7[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlal.s16   q15,d9,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlal.s16   q10,d10,d0[0]
+    vmlal.s16   q10,d11,d4[2]
+
+
+    vmlsl.s16   q11,d10,d0[0]
+    vmlsl.s16   q11,d11,d2[2]
+
+    vmlsl.s16   q8,d10,d0[0]
+    vmlsl.s16   q8,d11,d6[2]
+
+    vmlal.s16   q9,d10,d0[0]
+    vmlal.s16   q9,d11,d0[2]
+
+
+
+    vld1.16     d12,[r0],r6
+    vld1.16     d14,[r0],r6
+    vld1.16     d13,[r0],r6
+    vld1.16     d15,[r0],r6
+
+
+
+
+    vmlal.s16   q12,d14,d5[1]
+    vmlsl.s16   q13,d14,d0[2]
+    vmlal.s16   q14,d14,d5[3]
+    vmlal.s16   q15,d14,d4[3]
+
+
+    vmlal.s16   q12,d15,d5[3]
+    vmlsl.s16   q13,d15,d1[1]
+    vmlal.s16   q14,d15,d3[1]
+    vmlsl.s16   q15,d15,d7[3]
+
+
+    vmlal.s16   q10,d12,d5[0]
+    vmlal.s16   q10,d13,d5[2]
+    vmlsl.s16   q11,d12,d1[0]
+    vmlsl.s16   q11,d13,d0[2]
+    vmlal.s16   q8,d12,d7[0]
+    vmlal.s16   q8,d13,d4[2]
+    vmlal.s16   q9,d12,d3[0]
+    vmlal.s16   q9,d13,d6[2]
+
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d8,[r0],r6
+    vld1.16     d11,[r0],r6
+    vld1.16     d9,[r0],r6
+
+
+
+
+
+
+
+    vmlal.s16   q12,d8,d6[1]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d8,d2[3]                @// y1 * cos3(part of b1)
+    vmlal.s16   q14,d8,d0[1]                @// y1 * sin3(part of b2)
+    vmlsl.s16   q15,d8,d4[1]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d9,d6[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d4[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d9,d1[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d0[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlal.s16   q10,d10,d6[0]
+    vmlal.s16   q10,d11,d6[2]
+
+
+    vmlsl.s16   q11,d10,d2[0]
+    vmlsl.s16   q11,d11,d3[2]
+
+    vmlal.s16   q8,d10,d2[0]
+    vmlal.s16   q8,d11,d0[2]
+
+    vmlsl.s16   q9,d10,d6[0]
+    vmlsl.s16   q9,d11,d2[2]
+
+    vld1.16     d12,[r0],r6
+    vld1.16     d14,[r0],r6
+    vld1.16     d13,[r0],r6
+    vld1.16     d15,[r0],r6
+
+
+    vmlal.s16   q12,d14,d7[1]
+    vmlsl.s16   q13,d14,d5[3]
+    vmlal.s16   q14,d14,d4[1]
+    vmlsl.s16   q15,d14,d2[3]
+
+
+    vmlal.s16   q12,d15,d7[3]
+    vmlsl.s16   q13,d15,d7[1]
+    vmlal.s16   q14,d15,d6[3]
+    vmlsl.s16   q15,d15,d6[1]
+
+
+    vmlal.s16   q10,d12,d7[0]
+    vmlal.s16   q10,d13,d7[2]
+    vmlsl.s16   q11,d12,d5[0]
+    vmlsl.s16   q11,d13,d6[2]
+    vmlal.s16   q8,d12,d3[0]
+    vmlal.s16   q8,d13,d5[2]
+    vmlsl.s16   q9,d12,d1[0]
+    vmlsl.s16   q9,d13,d4[2]
+
+
+
+shift1:
+    vadd.s32    q4,q10,q12
+    vsub.s32    q5,q10,q12
+
+    vadd.s32    q6,q11,q13
+    vsub.s32    q12,q11,q13
+
+    vadd.s32    q7,q8,q14
+    vsub.s32    q13,q8,q14
+
+
+    vadd.s32    q8,q9,q15
+    vsub.s32    q14,q9,q15
+
+
+    vqrshrn.s32 d30,q4,#shift_stage1_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d19,q5,#shift_stage1_idct   @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d31,q7,#shift_stage1_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d18,q13,#shift_stage1_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d12,q6,#shift_stage1_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d15,q12,#shift_stage1_idct  @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d13,q8,#shift_stage1_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d14,q14,#shift_stage1_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+
+    @ registers used q15,q14,q6,q7
+
+
+    vtrn.16     q15,q6
+    vtrn.16     q7,q9
+
+    vtrn.32     d30,d31
+    vtrn.32     d12,d13
+    vtrn.32     d14,d15
+    vtrn.32     d18,d19
+
+
+@ d30 =r0 1- 4 values
+@ d31 =r2 1- 4 values
+@ d12=r1 1- 4 values
+@ d13=r3 1- 4 values
+@ d14 =r0 28-31 values
+@ d15 =r2 28- 31 values
+@ d18=r1 28- 31 values
+@ d19=r3 28- 31 values
+
+
+
+    vst1.16     {q15},[r1]!
+    vst1.16     {q6},[r1]!
+    add         r1,r1,#192
+    vst1.16     {q7},[r1]!
+    vst1.16     {q9},[r1]!
+    sub         r1,r1,#224
+
+    mov         r0,r8
+
+
+
+
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d8,[r0],r6
+    vld1.16     d11,[r0],r6
+    vld1.16     d9,[r0],r6
+
+
+
+
+    vmull.s16   q12,d8,d2[1]                @// y1 * cos1(part of b0)
+    vmull.s16   q13,d8,d2[3]                @// y1 * cos3(part of b1)
+    vmull.s16   q14,d8,d3[1]                @// y1 * sin3(part of b2)
+    vmull.s16   q15,d8,d3[3]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d9,d6[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d7[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d6[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d4[3]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmull.s16   q10,d10,d0[0]
+    vmlal.s16   q10,d11,d4[2]
+
+
+    vmull.s16   q11,d10,d0[0]
+    vmlal.s16   q11,d11,d5[2]
+
+    vmull.s16   q8,d10,d0[0]
+    vmlal.s16   q8,d11,d6[2]
+
+    vmull.s16   q9,d10,d0[0]
+    vmlal.s16   q9,d11,d7[2]
+    cmp         r11,r10
+    bhs         shift2
+
+    vld1.16     d12,[r0],r6
+    vld1.16     d14,[r0],r6
+    vld1.16     d13,[r0],r6
+    vld1.16     d15,[r0],r6
+
+
+    vmlsl.s16   q12,d14,d4[3]
+    vmlsl.s16   q13,d14,d2[1]
+    vmlsl.s16   q14,d14,d0[1]
+    vmlsl.s16   q15,d14,d2[3]
+
+
+    vmlsl.s16   q12,d15,d0[3]
+    vmlsl.s16   q13,d15,d3[1]
+    vmlsl.s16   q14,d15,d6[3]
+    vmlal.s16   q15,d15,d5[3]
+
+
+    vmlsl.s16   q10,d12,d7[0]
+    vmlsl.s16   q10,d13,d2[2]
+    vmlsl.s16   q11,d12,d5[0]
+    vmlsl.s16   q11,d13,d0[2]
+    vmlsl.s16   q8,d12,d3[0]
+    vmlsl.s16   q8,d13,d3[2]
+    vmlsl.s16   q9,d12,d1[0]
+    vmlsl.s16   q9,d13,d6[2]
+
+    cmp         r11,r9
+    bhs         shift2
+
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d8,[r0],r6
+    vld1.16     d11,[r0],r6
+    vld1.16     d9,[r0],r6
+
+
+
+
+
+
+
+    vmlsl.s16   q12,d8,d4[1]                @// y1 * cos1(part of b0)
+    vmlal.s16   q13,d8,d7[1]                @// y1 * cos3(part of b1)
+    vmlal.s16   q14,d8,d2[3]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d8,d1[3]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d9,d7[1]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlal.s16   q13,d9,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d9,d3[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d6[3]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlsl.s16   q10,d10,d2[0]
+    vmlsl.s16   q10,d11,d6[2]
+
+
+    vmlsl.s16   q11,d10,d6[0]
+    vmlal.s16   q11,d11,d4[2]
+
+    vmlal.s16   q8,d10,d6[0]
+    vmlal.s16   q8,d11,d0[2]
+
+    vmlal.s16   q9,d10,d2[0]
+    vmlal.s16   q9,d11,d5[2]
+
+    cmp         r11,r5
+    bhs         shift2
+
+
+    vld1.16     d12,[r0],r6
+    vld1.16     d14,[r0],r6
+    vld1.16     d13,[r0],r6
+    vld1.16     d15,[r0],r6
+
+
+
+
+
+    vmlal.s16   q12,d14,d2[3]
+    vmlal.s16   q13,d14,d3[3]
+    vmlsl.s16   q14,d14,d5[3]
+    vmlsl.s16   q15,d14,d0[3]
+
+
+    vmlal.s16   q12,d15,d1[3]
+    vmlsl.s16   q13,d15,d6[3]
+    vmlsl.s16   q14,d15,d0[3]
+    vmlal.s16   q15,d15,d7[3]
+
+
+    vmlal.s16   q10,d12,d5[0]
+    vmlal.s16   q10,d13,d0[2]
+    vmlal.s16   q11,d12,d1[0]
+    vmlal.s16   q11,d13,d6[2]
+    vmlal.s16   q8,d12,d7[0]
+    vmlsl.s16   q8,d13,d2[2]
+    vmlsl.s16   q9,d12,d3[0]
+    vmlsl.s16   q9,d13,d4[2]
+
+
+    cmp         r11,r7
+    bhs         shift2
+
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d8,[r0],r6
+    vld1.16     d11,[r0],r6
+    vld1.16     d9,[r0],r6
+
+
+
+
+
+
+
+    vmlal.s16   q12,d8,d6[1]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d8,d1[1]                @// y1 * cos3(part of b1)
+    vmlsl.s16   q14,d8,d7[1]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d8,d0[3]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d9,d5[1]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d4[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d9,d2[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlal.s16   q15,d9,d7[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlal.s16   q10,d10,d0[0]
+    vmlsl.s16   q10,d11,d7[2]
+
+
+    vmlsl.s16   q11,d10,d0[0]
+    vmlsl.s16   q11,d11,d1[2]
+
+    vmlsl.s16   q8,d10,d0[0]
+    vmlal.s16   q8,d11,d5[2]
+
+    vmlal.s16   q9,d10,d0[0]
+    vmlal.s16   q9,d11,d3[2]
+
+
+
+    vld1.16     d12,[r0],r6
+    vld1.16     d14,[r0],r6
+    vld1.16     d13,[r0],r6
+    vld1.16     d15,[r0],r6
+
+
+    vmlsl.s16   q12,d14,d0[1]
+    vmlal.s16   q13,d14,d6[1]
+    vmlal.s16   q14,d14,d4[1]
+    vmlsl.s16   q15,d14,d1[1]
+
+
+    vmlsl.s16   q12,d15,d3[3]
+    vmlal.s16   q13,d15,d0[1]
+    vmlsl.s16   q14,d15,d5[1]
+    vmlsl.s16   q15,d15,d6[1]
+
+
+    vmlsl.s16   q10,d12,d3[0]
+    vmlsl.s16   q10,d13,d1[2]
+    vmlsl.s16   q11,d12,d7[0]
+    vmlal.s16   q11,d13,d3[2]
+    vmlal.s16   q8,d12,d1[0]
+    vmlal.s16   q8,d13,d7[2]
+    vmlsl.s16   q9,d12,d5[0]
+    vmlsl.s16   q9,d13,d2[2]
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d8,[r0],r6
+    vld1.16     d11,[r0],r6
+    vld1.16     d9,[r0],r6
+
+
+
+
+    vmlal.s16   q12,d8,d7[3]                @// y1 * cos1(part of b0)
+    vmlal.s16   q13,d8,d4[3]                @// y1 * cos3(part of b1)
+    vmlsl.s16   q14,d8,d1[1]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d8,d2[1]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d9,d3[1]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d5[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d7[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlal.s16   q15,d9,d5[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlsl.s16   q10,d10,d6[0]
+    vmlal.s16   q10,d11,d5[2]
+
+
+    vmlal.s16   q11,d10,d2[0]
+    vmlal.s16   q11,d11,d7[2]
+
+    vmlsl.s16   q8,d10,d2[0]
+    vmlsl.s16   q8,d11,d4[2]
+
+    vmlal.s16   q9,d10,d6[0]
+    vmlal.s16   q9,d11,d1[2]
+
+
+    vld1.16     d12,[r0],r6
+    vld1.16     d14,[r0],r6
+    vld1.16     d13,[r0],r6
+    vld1.16     d15,[r0],r6
+
+
+
+
+
+    vmlal.s16   q12,d14,d1[1]
+    vmlsl.s16   q13,d14,d0[3]
+    vmlal.s16   q14,d14,d1[3]
+    vmlsl.s16   q15,d14,d3[1]
+
+
+    vmlal.s16   q12,d15,d5[3]
+    vmlsl.s16   q13,d15,d5[1]
+    vmlal.s16   q14,d15,d4[3]
+    vmlsl.s16   q15,d15,d4[1]
+
+
+    vmlal.s16   q10,d12,d1[0]
+    vmlal.s16   q10,d13,d3[2]
+    vmlsl.s16   q11,d12,d3[0]
+    vmlsl.s16   q11,d13,d2[2]
+    vmlal.s16   q8,d12,d5[0]
+    vmlal.s16   q8,d13,d1[2]
+    vmlsl.s16   q9,d12,d7[0]
+    vmlsl.s16   q9,d13,d0[2]
+
+shift2:
+    vadd.s32    q4,q10,q12
+    vsub.s32    q5,q10,q12
+
+    vadd.s32    q6,q11,q13
+    vsub.s32    q12,q11,q13
+
+    vadd.s32    q7,q8,q14
+    vsub.s32    q13,q8,q14
+
+
+    vadd.s32    q8,q9,q15
+    vsub.s32    q14,q9,q15
+
+
+    vqrshrn.s32 d30,q4,#shift_stage1_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d19,q5,#shift_stage1_idct   @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d31,q7,#shift_stage1_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d18,q13,#shift_stage1_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d12,q6,#shift_stage1_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d15,q12,#shift_stage1_idct  @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d13,q8,#shift_stage1_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d14,q14,#shift_stage1_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+    vtrn.16     q15,q6
+    vtrn.16     q7,q9
+
+    vtrn.32     d30,d31
+    vtrn.32     d12,d13
+    vtrn.32     d14,d15
+    vtrn.32     d18,d19
+
+
+    vst1.16     {q15},[r1]!
+    vst1.16     {q6},[r1]!
+    add         r1,r1,#128
+    vst1.16     {q7},[r1]!
+    vst1.16     {q9},[r1]!
+    sub         r1,r1,#160
+    mov         r0,r8
+
+
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d8,[r0],r6
+    vld1.16     d11,[r0],r6
+    vld1.16     d9,[r0],r6
+
+
+    vmull.s16   q12,d8,d4[1]                @// y1 * cos1(part of b0)
+    vmull.s16   q13,d8,d4[3]                @// y1 * cos3(part of b1)
+    vmull.s16   q14,d8,d5[1]                @// y1 * sin3(part of b2)
+    vmull.s16   q15,d8,d5[3]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d9,d3[1]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d0[2]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmull.s16   q10,d10,d0[0]
+    vmlsl.s16   q10,d11,d7[2]
+
+
+    vmull.s16   q11,d10,d0[0]
+    vmlsl.s16   q11,d11,d6[2]
+
+    vmull.s16   q8,d10,d0[0]
+    vmlsl.s16   q8,d11,d5[2]
+
+    vmull.s16   q9,d10,d0[0]
+    vmlsl.s16   q9,d11,d4[2]
+
+    cmp         r11,r10
+    bhs         shift3
+
+    vld1.16     d12,[r0],r6
+    vld1.16     d14,[r0],r6
+    vld1.16     d13,[r0],r6
+    vld1.16     d15,[r0],r6
+
+
+
+
+    vmlsl.s16   q12,d14,d5[1]
+    vmlsl.s16   q13,d14,d7[3]
+    vmlal.s16   q14,d14,d5[3]
+    vmlal.s16   q15,d14,d3[1]
+
+
+    vmlal.s16   q12,d15,d2[1]
+    vmlal.s16   q13,d15,d1[1]
+    vmlal.s16   q14,d15,d4[3]
+    vmlsl.s16   q15,d15,d7[3]
+
+
+    vmlsl.s16   q10,d12,d1[0]
+    vmlal.s16   q10,d13,d6[2]
+    vmlsl.s16   q11,d12,d3[0]
+    vmlal.s16   q11,d13,d3[2]
+    vmlsl.s16   q8,d12,d5[0]
+    vmlal.s16   q8,d13,d0[2]
+    vmlsl.s16   q9,d12,d7[0]
+    vmlal.s16   q9,d13,d2[2]
+
+    cmp         r11,r9
+    bhs         shift3
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d8,[r0],r6
+    vld1.16     d11,[r0],r6
+    vld1.16     d9,[r0],r6
+
+    vmlal.s16   q12,d8,d6[1]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d8,d5[1]                @// y1 * cos3(part of b1)
+    vmlsl.s16   q14,d8,d0[3]                @// y1 * sin3(part of b2)
+    vmlsl.s16   q15,d8,d3[3]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d9,d1[1]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d4[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d9,d6[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlal.s16   q15,d9,d0[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlal.s16   q10,d10,d2[0]
+    vmlsl.s16   q10,d11,d5[2]
+
+
+    vmlal.s16   q11,d10,d6[0]
+    vmlsl.s16   q11,d11,d0[2]
+
+    vmlsl.s16   q8,d10,d6[0]
+    vmlsl.s16   q8,d11,d4[2]
+
+    vmlsl.s16   q9,d10,d2[0]
+    vmlal.s16   q9,d11,d6[2]
+
+    cmp         r11,r5
+    bhs         shift3
+
+
+    vld1.16     d12,[r0],r6
+    vld1.16     d14,[r0],r6
+    vld1.16     d13,[r0],r6
+    vld1.16     d15,[r0],r6
+
+
+
+
+
+
+    vmlsl.s16   q12,d14,d7[1]
+    vmlal.s16   q13,d14,d2[1]
+    vmlal.s16   q14,d14,d4[1]
+    vmlsl.s16   q15,d14,d5[1]
+
+
+    vmlal.s16   q12,d15,d0[3]
+    vmlal.s16   q13,d15,d7[1]
+    vmlsl.s16   q14,d15,d1[1]
+    vmlsl.s16   q15,d15,d6[1]
+
+
+    vmlsl.s16   q10,d12,d3[0]
+    vmlal.s16   q10,d13,d4[2]
+    vmlal.s16   q11,d12,d7[0]
+    vmlal.s16   q11,d13,d2[2]
+    vmlal.s16   q8,d12,d1[0]
+    vmlsl.s16   q8,d13,d6[2]
+    vmlal.s16   q9,d12,d5[0]
+    vmlsl.s16   q9,d13,d0[2]
+
+
+    cmp         r11,r7
+    bhs         shift3
+
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d8,[r0],r6
+    vld1.16     d11,[r0],r6
+    vld1.16     d9,[r0],r6
+
+
+    vmlsl.s16   q12,d8,d7[3]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d8,d0[1]                @// y1 * cos3(part of b1)
+    vmlal.s16   q14,d8,d6[3]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d8,d1[3]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d9,d0[1]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlal.s16   q13,d9,d5[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d9,d3[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d2[3]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlal.s16   q10,d10,d0[0]
+    vmlsl.s16   q10,d11,d3[2]
+
+
+    vmlsl.s16   q11,d10,d0[0]
+    vmlsl.s16   q11,d11,d5[2]
+
+    vmlsl.s16   q8,d10,d0[0]
+    vmlal.s16   q8,d11,d1[2]
+
+    vmlal.s16   q9,d10,d0[0]
+    vmlal.s16   q9,d11,d7[2]
+
+
+    vld1.16     d12,[r0],r6
+    vld1.16     d14,[r0],r6
+    vld1.16     d13,[r0],r6
+    vld1.16     d15,[r0],r6
+
+
+
+    vmlal.s16   q12,d14,d6[3]
+    vmlal.s16   q13,d14,d3[3]
+    vmlsl.s16   q14,d14,d1[3]
+    vmlal.s16   q15,d14,d7[1]
+
+
+    vmlal.s16   q12,d15,d1[3]
+    vmlsl.s16   q13,d15,d2[3]
+    vmlal.s16   q14,d15,d7[1]
+    vmlal.s16   q15,d15,d4[1]
+
+
+    vmlsl.s16   q10,d12,d5[0]
+    vmlal.s16   q10,d13,d2[2]
+    vmlal.s16   q11,d12,d1[0]
+    vmlsl.s16   q11,d13,d7[2]
+    vmlsl.s16   q8,d12,d7[0]
+    vmlsl.s16   q8,d13,d3[2]
+    vmlsl.s16   q9,d12,d3[0]
+    vmlal.s16   q9,d13,d1[2]
+
+
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d8,[r0],r6
+    vld1.16     d11,[r0],r6
+    vld1.16     d9,[r0],r6
+
+
+
+
+    vmlsl.s16   q12,d8,d5[3]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d8,d6[3]                @// y1 * cos3(part of b1)
+    vmlal.s16   q14,d8,d3[1]                @// y1 * sin3(part of b2)
+    vmlsl.s16   q15,d8,d0[1]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d9,d2[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlal.s16   q13,d9,d0[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d2[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlal.s16   q15,d9,d4[3]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlal.s16   q10,d10,d6[0]
+    vmlsl.s16   q10,d11,d1[2]
+
+
+    vmlsl.s16   q11,d10,d2[0]
+    vmlal.s16   q11,d11,d4[2]
+
+    vmlal.s16   q8,d10,d2[0]
+    vmlsl.s16   q8,d11,d7[2]
+
+    vmlsl.s16   q9,d10,d6[0]
+    vmlsl.s16   q9,d11,d5[2]
+
+
+    vld1.16     d12,[r0],r6
+    vld1.16     d14,[r0],r6
+    vld1.16     d13,[r0],r6
+    vld1.16     d15,[r0],r6
+
+    vmlal.s16   q12,d14,d4[3]
+    vmlsl.s16   q13,d14,d6[1]
+    vmlal.s16   q14,d14,d7[3]
+    vmlal.s16   q15,d14,d6[3]
+
+
+    vmlal.s16   q12,d15,d3[3]
+    vmlsl.s16   q13,d15,d3[1]
+    vmlal.s16   q14,d15,d2[3]
+    vmlsl.s16   q15,d15,d2[1]
+
+
+    vmlsl.s16   q10,d12,d7[0]
+    vmlal.s16   q10,d13,d0[2]
+    vmlal.s16   q11,d12,d5[0]
+    vmlsl.s16   q11,d13,d1[2]
+    vmlsl.s16   q8,d12,d3[0]
+    vmlal.s16   q8,d13,d2[2]
+    vmlal.s16   q9,d12,d1[0]
+    vmlsl.s16   q9,d13,d3[2]
+
+shift3:
+    vadd.s32    q4,q10,q12
+    vsub.s32    q5,q10,q12
+
+    vadd.s32    q6,q11,q13
+    vsub.s32    q12,q11,q13
+
+    vadd.s32    q7,q8,q14
+    vsub.s32    q13,q8,q14
+
+
+    vadd.s32    q8,q9,q15
+    vsub.s32    q14,q9,q15
+
+
+    vqrshrn.s32 d30,q4,#shift_stage1_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d19,q5,#shift_stage1_idct   @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d31,q7,#shift_stage1_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d18,q13,#shift_stage1_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d12,q6,#shift_stage1_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d15,q12,#shift_stage1_idct  @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d13,q8,#shift_stage1_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d14,q14,#shift_stage1_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+    vtrn.16     q15,q6
+    vtrn.16     q7,q9
+
+    vtrn.32     d30,d31
+    vtrn.32     d12,d13
+    vtrn.32     d14,d15
+    vtrn.32     d18,d19
+
+
+    vst1.16     {q15},[r1]!
+    vst1.16     {q6},[r1]!
+    add         r1,r1,#64
+    vst1.16     {q7},[r1]!
+    vst1.16     {q9},[r1]!
+    sub         r1,r1,#96
+
+    mov         r0,r8
+
+
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d8,[r0],r6
+    vld1.16     d11,[r0],r6
+    vld1.16     d9,[r0],r6
+
+
+    vmull.s16   q12,d8,d6[1]                @// y1 * cos1(part of b0)
+    vmull.s16   q13,d8,d6[3]                @// y1 * cos3(part of b1)
+    vmull.s16   q14,d8,d7[1]                @// y1 * sin3(part of b2)
+    vmull.s16   q15,d8,d7[3]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d9,d2[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d4[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d5[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d7[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmull.s16   q10,d10,d0[0]
+    vmlsl.s16   q10,d11,d3[2]
+
+
+    vmull.s16   q11,d10,d0[0]
+    vmlsl.s16   q11,d11,d2[2]
+
+    vmull.s16   q8,d10,d0[0]
+    vmlsl.s16   q8,d11,d1[2]
+
+    vmull.s16   q9,d10,d0[0]
+    vmlsl.s16   q9,d11,d0[2]
+
+    cmp         r11,r10
+    bhs         shift4
+
+    vld1.16     d12,[r0],r6
+    vld1.16     d14,[r0],r6
+    vld1.16     d13,[r0],r6
+    vld1.16     d15,[r0],r6
+
+
+
+
+
+
+    vmlal.s16   q12,d14,d0[1]
+    vmlal.s16   q13,d14,d1[3]
+    vmlal.s16   q14,d14,d4[1]
+    vmlal.s16   q15,d14,d6[3]
+
+
+    vmlsl.s16   q12,d15,d4[1]
+    vmlsl.s16   q13,d15,d0[3]
+    vmlsl.s16   q14,d15,d2[3]
+    vmlsl.s16   q15,d15,d6[1]
+
+
+    vmlal.s16   q10,d12,d7[0]
+    vmlal.s16   q10,d13,d5[2]
+    vmlal.s16   q11,d12,d5[0]
+    vmlsl.s16   q11,d13,d7[2]
+    vmlal.s16   q8,d12,d3[0]
+    vmlsl.s16   q8,d13,d4[2]
+    vmlal.s16   q9,d12,d1[0]
+    vmlsl.s16   q9,d13,d1[2]
+
+    cmp         r11,r9
+    bhs         shift4
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d8,[r0],r6
+    vld1.16     d11,[r0],r6
+    vld1.16     d9,[r0],r6
+
+
+
+    vmlal.s16   q12,d8,d7[3]                @// y1 * cos1(part of b0)
+    vmlal.s16   q13,d8,d3[1]                @// y1 * cos3(part of b1)
+    vmlal.s16   q14,d8,d1[1]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d8,d5[3]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d9,d4[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d5[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d0[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d5[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlsl.s16   q10,d10,d2[0]
+    vmlal.s16   q10,d11,d1[2]
+
+
+    vmlsl.s16   q11,d10,d6[0]
+    vmlal.s16   q11,d11,d3[2]
+
+    vmlal.s16   q8,d10,d6[0]
+    vmlsl.s16   q8,d11,d7[2]
+
+    vmlal.s16   q9,d10,d2[0]
+    vmlsl.s16   q9,d11,d2[2]
+
+    cmp         r11,r5
+    bhs         shift4
+
+
+    vld1.16     d12,[r0],r6
+    vld1.16     d14,[r0],r6
+    vld1.16     d13,[r0],r6
+    vld1.16     d15,[r0],r6
+
+
+
+
+
+
+    vmlsl.s16   q12,d14,d1[1]
+    vmlsl.s16   q13,d14,d7[3]
+    vmlal.s16   q14,d14,d1[3]
+    vmlal.s16   q15,d14,d4[3]
+
+
+    vmlal.s16   q12,d15,d2[1]
+    vmlal.s16   q13,d15,d5[1]
+    vmlsl.s16   q14,d15,d3[1]
+    vmlsl.s16   q15,d15,d4[1]
+
+
+    vmlsl.s16   q10,d12,d5[0]
+    vmlsl.s16   q10,d13,d7[2]
+    vmlsl.s16   q11,d12,d1[0]
+    vmlal.s16   q11,d13,d1[2]
+    vmlsl.s16   q8,d12,d7[0]
+    vmlal.s16   q8,d13,d5[2]
+    vmlal.s16   q9,d12,d3[0]
+    vmlsl.s16   q9,d13,d3[2]
+
+    cmp         r11,r7
+    bhs         shift4
+
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d8,[r0],r6
+    vld1.16     d11,[r0],r6
+    vld1.16     d9,[r0],r6
+
+
+    vmlsl.s16   q12,d8,d5[3]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d8,d2[3]                @// y1 * cos3(part of b1)
+    vmlal.s16   q14,d8,d4[3]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d8,d3[3]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d9,d6[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlal.s16   q13,d9,d0[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d6[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d3[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlal.s16   q10,d10,d0[0]
+    vmlsl.s16   q10,d11,d0[2]
+
+
+    vmlsl.s16   q11,d10,d0[0]
+    vmlal.s16   q11,d11,d6[2]
+
+    vmlsl.s16   q8,d10,d0[0]
+    vmlal.s16   q8,d11,d2[2]
+
+    vmlal.s16   q9,d10,d0[0]
+    vmlsl.s16   q9,d11,d4[2]
+
+
+
+
+    vld1.16     d12,[r0],r6
+    vld1.16     d14,[r0],r6
+    vld1.16     d13,[r0],r6
+    vld1.16     d15,[r0],r6
+
+
+
+
+
+
+    vmlal.s16   q12,d14,d3[1]
+    vmlsl.s16   q13,d14,d2[1]
+    vmlal.s16   q14,d14,d7[3]
+    vmlal.s16   q15,d14,d2[3]
+
+
+    vmlsl.s16   q12,d15,d0[3]
+    vmlal.s16   q13,d15,d4[3]
+    vmlal.s16   q14,d15,d6[3]
+    vmlsl.s16   q15,d15,d2[1]
+
+
+    vmlal.s16   q10,d12,d3[0]
+    vmlsl.s16   q10,d13,d6[2]
+    vmlal.s16   q11,d12,d7[0]
+    vmlsl.s16   q11,d13,d4[2]
+    vmlsl.s16   q8,d12,d1[0]
+    vmlal.s16   q8,d13,d0[2]
+    vmlal.s16   q9,d12,d5[0]
+    vmlsl.s16   q9,d13,d5[2]
+
+
+    vld1.16     d10,[r0],r6
+    vld1.16     d8,[r0],r6
+    vld1.16     d11,[r0],r6
+    vld1.16     d9,[r0],r6
+
+
+
+
+
+    vmlal.s16   q12,d8,d3[3]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d8,d7[1]                @// y1 * cos3(part of b1)
+    vmlsl.s16   q14,d8,d5[1]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d8,d1[3]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d9,d7[1]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d6[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d9,d3[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlsl.s16   q10,d10,d6[0]
+    vmlal.s16   q10,d11,d2[2]
+
+
+    vmlal.s16   q11,d10,d2[0]
+    vmlsl.s16   q11,d11,d0[2]
+
+    vmlsl.s16   q8,d10,d2[0]
+    vmlal.s16   q8,d11,d3[2]
+
+    vmlal.s16   q9,d10,d6[0]
+    vmlsl.s16   q9,d11,d6[2]
+
+
+    vld1.16     d12,[r0],r6
+    vld1.16     d14,[r0],r6
+    vld1.16     d13,[r0],r6
+    vld1.16     d15,[r0],r6
+
+
+
+
+    vmlsl.s16   q12,d14,d5[1]
+    vmlal.s16   q13,d14,d3[3]
+    vmlsl.s16   q14,d14,d2[1]
+    vmlal.s16   q15,d14,d0[3]
+
+
+    vmlal.s16   q12,d15,d1[3]
+    vmlsl.s16   q13,d15,d1[1]
+    vmlal.s16   q14,d15,d0[3]
+    vmlsl.s16   q15,d15,d0[1]
+
+
+    vmlsl.s16   q10,d12,d1[0]
+    vmlal.s16   q10,d13,d4[2]
+    vmlal.s16   q11,d12,d3[0]
+    vmlsl.s16   q11,d13,d5[2]
+    vmlsl.s16   q8,d12,d5[0]
+    vmlal.s16   q8,d13,d6[2]
+    vmlal.s16   q9,d12,d7[0]
+    vmlsl.s16   q9,d13,d7[2]
+
+shift4:
+    vadd.s32    q4,q10,q12
+    vsub.s32    q5,q10,q12
+
+    vadd.s32    q6,q11,q13
+    vsub.s32    q12,q11,q13
+
+    vadd.s32    q7,q8,q14
+    vsub.s32    q13,q8,q14
+
+
+    vadd.s32    q8,q9,q15
+    vsub.s32    q14,q9,q15
+
+
+    vqrshrn.s32 d30,q4,#shift_stage1_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d19,q5,#shift_stage1_idct   @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d31,q7,#shift_stage1_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d18,q13,#shift_stage1_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d12,q6,#shift_stage1_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d15,q12,#shift_stage1_idct  @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d13,q8,#shift_stage1_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d14,q14,#shift_stage1_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+    vtrn.16     q15,q6
+    vtrn.16     q7,q9
+
+    vtrn.32     d30,d31
+    vtrn.32     d12,d13
+    vtrn.32     d14,d15
+    vtrn.32     d18,d19
+
+
+    vst1.16     {q15},[r1]!
+    vst1.16     {q6},[r1]!
+    vst1.16     {q7},[r1]!
+    vst1.16     {q9},[r1]!
+
+    add         r1,r1,#96
+
+    subs        r14,r14,#1
+    bne         dct_stage1
+second_stage_dct:
+@   mov     r0,r1
+    ldr         r0,[sp]
+    ldr         r1,[sp,#4]
+    ldr         r8,[sp,#60]                 @ prediction stride
+    ldr         r7,[sp,#64]                 @ destination stride
+
+@   add r4,r2,r8, lsl #1    @ r4 = r2 + pred_strd * 2    => r4 points to 3rd row of pred data
+@   add r5,r8,r8, lsl #1    @
+@   sub   r0,r0,#512
+    mov         r11,#0xfffffff0
+    mov         r5, #0xffffff00
+    ldr         r6,r5_addr
+    ldr         r9,r9_addr
+@   sub     r1,r1,#2048
+    mov         r4,r1
+    mov         r10,#240
+    mov         r14,#8
+    b           stage2
+
+@ registers free :
+
+@ arm registers used
+@ r8 : predicition stride
+@ r7 : destination stride
+@ r1: temp buffer
+@ r2 : pred buffer
+@ r3 : destination buffer
+@ r14 : loop counter
+@r0 : scratch buffer
+@r10 : used as stride
+@ r4 : used to store the initial address
+@r12 : zero cols
+@ r11 : 0xfffffff0
+@ r5 : 0xffffff00
+dct_stage2:
+    add         r4,r4,#32
+    mov         r1,r4
+stage2:
+    vld1.16     {d10,d11},[r1]!
+    vld1.16     {d8,d9},[r1],r10
+
+    vmull.s16   q12,d8,d0[1]                @// y1 * cos1(part of b0)
+    vmull.s16   q13,d8,d0[3]                @// y1 * cos3(part of b1)
+    vmull.s16   q14,d8,d1[1]                @// y1 * sin3(part of b2)
+    vmull.s16   q15,d8,d1[3]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d9,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlal.s16   q13,d9,d2[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d9,d3[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlal.s16   q15,d9,d5[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+    vmull.s16   q10,d10,d0[0]
+    vmlal.s16   q10,d11,d0[2]
+
+
+    vmull.s16   q11,d10,d0[0]
+    vmlal.s16   q11,d11,d1[2]
+
+    vmull.s16   q8,d10,d0[0]
+    vmlal.s16   q8,d11,d2[2]
+
+    vmull.s16   q9,d10,d0[0]
+    vmlal.s16   q9,d11,d3[2]
+    cmp         r12,r11
+    bhs         stage2_shift1
+
+    vld1.16     {d12,d13},[r1]!
+    vld1.16     {d14,d15},[r1],r10
+
+
+
+
+
+
+    vmlal.s16   q12,d14,d1[1]
+    vmlal.s16   q13,d14,d3[3]
+    vmlal.s16   q14,d14,d6[1]
+    vmlsl.s16   q15,d14,d7[1]
+
+
+    vmlal.s16   q12,d15,d1[3]
+    vmlal.s16   q13,d15,d5[1]
+    vmlsl.s16   q14,d15,d7[1]
+    vmlsl.s16   q15,d15,d3[3]
+
+
+    vmlal.s16   q10,d12,d1[0]
+    vmlal.s16   q10,d13,d1[2]
+    vmlal.s16   q11,d12,d3[0]
+    vmlal.s16   q11,d13,d4[2]
+    vmlal.s16   q8,d12,d5[0]
+    vmlal.s16   q8,d13,d7[2]
+    vmlal.s16   q9,d12,d7[0]
+    vmlsl.s16   q9,d13,d5[2]
+    cmp         r12,r5
+    bhs         stage2_shift1
+
+    vld1.16     {d10,d11},[r1]!
+    vld1.16     {d8,d9},[r1],r10
+
+    vmlal.s16   q12,d8,d2[1]                @// y1 * cos1(part of b0)
+    vmlal.s16   q13,d8,d6[3]                @// y1 * cos3(part of b1)
+    vmlsl.s16   q14,d8,d4[3]                @// y1 * sin3(part of b2)
+    vmlsl.s16   q15,d8,d0[1]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d9,d2[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d7[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d2[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d3[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlal.s16   q10,d10,d2[0]
+    vmlal.s16   q10,d11,d2[2]
+
+
+    vmlal.s16   q11,d10,d6[0]
+    vmlal.s16   q11,d11,d7[2]
+
+    vmlsl.s16   q8,d10,d6[0]
+    vmlsl.s16   q8,d11,d3[2]
+
+    vmlsl.s16   q9,d10,d2[0]
+    vmlsl.s16   q9,d11,d1[2]
+
+    cmp         r12,r6
+    bhs         stage2_shift1
+
+
+    vld1.16     {d12,d13},[r1]!
+    vld1.16     {d14,d15},[r1],r10
+
+
+
+
+
+    vmlal.s16   q12,d14,d3[1]
+    vmlsl.s16   q13,d14,d6[1]
+    vmlsl.s16   q14,d14,d0[1]
+    vmlsl.s16   q15,d14,d6[3]
+
+
+    vmlal.s16   q12,d15,d3[3]
+    vmlsl.s16   q13,d15,d4[3]
+    vmlsl.s16   q14,d15,d2[3]
+    vmlal.s16   q15,d15,d5[3]
+
+
+    vmlal.s16   q10,d12,d3[0]
+    vmlal.s16   q10,d13,d3[2]
+    vmlsl.s16   q11,d12,d7[0]
+    vmlsl.s16   q11,d13,d5[2]
+    vmlsl.s16   q8,d12,d1[0]
+    vmlsl.s16   q8,d13,d1[2]
+    vmlsl.s16   q9,d12,d5[0]
+    vmlal.s16   q9,d13,d7[2]
+
+    cmp         r12,r9
+    bhs         stage2_shift1
+
+
+    vld1.16     {d10,d11},[r1]!
+    vld1.16     {d8,d9},[r1],r10
+
+
+    vmlal.s16   q12,d8,d4[1]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d8,d3[1]                @// y1 * cos3(part of b1)
+    vmlsl.s16   q14,d8,d5[1]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d8,d2[1]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d9,d4[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d7[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlal.s16   q15,d9,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlal.s16   q10,d10,d0[0]
+    vmlal.s16   q10,d11,d4[2]
+
+
+    vmlsl.s16   q11,d10,d0[0]
+    vmlsl.s16   q11,d11,d2[2]
+
+    vmlsl.s16   q8,d10,d0[0]
+    vmlsl.s16   q8,d11,d6[2]
+
+    vmlal.s16   q9,d10,d0[0]
+    vmlal.s16   q9,d11,d0[2]
+
+    vld1.16     {d12,d13},[r1]!
+    vld1.16     {d14,d15},[r1],r10
+
+
+
+
+
+    vmlal.s16   q12,d14,d5[1]
+    vmlsl.s16   q13,d14,d0[2]
+    vmlal.s16   q14,d14,d5[3]
+    vmlal.s16   q15,d14,d4[3]
+
+
+    vmlal.s16   q12,d15,d5[3]
+    vmlsl.s16   q13,d15,d1[1]
+    vmlal.s16   q14,d15,d3[1]
+    vmlsl.s16   q15,d15,d7[3]
+
+
+    vmlal.s16   q10,d12,d5[0]
+    vmlal.s16   q10,d13,d5[2]
+    vmlsl.s16   q11,d12,d1[0]
+    vmlsl.s16   q11,d13,d0[2]
+    vmlal.s16   q8,d12,d7[0]
+    vmlal.s16   q8,d13,d4[2]
+    vmlal.s16   q9,d12,d3[0]
+    vmlal.s16   q9,d13,d6[2]
+
+
+    vld1.16     {d10,d11},[r1]!
+    vld1.16     {d8,d9},[r1],r10
+
+
+
+
+    vmlal.s16   q12,d8,d6[1]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d8,d2[3]                @// y1 * cos3(part of b1)
+    vmlal.s16   q14,d8,d0[1]                @// y1 * sin3(part of b2)
+    vmlsl.s16   q15,d8,d4[1]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d9,d6[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d4[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d9,d1[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d0[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlal.s16   q10,d10,d6[0]
+    vmlal.s16   q10,d11,d6[2]
+
+
+    vmlsl.s16   q11,d10,d2[0]
+    vmlsl.s16   q11,d11,d3[2]
+
+    vmlal.s16   q8,d10,d2[0]
+    vmlal.s16   q8,d11,d0[2]
+
+    vmlsl.s16   q9,d10,d6[0]
+    vmlsl.s16   q9,d11,d2[2]
+
+    vld1.16     {d12,d13},[r1]!
+    vld1.16     {d14,d15},[r1],r10
+
+    vmlal.s16   q12,d14,d7[1]
+    vmlsl.s16   q13,d14,d5[3]
+    vmlal.s16   q14,d14,d4[1]
+    vmlsl.s16   q15,d14,d2[3]
+
+
+    vmlal.s16   q12,d15,d7[3]
+    vmlsl.s16   q13,d15,d7[1]
+    vmlal.s16   q14,d15,d6[3]
+    vmlsl.s16   q15,d15,d6[1]
+
+
+    vmlal.s16   q10,d12,d7[0]
+    vmlal.s16   q10,d13,d7[2]
+    vmlsl.s16   q11,d12,d5[0]
+    vmlsl.s16   q11,d13,d6[2]
+    vmlal.s16   q8,d12,d3[0]
+    vmlal.s16   q8,d13,d5[2]
+    vmlsl.s16   q9,d12,d1[0]
+    vmlsl.s16   q9,d13,d4[2]
+
+stage2_shift1:
+    vadd.s32    q4,q10,q12
+    vsub.s32    q5,q10,q12
+
+    vadd.s32    q6,q11,q13
+    vsub.s32    q12,q11,q13
+
+    vadd.s32    q7,q8,q14
+    vsub.s32    q13,q8,q14
+
+
+    vadd.s32    q8,q9,q15
+    vsub.s32    q14,q9,q15
+
+
+    vqrshrn.s32 d30,q4,#shift_stage2_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d19,q5,#shift_stage2_idct   @// r7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d31,q7,#shift_stage2_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d18,q13,#shift_stage2_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d12,q6,#shift_stage2_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d15,q12,#shift_stage2_idct  @// r6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d13,q8,#shift_stage2_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d14,q14,#shift_stage2_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
+
+
+    vtrn.16     q15,q6
+    vtrn.16     q7,q9
+
+    vtrn.32     d30,d31
+    vtrn.32     d12,d13
+    vtrn.32     d14,d15
+    vtrn.32     d18,d19
+
+
+    vst1.16     {q15},[r0]!
+    vst1.16     {q6},[r0]!
+    vst1.16     {q7},[r0]!
+    vst1.16     {q9},[r0]!
+
+
+    mov         r1,r4
+
+
+
+
+
+
+    vld1.16     {d10,d11},[r1]!
+    vld1.16     {d8,d9},[r1],r10
+
+
+    vmull.s16   q12,d8,d2[1]                @// y1 * cos1(part of b0)
+    vmull.s16   q13,d8,d2[3]                @// y1 * cos3(part of b1)
+    vmull.s16   q14,d8,d3[1]                @// y1 * sin3(part of b2)
+    vmull.s16   q15,d8,d3[3]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d9,d6[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d7[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d6[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d4[3]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmull.s16   q10,d10,d0[0]
+    vmlal.s16   q10,d11,d4[2]
+
+
+    vmull.s16   q11,d10,d0[0]
+    vmlal.s16   q11,d11,d5[2]
+
+    vmull.s16   q8,d10,d0[0]
+    vmlal.s16   q8,d11,d6[2]
+
+    vmull.s16   q9,d10,d0[0]
+    vmlal.s16   q9,d11,d7[2]
+
+    cmp         r12,r11
+    bhs         stage2_shift2
+
+    vld1.16     {d12,d13},[r1]!
+    vld1.16     {d14,d15},[r1],r10
+
+
+    vmlsl.s16   q12,d14,d4[3]
+    vmlsl.s16   q13,d14,d2[1]
+    vmlsl.s16   q14,d14,d0[1]
+    vmlsl.s16   q15,d14,d2[3]
+
+
+    vmlsl.s16   q12,d15,d0[3]
+    vmlsl.s16   q13,d15,d3[1]
+    vmlsl.s16   q14,d15,d6[3]
+    vmlal.s16   q15,d15,d5[3]
+
+
+    vmlsl.s16   q10,d12,d7[0]
+    vmlsl.s16   q10,d13,d2[2]
+    vmlsl.s16   q11,d12,d5[0]
+    vmlsl.s16   q11,d13,d0[2]
+    vmlsl.s16   q8,d12,d3[0]
+    vmlsl.s16   q8,d13,d3[2]
+    vmlsl.s16   q9,d12,d1[0]
+    vmlsl.s16   q9,d13,d6[2]
+
+    cmp         r12,r5
+    bhs         stage2_shift2
+
+    vld1.16     {d10,d11},[r1]!
+    vld1.16     {d8,d9},[r1],r10
+
+
+
+
+
+    vmlsl.s16   q12,d8,d4[1]                @// y1 * cos1(part of b0)
+    vmlal.s16   q13,d8,d7[1]                @// y1 * cos3(part of b1)
+    vmlal.s16   q14,d8,d2[3]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d8,d1[3]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d9,d7[1]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlal.s16   q13,d9,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d9,d3[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d6[3]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlsl.s16   q10,d10,d2[0]
+    vmlsl.s16   q10,d11,d6[2]
+
+
+    vmlsl.s16   q11,d10,d6[0]
+    vmlal.s16   q11,d11,d4[2]
+
+    vmlal.s16   q8,d10,d6[0]
+    vmlal.s16   q8,d11,d0[2]
+
+    vmlal.s16   q9,d10,d2[0]
+    vmlal.s16   q9,d11,d5[2]
+
+    cmp         r12,r6
+    bhs         stage2_shift2
+
+
+    vld1.16     {d12,d13},[r1]!
+    vld1.16     {d14,d15},[r1],r10
+
+
+
+
+
+
+    vmlal.s16   q12,d14,d2[3]
+    vmlal.s16   q13,d14,d3[3]
+    vmlsl.s16   q14,d14,d5[3]
+    vmlsl.s16   q15,d14,d0[3]
+
+
+    vmlal.s16   q12,d15,d1[3]
+    vmlsl.s16   q13,d15,d6[3]
+    vmlsl.s16   q14,d15,d0[3]
+    vmlal.s16   q15,d15,d7[3]
+
+
+    vmlal.s16   q10,d12,d5[0]
+    vmlal.s16   q10,d13,d0[2]
+    vmlal.s16   q11,d12,d1[0]
+    vmlal.s16   q11,d13,d6[2]
+    vmlal.s16   q8,d12,d7[0]
+    vmlsl.s16   q8,d13,d2[2]
+    vmlsl.s16   q9,d12,d3[0]
+    vmlsl.s16   q9,d13,d4[2]
+
+    cmp         r12,r9
+    bhs         stage2_shift2
+
+
+    vld1.16     {d10,d11},[r1]!
+    vld1.16     {d8,d9},[r1],r10
+
+
+
+    vmlal.s16   q12,d8,d6[1]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d8,d1[1]                @// y1 * cos3(part of b1)
+    vmlsl.s16   q14,d8,d7[1]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d8,d0[3]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d9,d5[1]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d4[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d9,d2[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlal.s16   q15,d9,d7[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlal.s16   q10,d10,d0[0]
+    vmlsl.s16   q10,d11,d7[2]
+
+
+    vmlsl.s16   q11,d10,d0[0]
+    vmlsl.s16   q11,d11,d1[2]
+
+    vmlsl.s16   q8,d10,d0[0]
+    vmlal.s16   q8,d11,d5[2]
+
+    vmlal.s16   q9,d10,d0[0]
+    vmlal.s16   q9,d11,d3[2]
+
+    vld1.16     {d12,d13},[r1]!
+    vld1.16     {d14,d15},[r1],r10
+
+
+
+
+    vmlsl.s16   q12,d14,d0[1]
+    vmlal.s16   q13,d14,d6[1]
+    vmlal.s16   q14,d14,d4[1]
+    vmlsl.s16   q15,d14,d1[1]
+
+
+    vmlsl.s16   q12,d15,d3[3]
+    vmlal.s16   q13,d15,d0[1]
+    vmlsl.s16   q14,d15,d5[1]
+    vmlsl.s16   q15,d15,d6[1]
+
+
+    vmlsl.s16   q10,d12,d3[0]
+    vmlsl.s16   q10,d13,d1[2]
+    vmlsl.s16   q11,d12,d7[0]
+    vmlal.s16   q11,d13,d3[2]
+    vmlal.s16   q8,d12,d1[0]
+    vmlal.s16   q8,d13,d7[2]
+    vmlsl.s16   q9,d12,d5[0]
+    vmlsl.s16   q9,d13,d2[2]
+
+
+    vld1.16     {d10,d11},[r1]!
+    vld1.16     {d8,d9},[r1],r10
+
+
+    vmlal.s16   q12,d8,d7[3]                @// y1 * cos1(part of b0)
+    vmlal.s16   q13,d8,d4[3]                @// y1 * cos3(part of b1)
+    vmlsl.s16   q14,d8,d1[1]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d8,d2[1]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d9,d3[1]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d5[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d7[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlal.s16   q15,d9,d5[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlsl.s16   q10,d10,d6[0]
+    vmlal.s16   q10,d11,d5[2]
+
+
+    vmlal.s16   q11,d10,d2[0]
+    vmlal.s16   q11,d11,d7[2]
+
+    vmlsl.s16   q8,d10,d2[0]
+    vmlsl.s16   q8,d11,d4[2]
+
+    vmlal.s16   q9,d10,d6[0]
+    vmlal.s16   q9,d11,d1[2]
+
+
+    vld1.16     {d12,d13},[r1]!
+    vld1.16     {d14,d15},[r1],r10
+
+
+
+    vmlal.s16   q12,d14,d1[1]
+    vmlsl.s16   q13,d14,d0[3]
+    vmlal.s16   q14,d14,d1[3]
+    vmlsl.s16   q15,d14,d3[1]
+
+
+    vmlal.s16   q12,d15,d5[3]
+    vmlsl.s16   q13,d15,d5[1]
+    vmlal.s16   q14,d15,d4[3]
+    vmlsl.s16   q15,d15,d4[1]
+
+
+    vmlal.s16   q10,d12,d1[0]
+    vmlal.s16   q10,d13,d3[2]
+    vmlsl.s16   q11,d12,d3[0]
+    vmlsl.s16   q11,d13,d2[2]
+    vmlal.s16   q8,d12,d5[0]
+    vmlal.s16   q8,d13,d1[2]
+    vmlsl.s16   q9,d12,d7[0]
+    vmlsl.s16   q9,d13,d0[2]
+
+stage2_shift2:
+    vadd.s32    q4,q10,q12
+    vsub.s32    q5,q10,q12
+
+    vadd.s32    q6,q11,q13
+    vsub.s32    q12,q11,q13
+
+    vadd.s32    q7,q8,q14
+    vsub.s32    q13,q8,q14
+
+
+    vadd.s32    q8,q9,q15
+    vsub.s32    q14,q9,q15
+
+
+    vqrshrn.s32 d30,q4,#shift_stage2_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d19,q5,#shift_stage2_idct   @// r7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d31,q7,#shift_stage2_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d18,q13,#shift_stage2_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d12,q6,#shift_stage2_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d15,q12,#shift_stage2_idct  @// r6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d13,q8,#shift_stage2_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d14,q14,#shift_stage2_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
+
+    vtrn.16     q15,q6
+    vtrn.16     q7,q9
+
+    vtrn.32     d30,d31
+    vtrn.32     d12,d13
+    vtrn.32     d14,d15
+    vtrn.32     d18,d19
+
+
+    vst1.16     {q15},[r0]!
+    vst1.16     {q6},[r0]!
+    vst1.16     {q7},[r0]!
+    vst1.16     {q9},[r0]!
+
+
+
+    mov         r1,r4
+
+
+
+
+    vld1.16     {d10,d11},[r1]!
+    vld1.16     {d8,d9},[r1],r10
+
+    vmull.s16   q12,d8,d4[1]                @// y1 * cos1(part of b0)
+    vmull.s16   q13,d8,d4[3]                @// y1 * cos3(part of b1)
+    vmull.s16   q14,d8,d5[1]                @// y1 * sin3(part of b2)
+    vmull.s16   q15,d8,d5[3]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d9,d3[1]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d0[2]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmull.s16   q10,d10,d0[0]
+    vmlsl.s16   q10,d11,d7[2]
+
+
+    vmull.s16   q11,d10,d0[0]
+    vmlsl.s16   q11,d11,d6[2]
+
+    vmull.s16   q8,d10,d0[0]
+    vmlsl.s16   q8,d11,d5[2]
+
+    vmull.s16   q9,d10,d0[0]
+    vmlsl.s16   q9,d11,d4[2]
+
+    cmp         r12,r11
+    bhs         stage2_shift3
+
+    vld1.16     {d12,d13},[r1]!
+    vld1.16     {d14,d15},[r1],r10
+
+    vmlsl.s16   q12,d14,d5[1]
+    vmlsl.s16   q13,d14,d7[3]
+    vmlal.s16   q14,d14,d5[3]
+    vmlal.s16   q15,d14,d3[1]
+
+
+    vmlal.s16   q12,d15,d2[1]
+    vmlal.s16   q13,d15,d1[1]
+    vmlal.s16   q14,d15,d4[3]
+    vmlsl.s16   q15,d15,d7[3]
+
+
+    vmlsl.s16   q10,d12,d1[0]
+    vmlal.s16   q10,d13,d6[2]
+    vmlsl.s16   q11,d12,d3[0]
+    vmlal.s16   q11,d13,d3[2]
+    vmlsl.s16   q8,d12,d5[0]
+    vmlal.s16   q8,d13,d0[2]
+    vmlsl.s16   q9,d12,d7[0]
+    vmlal.s16   q9,d13,d2[2]
+
+    cmp         r12,r5
+    bhs         stage2_shift3
+
+    vld1.16     {d10,d11},[r1]!
+    vld1.16     {d8,d9},[r1],r10
+
+
+
+    vmlal.s16   q12,d8,d6[1]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d8,d5[1]                @// y1 * cos3(part of b1)
+    vmlsl.s16   q14,d8,d0[3]                @// y1 * sin3(part of b2)
+    vmlsl.s16   q15,d8,d3[3]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d9,d1[1]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d4[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d9,d6[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlal.s16   q15,d9,d0[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlal.s16   q10,d10,d2[0]
+    vmlsl.s16   q10,d11,d5[2]
+
+
+    vmlal.s16   q11,d10,d6[0]
+    vmlsl.s16   q11,d11,d0[2]
+
+    vmlsl.s16   q8,d10,d6[0]
+    vmlsl.s16   q8,d11,d4[2]
+
+    vmlsl.s16   q9,d10,d2[0]
+    vmlal.s16   q9,d11,d6[2]
+
+    cmp         r12,r6
+    bhs         stage2_shift3
+
+    vld1.16     {d12,d13},[r1]!
+    vld1.16     {d14,d15},[r1],r10
+
+
+
+
+
+    vmlsl.s16   q12,d14,d7[1]
+    vmlal.s16   q13,d14,d2[1]
+    vmlal.s16   q14,d14,d4[1]
+    vmlsl.s16   q15,d14,d5[1]
+
+
+    vmlal.s16   q12,d15,d0[3]
+    vmlal.s16   q13,d15,d7[1]
+    vmlsl.s16   q14,d15,d1[1]
+    vmlsl.s16   q15,d15,d6[1]
+
+
+    vmlsl.s16   q10,d12,d3[0]
+    vmlal.s16   q10,d13,d4[2]
+    vmlal.s16   q11,d12,d7[0]
+    vmlal.s16   q11,d13,d2[2]
+    vmlal.s16   q8,d12,d1[0]
+    vmlsl.s16   q8,d13,d6[2]
+    vmlal.s16   q9,d12,d5[0]
+    vmlsl.s16   q9,d13,d0[2]
+
+    cmp         r12,r9
+    bhs         stage2_shift3
+
+
+    vld1.16     {d10,d11},[r1]!
+    vld1.16     {d8,d9},[r1],r10
+
+
+    vmlsl.s16   q12,d8,d7[3]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d8,d0[1]                @// y1 * cos3(part of b1)
+    vmlal.s16   q14,d8,d6[3]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d8,d1[3]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d9,d0[1]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlal.s16   q13,d9,d5[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d9,d3[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d2[3]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlal.s16   q10,d10,d0[0]
+    vmlsl.s16   q10,d11,d3[2]
+
+
+    vmlsl.s16   q11,d10,d0[0]
+    vmlsl.s16   q11,d11,d5[2]
+
+    vmlsl.s16   q8,d10,d0[0]
+    vmlal.s16   q8,d11,d1[2]
+
+    vmlal.s16   q9,d10,d0[0]
+    vmlal.s16   q9,d11,d7[2]
+
+    vld1.16     {d12,d13},[r1]!
+    vld1.16     {d14,d15},[r1],r10
+
+
+
+
+    vmlal.s16   q12,d14,d6[3]
+    vmlal.s16   q13,d14,d3[3]
+    vmlsl.s16   q14,d14,d1[3]
+    vmlal.s16   q15,d14,d7[1]
+
+
+    vmlal.s16   q12,d15,d1[3]
+    vmlsl.s16   q13,d15,d2[3]
+    vmlal.s16   q14,d15,d7[1]
+    vmlal.s16   q15,d15,d4[1]
+
+
+    vmlsl.s16   q10,d12,d5[0]
+    vmlal.s16   q10,d13,d2[2]
+    vmlal.s16   q11,d12,d1[0]
+    vmlsl.s16   q11,d13,d7[2]
+    vmlsl.s16   q8,d12,d7[0]
+    vmlsl.s16   q8,d13,d3[2]
+    vmlsl.s16   q9,d12,d3[0]
+    vmlal.s16   q9,d13,d1[2]
+
+
+    vld1.16     {d10,d11},[r1]!
+    vld1.16     {d8,d9},[r1],r10
+
+
+    vmlsl.s16   q12,d8,d5[3]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d8,d6[3]                @// y1 * cos3(part of b1)
+    vmlal.s16   q14,d8,d3[1]                @// y1 * sin3(part of b2)
+    vmlsl.s16   q15,d8,d0[1]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d9,d2[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlal.s16   q13,d9,d0[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d2[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlal.s16   q15,d9,d4[3]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlal.s16   q10,d10,d6[0]
+    vmlsl.s16   q10,d11,d1[2]
+
+
+    vmlsl.s16   q11,d10,d2[0]
+    vmlal.s16   q11,d11,d4[2]
+
+    vmlal.s16   q8,d10,d2[0]
+    vmlsl.s16   q8,d11,d7[2]
+
+    vmlsl.s16   q9,d10,d6[0]
+    vmlsl.s16   q9,d11,d5[2]
+
+    vld1.16     {d12,d13},[r1]!
+    vld1.16     {d14,d15},[r1],r10
+
+
+
+    vmlal.s16   q12,d14,d4[3]
+    vmlsl.s16   q13,d14,d6[1]
+    vmlal.s16   q14,d14,d7[3]
+    vmlal.s16   q15,d14,d6[3]
+
+
+    vmlal.s16   q12,d15,d3[3]
+    vmlsl.s16   q13,d15,d3[1]
+    vmlal.s16   q14,d15,d2[3]
+    vmlsl.s16   q15,d15,d2[1]
+
+
+    vmlsl.s16   q10,d12,d7[0]
+    vmlal.s16   q10,d13,d0[2]
+    vmlal.s16   q11,d12,d5[0]
+    vmlsl.s16   q11,d13,d1[2]
+    vmlsl.s16   q8,d12,d3[0]
+    vmlal.s16   q8,d13,d2[2]
+    vmlal.s16   q9,d12,d1[0]
+    vmlsl.s16   q9,d13,d3[2]
+
+stage2_shift3:
+    vadd.s32    q4,q10,q12
+    vsub.s32    q5,q10,q12
+
+    vadd.s32    q6,q11,q13
+    vsub.s32    q12,q11,q13
+
+    vadd.s32    q7,q8,q14
+    vsub.s32    q13,q8,q14
+
+
+    vadd.s32    q8,q9,q15
+    vsub.s32    q14,q9,q15
+
+
+    vqrshrn.s32 d30,q4,#shift_stage2_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d19,q5,#shift_stage2_idct   @// r11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d31,q7,#shift_stage2_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d18,q13,#shift_stage2_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d12,q6,#shift_stage2_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d15,q12,#shift_stage2_idct  @// r6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d13,q8,#shift_stage2_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d14,q14,#shift_stage2_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
+
+    vtrn.16     q15,q6
+    vtrn.16     q7,q9
+
+    vtrn.32     d30,d31
+    vtrn.32     d12,d13
+    vtrn.32     d14,d15
+    vtrn.32     d18,d19
+
+
+    vst1.16     {q15},[r0]!
+    vst1.16     {q6},[r0]!
+    vst1.16     {q7},[r0]!
+    vst1.16     {q9},[r0]!
+
+
+
+
+    mov         r1,r4
+
+
+
+
+    vld1.16     {d10,d11},[r1]!
+    vld1.16     {d8,d9},[r1],r10
+
+
+    vmull.s16   q12,d8,d6[1]                @// y1 * cos1(part of b0)
+    vmull.s16   q13,d8,d6[3]                @// y1 * cos3(part of b1)
+    vmull.s16   q14,d8,d7[1]                @// y1 * sin3(part of b2)
+    vmull.s16   q15,d8,d7[3]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d9,d2[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d4[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d5[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d7[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmull.s16   q10,d10,d0[0]
+    vmlsl.s16   q10,d11,d3[2]
+
+
+    vmull.s16   q11,d10,d0[0]
+    vmlsl.s16   q11,d11,d2[2]
+
+    vmull.s16   q8,d10,d0[0]
+    vmlsl.s16   q8,d11,d1[2]
+
+    vmull.s16   q9,d10,d0[0]
+    vmlsl.s16   q9,d11,d0[2]
+
+    cmp         r12,r11
+    bhs         stage2_shift4
+    vld1.16     {d12,d13},[r1]!
+    vld1.16     {d14,d15},[r1],r10
+
+
+
+
+
+
+    vmlal.s16   q12,d14,d0[1]
+    vmlal.s16   q13,d14,d1[3]
+    vmlal.s16   q14,d14,d4[1]
+    vmlal.s16   q15,d14,d6[3]
+
+
+    vmlsl.s16   q12,d15,d4[1]
+    vmlsl.s16   q13,d15,d0[3]
+    vmlsl.s16   q14,d15,d2[3]
+    vmlsl.s16   q15,d15,d6[1]
+
+
+    vmlal.s16   q10,d12,d7[0]
+    vmlal.s16   q10,d13,d5[2]
+    vmlal.s16   q11,d12,d5[0]
+    vmlsl.s16   q11,d13,d7[2]
+    vmlal.s16   q8,d12,d3[0]
+    vmlsl.s16   q8,d13,d4[2]
+    vmlal.s16   q9,d12,d1[0]
+    vmlsl.s16   q9,d13,d1[2]
+
+    cmp         r12,r5
+    bhs         stage2_shift4
+
+    vld1.16     {d10,d11},[r1]!
+    vld1.16     {d8,d9},[r1],r10
+
+
+
+    vmlal.s16   q12,d8,d7[3]                @// y1 * cos1(part of b0)
+    vmlal.s16   q13,d8,d3[1]                @// y1 * cos3(part of b1)
+    vmlal.s16   q14,d8,d1[1]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d8,d5[3]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d9,d4[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d5[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d0[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d5[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlsl.s16   q10,d10,d2[0]
+    vmlal.s16   q10,d11,d1[2]
+
+
+    vmlsl.s16   q11,d10,d6[0]
+    vmlal.s16   q11,d11,d3[2]
+
+    vmlal.s16   q8,d10,d6[0]
+    vmlsl.s16   q8,d11,d7[2]
+
+    vmlal.s16   q9,d10,d2[0]
+    vmlsl.s16   q9,d11,d2[2]
+
+    cmp         r12,r6
+    bhs         stage2_shift4
+
+
+    vld1.16     {d12,d13},[r1]!
+    vld1.16     {d14,d15},[r1],r10
+
+
+
+
+
+
+    vmlsl.s16   q12,d14,d1[1]
+    vmlsl.s16   q13,d14,d7[3]
+    vmlal.s16   q14,d14,d1[3]
+    vmlal.s16   q15,d14,d4[3]
+
+
+    vmlal.s16   q12,d15,d2[1]
+    vmlal.s16   q13,d15,d5[1]
+    vmlsl.s16   q14,d15,d3[1]
+    vmlsl.s16   q15,d15,d4[1]
+
+
+    vmlsl.s16   q10,d12,d5[0]
+    vmlsl.s16   q10,d13,d7[2]
+    vmlsl.s16   q11,d12,d1[0]
+    vmlal.s16   q11,d13,d1[2]
+    vmlsl.s16   q8,d12,d7[0]
+    vmlal.s16   q8,d13,d5[2]
+    vmlal.s16   q9,d12,d3[0]
+    vmlsl.s16   q9,d13,d3[2]
+
+    cmp         r12,r9
+    bhs         stage2_shift4
+
+
+    vld1.16     {d10,d11},[r1]!
+    vld1.16     {d8,d9},[r1],r10
+
+
+    vmlsl.s16   q12,d8,d5[3]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d8,d2[3]                @// y1 * cos3(part of b1)
+    vmlal.s16   q14,d8,d4[3]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d8,d3[3]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d9,d6[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlal.s16   q13,d9,d0[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d6[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d3[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlal.s16   q10,d10,d0[0]
+    vmlsl.s16   q10,d11,d0[2]
+
+
+    vmlsl.s16   q11,d10,d0[0]
+    vmlal.s16   q11,d11,d6[2]
+
+    vmlsl.s16   q8,d10,d0[0]
+    vmlal.s16   q8,d11,d2[2]
+
+    vmlal.s16   q9,d10,d0[0]
+    vmlsl.s16   q9,d11,d4[2]
+
+    vld1.16     {d12,d13},[r1]!
+    vld1.16     {d14,d15},[r1],r10
+
+
+
+
+    vmlal.s16   q12,d14,d3[1]
+    vmlsl.s16   q13,d14,d2[1]
+    vmlal.s16   q14,d14,d7[3]
+    vmlal.s16   q15,d14,d2[3]
+
+
+    vmlsl.s16   q12,d15,d0[3]
+    vmlal.s16   q13,d15,d4[3]
+    vmlal.s16   q14,d15,d6[3]
+    vmlsl.s16   q15,d15,d2[1]
+
+
+    vmlal.s16   q10,d12,d3[0]
+    vmlsl.s16   q10,d13,d6[2]
+    vmlal.s16   q11,d12,d7[0]
+    vmlsl.s16   q11,d13,d4[2]
+    vmlsl.s16   q8,d12,d1[0]
+    vmlal.s16   q8,d13,d0[2]
+    vmlal.s16   q9,d12,d5[0]
+    vmlsl.s16   q9,d13,d5[2]
+
+
+    vld1.16     {d10,d11},[r1]!
+    vld1.16     {d8,d9},[r1],r10
+
+
+
+
+    vmlal.s16   q12,d8,d3[3]                @// y1 * cos1(part of b0)
+    vmlsl.s16   q13,d8,d7[1]                @// y1 * cos3(part of b1)
+    vmlsl.s16   q14,d8,d5[1]                @// y1 * sin3(part of b2)
+    vmlal.s16   q15,d8,d1[3]                @// y1 * sin1(part of b3)
+
+    vmlsl.s16   q12,d9,d7[1]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d6[1]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlal.s16   q14,d9,d3[3]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    vmlsl.s16   q10,d10,d6[0]
+    vmlal.s16   q10,d11,d2[2]
+
+
+    vmlal.s16   q11,d10,d2[0]
+    vmlsl.s16   q11,d11,d0[2]
+
+    vmlsl.s16   q8,d10,d2[0]
+    vmlal.s16   q8,d11,d3[2]
+
+    vmlal.s16   q9,d10,d6[0]
+    vmlsl.s16   q9,d11,d6[2]
+
+
+    vld1.16     {d12,d13},[r1]!
+    vld1.16     {d14,d15},[r1],r10
+
+
+
+    vmlsl.s16   q12,d14,d5[1]
+    vmlal.s16   q13,d14,d3[3]
+    vmlsl.s16   q14,d14,d2[1]
+    vmlal.s16   q15,d14,d0[3]
+
+
+    vmlal.s16   q12,d15,d1[3]
+    vmlsl.s16   q13,d15,d1[1]
+    vmlal.s16   q14,d15,d0[3]
+    vmlsl.s16   q15,d15,d0[1]
+
+
+    vmlsl.s16   q10,d12,d1[0]
+    vmlal.s16   q10,d13,d4[2]
+    vmlal.s16   q11,d12,d3[0]
+    vmlsl.s16   q11,d13,d5[2]
+    vmlsl.s16   q8,d12,d5[0]
+    vmlal.s16   q8,d13,d6[2]
+    vmlal.s16   q9,d12,d7[0]
+    vmlsl.s16   q9,d13,d7[2]
+
+stage2_shift4:
+    vadd.s32    q4,q10,q12
+    vsub.s32    q5,q10,q12
+
+    vadd.s32    q6,q11,q13
+    vsub.s32    q12,q11,q13
+
+    vadd.s32    q7,q8,q14
+    vsub.s32    q13,q8,q14
+
+
+    vadd.s32    q8,q9,q15
+    vsub.s32    q14,q9,q15
+
+
+    vqrshrn.s32 d30,q4,#shift_stage2_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d19,q5,#shift_stage2_idct   @// r11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d31,q7,#shift_stage2_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d18,q13,#shift_stage2_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d12,q6,#shift_stage2_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d15,q12,#shift_stage2_idct  @// r6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d13,q8,#shift_stage2_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
+    vqrshrn.s32 d14,q14,#shift_stage2_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
+
+
+
+    vtrn.16     q15,q6
+    vtrn.16     q7,q9
+
+    vtrn.32     d30,d31
+    vtrn.32     d12,d13
+    vtrn.32     d14,d15
+    vtrn.32     d18,d19
+
+
+
+    vst1.16     {q15},[r0]!
+    vst1.16     {q6},[r0]!
+    vst1.16     {q7},[r0]!
+    vst1.16     {q9},[r0]!
+
+
+
+
+    sub         r0,r0,#256
+prediction_buffer:
+
+
+    vld1.16     {d12,d13},[r0]!
+    vld1.16     {d14,d15},[r0]!
+
+    add         r0,r0,#32
+
+    vld1.16     {d16,d17},[r0]!
+    vld1.16     {d18,d19},[r0]!
+    add         r0,r0,#32
+
+    vld1.16     {d20,d21},[r0]!
+    vld1.16     {d22,d23},[r0]!
+
+
+    add         r0,r0,#32
+
+    vld1.16     {d24,d25},[r0]!
+    vld1.16     {d26,d27},[r0]!
+
+
+
+
+
+@ d12 =r0 1- 4 values
+@ d13 =r2 1- 4 values
+@ d14=r1 1- 4 values
+@ d15=r3 1- 4 values
+
+@ d16 =r0 5- 8 values
+@ d17 =r2 5- 8 values
+@ d18=r1 5- 8 values
+@ d19=r3 5- 8 values
+
+@ d20 =r0 9- 12 values
+@ d21 =r2 9- 12 values
+@ d22=r1 9- 12 values
+@ d23=r3 9- 12 values
+
+@ d24 =r0 13-16 values
+@ d25 =r2 13- 16 values
+@ d26=r1 13- 16 values
+@ d27=r3 13- 16 values
+
+    vswp        d13,d16
+    vswp        d21,d24
+    vswp        d15,d18
+    vswp        d23,d26
+
+
+    vld1.8      {d8,d9},[r2],r8
+    vld1.8      {d10,d11},[r2],r8
+    vld1.8      {d28,d29},[r2],r8
+    vld1.8      {d30,d31},[r2],r8
+
+
+    vaddw.u8    q6,q6,d8
+    vaddw.u8    q10,q10,d9
+    vaddw.u8    q7,q7,d10
+    vaddw.u8    q11,q11,d11
+    vaddw.u8    q8,q8,d28
+    vaddw.u8    q12,q12,d29
+    vaddw.u8    q9,q9,d30
+    vaddw.u8    q13,q13,d31
+    sub         r2,r2,r8,lsl #2
+    add         r2,r2,#16
+    vqmovun.s16 d12,q6
+    vqmovun.s16 d13,q10
+    vqmovun.s16 d20,q7
+    vqmovun.s16 d21,q11
+    vqmovun.s16 d14,q8
+    vqmovun.s16 d15,q12
+    vqmovun.s16 d22,q9
+    vqmovun.s16 d23,q13
+
+
+    vst1.8      {d12,d13},[r3],r7
+    vst1.8      {d20,d21},[r3],r7
+    vst1.8      {d14,d15},[r3],r7
+    vst1.8      {d22,d23},[r3],r7
+
+
+    sub         r3,r3,r7,lsl #2
+    add         r3,r3,#16
+
+    vld1.16     {d12,d13},[r0]!
+    vld1.16     {d14,d15},[r0]!
+
+    sub         r0,r0,#96
+
+    vld1.16     {d16,d17},[r0]!
+    vld1.16     {d18,d19},[r0]!
+    sub         r0,r0,#96
+
+    vld1.16     {d20,d21},[r0]!
+    vld1.16     {d22,d23},[r0]!
+
+
+    sub         r0,r0,#96
+
+    vld1.16     {d24,d25},[r0]!
+    vld1.16     {d26,d27},[r0]!
+
+
+    sub         r0,r0,#64
+
+
+
+
+    vswp        d13,d16
+    vswp        d21,d24
+    vswp        d15,d18
+    vswp        d23,d26
+
+
+    vld1.8      {d8,d9},[r2],r8
+    vld1.8      {d10,d11},[r2],r8
+    vld1.8      {d28,d29},[r2],r8
+    vld1.8      {d30,d31},[r2],r8
+
+
+    vaddw.u8    q6,q6,d8
+    vaddw.u8    q10,q10,d9
+    vaddw.u8    q7,q7,d10
+    vaddw.u8    q11,q11,d11
+    vaddw.u8    q8,q8,d28
+    vaddw.u8    q12,q12,d29
+    vaddw.u8    q9,q9,d30
+    vaddw.u8    q13,q13,d31
+    sub         r2,r2,#16
+
+    vqmovun.s16 d12,q6
+    vqmovun.s16 d13,q10
+    vqmovun.s16 d20,q7
+    vqmovun.s16 d21,q11
+    vqmovun.s16 d14,q8
+    vqmovun.s16 d15,q12
+    vqmovun.s16 d22,q9
+    vqmovun.s16 d23,q13
+
+
+    vst1.8      {d12,d13},[r3],r7
+    vst1.8      {d20,d21},[r3],r7
+    vst1.8      {d14,d15},[r3],r7
+    vst1.8      {d22,d23},[r3],r7
+
+    sub         r3,r3,#16
+
+    subs        r14,r14,#1
+    bne         dct_stage2
+    ldmfd       sp!,{r0-r12,pc}
+
+
+
+
+

diff --git a/common/arm/ihevc_itrans_recon_4x4.s b/common/arm/ihevc_itrans_recon_4x4.s
new file mode 100644
index 0000000..c955502
--- /dev/null
+++ b/common/arm/ihevc_itrans_recon_4x4.s

@@ -0,0 +1,232 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@ *******************************************************************************
+@ * @file
+@ *  ihevc_itrans_recon_4x4_neon.s
+@ *
+@ * @brief
+@ *  contains function definitions for single stage  inverse transform
+@ *
+@ * @author
+@ *  naveen sr
+@ *
+@ * @par list of functions:
+@ *  - ihevc_itrans_recon_4x4()
+@ *
+@ * @remarks
+@ *  none
+@ *
+@ *******************************************************************************
+@*/
+@ /**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ *  this function performs inverse transform  and reconstruction for 4x4
+@ * input block
+@ *
+@ * @par description:
+@ *  performs inverse transform and adds the prediction  data and clips output
+@ * to 8 bit
+@ *
+@ * @param[in] pi2_src
+@ *  input 4x4 coefficients
+@ *
+@ * @param[in] pi2_tmp
+@ *  temporary 4x4 buffer for storing inverse
+@ *
+@ *  transform
+@ *  1st stage output
+@ *
+@ * @param[in] pu1_pred
+@ *  prediction 4x4 block
+@ *
+@ * @param[out] pu1_dst
+@ *  output 4x4 block
+@ *
+@ * @param[in] src_strd
+@ *  input stride
+@ *
+@ * @param[in] pred_strd
+@ *  prediction stride
+@ *
+@ * @param[in] dst_strd
+@ *  output stride
+@ *
+@ * @param[in] shift
+@ *  output shift
+@ *
+@ * @param[in] zero_cols
+@ *  zero columns in pi2_src
+@ *
+@ * @returns  void
+@ *
+@ * @remarks
+@ *  none
+@ *
+@ *******************************************************************************
+@ */
+@void ihevc_itrans_recon_4x4(word16 *pi2_src,
+@       word16 *pi2_tmp,
+@       uword8 *pu1_pred,
+@       uword8 *pu1_dst,
+@       word32 src_strd,
+@       word32 pred_strd,
+@       word32 dst_strd,
+@       word32 zero_cols)
+@**************variables vs registers*************************
+@   r0 => *pi2_src
+@   r1 => *pi2_tmp
+@   r2 => *pu1_pred
+@   r3 => *pu1_dst
+@   r4 => src_strd
+@   r5 => pred_strd
+@   r6 => dst_strd
+@   r7 => zero_cols
+
+
+.text
+.align 4
+
+
+.set shift_stage1_idct ,   7
+.set shift_stage2_idct ,   12
+
+
+
+.globl ihevc_itrans_recon_4x4_a9q
+
+.extern g_ai2_ihevc_trans_4_transpose
+
+g_ai2_ihevc_trans_4_transpose_addr:
+.long g_ai2_ihevc_trans_4_transpose - ulbl1 - 8
+
+.type ihevc_itrans_recon_4x4_a9q, %function
+
+ihevc_itrans_recon_4x4_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r8,g_ai2_ihevc_trans_4_transpose_addr
+ulbl1:
+    add         r8,r8,pc
+
+    ldr         r4,[sp,#40]                 @loading src_strd
+    ldr         r5,[sp,#44]                 @loading pred_strd
+    add         r4,r4,r4                    @ src_strd in terms of word16
+
+    ldr         r6,[sp,#48]                 @loading dst_strd
+    ldr         r7,[sp,#52]                 @loading zero_cols
+    add         r9,r0,r4                    @ pi2_src[0] + src_strd
+
+
+
+    vld1.16     d4,[r8]                     @loading first row of g_ai2_ihevc_trans_4_transpose
+    @ d4 = {36,64,83,64}
+    @index = 3  2  1  0
+    add         r10,r9,r4, lsl #1           @ 3*src_strd
+    add         r4,r4,r4
+    vld1.16     d1,[r9]                     @loading pi2_src 2nd row
+    vld1.16     d3,[r10]                    @loading pi2_src 4th row
+    vld1.16     d0,[r0],r4                  @loading pi2_src 1st row
+    vld1.16     d2,[r0],r4                  @loading pi2_src 3rd row
+
+
+    @ first stage computation starts
+    vmull.s16   q3,d1,d4[1]                 @83 * pi2_src[1]
+    vmlal.s16   q3,d3,d4[3]                 @o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
+    vmull.s16   q4,d1,d4[3]                 @36 * pi2_src[1]
+    vld1.32     d22[0], [r2],r5
+    vmlsl.s16   q4,d3,d4[1]                 @o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
+
+    vaddl.s16   q5,d0,d2                    @pi2_src[0] + pi2_src[2]
+    vsubl.s16   q6,d0,d2                    @pi2_src[0] - pi2_src[2]
+    vshl.s32    q5,q5,#6                    @e[0] = 64*(pi2_src[0] + pi2_src[2])
+    vshl.s32    q6,q6,#6                    @e[1] = 64*(pi2_src[0] - pi2_src[2])
+
+    vadd.s32    q7,q5,q3                    @((e[0] + o[0] )
+    vadd.s32    q8,q6,q4                    @((e[1] + o[1])
+    vsub.s32    q9,q6,q4                    @((e[1] - o[1])
+    vsub.s32    q10,q5,q3                   @((e[0] - o[0])
+
+    vqrshrn.s32 d0,q7,#shift_stage1_idct    @pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
+    vqrshrn.s32 d1,q8,#shift_stage1_idct    @pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) )
+    vqrshrn.s32 d2,q9,#shift_stage1_idct    @pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) )
+    vqrshrn.s32 d3,q10,#shift_stage1_idct   @pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) )
+
+    vtrn.16     d0,d1
+    vtrn.16     d2,d3
+    vtrn.32     d0,d2
+    vtrn.32     d1,d3
+
+    @ first stage ends
+    @ output in d0,d1,d2,d3
+    @ second stage starts
+    vmull.s16   q3,d1,d4[1]                 @83 * pi2_src[1]
+    vld1.32     d22[1], [r2],r5
+    vmlal.s16   q3,d3,d4[3]                 @o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
+    vmull.s16   q4,d1,d4[3]                 @36 * pi2_src[1]
+    vmlsl.s16   q4,d3,d4[1]                 @o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
+    vld1.32     d23[0], [r2],r5
+
+    vaddl.s16   q5,d0,d2                    @pi2_src[0] + pi2_src[2]
+    vsubl.s16   q6,d0,d2                    @pi2_src[0] - pi2_src[2]
+    vshl.s32    q5,q5,#6                    @e[0] = 64*(pi2_src[0] + pi2_src[2])
+    vshl.s32    q6,q6,#6                    @e[1] = 64*(pi2_src[0] - pi2_src[2])
+
+
+    vadd.s32    q7,q5,q3                    @((e[0] + o[0] )
+    vadd.s32    q8,q6,q4                    @((e[1] + o[1])
+    vsub.s32    q9,q6,q4                    @((e[1] - o[1])
+    vsub.s32    q10,q5,q3                   @((e[0] - o[0])
+
+    vqrshrn.s32 d0,q7,#shift_stage2_idct    @pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
+    vqrshrn.s32 d1,q8,#shift_stage2_idct    @pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) )
+    vqrshrn.s32 d2,q9,#shift_stage2_idct    @pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) )
+    vqrshrn.s32 d3,q10,#shift_stage2_idct   @pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) )
+    vld1.32     d23[1], [r2],r5
+
+    vtrn.16     d0,d1
+    vtrn.16     d2,d3
+    vtrn.32     d0,d2
+    vtrn.32     d1,d3
+    @ second stage ends
+    @ output in d0,d1,d2,d3
+    @ second stage computation ends
+
+    @ loading pred
+
+    vaddw.u8    q0,q0,d22                   @ pi2_out(16bit) + pu1_pred(8bit)
+    vaddw.u8    q1,q1,d23                   @ pi2_out(16bit) + pu1_pred(8bit)
+    vqmovun.s16 d0,q0                       @ clip_u8(pi2_out(16bit) + pu1_pred(8bit))
+    vqmovun.s16 d1,q1                       @ clip_u8(pi2_out(16bit) + pu1_pred(8bit))
+
+    @ storing destination
+    vst1.32     {d0[0]},[r3],r6
+    vst1.32     {d0[1]},[r3],r6
+    vst1.32     {d1[0]},[r3],r6
+    vst1.32     {d1[1]},[r3],r6
+
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+

diff --git a/common/arm/ihevc_itrans_recon_4x4_ttype1.s b/common/arm/ihevc_itrans_recon_4x4_ttype1.s
new file mode 100644
index 0000000..ab65dae
--- /dev/null
+++ b/common/arm/ihevc_itrans_recon_4x4_ttype1.s

@@ -0,0 +1,236 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@ *******************************************************************************
+@ * @file
+@ *  ihevc_itrans_recon_4x4_ttype1.s
+@ *
+@ * @brief
+@ *  contains function definitions for inverse transform  and reconstruction
+@ *
+@ *
+@ * @author
+@ *  naveen sr
+@ *
+@ * @par list of functions:
+@ *  - ihevc_itrans_recon_4x4_ttype1()
+@ *
+@ * @remarks
+@ *  none
+@ *
+@ *******************************************************************************
+@ */
+
+@/* all the functions here are replicated from ihevc_itrans.c and modified to */
+@/* include reconstruction */
+@
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ *  this function performs inverse transform type 1 (dst)  and reconstruction
+@ * for 4x4 input block
+@ *
+@ * @par description:
+@ *  performs inverse transform and adds the prediction  data and clips output
+@ * to 8 bit
+@ *
+@ * @param[in] pi2_src
+@ *  input 4x4 coefficients
+@ *
+@ * @param[in] pi2_tmp
+@ *  temporary 4x4 buffer for storing inverse
+@ *
+@ *  transform
+@ *  1st stage output
+@ *
+@ * @param[in] pu1_pred
+@ *  prediction 4x4 block
+@ *
+@ * @param[out] pu1_dst
+@ *  output 4x4 block
+@ *
+@ * @param[in] src_strd
+@ *  input stride
+@ *
+@ * @param[in] pred_strd
+@ *  prediction stride
+@ *
+@ * @param[in] dst_strd
+@ *  output stride
+@ *
+@ * @param[in] zero_cols
+@ *  zero columns in pi2_src
+@ *
+@ * @returns  void
+@ *
+@ * @remarks
+@ *  none
+@ *
+@ *******************************************************************************
+@ */
+@void ihevc_itrans_recon_4x4_ttype1(word16 *pi2_src,
+@       word16 *pi2_tmp,
+@       uword8 *pu1_pred,
+@       uword8 *pu1_dst,
+@       word32 src_strd,
+@       word32 pred_strd,
+@       word32 dst_strd,
+@       word32 zero_cols)
+
+@**************variables vs registers*************************
+@   r0 => *pi2_src
+@   r1 => *pi2_tmp
+@   r2 => *pu1_pred
+@   r3 => *pu1_dst
+@   r4 => src_strd
+@   r5 => pred_strd
+@   r6 => dst_strd
+@   r7 => zero_cols
+
+.text
+.align 4
+
+
+
+
+.set shift_stage1_idct ,   7
+.set shift_stage2_idct ,   12
+
+.globl ihevc_itrans_recon_4x4_ttype1_a9q
+
+.type ihevc_itrans_recon_4x4_ttype1_a9q, %function
+
+ihevc_itrans_recon_4x4_ttype1_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+    ldr         r4,[sp,#40]                 @loading src_strd
+    ldr         r5,[sp,#44]                 @loading pred_strd
+    ldr         r6,[sp,#48]                 @loading dst_strd
+    ldr         r7,[sp,#52]                 @loading zero_cols
+
+    add         r4,r4,r4                    @ src_strd in terms of word16
+
+    mov         r8,#29
+    mov         r9,#55
+    mov         r10,#74
+    mov         r11,#84
+    vmov.i16    d4[0],r8
+    vld1.16     d0,[r0],r4                  @loading pi2_src 1st row
+    vmov.i16    d4[1],r9
+    vld1.16     d1,[r0],r4                  @loading pi2_src 2nd row
+    vmov.i16    d4[2],r10
+    vld1.16     d2,[r0],r4                  @loading pi2_src 3rd row
+    vmov.i16    d4[3],r11
+    vld1.16     d3,[r0],r4                  @loading pi2_src 4th row
+
+    @ first stage computation starts
+    vmull.s16   q3,d1,d4[2]                 @74 * pi2_src[1]
+    vmlal.s16   q3,d0,d4[0]                 @74 * pi2_src[1] + 29 * pi2_src[0]
+    vmlal.s16   q3,d3,d4[1]                 @74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
+    vmlal.s16   q3,d2,d4[3]                 @pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
+
+    vmull.s16   q4,d1,d4[2]                 @74 * pi2_src[1]
+    vmlal.s16   q4,d0,d4[1]                 @74 * pi2_src[1] + 55 * pi2_src[0]
+    vmlsl.s16   q4,d2,d4[0]                 @74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2]
+    vmlsl.s16   q4,d3,d4[3]                 @pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2] - 84 * pi2_src[3])
+
+    vmull.s16   q5,d0,d4[2]                 @ 74 * pi2_src[0]
+    vmlsl.s16   q5,d2,d4[2]                 @ 74 * pi2_src[0] - 74 * pi2_src[2]
+    vmlal.s16   q5,d3,d4[2]                 @pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
+
+    vmull.s16   q6,d2,d4[1]                 @ 55 * pi2_src[2]
+    vmlsl.s16   q6,d1,d4[2]                 @ 55 * pi2_src[2] - 74 * pi2_src[1]
+    vmlsl.s16   q6,d3,d4[0]                 @ - 74 * pi2_src[1] +   55 * pi2_src[2] - 29 * pi2_src[3]
+    vmlal.s16   q6,d0,d4[3]                 @pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+
+    vqrshrn.s32 d14,q3,#shift_stage1_idct   @ (pi2_out[0] + rounding ) >> shift_stage1_idct
+    vqrshrn.s32 d15,q4,#shift_stage1_idct   @ (pi2_out[1] + rounding ) >> shift_stage1_idct
+    vqrshrn.s32 d16,q5,#shift_stage1_idct   @ (pi2_out[2] + rounding ) >> shift_stage1_idct
+    vqrshrn.s32 d17,q6,#shift_stage1_idct   @ (pi2_out[3] + rounding ) >> shift_stage1_idct
+    vld1.32     d18[0], [r2],r5
+
+    vtrn.16     d14,d15
+    vtrn.16     d16,d17
+    vtrn.32     d14,d16
+    vtrn.32     d15,d17
+    @ output in d14,d15,d16,d17
+    @ first stage computation ends
+
+    @ second stage computation starts  :  copy pasting 1st stage
+    @ register changes
+    @ d14 - d0
+    @ d15 - d1
+    @ d16 - d2
+    @ d17 - d3
+    vld1.32     d18[1], [r2],r5
+    vmull.s16   q3,d15,d4[2]                @74 * pi2_src[1]
+    vmlal.s16   q3,d14,d4[0]                @74 * pi2_src[1] + 29 * pi2_src[0]
+    vmlal.s16   q3,d17,d4[1]                @74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
+    vmlal.s16   q3,d16,d4[3]                @pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
+
+    vmull.s16   q4,d15,d4[2]                @74 * pi2_src[1]
+    vmlal.s16   q4,d14,d4[1]                @74 * pi2_src[1] + 55 * pi2_src[0]
+    vmlsl.s16   q4,d16,d4[0]                @74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2]
+    vmlsl.s16   q4,d17,d4[3]                @pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2] - 84 * pi2_src[3])
+
+    vmull.s16   q5,d14,d4[2]                @ 74 * pi2_src[0]
+    vmlsl.s16   q5,d16,d4[2]                @ 74 * pi2_src[0] - 74 * pi2_src[2]
+    vmlal.s16   q5,d17,d4[2]                @pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
+    vld1.32     d19[0], [r2],r5
+
+    vmull.s16   q6,d16,d4[1]                @ 55 * pi2_src[2]
+    vmlsl.s16   q6,d15,d4[2]                @  - 74 * pi2_src[1] +   55 * pi2_src[2]
+    vmlsl.s16   q6,d17,d4[0]                @ - 74 * pi2_src[1] +   55 * pi2_src[2] - 29 * pi2_src[3]
+    vmlal.s16   q6,d14,d4[3]                @pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+
+    vqrshrn.s32 d0,q3,#shift_stage2_idct    @ (pi2_out[0] + rounding ) >> shift_stage1_idct
+    vqrshrn.s32 d1,q4,#shift_stage2_idct    @ (pi2_out[1] + rounding ) >> shift_stage1_idct
+    vqrshrn.s32 d2,q5,#shift_stage2_idct    @ (pi2_out[2] + rounding ) >> shift_stage1_idct
+    vqrshrn.s32 d3,q6,#shift_stage2_idct    @ (pi2_out[3] + rounding ) >> shift_stage1_idct
+    vld1.32     d19[1], [r2],r5
+    vtrn.16     d0,d1
+    vtrn.16     d2,d3
+    vtrn.32     d0,d2
+    vtrn.32     d1,d3
+    @ output in d0,d1,d2,d3
+    @ second stage computation ends
+
+    @ loading pred
+
+    vaddw.u8    q0,q0,d18                   @ pi2_out(16bit) + pu1_pred(8bit)
+    vqmovun.s16 d0,q0                       @ clip_u8(pi2_out(16bit) + pu1_pred(8bit))
+    vaddw.u8    q1,q1,d19                   @ pi2_out(16bit) + pu1_pred(8bit)
+    vqmovun.s16 d1,q1                       @ clip_u8(pi2_out(16bit) + pu1_pred(8bit))
+
+    @ storing destination
+    vst1.32     {d0[0]},[r3],r6
+    vst1.32     {d0[1]},[r3],r6
+    vst1.32     {d1[0]},[r3],r6
+    vst1.32     {d1[1]},[r3],r6
+
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+
+
+
+

diff --git a/common/arm/ihevc_itrans_recon_8x8.s b/common/arm/ihevc_itrans_recon_8x8.s
new file mode 100644
index 0000000..440512a
--- /dev/null
+++ b/common/arm/ihevc_itrans_recon_8x8.s

@@ -0,0 +1,934 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@ *******************************************************************************
+@ * @file
+@ *  ihevc_itrans_recon_8x8_neon.s
+@ *
+@ * @brief
+@ *  contains function definitions for single stage  inverse transform
+@ *
+@ * @author
+@ *  anand s
+@ *
+@ * @par list of functions:
+@ *  - ihevc_itrans_recon_8x8()
+@ *
+@ * @remarks
+@ *  none
+@ *
+@ *******************************************************************************
+@*/
+
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ *  this function performs inverse transform  and reconstruction for 8x8
+@ * input block
+@ *
+@ * @par description:
+@ *  performs inverse transform and adds the prediction  data and clips output
+@ * to 8 bit
+@ *
+@ * @param[in] pi2_src
+@ *  input 8x8 coefficients
+@ *
+@ * @param[in] pi2_tmp
+@ *  temporary 8x8 buffer for storing inverse
+@ *
+@ *  transform
+@ *  1st stage output
+@ *
+@ * @param[in] pu1_pred
+@ *  prediction 8x8 block
+@ *
+@ * @param[out] pu1_dst
+@ *  output 8x8 block
+@ *
+@ * @param[in] src_strd
+@ *  input stride
+@ *
+@ * @param[in] pred_strd
+@ *  prediction stride
+@ *
+@ * @param[in] dst_strd
+@ *  output stride
+@ *
+@ * @param[in] shift
+@ *  output shift
+@ *
+@ * @param[in] zero_cols
+@ *  zero columns in pi2_src
+@ *
+@ * @returns  void
+@ *
+@ * @remarks
+@ *  none
+@ *
+@ *******************************************************************************
+@ */
+
+@void ihevc_itrans_recon_8x8(word16 *pi2_src,
+@                            word16 *pi2_tmp,
+@                            uword8 *pu1_pred,
+@                            uword8 *pu1_dst,
+@                            word32 src_strd,
+@                            word32 pred_strd,
+@                            word32 dst_strd,
+@                            word32 zero_cols
+@                            word32 zero_rows               )
+
+@**************variables vs registers*************************
+@   r0 => *pi2_src
+@   r1 => *pi2_tmp
+@   r2 => *pu1_pred
+@   r3 => *pu1_dst
+@   src_strd
+@   pred_strd
+@   dst_strd
+@   zero_cols
+
+
+
+.text
+.align 4
+
+
+
+
+.set width_x_size_x5 ,   40
+.set width_x_size_x2 ,   32
+.set shift_stage1_idct ,   7
+.set shift_stage2_idct ,   12
+
+.globl ihevc_itrans_recon_8x8_a9q
+
+.extern g_ai2_ihevc_trans_8_transpose
+
+g_ai2_ihevc_trans_8_transpose_addr:
+.long g_ai2_ihevc_trans_8_transpose - ulbl1 - 8
+
+.type ihevc_itrans_recon_8x8_a9q, %function
+
+ihevc_itrans_recon_8x8_a9q:
+@//register usage.extern        - loading and until idct of columns
+@// cosine constants    -   d0
+@// sine constants      -   d1
+@// row 0 first half    -   d2      -   y0
+@// row 1 first half    -   d6      -   y1
+@// row 2 first half    -   d3      -   y2
+@// row 3 first half    -   d7      -   y3
+@// row 4 first half    -   d10     -   y4
+@// row 5 first half    -   d14     -   y5
+@// row 6 first half    -   d11     -   y6
+@// row 7 first half    -   d15     -   y7
+
+@// row 0 second half   -   d4      -   y0
+@// row 1 second half   -   d8      -   y1
+@// row 2 second half   -   d5      -   y2
+@// row 3 second half   -   d9      -   y3
+@// row 4 second half   -   d12     -   y4
+@// row 5 second half   -   d16     -   y5
+@// row 6 second half   -   d13     -   y6
+@// row 7 second half   -   d17     -   y7
+
+    @// copy the input pointer to another register
+    @// step 1 : load all constants
+    stmfd       sp!,{r4-r12,lr}
+    add         sp,sp,#40
+    ldr         r8,[sp,#4]                  @ prediction stride
+    ldr         r7,[sp,#8]                  @ destination stride
+    ldr         r6,[sp]                     @ src stride
+    ldr         r12,[sp,#12]
+    ldr         r11,[sp,#16]
+    mov         r6,r6,lsl #1                @ x sizeof(word16)
+    add         r9,r0,r6, lsl #1            @ 2 rows
+
+    add         r10,r6,r6, lsl #1           @ 3 rows
+
+    sub         r10,r10, #8                 @ - 4 cols * sizeof(word16)
+    sub         r5,r6, #8                   @ src_strd - 4 cols * sizeof(word16)
+
+@   ldr         r14,=g_imp4d_cxa8_idct_q15
+    ldr         r14,g_ai2_ihevc_trans_8_transpose_addr
+ulbl1:
+    add         r14,r14,pc
+    vld1.16     {d0,d1},[r14]               @//d0,d1 are used for storing the constant data
+
+    @//step 2 load all the input data
+    @//step 3 operate first 4 colums at a time
+
+    and         r11,r11,#0xff
+    and         r12,r12,#0xff
+
+    cmp         r11,#0xf0
+    bge         skip_last4_rows
+
+
+    vld1.16     d2,[r0]!
+    vld1.16     d3,[r9]!
+    vld1.16     d4,[r0],r5
+    vmull.s16   q10,d2,d0[0]                @// y0 * cos4(part of c0 and c1)
+    vld1.16     d5,[r9],r5
+    vmull.s16   q9,d3,d1[2]                 @// y2 * sin2 (q3 is freed by this time)(part of d1)
+    vld1.16     d6,[r0]!
+    vld1.16     d7,[r9]!
+    vmull.s16   q12,d6,d0[1]                @// y1 * cos1(part of b0)
+    vld1.16     d8,[r0],r10
+    vmull.s16   q13,d6,d0[3]                @// y1 * cos3(part of b1)
+    vld1.16     d9,[r9],r10
+    vmull.s16   q14,d6,d1[1]                @// y1 * sin3(part of b2)
+    vld1.16     d10,[r0]!
+    vmull.s16   q15,d6,d1[3]                @// y1 * sin1(part of b3)
+    vld1.16     d11,[r9]!
+    vmlal.s16   q12,d7,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vld1.16     d12,[r0],r5
+    vmlsl.s16   q13,d7,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vld1.16     d13,[r9],r5
+    vmlsl.s16   q14,d7,d0[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vld1.16     d14,[r0]!
+    vmlsl.s16   q15,d7,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+    vld1.16     d15,[r9]!
+    vmull.s16   q11,d10,d0[0]               @// y4 * cos4(part of c0 and c1)
+    vld1.16     d16,[r0],r10
+    vmull.s16   q3,d3,d0[2]                 @// y2 * cos2(part of d0)
+    vld1.16     d17,[r9],r10
+
+    @/* this following was activated when alignment is not there */
+@// vld1.16     d2,[r0]!
+@// vld1.16     d3,[r2]!
+@// vld1.16     d4,[r0]!
+@// vld1.16     d5,[r2]!
+@// vld1.16     d6,[r0]!
+@// vld1.16     d7,[r2]!
+@// vld1.16     d8,[r0],r3
+@// vld1.16     d9,[r2],r3
+@// vld1.16     d10,[r0]!
+@// vld1.16     d11,[r2]!
+@// vld1.16     d12,[r0]!
+@// vld1.16     d13,[r2]!
+@// vld1.16     d14,[r0]!
+@// vld1.16     d15,[r2]!
+@// vld1.16     d16,[r0],r3
+@// vld1.16     d17,[r2],r3
+
+
+
+
+    vmlal.s16   q12,d14,d1[1]               @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+    vmlsl.s16   q13,d14,d0[1]               @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+    vmlal.s16   q14,d14,d1[3]               @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+    vmlal.s16   q15,d14,d0[3]               @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+    vmlsl.s16   q9,d11,d0[2]                @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+    vmlal.s16   q3,d11,d1[2]                @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+    vadd.s32    q5,q10,q11                  @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+    vsub.s32    q10,q10,q11                 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+    vmlal.s16   q12,d15,d1[3]               @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
+    vmlsl.s16   q13,d15,d1[1]               @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
+    vmlal.s16   q14,d15,d0[3]               @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
+    vmlsl.s16   q15,d15,d0[1]               @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)
+
+    vadd.s32    q7,q5,q3                    @// a0 = c0 + d0(part of r0,r7)
+    vsub.s32    q5,q5,q3                    @// a3 = c0 - d0(part of r3,r4)
+    vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of r2,r5)
+    vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of r1,r6)
+
+    vadd.s32    q10,q7,q12                  @// a0 + b0(part of r0)
+    vsub.s32    q3,q7,q12                   @// a0 - b0(part of r7)
+
+    vadd.s32    q12,q11,q14                 @// a2 + b2(part of r2)
+    vsub.s32    q11,q11,q14                 @// a2 - b2(part of r5)
+
+    vadd.s32    q14,q9,q13                  @// a1 + b1(part of r1)
+    vsub.s32    q9,q9,q13                   @// a1 - b1(part of r6)
+
+    vadd.s32    q13,q5,q15                  @// a3 + b3(part of r3)
+    vsub.s32    q15,q5,q15                  @// a3 - b3(part of r4)
+
+    vqrshrn.s32 d2,q10,#shift_stage1_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d15,q3,#shift_stage1_idct   @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d3,q12,#shift_stage1_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d14,q11,#shift_stage1_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d6,q14,#shift_stage1_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d11,q9,#shift_stage1_idct   @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d7,q13,#shift_stage1_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d10,q15,#shift_stage1_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+
+    b           last4_cols
+
+
+
+skip_last4_rows:
+
+
+
+    vld1.16     d2,[r0]!
+    vld1.16     d3,[r9]!
+    vld1.16     d4,[r0],r5
+    vld1.16     d5,[r9],r5
+    vld1.16     d6,[r0]!
+    vld1.16     d7,[r9]!
+    vld1.16     d8,[r0],r10
+    vld1.16     d9,[r9],r10
+
+
+
+    vmov.s16    q6,#0
+    vmov.s16    q8,#0
+
+
+
+
+    vmull.s16   q12,d6,d0[1]                @// y1 * cos1(part of b0)
+    vmull.s16   q13,d6,d0[3]                @// y1 * cos3(part of b1)
+    vmull.s16   q14,d6,d1[1]                @// y1 * sin3(part of b2)
+    vmull.s16   q15,d6,d1[3]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d7,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d7,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d7,d0[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d7,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+    vmull.s16   q9,d3,d1[2]                 @// y2 * sin2 (q3 is freed by this time)(part of d1)
+    vmull.s16   q3,d3,d0[2]                 @// y2 * cos2(part of d0)
+
+    vmull.s16   q10,d2,d0[0]                @// y0 * cos4(part of c0 and c1)
+
+
+    vadd.s32    q7,q10,q3                   @// a0 = c0 + d0(part of r0,r7)
+    vsub.s32    q5,q10,q3                   @// a3 = c0 - d0(part of r3,r4)
+    vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of r2,r5)
+    vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of r1,r6)
+
+    vadd.s32    q10,q7,q12                  @// a0 + b0(part of r0)
+    vsub.s32    q3,q7,q12                   @// a0 - b0(part of r7)
+
+    vadd.s32    q12,q11,q14                 @// a2 + b2(part of r2)
+    vsub.s32    q11,q11,q14                 @// a2 - b2(part of r5)
+
+    vadd.s32    q14,q9,q13                  @// a1 + b1(part of r1)
+    vsub.s32    q9,q9,q13                   @// a1 - b1(part of r6)
+
+    vadd.s32    q13,q5,q15                  @// a3 + b3(part of r3)
+    vsub.s32    q15,q5,q15                  @// a3 - b3(part of r4)
+
+    vqrshrn.s32 d2,q10,#shift_stage1_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d15,q3,#shift_stage1_idct   @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d3,q12,#shift_stage1_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d14,q11,#shift_stage1_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d6,q14,#shift_stage1_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d11,q9,#shift_stage1_idct   @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d7,q13,#shift_stage1_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d10,q15,#shift_stage1_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+
+last4_cols:
+
+
+    cmp         r12,#0xf0
+    bge         skip_last4cols
+
+    vmull.s16   q12,d8,d0[1]                @// y1 * cos1(part of b0)
+    vmull.s16   q13,d8,d0[3]                @// y1 * cos3(part of b1)
+    vmull.s16   q14,d8,d1[1]                @// y1 * sin3(part of b2)
+    vmull.s16   q15,d8,d1[3]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d9,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d9,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d9,d0[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d9,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+    vmull.s16   q9,d5,d1[2]                 @// y2 * sin2 (q4 is freed by this time)(part of d1)
+    vmull.s16   q4,d5,d0[2]                 @// y2 * cos2(part of d0)
+
+    vmull.s16   q10,d4,d0[0]                @// y0 * cos4(part of c0 and c1)
+    vmull.s16   q11,d12,d0[0]               @// y4 * cos4(part of c0 and c1)
+
+    vmlal.s16   q12,d16,d1[1]               @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+    vmlsl.s16   q13,d16,d0[1]               @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+    vmlal.s16   q14,d16,d1[3]               @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+    vmlal.s16   q15,d16,d0[3]               @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+    vmlsl.s16   q9,d13,d0[2]                @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+    vmlal.s16   q4,d13,d1[2]                @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+    vadd.s32    q6,q10,q11                  @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+    vsub.s32    q10,q10,q11                 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+    vmlal.s16   q12,d17,d1[3]               @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
+    vmlsl.s16   q13,d17,d1[1]               @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
+    vmlal.s16   q14,d17,d0[3]               @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
+    vmlsl.s16   q15,d17,d0[1]               @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
+
+    vadd.s32    q8,q6,q4                    @// a0 = c0 + d0(part of e0,e7)
+    vsub.s32    q6,q6,q4                    @// a3 = c0 - d0(part of e3,e4)
+    vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of e2,e5)
+    vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of e1,e6)
+
+    vadd.s32    q10,q8,q12                  @// a0 + b0(part of e0)
+    vsub.s32    q4,q8,q12                   @// a0 - b0(part of e7)
+
+    vadd.s32    q12,q11,q14                 @// a2 + b2(part of e2)
+    vsub.s32    q11,q11,q14                 @// a2 - b2(part of e5)
+
+    vadd.s32    q14,q9,q13                  @// a1 + b1(part of e1)
+    vsub.s32    q9,q9,q13                   @// a1 - b1(part of e6)
+
+    vadd.s32    q13,q6,q15                  @// a3 + b3(part of e3)
+    vsub.s32    q15,q6,q15                  @// a3 - b3(part of r4)
+
+    vqrshrn.s32 d4,q10,#shift_stage1_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d17,q4,#shift_stage1_idct   @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d5,q12,#shift_stage1_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d16,q11,#shift_stage1_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d8,q14,#shift_stage1_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d13,q9,#shift_stage1_idct   @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d9,q13,#shift_stage1_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    vqrshrn.s32 d12,q15,#shift_stage1_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+    b           end_skip_last4cols
+
+
+
+skip_last4cols:
+
+
+
+
+
+
+    vtrn.16     q1,q3                       @//[r3,r1],[r2,r0] first qudrant transposing
+
+    vtrn.16     q5,q7                       @//[r7,r5],[r6,r4] third qudrant transposing
+
+
+    vtrn.32     d6,d7                       @//r0,r1,r2,r3 first qudrant transposing continued.....
+    vtrn.32     d2,d3                       @//r0,r1,r2,r3 first qudrant transposing continued.....
+
+    vtrn.32     d10,d11                     @//r4,r5,r6,r7 third qudrant transposing continued.....
+    vtrn.32     d14,d15                     @//r4,r5,r6,r7 third qudrant transposing continued.....
+
+
+    vmull.s16   q12,d6,d0[1]                @// y1 * cos1(part of b0)
+    vmull.s16   q13,d6,d0[3]                @// y1 * cos3(part of b1)
+    vmull.s16   q14,d6,d1[1]                @// y1 * sin3(part of b2)
+    vmull.s16   q15,d6,d1[3]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d7,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d7,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d7,d0[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d7,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+    vmull.s16   q10,d2,d0[0]                @// y0 * cos4(part of c0 and c1)
+@   vmull.s16   q11,d4,d0[0]                    @// y4 * cos4(part of c0 and c1)
+
+    vmull.s16   q9,d3,d1[2]                 @// y2 * sin2 (q3 is freed by this time)(part of d1)
+    vmull.s16   q3,d3,d0[2]                 @// y2 * cos2(part of d0)
+
+
+
+
+    vsub.s32    q11,q10,q3                  @// a3 = c0 - d0(part of r3,r4)
+    vadd.s32    q2,q10,q3                   @// a0 = c0 + d0(part of r0,r7)
+
+
+    vadd.s32    q1,q2,q12
+
+    vsub.s32    q3,q2,q12
+
+    vadd.s32    q4,q11,q15
+
+    vsub.s32    q12,q11,q15
+
+    vqrshrn.s32 d5,q4,#shift_stage2_idct
+    vqrshrn.s32 d2,q1,#shift_stage2_idct
+    vqrshrn.s32 d9,q3,#shift_stage2_idct
+    vqrshrn.s32 d6,q12,#shift_stage2_idct
+
+    vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of r2,r5)
+    vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of r1,r6)
+
+
+    vadd.s32    q15,q11,q14
+
+    vsub.s32    q12,q11,q14
+
+    vadd.s32    q14,q9,q13
+
+    vsub.s32    q11,q9,q13
+    vqrshrn.s32 d4,q15,#shift_stage2_idct
+    vqrshrn.s32 d7,q12,#shift_stage2_idct
+    vqrshrn.s32 d3,q14,#shift_stage2_idct
+    vqrshrn.s32 d8,q11,#shift_stage2_idct
+
+
+
+
+
+
+
+
+
+
+    vmull.s16   q12,d14,d0[1]               @// y1 * cos1(part of b0)
+
+    vmull.s16   q13,d14,d0[3]               @// y1 * cos3(part of b1)
+    vmull.s16   q14,d14,d1[1]               @// y1 * sin3(part of b2)
+    vmull.s16   q15,d14,d1[3]               @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d15,d0[3]               @// y1 * cos1 + y3 * cos3(part of b0)
+    vtrn.16     d2,d3
+    vmlsl.s16   q13,d15,d1[3]               @// y1 * cos3 - y3 * sin1(part of b1)
+    vtrn.16     d4,d5
+    vmlsl.s16   q14,d15,d0[1]               @// y1 * sin3 - y3 * cos1(part of b2)
+    vtrn.16     d6,d7
+    vmlsl.s16   q15,d15,d1[1]               @// y1 * sin1 - y3 * sin3(part of b3)
+    vtrn.16     d8,d9
+    vmull.s16   q10,d10,d0[0]               @// y0 * cos4(part of c0 and c1)
+    vtrn.32     d2,d4
+
+    vtrn.32     d3,d5
+    vmull.s16   q9,d11,d1[2]                @// y2 * sin2 (q7 is freed by this time)(part of d1)
+    vtrn.32     d6,d8
+    vmull.s16   q7,d11,d0[2]                @// y2 * cos2(part of d0)
+    vtrn.32     d7,d9
+
+
+    add         r4,r2,r8, lsl #1            @ r4 = r2 + pred_strd * 2    => r4 points to 3rd row of pred data
+
+
+    add         r5,r8,r8, lsl #1            @
+
+
+    add         r0,r3,r7, lsl #1            @ r0 points to 3rd row of dest data
+
+
+    add         r10,r7,r7, lsl #1           @
+
+
+    vswp        d3,d6
+
+
+    vswp        d5,d8
+
+
+    vsub.s32    q11,q10,q7                  @// a3 = c0 - d0(part of r3,r4)
+    vadd.s32    q6,q10,q7                   @// a0 = c0 + d0(part of r0,r7)
+
+
+    vadd.s32    q0,q6,q12
+
+
+    vsub.s32    q12,q6,q12
+
+
+    vadd.s32    q6,q11,q15
+
+
+    vsub.s32    q7,q11,q15
+
+    vqrshrn.s32 d10,q0,#shift_stage2_idct
+    vqrshrn.s32 d17,q12,#shift_stage2_idct
+    vqrshrn.s32 d13,q6,#shift_stage2_idct
+    vqrshrn.s32 d14,q7,#shift_stage2_idct
+
+    vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of r2,r5)
+    vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of r1,r6)
+
+
+    vadd.s32    q0,q11,q14
+
+
+    vsub.s32    q12,q11,q14
+
+
+    vadd.s32    q14,q9,q13
+
+
+    vsub.s32    q13,q9,q13
+    vld1.8      d18,[r2],r8
+
+    vqrshrn.s32 d12,q0,#shift_stage2_idct
+    vld1.8      d20,[r2],r5
+
+
+    vqrshrn.s32 d15,q12,#shift_stage2_idct
+    vld1.8      d19,[r2],r8
+
+
+
+
+    vqrshrn.s32 d11,q14,#shift_stage2_idct
+    vld1.8      d22,[r4],r8
+
+
+
+
+    vqrshrn.s32 d16,q13,#shift_stage2_idct
+    vld1.8      d21,[r2],r5
+
+
+    b           pred_buff_addition
+end_skip_last4cols:
+
+
+
+@/* now the idct of columns is done, transpose so that row idct done efficiently(step5) */
+    vtrn.16     q1,q3                       @//[r3,r1],[r2,r0] first qudrant transposing
+    vtrn.16     q2,q4                       @//[r3,r1],[r2,r0] second qudrant transposing
+    vtrn.16     q5,q7                       @//[r7,r5],[r6,r4] third qudrant transposing
+    vtrn.16     q6,q8                       @//[r7,r5],[r6,r4] fourth qudrant transposing
+
+    vtrn.32     d6,d7                       @//r0,r1,r2,r3 first qudrant transposing continued.....
+    vtrn.32     d2,d3                       @//r0,r1,r2,r3 first qudrant transposing continued.....
+    vtrn.32     d4,d5                       @//r0,r1,r2,r3 second qudrant transposing continued.....
+    vtrn.32     d8,d9                       @//r0,r1,r2,r3 second qudrant transposing continued.....
+    vtrn.32     d10,d11                     @//r4,r5,r6,r7 third qudrant transposing continued.....
+    vtrn.32     d14,d15                     @//r4,r5,r6,r7 third qudrant transposing continued.....
+    vtrn.32     d12,d13                     @//r4,r5,r6,r7 fourth qudrant transposing continued.....
+    vtrn.32     d16,d17                     @//r4,r5,r6,r7 fourth qudrant transposing continued.....
+
+    @//step6 operate on first four rows and find their idct
+    @//register usage.extern        - storing and idct of rows
+@// cosine constants    -   d0
+@// sine constants      -   d1
+@// element 0 first four    -   d2      -   y0
+@// element 1 first four    -   d6      -   y1
+@// element 2 first four    -   d3      -   y2
+@// element 3 first four    -   d7      -   y3
+@// element 4 first four    -   d4      -   y4
+@// element 5 first four    -   d8      -   y5
+@// element 6 first four    -   d5      -   y6
+@// element 7 first four    -   d9      -   y7
+@// element 0 second four   -   d10     -   y0
+@// element 1 second four   -   d14     -   y1
+@// element 2 second four   -   d11     -   y2
+@// element 3 second four   -   d15     -   y3
+@// element 4 second four   -   d12     -   y4
+@// element 5 second four   -   d16     -   y5
+@// element 6 second four   -   d13     -   y6
+@// element 7 second four   -   d17     -   y7
+
+    @// map between first kernel code seq and current
+@//     d2  ->  d2
+@//     d6  ->  d6
+@//     d3  ->  d3
+@//     d7  ->  d7
+@//     d10 ->  d4
+@//     d14 ->  d8
+@//     d11 ->  d5
+@//     d15 ->  d9
+@//     q3  ->  q3
+@//     q5  ->  q2
+@//     q7  ->  q4
+
+    vmull.s16   q12,d6,d0[1]                @// y1 * cos1(part of b0)
+    vmull.s16   q13,d6,d0[3]                @// y1 * cos3(part of b1)
+    vmull.s16   q14,d6,d1[1]                @// y1 * sin3(part of b2)
+    vmull.s16   q15,d6,d1[3]                @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d7,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
+    vmlsl.s16   q13,d7,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
+    vmlsl.s16   q14,d7,d0[1]                @// y1 * sin3 - y3 * cos1(part of b2)
+    vmlsl.s16   q15,d7,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)
+
+    vmull.s16   q10,d2,d0[0]                @// y0 * cos4(part of c0 and c1)
+    vmull.s16   q11,d4,d0[0]                @// y4 * cos4(part of c0 and c1)
+
+    vmull.s16   q9,d3,d1[2]                 @// y2 * sin2 (q3 is freed by this time)(part of d1)
+    vmull.s16   q3,d3,d0[2]                 @// y2 * cos2(part of d0)
+
+
+    vmlal.s16   q12,d8,d1[1]                @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+    vmlsl.s16   q13,d8,d0[1]                @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+    vmlal.s16   q14,d8,d1[3]                @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+    vmlal.s16   q15,d8,d0[3]                @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+    vmlsl.s16   q9,d5,d0[2]                 @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+    vmlal.s16   q3,d5,d1[2]                 @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+    vadd.s32    q1,q10,q11                  @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+    vsub.s32    q10,q10,q11                 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+    vmlal.s16   q12,d9,d1[3]                @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
+    vmlsl.s16   q13,d9,d1[1]                @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
+    vmlal.s16   q14,d9,d0[3]                @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
+    vmlsl.s16   q15,d9,d0[1]                @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)
+
+    vsub.s32    q11,q1,q3                   @// a3 = c0 - d0(part of r3,r4)
+    vadd.s32    q2,q1,q3                    @// a0 = c0 + d0(part of r0,r7)
+
+
+    vadd.s32    q1,q2,q12
+
+    vsub.s32    q3,q2,q12
+
+    vadd.s32    q4,q11,q15
+
+    vsub.s32    q12,q11,q15
+
+    vqrshrn.s32 d5,q4,#shift_stage2_idct
+    vqrshrn.s32 d2,q1,#shift_stage2_idct
+    vqrshrn.s32 d9,q3,#shift_stage2_idct
+    vqrshrn.s32 d6,q12,#shift_stage2_idct
+
+    vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of r2,r5)
+    vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of r1,r6)
+
+
+    vadd.s32    q15,q11,q14
+
+    vsub.s32    q12,q11,q14
+
+    vadd.s32    q14,q9,q13
+
+    vsub.s32    q11,q9,q13
+    vqrshrn.s32 d4,q15,#shift_stage2_idct
+    vqrshrn.s32 d7,q12,#shift_stage2_idct
+    vqrshrn.s32 d3,q14,#shift_stage2_idct
+    vqrshrn.s32 d8,q11,#shift_stage2_idct
+
+
+
+
+
+
+
+
+
+
+    vmull.s16   q12,d14,d0[1]               @// y1 * cos1(part of b0)
+
+    vmull.s16   q13,d14,d0[3]               @// y1 * cos3(part of b1)
+    vmull.s16   q14,d14,d1[1]               @// y1 * sin3(part of b2)
+    vmull.s16   q15,d14,d1[3]               @// y1 * sin1(part of b3)
+
+    vmlal.s16   q12,d15,d0[3]               @// y1 * cos1 + y3 * cos3(part of b0)
+    vtrn.16     d2,d3
+    vmlsl.s16   q13,d15,d1[3]               @// y1 * cos3 - y3 * sin1(part of b1)
+    vtrn.16     d4,d5
+    vmlsl.s16   q14,d15,d0[1]               @// y1 * sin3 - y3 * cos1(part of b2)
+    vtrn.16     d6,d7
+    vmlsl.s16   q15,d15,d1[1]               @// y1 * sin1 - y3 * sin3(part of b3)
+    vtrn.16     d8,d9
+    vmull.s16   q10,d10,d0[0]               @// y0 * cos4(part of c0 and c1)
+    vtrn.32     d2,d4
+    vmull.s16   q11,d12,d0[0]               @// y4 * cos4(part of c0 and c1)
+    vtrn.32     d3,d5
+    vmull.s16   q9,d11,d1[2]                @// y2 * sin2 (q7 is freed by this time)(part of d1)
+    vtrn.32     d6,d8
+    vmull.s16   q7,d11,d0[2]                @// y2 * cos2(part of d0)
+    vtrn.32     d7,d9
+    vmlal.s16   q12,d16,d1[1]               @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+
+    add         r4,r2,r8, lsl #1            @ r4 = r2 + pred_strd * 2    => r4 points to 3rd row of pred data
+    vmlsl.s16   q13,d16,d0[1]               @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+
+    add         r5,r8,r8, lsl #1            @
+    vmlal.s16   q14,d16,d1[3]               @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+
+    add         r0,r3,r7, lsl #1            @ r0 points to 3rd row of dest data
+    vmlal.s16   q15,d16,d0[3]               @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+    add         r10,r7,r7, lsl #1           @
+    vmlsl.s16   q9,d13,d0[2]                @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+
+
+    vmlal.s16   q7,d13,d1[2]                @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+    vadd.s32    q6,q10,q11                  @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+    vsub.s32    q10,q10,q11                 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+    vmlal.s16   q12,d17,d1[3]               @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
+    vswp        d3,d6
+    vmlsl.s16   q13,d17,d1[1]               @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
+
+    vswp        d5,d8
+    vmlal.s16   q14,d17,d0[3]               @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
+    vmlsl.s16   q15,d17,d0[1]               @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)
+
+    vsub.s32    q11,q6,q7                   @// a3 = c0 - d0(part of r3,r4)
+    vadd.s32    q6,q6,q7                    @// a0 = c0 + d0(part of r0,r7)
+
+
+    vadd.s32    q0,q6,q12
+
+
+    vsub.s32    q12,q6,q12
+
+
+    vadd.s32    q6,q11,q15
+
+
+    vsub.s32    q7,q11,q15
+
+    vqrshrn.s32 d10,q0,#shift_stage2_idct
+    vqrshrn.s32 d17,q12,#shift_stage2_idct
+    vqrshrn.s32 d13,q6,#shift_stage2_idct
+    vqrshrn.s32 d14,q7,#shift_stage2_idct
+
+    vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of r2,r5)
+    vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of r1,r6)
+
+
+    vadd.s32    q0,q11,q14
+
+
+    vsub.s32    q12,q11,q14
+
+
+    vadd.s32    q14,q9,q13
+
+
+    vsub.s32    q13,q9,q13
+    vld1.8      d18,[r2],r8
+
+    vqrshrn.s32 d12,q0,#shift_stage2_idct
+    vld1.8      d20,[r2],r5
+
+
+    vqrshrn.s32 d15,q12,#shift_stage2_idct
+    vld1.8      d19,[r2],r8
+
+
+
+
+    vqrshrn.s32 d11,q14,#shift_stage2_idct
+    vld1.8      d22,[r4],r8
+
+
+
+
+    vqrshrn.s32 d16,q13,#shift_stage2_idct
+    vld1.8      d21,[r2],r5
+
+
+
+
+pred_buff_addition:
+
+
+    vtrn.16     d10,d11
+    vld1.8      d24,[r4],r5
+
+    vtrn.16     d12,d13
+    vld1.8      d23,[r4],r8
+
+    vaddw.u8    q1,q1,d18
+    vld1.8      d25,[r4],r5
+
+    vtrn.16     d14,d15
+    vaddw.u8    q2,q2,d22
+
+    vtrn.16     d16,d17
+    vaddw.u8    q3,q3,d20
+
+    vtrn.32     d10,d12
+    vaddw.u8    q4,q4,d24
+
+    vtrn.32     d11,d13
+    vtrn.32     d14,d16
+    vtrn.32     d15,d17
+
+    vswp        d11,d14
+    vswp        d13,d16
+
+@ row values stored in the q register.
+
+@q1 :r0
+@q3: r1
+@q2: r2
+@q4: r3
+@q5: r4
+@q7: r5
+@q6: r6
+@q8: r7
+
+
+
+@/// adding the prediction buffer
+
+
+
+
+
+
+
+
+
+    @ load prediction data
+
+
+
+
+
+    @adding recon with prediction
+
+
+
+
+
+    vaddw.u8    q5,q5,d19
+    vqmovun.s16 d2,q1
+    vaddw.u8    q7,q7,d21
+    vqmovun.s16 d4,q2
+    vaddw.u8    q6,q6,d23
+    vqmovun.s16 d6,q3
+    vaddw.u8    q8,q8,d25
+    vqmovun.s16 d8,q4
+
+
+
+
+
+
+
+    vst1.8      {d2},[r3],r7
+    vqmovun.s16 d10,q5
+    vst1.8      {d6},[r3],r10
+    vqmovun.s16 d14,q7
+    vst1.8      {d4},[r0],r7
+    vqmovun.s16 d12,q6
+    vst1.8      {d8},[r0],r10
+    vqmovun.s16 d16,q8
+
+
+
+
+
+
+
+    vst1.8      {d10},[r3],r7
+    vst1.8      {d14},[r3],r10
+    vst1.8      {d12},[r0],r7
+    vst1.8      {d16},[r0],r10
+
+
+
+
+    sub         sp,sp,#40
+    ldmfd       sp!,{r4-r12,pc}
+
+
+
+
+

diff --git a/common/arm/ihevc_mem_fns.s b/common/arm/ihevc_mem_fns.s
new file mode 100644
index 0000000..21b5570
--- /dev/null
+++ b/common/arm/ihevc_mem_fns.s

@@ -0,0 +1,279 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@ *******************************************************************************
+@ * ,:file
+@ *  ihevc_mem_fns_neon.s
+@ *
+@ * ,:brief
+@ *  Contains function definitions for memory manipulation
+@ *
+@ * ,:author
+@ *  Naveen SR
+@ *
+@ * ,:par List of Functions:
+@ *  - ihevc_memcpy()
+@ *  - ihevc_memset_mul_8()
+@ *  - ihevc_memset_16bit_mul_8()
+@ *
+@ * ,:remarks
+@ *  None
+@ *
+@ *******************************************************************************
+@*/
+
+@/**
+@*******************************************************************************
+@*
+@* ,:brief
+@*   memcpy of a 1d array
+@*
+@* ,:par Description:
+@*   Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
+@*
+@* ,:param[in] pu1_dst
+@*  UWORD8 pointer to the destination
+@*
+@* ,:param[in] pu1_src
+@*  UWORD8 pointer to the source
+@*
+@* ,:param[in] num_bytes
+@*  number of bytes to copy
+@* ,:returns
+@*
+@* ,:remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_memcpy_mul_8(UWORD8 *pu1_dst,
+@                    UWORD8 *pu1_src,
+@                   UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@   r0 => *pu1_dst
+@   r1 => *pu1_src
+@   r2 => num_bytes
+
+.text
+.p2align 2
+
+
+
+
+    .global ihevc_memcpy_mul_8_a9q
+.type ihevc_memcpy_mul_8_a9q, %function
+
+ihevc_memcpy_mul_8_a9q:
+
+LOOP_NEON_MEMCPY_MUL_8:
+    @ Memcpy 8 bytes
+    VLD1.8      d0,[r1]!
+    VST1.8      d0,[r0]!
+
+    SUBS        r2,r2,#8
+    BNE         LOOP_NEON_MEMCPY_MUL_8
+    MOV         PC,LR
+
+
+
+@*******************************************************************************
+@*/
+@void ihevc_memcpy(UWORD8 *pu1_dst,
+@                  UWORD8 *pu1_src,
+@                  UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@   r0 => *pu1_dst
+@   r1 => *pu1_src
+@   r2 => num_bytes
+
+
+
+    .global ihevc_memcpy_a9q
+.type ihevc_memcpy_a9q, %function
+
+ihevc_memcpy_a9q:
+    SUBS        r2,#8
+    BLT         ARM_MEMCPY
+LOOP_NEON_MEMCPY:
+    @ Memcpy 8 bytes
+    VLD1.8      d0,[r1]!
+    VST1.8      d0,[r0]!
+
+    SUBS        r2,#8
+    BGE         LOOP_NEON_MEMCPY
+    CMP         r2,#-8
+    BXEQ        LR
+
+ARM_MEMCPY:
+    ADD         r2,#8
+
+LOOP_ARM_MEMCPY:
+    LDRB        r3,[r1],#1
+    STRB        r3,[r0],#1
+    SUBS        r2,#1
+    BNE         LOOP_ARM_MEMCPY
+    BX          LR
+
+
+
+
+@void ihevc_memset_mul_8(UWORD8 *pu1_dst,
+@                       UWORD8 value,
+@                       UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@   r0 => *pu1_dst
+@   r1 => value
+@   r2 => num_bytes
+
+.text
+.p2align 2
+
+
+
+    .global ihevc_memset_mul_8_a9q
+.type ihevc_memset_mul_8_a9q, %function
+
+ihevc_memset_mul_8_a9q:
+
+@ Assumptions: numbytes is either 8, 16 or 32
+    VDUP.8      d0,r1
+LOOP_MEMSET_MUL_8:
+    @ Memset 8 bytes
+    VST1.8      d0,[r0]!
+
+    SUBS        r2,r2,#8
+    BNE         LOOP_MEMSET_MUL_8
+
+    BX          LR
+
+
+
+
+@void ihevc_memset(UWORD8 *pu1_dst,
+@                       UWORD8 value,
+@                       UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@   r0 => *pu1_dst
+@   r1 => value
+@   r2 => num_bytes
+
+
+
+    .global ihevc_memset_a9q
+.type ihevc_memset_a9q, %function
+
+ihevc_memset_a9q:
+    SUBS        r2,#8
+    BLT         ARM_MEMSET
+    VDUP.8      d0,r1
+LOOP_NEON_MEMSET:
+    @ Memcpy 8 bytes
+    VST1.8      d0,[r0]!
+
+    SUBS        r2,#8
+    BGE         LOOP_NEON_MEMSET
+    CMP         r2,#-8
+    BXEQ        LR
+
+ARM_MEMSET:
+    ADD         r2,#8
+
+LOOP_ARM_MEMSET:
+    STRB        r1,[r0],#1
+    SUBS        r2,#1
+    BNE         LOOP_ARM_MEMSET
+    BX          LR
+
+
+
+
+@void ihevc_memset_16bit_mul_8(UWORD16 *pu2_dst,
+@                                   UWORD16 value,
+@                                   UWORD8 num_words)
+@**************Variables Vs Registers*************************
+@   r0 => *pu2_dst
+@   r1 => value
+@   r2 => num_words
+
+.text
+.p2align 2
+
+
+
+    .global ihevc_memset_16bit_mul_8_a9q
+.type ihevc_memset_16bit_mul_8_a9q, %function
+
+ihevc_memset_16bit_mul_8_a9q:
+
+@ Assumptions: num_words is either 8, 16 or 32
+
+    @ Memset 8 words
+    VDUP.16     d0,r1
+LOOP_MEMSET_16BIT_MUL_8:
+    VST1.16     d0,[r0]!
+    VST1.16     d0,[r0]!
+
+    SUBS        r2,r2,#8
+    BNE         LOOP_MEMSET_16BIT_MUL_8
+
+    BX          LR
+
+
+
+
+@void ihevc_memset_16bit(UWORD16 *pu2_dst,
+@                       UWORD16 value,
+@                       UWORD8 num_words)
+@**************Variables Vs Registers*************************
+@   r0 => *pu2_dst
+@   r1 => value
+@   r2 => num_words
+
+
+
+    .global ihevc_memset_16bit_a9q
+.type ihevc_memset_16bit_a9q, %function
+
+ihevc_memset_16bit_a9q:
+    SUBS        r2,#8
+    BLT         ARM_MEMSET_16BIT
+    VDUP.16     d0,r1
+LOOP_NEON_MEMSET_16BIT:
+    @ Memset 8 words
+    VST1.16     d0,[r0]!
+    VST1.16     d0,[r0]!
+
+    SUBS        r2,#8
+    BGE         LOOP_NEON_MEMSET_16BIT
+    CMP         r2,#-8
+    BXEQ        LR
+
+ARM_MEMSET_16BIT:
+    ADD         r2,#8
+
+LOOP_ARM_MEMSET_16BIT:
+    STRH        r1,[r0],#2
+    SUBS        r2,#1
+    BNE         LOOP_ARM_MEMSET_16BIT
+    BX          LR
+
+
+
+
+    .section .note.GNU-stack,"",%progbits
+

diff --git a/common/arm/ihevc_padding.s b/common/arm/ihevc_padding.s
new file mode 100644
index 0000000..08d1f36
--- /dev/null
+++ b/common/arm/ihevc_padding.s

@@ -0,0 +1,531 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@ *******************************************************************************
+@ * @file
+@ *  ihevc_padding_neon.s
+@ *
+@ * @brief
+@ *  contains function definitions padding
+@ *
+@ * @author
+@ *  naveen sr
+@ *
+@ * @par list of functions:
+@ *  - ihevc_pad_left_luma()
+@ *  - ihevc_pad_left_chroma()
+@ *
+@ * @remarks
+@ *  none
+@ *
+@ *******************************************************************************
+@*/
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*   padding (luma block) at the left of a 2d array
+@*
+@* @par description:
+@*   the left column of a 2d array is replicated for pad_size times at the left
+@*
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @param[in] pad_size
+@*  integer -padding size of the array
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@.if pad_left_luma == c
+@void ihevc_pad_left_luma(uword8 *pu1_src,
+@                        word32 src_strd,
+@                        word32 ht,
+@                        word32 pad_size)
+@**************variables vs registers*************************
+@   r0 => *pu1_src
+@   r1 => src_strd
+@   r2 => ht
+@   r3 => pad_size
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_pad_left_luma_a9q
+
+.type ihevc_pad_left_luma_a9q, %function
+
+ihevc_pad_left_luma_a9q:
+
+    stmfd       sp!, {r4-r11,lr}            @stack stores the values of the arguments
+
+loop_start_luma_left:
+    @ pad size is assumed to be pad_left = 80
+    sub         r4,r0,r3
+
+    ldrb        r8,[r0]
+    add         r0,r1
+    ldrb        r9,[r0]
+    add         r0,r1
+    ldrb        r10,[r0]
+    add         r0,r1
+    ldrb        r11,[r0]
+    add         r0,r1
+
+    vdup.u8     q0,r8
+    vdup.u8     q1,r9
+    vdup.u8     q2,r10
+    vdup.u8     q3,r11
+
+    add         r5,r4,r1
+
+    vst1.8      {d0,d1},[r4]!               @128/8 = 16 bytes store
+    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
+    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
+    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
+    vst1.8      {d0,d1},[r4]                @ 16 bytes store
+
+    add         r6,r5,r1
+
+    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
+    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
+    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
+    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
+    vst1.8      {d2,d3},[r5]                @128/8 = 16 bytes store
+
+    add         r7,r6,r1
+
+    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
+    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
+    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
+    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
+    vst1.8      {d4,d5},[r6]                @128/8 = 16 bytes store
+
+    subs        r2,#4
+
+    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
+    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
+    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
+    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
+    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
+
+    @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
+
+    bne         loop_start_luma_left
+
+    ldmfd       sp!,{r4-r11,pc}             @reload the registers from sp
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*   padding (chroma block) at the left of a 2d array
+@*
+@* @par description:
+@*   the left column of a 2d array is replicated for pad_size times at the left
+@*
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array (each colour component)
+@*
+@* @param[in] pad_size
+@*  integer -padding size of the array
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@.if pad_left_chroma == c
+@void ihevc_pad_left_chroma(uword8 *pu1_src,
+@                            word32 src_strd,
+@                            word32 ht,
+@                            word32 pad_size)
+@{
+@   r0 => *pu1_src
+@   r1 => src_strd
+@   r2 => ht
+@   r3 => pad_size
+
+
+
+.globl ihevc_pad_left_chroma_a9q
+
+.type ihevc_pad_left_chroma_a9q, %function
+
+ihevc_pad_left_chroma_a9q:
+
+    stmfd       sp!, {r4-r11, lr}           @stack stores the values of the arguments
+
+loop_start_chroma_left:
+    @ pad size is assumed to be pad_left = 80
+    sub         r4,r0,r3
+
+    ldrh        r8,[r0]
+    add         r0,r1
+    ldrh        r9,[r0]
+    add         r0,r1
+    ldrh        r10,[r0]
+    add         r0,r1
+    ldrh        r11,[r0]
+    add         r0,r1
+
+    vdup.u16    q0,r8
+    vdup.u16    q1,r9
+    vdup.u16    q2,r10
+    vdup.u16    q3,r11
+
+    add         r5,r4,r1
+
+    vst1.8      {d0,d1},[r4]!               @128/8 = 16 bytes store
+    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
+    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
+    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
+    vst1.8      {d0,d1},[r4]                @ 16 bytes store
+
+    add         r6,r5,r1
+
+    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
+    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
+    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
+    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
+    vst1.8      {d2,d3},[r5]                @128/8 = 16 bytes store
+
+    add         r7,r6,r1
+
+    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
+    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
+    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
+    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
+    vst1.8      {d4,d5},[r6]                @128/8 = 16 bytes store
+
+    subs        r2,#4
+
+    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
+    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
+    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
+    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
+    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
+
+    @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
+
+    bne         loop_start_chroma_left
+
+    ldmfd       sp!,{r4-r11,pc}             @reload the registers from sp
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* padding (luma block) at the right of a 2d array
+@*
+@* @par description:
+@* the right column of a 2d array is replicated for pad_size times at the right
+@*
+@*
+@* @param[in] pu1_src
+@*  uword8 pointer to the source
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @param[in] pad_size
+@*  integer -padding size of the array
+@*
+@* @param[in] ht
+@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@.if pad_right_luma == c
+@void ihevc_pad_right_luma(uword8 *pu1_src,
+@                        word32 src_strd,
+@                        word32 ht,
+@                        word32 pad_size)
+@{
+@    word32 row@
+@
+@    for(row = 0@ row < ht@ row++)
+@    {
+@        memset(pu1_src, *(pu1_src -1), pad_size)@
+@
+@        pu1_src += src_strd@
+@    }
+@}
+@
+@   r0 => *pu1_src
+@   r1 => src_strd
+@   r2 => ht
+@   r3 => pad_size
+
+
+
+.globl ihevc_pad_right_luma_a9q
+
+.type ihevc_pad_right_luma_a9q, %function
+
+ihevc_pad_right_luma_a9q:
+
+    stmfd       sp!, {r4-r11, lr}           @stack stores the values of the arguments
+
+loop_start_luma_right:
+    @ pad size is assumed to be pad_left = 80
+    mov         r4,r0
+
+    ldrb        r8,[r0, #-1]
+    add         r0,r1
+    ldrb        r9,[r0, #-1]
+    add         r0,r1
+    ldrb        r10,[r0, #-1]
+    add         r0,r1
+    ldrb        r11,[r0, #-1]
+    add         r0,r1
+
+    add         r5,r4,r1
+    add         r6,r5,r1
+    add         r7,r6,r1
+
+    vdup.u8     q0,r8
+    vdup.u8     q1,r9
+    vdup.u8     q2,r10
+    vdup.u8     q3,r11
+
+    vst1.8      {d0,d1},[r4]!               @128/8 = 16 bytes store
+    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
+    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
+    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
+    vst1.8      {d0,d1},[r4]                @ 16 bytes store
+
+
+    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
+    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
+    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
+    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
+    vst1.8      {d2,d3},[r5]                @128/8 = 16 bytes store
+
+    subs        r2,#4
+
+    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
+    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
+    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
+    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
+    vst1.8      {d4,d5},[r6]                @128/8 = 16 bytes store
+
+    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
+    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
+    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
+    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
+    vst1.8      {d6,d7},[r7]                @128/8 = 16 bytes store
+
+
+    @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
+
+
+    bne         loop_start_luma_right
+
+    ldmfd       sp!,{r4-r11,pc}             @reload the registers from sp
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@@* padding (chroma block) at the right of a 2d array
+@*
+@* @par description:
+@* the right column of a 2d array is replicated for pad_size times at the right
+@*
+@*
+@* @param[in] pu1_src
+@@*  uword8 pointer to the source
+@*
+@* @param[in] src_strd
+@*  integer source stride
+@*
+@* @param[in] ht
+@@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array (each colour component)
+@*
+@* @param[in] pad_size
+@*  integer -padding size of the array
+@*
+@* @param[in] ht
+@@*  integer height of the array
+@*
+@* @param[in] wd
+@*  integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@.if pad_right_chroma == c
+@void ihevc_pad_right_chroma(uword8 *pu1_src,
+@                        word32 src_strd,
+@                        word32 ht,
+@                        word32 pad_size)
+@   r0 => *pu1_src
+@   r1 => src_strd
+@   r2 => ht
+@   r3 => pad_size
+
+
+
+.globl ihevc_pad_right_chroma_a9q
+
+.type ihevc_pad_right_chroma_a9q, %function
+
+ihevc_pad_right_chroma_a9q:
+
+    stmfd       sp!, {r4-r11, lr}           @stack stores the values of the arguments
+
+loop_start_chroma_right:
+    @ pad size is assumed to be pad_left = 80
+    mov         r4,r0
+
+    ldrh        r8,[r0, #-2]
+    add         r0,r1
+    ldrh        r9,[r0, #-2]
+    add         r0,r1
+    ldrh        r10,[r0, #-2]
+    add         r0,r1
+    ldrh        r11,[r0, #-2]
+    add         r0,r1
+
+    vdup.u16    q0,r8
+    vdup.u16    q1,r9
+    vdup.u16    q2,r10
+    vdup.u16    q3,r11
+
+    add         r5,r4,r1
+
+    vst1.8      {d0,d1},[r4]!               @128/8 = 16 bytes store
+    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
+    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
+    vst1.8      {d0,d1},[r4]!               @ 16 bytes store
+    vst1.8      {d0,d1},[r4]                @ 16 bytes store
+
+    add         r6,r5,r1
+
+    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
+    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
+    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
+    vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
+    vst1.8      {d2,d3},[r5]                @128/8 = 16 bytes store
+
+    add         r7,r6,r1
+
+    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
+    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
+    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
+    vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
+    vst1.8      {d4,d5},[r6]                @128/8 = 16 bytes store
+
+    subs        r2,#4
+
+    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
+    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
+    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
+    vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
+    vst1.8      {d6,d7},[r7]                @128/8 = 16 bytes store
+
+    @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
+
+    bne         loop_start_chroma_right
+
+    ldmfd       sp!,{r4-r11,pc}             @reload the registers from sp
+
+
+
+
+
+
+
+

diff --git a/common/arm/ihevc_platform_macros.h b/common/arm/ihevc_platform_macros.h
new file mode 100644
index 0000000..72ef0c3
--- /dev/null
+++ b/common/arm/ihevc_platform_macros.h

@@ -0,0 +1,149 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_platform_macros.h
+*
+* @brief
+*  Platform specific Macro definitions used in the codec
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_PLATFORM_MACROS_H_
+#define _IHEVC_PLATFORM_MACROS_H_
+
+#ifndef  ARMV8
+static __inline WORD32 CLIP_U8(WORD32 x)
+{
+    asm("usat %0, #8, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+static __inline WORD32 CLIP_S8(WORD32 x)
+{
+    asm("ssat %0, #8, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+static __inline WORD32 CLIP_U10(WORD32 x)
+{
+    asm("usat %0, #10, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+static __inline WORD32 CLIP_S10(WORD32 x)
+{
+    asm("ssat %0, #10, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+static __inline WORD32 CLIP_U12(WORD32 x)
+{
+    asm("usat %0, #12, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+static __inline WORD32 CLIP_S12(WORD32 x)
+{
+    asm("ssat %0, #12, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+static __inline WORD32 CLIP_U16(WORD32 x)
+{
+    asm("usat %0, #16, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+static __inline WORD32 CLIP_S16(WORD32 x)
+{
+    asm("ssat %0, #16, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+
+static __inline UWORD32 ITT_BIG_ENDIAN(UWORD32 x)
+{
+    asm("rev %0, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+#else
+
+#define CLIP_U8(x) CLIP3((x), 0,     255)
+#define CLIP_S8(x) CLIP3((x), -128,  127)
+
+#define CLIP_U10(x) CLIP3((x), 0,     1023);
+#define CLIP_S10(x) CLIP3((x), -512,  511);
+
+#define CLIP_U12(x) CLIP3((x), 0,     4095);
+#define CLIP_S12(x) CLIP3((x), -2048,  2047);
+
+#define CLIP_U16(x) CLIP3((x), 0,        65535)
+#define CLIP_S16(x) CLIP3((x), -32768,   32767)
+
+#define ITT_BIG_ENDIAN(x)   ((x & 0x000000ff) << 24)                |   \
+                            ((x & 0x0000ff00) << 8)    |   \
+                            ((x & 0x00ff0000) >> 8)    |   \
+                            ((UWORD32)x >> 24);
+#endif
+
+#define SHL(x,y) (((y) < 32) ? ((x) << (y)) : 0)
+#define SHR(x,y) (((y) < 32) ? ((x) >> (y)) : 0)
+
+#define SHR_NEG(val,shift)  ((shift>0)?(val>>shift):(val<<(-shift)))
+#define SHL_NEG(val,shift)  ((shift<0)?(val>>(-shift)):(val<<shift))
+
+#define INLINE inline
+
+static INLINE UWORD32 CLZ(UWORD32 u4_word)
+{
+    if(u4_word)
+        return (__builtin_clz(u4_word));
+    else
+        return 32;
+}
+static INLINE UWORD32 CTZ(UWORD32 u4_word)
+{
+    if(0 == u4_word)
+        return 31;
+    else
+    {
+        unsigned int index;
+        index = __builtin_ctz(u4_word);
+        return (UWORD32)index;
+    }
+}
+
+
+
+
+#define NOP(nop_cnt)    {UWORD32 nop_i; for (nop_i = 0; nop_i < nop_cnt; nop_i++);}
+
+
+
+#define MEM_ALIGN8 __attribute__ ((aligned (8)))
+#define MEM_ALIGN16 __attribute__ ((aligned (16)))
+#define MEM_ALIGN32 __attribute__ ((aligned (32)))
+
+#endif /* _IHEVC_PLATFORM_MACROS_H_ */

diff --git a/common/arm/ihevc_sao_band_offset_chroma.s b/common/arm/ihevc_sao_band_offset_chroma.s
new file mode 100644
index 0000000..32e149d
--- /dev/null
+++ b/common/arm/ihevc_sao_band_offset_chroma.s

@@ -0,0 +1,393 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@*  ihevc_sao_band_offset_chroma.s
+@*
+@* ,:brief
+@*  Contains function definitions for inter prediction  interpolation.
+@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@*  Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_band_offset_chroma(UWORD8 *pu1_src,
+@                           WORD32 src_strd,
+@                           UWORD8 *pu1_src_left,
+@                           UWORD8 *pu1_src_top,
+@                           UWORD8 *pu1_src_top_left,
+@                           WORD32 sao_band_pos_u,
+@                           WORD32 sao_band_pos_v,
+@                           WORD8 *pi1_sao_offset_u,
+@                           WORD8 *pi1_sao_offset_v,
+@                           WORD32 wd,
+@                           WORD32 ht)
+@
+@**************Variables Vs Registers*****************************************
+@r0 =>  *pu1_src
+@r1 =>  src_strd
+@r2 =>  *pu1_src_left
+@r3 =>  *pu1_src_top
+@r4 =>  *pu1_src_top_left
+@r5 =>  sao_band_pos_u
+@r6 =>  sao_band_pos_v
+@r7 =>  *pi1_sao_offset_u
+@r8 =>  *pi1_sao_offset_v
+@r9 =>  wd
+@r10=>  ht
+
+.text
+.p2align 2
+
+.extern gu1_table_band_idx
+.globl ihevc_sao_band_offset_chroma_a9q
+
+gu1_table_band_idx_addr_1:
+.long gu1_table_band_idx - ulbl1 - 8
+
+gu1_table_band_idx_addr_2:
+.long gu1_table_band_idx - ulbl2 - 8
+
+ihevc_sao_band_offset_chroma_a9q:
+
+    STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+    LDR         r4,[sp,#40]                 @Loads pu1_src_top_left
+    LDR         r10,[sp,#64]                @Loads ht
+
+    LDR         r9,[sp,#60]                 @Loads wd
+    MOV         r11,r10                     @Move the ht to r9 for loop counter
+
+    ADD         r12,r0,r9                   @pu1_src[row * src_strd + (wd)]
+    LDR         r14, gu1_table_band_idx_addr_1
+ulbl1:
+    add         r14,r14,pc
+    SUB         r12,r12,#2                  @wd-2
+
+SRC_LEFT_LOOP:
+    LDRH        r5,[r12],r1                 @Load the value
+    SUBS        r11,r11,#1                  @Decrement the loop counter
+    STRH        r5,[r2],#2                  @Store the value in pu1_src_left pointer
+    BNE         SRC_LEFT_LOOP
+
+    LDR         r5,[sp,#44]                 @Loads sao_band_pos_u
+    VLD1.8      D1,[r14]!                   @band_table_u.val[0]
+    ADD         r12,r3,r9                   @pu1_src_top[wd]
+
+    LDRH        r11,[r12,#-2]
+    VLD1.8      D2,[r14]!                   @band_table_u.val[1]
+    LSL         r6,r5,#3                    @sao_band_pos_u
+
+    STRH        r11,[r4]                    @store to pu1_src_top_left[0]
+    VLD1.8      D3,[r14]!                   @band_table_u.val[2]
+    LDR         r7,[sp,#52]                 @Loads pi1_sao_offset_u
+
+    SUB         r4,r10,#1                   @ht-1
+    VDUP.8      D31,r6                      @band_pos_u
+    MUL         r4,r4,r1                    @ht-1 * src_strd
+
+    ADD         r4,r4,r0                    @pu1_src[(ht - 1) * src_strd]
+    VLD1.8      D4,[r14]!                   @band_table_u.val[3]
+    MOV         r11,r9                      @Move the wd to r9 for loop counter
+
+SRC_TOP_LOOP:                               @wd is always multiple of 8
+    VLD1.8      D0,[r4]!                    @Load pu1_src[(ht - 1) * src_strd + col]
+    SUBS        r11,r11,#8                  @Decrement the loop counter by 8
+    VST1.8      D0,[r3]!                    @Store to pu1_src_top[col]
+    BNE         SRC_TOP_LOOP
+
+    VLD1.8      D30,[r7]                    @pi1_sao_offset_u load
+    VADD.I8     D5,D1,D31                   @band_table_u.val[0] = vadd_u8(band_table_u.val[0], sao_band_pos_u)
+
+    VDUP.8      D29,D30[1]                  @vdup_n_u8(pi1_sao_offset_u[1])
+    VADD.I8     D6,D2,D31                   @band_table_u.val[1] = vadd_u8(band_table_u.val[1], sao_band_pos_u)
+
+    VDUP.8      D28,D30[2]                  @vdup_n_u8(pi1_sao_offset_u[2])
+    VADD.I8     D7,D3,D31                   @band_table_u.val[2] = vadd_u8(band_table_u.val[2], sao_band_pos_u)
+
+    VDUP.8      D27,D30[3]                  @vdup_n_u8(pi1_sao_offset_u[3])
+    VADD.I8     D8,D4,D31                   @band_table_u.val[3] = vadd_u8(band_table_u.val[3], sao_band_pos_u)
+
+    CMP         r5,#28
+    VDUP.8      D26,D30[4]                  @vdup_n_u8(pi1_sao_offset_u[4])
+    LDR         r14, gu1_table_band_idx_addr_2
+ulbl2:
+    add         r14,r14,pc
+
+    VMOV.I8     D30,#16                     @vdup_n_u8(16)
+    VADD.I8     D1,D5,D29                   @band_table_u.val[0] = vadd_u8(band_table_u.val[0], vdup_n_u8(pi1_sao_offset_u[1]))
+
+    VLD1.8      D9,[r14]!                   @band_table_v.val[0]
+    VADD.I8     D2,D6,D28                   @band_table_u.val[1] = vadd_u8(band_table_u.val[1], vdup_n_u8(pi1_sao_offset_u[2]))
+
+    VLD1.8      D10,[r14]!                  @band_table_v.val[1]
+    VADD.I8     D3,D7,D27                   @band_table_u.val[2] = vadd_u8(band_table_u.val[2], vdup_n_u8(pi1_sao_offset_u[3]))
+
+    LDR         r6,[sp,#48]                 @Loads sao_band_pos_v
+    VADD.I8     D4,D8,D26                   @band_table_u.val[3] = vadd_u8(band_table_u.val[3], vdup_n_u8(pi1_sao_offset_u[4]))
+    LSL         r11,r6,#3                   @sao_band_pos_v
+
+    BLT         SAO_BAND_POS_U_0
+
+SAO_BAND_POS_U_28:                          @case 28
+    VCLE.U8     D13,D4,D30                  @vcle_u8(band_table.val[3], vdup_n_u8(16))
+    BNE         SAO_BAND_POS_U_29
+
+    VORR.U8     D4,D4,D13                   @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
+    B           SWITCH_BREAK_U
+
+SAO_BAND_POS_U_29:                          @case 29
+    CMP         r5,#29
+
+    VCLE.U8     D14,D3,D30                  @vcle_u8(band_table.val[2], vdup_n_u8(16))
+    BNE         SAO_BAND_POS_U_30
+    VORR.U8     D3,D3,D14                   @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
+
+    VAND.U8     D4,D4,D13                   @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
+    B           SWITCH_BREAK_U
+
+SAO_BAND_POS_U_30:                          @case 30
+    CMP         r5,#30
+
+    VCLE.U8     D15,D2,D30                  @vcle_u8(band_table.val[1], vdup_n_u8(16))
+    BNE         SAO_BAND_POS_U_31
+    VORR.U8     D2,D2,D15                   @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
+
+    VAND.U8     D3,D3,D14                   @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
+
+SAO_BAND_POS_U_31:                          @case 31
+    CMP         r5,#31
+    BNE         SWITCH_BREAK_U
+
+    VCLE.U8     D16,D1,D30                  @vcle_u8(band_table.val[0], vdup_n_u8(16))
+    VORR.U8     D1,D1,D16                   @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
+
+    VAND.U8     D2,D2,D15                   @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
+    B           SWITCH_BREAK_U
+
+SAO_BAND_POS_U_0:
+    CMP         r5,#0                       @case 0
+    BNE         SWITCH_BREAK_U
+
+    VCLE.U8     D16,D1,D30                  @vcle_u8(band_table.val[0], vdup_n_u8(16))
+    VAND.U8     D1,D1,D16                   @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
+
+SWITCH_BREAK_U:
+    VDUP.8      D30,r11                     @band_pos_v
+    LDR         r8,[sp,#56]                 @Loads pi1_sao_offset_v
+
+    VLD1.8      D11,[r14]!                  @band_table_v.val[2]
+    VADD.I8     D13,D9,D30                  @band_table_v.val[0] = vadd_u8(band_table_v.val[0], band_pos_v)
+
+    VLD1.8      D12,[r14]!                  @band_table_v.val[3]
+    VADD.I8     D14,D10,D30                 @band_table_v.val[1] = vadd_u8(band_table_v.val[1], band_pos_v)
+
+    VLD1.8      D25,[r8]                    @pi1_sao_offset_v load
+    VADD.I8     D15,D11,D30                 @band_table_v.val[2] = vadd_u8(band_table_v.val[2], band_pos_v)
+
+    VDUP.8      D29,D25[1]                  @vdup_n_u8(pi1_sao_offset_v[1])
+    VADD.I8     D16,D12,D30                 @band_table_v.val[3] = vadd_u8(band_table_v.val[3], band_pos_v)
+
+    VDUP.8      D28,D25[2]                  @vdup_n_u8(pi1_sao_offset_v[2])
+    VADD.I8     D9,D13,D29                  @band_table_v.val[0] = vadd_u8(band_table_v.val[0], vdup_n_u8(pi1_sao_offset_v[1]))
+
+    VDUP.8      D27,D25[3]                  @vdup_n_u8(pi1_sao_offset_v[3])
+    VADD.I8     D10,D14,D28                 @band_table_v.val[1] = vadd_u8(band_table_v.val[1], vdup_n_u8(pi1_sao_offset_v[2]))
+
+    VDUP.8      D26,D25[4]                  @vdup_n_u8(pi1_sao_offset_v[4])
+    VADD.I8     D11,D15,D27                 @band_table_v.val[2] = vadd_u8(band_table_v.val[2], vdup_n_u8(pi1_sao_offset_v[3]))
+
+    VMOV.I8     D29,#16                     @vdup_n_u8(16)
+    VADD.I8     D12,D16,D26                 @band_table_v.val[3] = vadd_u8(band_table_v.val[3], vdup_n_u8(pi1_sao_offset_v[4]))
+    AND         r12,r9,#0xf
+
+    CMP         r6,#28
+    BLT         SAO_BAND_POS_V_0
+
+SAO_BAND_POS_V_28:                          @case 28
+    VCLE.U8     D17,D12,D29                 @vcle_u8(band_table.val[3], vdup_n_u8(16))
+    BNE         SAO_BAND_POS_V_29
+    VORR.U8     D12,D12,D17                 @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
+    B           SWITCH_BREAK_V
+
+SAO_BAND_POS_V_29:                          @case 29
+    CMP         r6,#29
+
+    VCLE.U8     D18,D11,D29                 @vcle_u8(band_table.val[2], vdup_n_u8(16))
+    BNE         SAO_BAND_POS_V_30
+    VORR.U8     D11,D11,D18                 @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
+
+    VAND.U8     D12,D12,D17                 @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
+    B           SWITCH_BREAK_V
+
+SAO_BAND_POS_V_30:                          @case 30
+    CMP         r6,#30
+
+    VCLE.U8     D19,D10,D29                 @vcle_u8(band_table.val[1], vdup_n_u8(16))
+    BNE         SAO_BAND_POS_V_31
+    VORR.U8     D10,D10,D19                 @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
+
+    VAND.U8     D11,D11,D18                 @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
+    B           SWITCH_BREAK_V
+
+SAO_BAND_POS_V_31:                          @case 31
+    CMP         r6,#31
+    BNE         SWITCH_BREAK_V
+
+    VCLE.U8     D20,D9,D29                  @vcle_u8(band_table.val[0], vdup_n_u8(16))
+    VORR.U8     D9,D9,D20                   @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
+
+    VAND.U8     D10,D10,D19                 @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
+    B           SWITCH_BREAK_V
+
+SAO_BAND_POS_V_0:
+    CMP         r6,#0                       @case 0
+    BNE         SWITCH_BREAK_V
+
+    VCLE.U8     D20,D9,D29                  @vcle_u8(band_table.val[0], vdup_n_u8(16))
+    VAND.U8     D9,D9,D20                   @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
+
+SWITCH_BREAK_V:
+    CMP         r9,#16
+    MOV         r4,r0                       @pu1_src_cpy
+    BLT         WIDTH_RESIDUE
+
+WIDTH_LOOP:                                 @Width is assigned to be multiple of 16
+    MOV         r4,r0                       @pu1_src_cpy
+    MOV         r11,r10                     @move ht
+    ADD         r5,r4,r1
+
+HEIGHT_LOOP:                                @unrolled for 4 rows
+    ADD         r6,r5,r1
+    VLD2.8      {D5,D6},[r4]                @vld1q_u8(pu1_src_cpy)
+    ADD         r7,r6,r1
+
+    VLD2.8      {D13,D14},[r5]              @vld1q_u8(pu1_src_cpy)
+    VSUB.I8     D7,D5,D31                   @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+    VLD2.8      {D17,D18},[r6]              @vld1q_u8(pu1_src_cpy)
+    VSUB.I8     D8,D6,D30                   @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+    VLD2.8      {D21,D22},[r7]              @vld1q_u8(pu1_src_cpy)
+    VSUB.I8     D15,D13,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+    VTBX.8      D5,{D1-D4},D7               @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+    VSUB.I8     D16,D14,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+    VTBX.8      D6,{D9-D12},D8              @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+    VSUB.I8     D19,D17,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+    VTBX.8      D13,{D1-D4},D15             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+    VSUB.I8     D20,D18,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+    VTBX.8      D14,{D9-D12},D16            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+    VSUB.I8     D23,D21,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+    VST2.8      {D5,D6},[r4]                @vst1q_u8(pu1_src_cpy, au1_cur_row)
+    VSUB.I8     D24,D22,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+    SUBS        r11,r11,#4                  @Decrement the ht loop count by 4
+    VTBX.8      D17,{D1-D4},D19             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+
+    VST2.8      {D13,D14},[r5]              @vst1q_u8(pu1_src_cpy, au1_cur_row)
+
+    VTBX.8      D18,{D9-D12},D20            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+    VTBX.8      D21,{D1-D4},D23             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+    VTBX.8      D22,{D9-D12},D24            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+
+    VST2.8      {D17,D18},[r6],r1           @vst1q_u8(pu1_src_cpy, au1_cur_row)
+
+    ADD         r4,r6,r1
+    VST2.8      {D21,D22},[r7]              @vst1q_u8(pu1_src_cpy, au1_cur_row)
+    ADD         r5,r4,r1
+
+    BNE         HEIGHT_LOOP
+
+    SUB         r9,r9,#16                   @Decrement the width loop by 16
+    ADD         r0,r0,#16
+    CMP         r9,#8
+    BGT         WIDTH_LOOP
+    BLT         END_LOOP
+    MOV         r4,r0                       @pu1_src_cpy
+
+WIDTH_RESIDUE:                              @If width is not multiple of 16
+    ADD         r5,r4,r1
+    VLD2.8      {D5,D6},[r4]                @vld1q_u8(pu1_src_cpy)
+    ADD         r6,r5,r1
+
+    ADD         r7,r6,r1
+    VLD2.8      {D13,D14},[r5]              @vld1q_u8(pu1_src_cpy)
+    VSUB.I8     D7,D5,D31                   @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+    VLD2.8      {D17,D18},[r6]              @vld1q_u8(pu1_src_cpy)
+    VSUB.I8     D8,D6,D30                   @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+    VTBX.8      D5,{D1-D4},D7               @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+    VSUB.I8     D15,D13,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+    VTBX.8      D6,{D9-D12},D8              @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+    VSUB.I8     D16,D14,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+    VLD2.8      {D21,D22},[r7]              @vld1q_u8(pu1_src_cpy)
+    VSUB.I8     D19,D17,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+    VTBX.8      D13,{D1-D4},D15             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+    VSUB.I8     D20,D18,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+    VTBX.8      D14,{D9-D12},D16            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+    VZIP.8      D5,D6
+
+    VTBX.8      D17,{D1-D4},D19             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+    VSUB.I8     D23,D21,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+    VST1.8      {D5},[r4]                   @vst1q_u8(pu1_src_cpy, au1_cur_row)
+    VZIP.8      D13,D14
+
+    VTBX.8      D18,{D9-D12},D20            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+    VSUB.I8     D24,D22,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+    VST1.8      {D13},[r5]                  @vst1q_u8(pu1_src_cpy, au1_cur_row)
+    SUBS        r10,r10,#4                  @Decrement the ht loop count by 4
+
+    VTBX.8      D21,{D1-D4},D23             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+    VZIP.8      D17,D18
+
+    VTBX.8      D22,{D9-D12},D24            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+    VST1.8      {D17},[r6],r1               @vst1q_u8(pu1_src_cpy, au1_cur_row)
+    VZIP.8      D21,D22
+
+    ADD         r4,r6,r1
+    VST1.8      {D21},[r7]                  @vst1q_u8(pu1_src_cpy, au1_cur_row)
+    ADD         r5,r4,r1
+
+    BNE         WIDTH_RESIDUE
+
+END_LOOP:
+    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
+
+
+

diff --git a/common/arm/ihevc_sao_band_offset_luma.s b/common/arm/ihevc_sao_band_offset_luma.s
new file mode 100644
index 0000000..3875377
--- /dev/null
+++ b/common/arm/ihevc_sao_band_offset_luma.s

@@ -0,0 +1,233 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@*  ihevc_sao_band_offset_luma.s
+@*
+@* ,:brief
+@*  Contains function definitions for inter prediction  interpolation.
+@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@*  Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_band_offset_luma(UWORD8 *pu1_src,
+@                           WORD32 src_strd,
+@                           UWORD8 *pu1_src_left,
+@                           UWORD8 *pu1_src_top,
+@                           UWORD8 *pu1_src_top_left,
+@                           WORD32 sao_band_pos,
+@                           WORD8 *pi1_sao_offset,
+@                           WORD32 wd,
+@                           WORD32 ht)
+@
+@**************Variables Vs Registers*****************************************
+@r0 =>  *pu1_src
+@r1 =>  src_strd
+@r2 =>  *pu1_src_left
+@r3 =>  *pu1_src_top
+@r4 =>  *pu1_src_top_left
+@r5 =>  sao_band_pos
+@r6 =>  *pi1_sao_offset
+@r7 =>  wd
+@r8 =>  ht
+
+.text
+.p2align 2
+
+.extern gu1_table_band_idx
+.globl ihevc_sao_band_offset_luma_a9q
+
+gu1_table_band_idx_addr:
+.long gu1_table_band_idx - ulbl1 - 8
+
+ihevc_sao_band_offset_luma_a9q:
+
+    STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    LDR         r8,[sp,#56]                 @Loads ht
+    LDR         r7,[sp,#52]                 @Loads wd
+
+    MOV         r9,r8                       @Move the ht to r9 for loop counter
+    LDR         r5,[sp,#44]                 @Loads sao_band_pos
+    ADD         r10,r0,r7                   @pu1_src[row * src_strd + (wd)]
+
+    LDR         r4,[sp,#40]                 @Loads pu1_src_top_left
+    SUB         r10,r10,#1                  @wd-1
+    LDR         r14, gu1_table_band_idx_addr
+ulbl1:
+    add         r14,r14,pc
+
+SRC_LEFT_LOOP:
+    LDRB        r11,[r10],r1                @Load the value
+    SUBS        r9,r9,#1                    @Decrement the loop counter
+    STRB        r11,[r2],#1                 @Store the value in pu1_src_left pointer
+    BNE         SRC_LEFT_LOOP
+
+    ADD         r9,r3,r7                    @pu1_src_top[wd]
+    VLD1.8      D1,[r14]!                   @band_table.val[0]
+    LDR         r6,[sp,#48]                 @Loads pi1_sao_offset
+
+    LSL         r11,r5,#3
+    VLD1.8      D2,[r14]!                   @band_table.val[1]
+
+    LDRB        r10,[r9,#-1]
+    VDUP.8      D31,r11                     @band_pos
+    SUB         r12,r8,#1                   @ht-1
+
+    STRB        r10,[r4]                    @store to pu1_src_top_left[0]
+    VLD1.8      D3,[r14]!                   @band_table.val[2]
+    MUL         r12,r12,r1                  @ht-1 * src_strd
+
+    ADD         r4,r12,r0                   @pu1_src[(ht - 1) * src_strd]
+    VLD1.8      D4,[r14]!                   @band_table.val[3]
+    MOV         r9,r7                       @Move the wd to r9 for loop counter
+
+SRC_TOP_LOOP:                               @wd is always multiple of 8
+    VLD1.8      D0,[r4]!                    @Load pu1_src[(ht - 1) * src_strd + col]
+    SUBS        r9,r9,#8                    @Decrement the loop counter by 8
+    VST1.8      D0,[r3]!                    @Store to pu1_src_top[col]
+    BNE         SRC_TOP_LOOP
+
+    VLD1.8      D30,[r6]                    @pi1_sao_offset load
+    VADD.I8     D5,D1,D31                   @band_table.val[0] = vadd_u8(band_table.val[0], band_pos)
+
+    VDUP.8      D29,D30[1]                  @vdup_n_u8(pi1_sao_offset[1])
+    VADD.I8     D6,D2,D31                   @band_table.val[1] = vadd_u8(band_table.val[1], band_pos)
+
+    VDUP.8      D28,D30[2]                  @vdup_n_u8(pi1_sao_offset[2])
+    VADD.I8     D7,D3,D31                   @band_table.val[2] = vadd_u8(band_table.val[2], band_pos)
+
+    VDUP.8      D27,D30[3]                  @vdup_n_u8(pi1_sao_offset[3])
+    VADD.I8     D8,D4,D31                   @band_table.val[3] = vadd_u8(band_table.val[3], band_pos)
+
+    VDUP.8      D26,D30[4]                  @vdup_n_u8(pi1_sao_offset[4])
+    VADD.I8     D1,D5,D29                   @band_table.val[0] = vadd_u8(band_table.val[0], vdup_n_u8(pi1_sao_offset[1]))
+
+    VMOV.I8     D29,#16                     @vdup_n_u8(16)
+    VADD.I8     D2,D6,D28                   @band_table.val[1] = vadd_u8(band_table.val[1], vdup_n_u8(pi1_sao_offset[2]))
+
+    CMP         r5,#28
+    VADD.I8     D3,D7,D27                   @band_table.val[2] = vadd_u8(band_table.val[2], vdup_n_u8(pi1_sao_offset[3]))
+
+    VADD.I8     D4,D8,D26                   @band_table.val[3] = vadd_u8(band_table.val[3], vdup_n_u8(pi1_sao_offset[4]))
+    BLT         SAO_BAND_POS_0
+
+SAO_BAND_POS_28:                            @case 28
+
+    VCLE.U8     D12,D4,D29                  @vcle_u8(band_table.val[3], vdup_n_u8(16))
+
+    BNE         SAO_BAND_POS_29
+    VORR.U8     D4,D4,D12                   @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
+    B           SWITCH_BREAK
+
+SAO_BAND_POS_29:                            @case 29
+    CMP         r5,#29
+    VCLE.U8     D11,D3,D29                  @vcle_u8(band_table.val[2], vdup_n_u8(16))
+
+    BNE         SAO_BAND_POS_30
+    VORR.U8     D3,D3,D11                   @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
+
+    VAND.U8     D4,D4,D12                   @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
+    B           SWITCH_BREAK
+
+SAO_BAND_POS_30:                            @case 30
+    CMP         r5,#30
+    VCLE.U8     D10,D2,D29                  @vcle_u8(band_table.val[1], vdup_n_u8(16))
+
+    BNE         SAO_BAND_POS_31
+    VORR.U8     D2,D2,D10                   @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
+
+    VAND.U8     D3,D3,D11                   @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
+    B           SWITCH_BREAK
+
+SAO_BAND_POS_31:                            @case 31
+    CMP         r5,#31
+    BNE         SWITCH_BREAK
+
+    VCLE.U8     D9,D1,D29                   @vcle_u8(band_table.val[0], vdup_n_u8(16))
+    VORR.U8     D1,D1,D9                    @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
+
+    VAND.U8     D2,D2,D10                   @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
+
+SAO_BAND_POS_0:
+    CMP         r5,#0                       @case 0
+    BNE         SWITCH_BREAK
+
+    VCLE.U8     D9,D1,D29                   @vcle_u8(band_table.val[0], vdup_n_u8(16))
+    VAND.U8     D1,D1,D9                    @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
+
+SWITCH_BREAK:
+    MOV         r4,r0                       @pu1_src_cpy
+    MOV         r11,r8                      @move ht
+    ADD         r5,r4,r1
+
+HEIGHT_LOOP:
+    ADD         r6,r5,r1
+    VLD1.8      D13,[r4]                    @au1_cur_row = vld1_u8(pu1_src_cpy)
+
+    ADD         r10,r6,r1
+    VLD1.8      D15,[r5]                    @au1_cur_row = vld1_u8(pu1_src_cpy)
+
+    VLD1.8      D17,[r6]                    @au1_cur_row = vld1_u8(pu1_src_cpy)
+
+    VLD1.8      D19,[r10]                   @au1_cur_row = vld1_u8(pu1_src_cpy)
+    VSUB.I8     D14,D13,D31                 @vsub_u8(au1_cur_row, band_pos)
+
+    VTBX.8      D13,{D1-D4},D14             @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
+    VSUB.I8     D16,D15,D31                 @vsub_u8(au1_cur_row, band_pos)
+
+    VTBX.8      D15,{D1-D4},D16             @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
+    VSUB.I8     D18,D17,D31                 @vsub_u8(au1_cur_row, band_pos)
+
+    VTBX.8      D17,{D1-D4},D18             @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
+    VSUB.I8     D20,D19,D31                 @vsub_u8(au1_cur_row, band_pos)
+
+    VTBX.8      D19,{D1-D4},D20             @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
+    VST1.8      D13,[r4],r1                 @vst1_u8(pu1_src_cpy, au1_cur_row)
+
+    VST1.8      D15,[r5]                    @vst1_u8(pu1_src_cpy, au1_cur_row)
+    SUBS        r11,r11,#4                  @Decrement the ht loop count by 4
+
+    VST1.8      D17,[r6],r1                 @vst1_u8(pu1_src_cpy, au1_cur_row)
+
+    ADD         r4,r6,r1
+    VST1.8      D19,[r10]                   @vst1_u8(pu1_src_cpy, au1_cur_row)
+    ADD         r5,r4,r1
+
+    BNE         HEIGHT_LOOP
+
+    SUBS        r7,r7,#8                    @Decrement the width loop by 8
+    ADD         r0,r0,#8
+    BNE         SWITCH_BREAK
+
+    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
+
+
+
+

diff --git a/common/arm/ihevc_sao_edge_offset_class0.s b/common/arm/ihevc_sao_edge_offset_class0.s
new file mode 100644
index 0000000..a9fe046
--- /dev/null
+++ b/common/arm/ihevc_sao_edge_offset_class0.s

@@ -0,0 +1,344 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@*  ihevc_sao_edge_offset_class0.s
+@*
+@* ,:brief
+@*  Contains function definitions for inter prediction  interpolation.
+@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@*  Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_edge_offset_class0(UWORD8 *pu1_src,
+@                              WORD32 src_strd,
+@                              UWORD8 *pu1_src_left,
+@                              UWORD8 *pu1_src_top,
+@                              UWORD8 *pu1_src_top_left,
+@                              UWORD8 *pu1_src_top_right,
+@                              UWORD8 *pu1_src_bot_left,
+@                              UWORD8 *pu1_avail,
+@                              WORD8 *pi1_sao_offset,
+@                              WORD32 wd,
+@                              WORD32 ht)
+@
+@**************Variables Vs Registers*****************************************
+@r0 =>  *pu1_src
+@r1 =>  src_strd
+@r2 =>  *pu1_src_left
+@r3 =>  *pu1_src_top
+@r4 =>  *pu1_src_top_left
+@r7 =>  *pu1_avail
+@r8 =>  *pi1_sao_offset
+@r9 =>  wd
+@r10=>  ht
+
+.text
+.p2align 2
+
+.extern gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class0_a9q
+
+gi1_table_edge_idx_addr:
+.long gi1_table_edge_idx - ulbl1 - 8
+
+ihevc_sao_edge_offset_class0_a9q:
+
+
+    STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+    LDR         r9,[sp,#60]                 @Loads wd
+
+    LDR         r4,[sp,#40]                 @Loads pu1_src_top_left
+    VMOV.I8     Q1,#2                       @const_2 = vdupq_n_s8(2)
+    ADD         r11,r3,r9                   @pu1_src_top[wd]
+
+    LDR         r10,[sp,#64]                @Loads ht
+    VMOV.I16    Q2,#0                       @const_min_clip = vdupq_n_s16(0)
+    LDRB        r12,[r11,#-1]               @pu1_src_top[wd - 1]
+
+    LDR         r7,[sp,#52]                 @Loads pu1_avail
+    VMOV.I16    Q3,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+    LDR         r14, gi1_table_edge_idx_addr @table pointer
+ulbl1:
+    add         r14,r14,pc
+
+    LDR         r8,[sp,#56]                 @Loads pi1_sao_offset
+    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
+    STRB        r12,[r4]                    @*pu1_src_top_left = pu1_src_top[wd - 1]
+
+    MOV         r6,r0                       @pu1_src_org
+    VLD1.8      D10,[r14]                   @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    SUB         r4,r10,#1                   @(ht - 1)
+
+    MOV         r12,r9                      @Move wd to r12 for loop count
+    VLD1.8      D11,[r8]                    @offset_tbl = vld1_s8(pi1_sao_offset)
+    MUL         r4,r4,r1                    @(ht - 1) * src_strd
+
+    ADD         r4,r4,r0                    @pu1_src[(ht - 1) * src_strd]
+
+SRC_TOP_LOOP:                               @wd is always multiple of 8
+    VLD1.8      D0,[r4]!                    @Load pu1_src[(ht - 1) * src_strd + col]
+    SUBS        r12,r12,#8                  @Decrement the loop counter by 8
+    VST1.8      D0,[r3]!                    @Store to pu1_src_top[col]
+    BNE         SRC_TOP_LOOP
+    ADD         r6,r6,#15                   @pu1_src_org[16 - 1]
+
+    CMP         r9,#16                      @Compare wd with 16
+    MOV         r3,r2                       @pu1_src_left backup to reload later
+    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+
+    MOV         r8,r9                       @move wd to r8 for loop count
+
+WIDTH_LOOP_16:
+    CMP         r8,r9                       @if(col == wd)
+    BNE         AU1_MASK_FF                 @jump to else part
+    LDRB        r12,[r7]                    @pu1_avail[0]
+    VMOV.8      D8[0],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+    B           SKIP_AU1_MASK_FF            @Skip the else part
+
+AU1_MASK_FF:
+    MOV         r12,#0xFF                   @move -1 to r12
+    VMOV.8      D8[0],r12                   @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+SKIP_AU1_MASK_FF:
+    CMP         r8,#16                      @If col == 16
+    BNE         SKIP_MASKING_IF_NOT16       @If not skip masking
+    LDRB        r12,[r7,#1]                 @pu1_avail[1]
+    VMOV.8      D9[7],r12                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_MASKING_IF_NOT16:
+    MOV         r12,r0                      @pu1_src_cpy = pu1_src
+    MOV         r4,r10                      @move ht to r4 for loop count
+
+PU1_SRC_LOOP:
+    LDRB        r11,[r2]                    @load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
+    VLD1.8      D12,[r12]!                  @pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    VLD1.8      D13,[r12], r1               @pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    SUB         r12,#8
+    SUB         r5,r9,r8                    @wd - col
+
+    SUB         r14,r10,r4                  @ht - row
+    VMOV.8      D15[7],r11                  @vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+    MUL         r14,r14,r1                  @(ht - row) * src_strd
+
+    VLD1.8      D26,[r12]!                  @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    VLD1.8      D27,[r12]                   @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    SUB         r12,#8
+    VEXT.8      Q7,Q7,Q6,#15                @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+    ADD         r5,r14,r5                   @(ht - row) * src_strd + (wd - col)
+
+    LDRB        r11,[r2, #1]                @II Iteration load pu1_src_left since ht - row + 1 =1
+    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    LDRB        r14,[r6,r5]                 @pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
+
+    SUB         r4,r4,#1
+    VMOV.8      D29[7],r11                  @II Iteration vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+    SUB         r12,r12,r1                  @Decrement the pu1_src pointer by src_strd
+    VSUB.I8     Q10,Q9,Q8                   @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    STRB        r14,[r2],#1                 @pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+
+    LDRB        r11,[r12,#16]               @pu1_src_cpy[16]
+    VEXT.8      Q14,Q14,Q13,#15             @II Iteration pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+    SUB         r5,r9,r8                    @II wd - col
+
+    ADD         r12,r12,r1                  @Increment the pu1_src pointer by src_strd
+    VMOV.8      D14[0],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+    VCGT.U8     Q15,Q13,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+    LDRB        r11,[r12,#16]               @II pu1_src_cpy[16]
+    VEXT.8      Q7,Q6,Q7,#1                 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+    SUB         r14,r10,r4                  @II ht - row
+
+    VCLT.U8     Q0,Q13,Q14                  @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    VMOV.8      D28[0],r11                  @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+    SUB         r12,r12,r1                  @Decrement the pu1_src pointer by src_strd
+
+    MUL         r14,r14,r1                  @II (ht - row) * src_strd
+    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    ADD         r5,r14,r5                   @II (ht - row) * src_strd + (wd - col)
+
+    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    VEXT.8      Q14,Q13,Q14,#1              @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+
+    LDRB        r14,[r6,r5]                 @II pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
+    VSUB.I8     Q11,Q9,Q8                   @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUBS        r4,r4,#1                    @Decrement row by 1
+
+    VADD.I8     Q7,Q1,Q10                   @edge_idx = vaddq_s8(const_2, sign_left)
+    STRB        r14,[r2],#1                 @II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+
+    VADD.I8     Q7,Q7,Q11                   @edge_idx = vaddq_s8(edge_idx, sign_right)
+    VMOVL.U8    Q9,D12                      @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    VSUB.I8     Q10,Q0,Q15                  @II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    VTBL.8      D14,{D10},D14               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VCGT.U8     Q15,Q13,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+    VCLT.U8     Q0,Q13,Q14                  @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    VTBL.8      D15,{D10},D15               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VSUB.I8     Q11,Q0,Q15                  @II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VAND        Q7,Q7,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
+    VTBL.8      D16,{D11},D14               @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VMOVL.U8    Q0,D26                      @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    VADD.I8     Q14,Q1,Q10                  @II edge_idx = vaddq_s8(const_2, sign_left)
+    VADD.I8     Q14,Q14,Q11                 @II edge_idx = vaddq_s8(edge_idx, sign_right)
+
+    VADDW.S8    Q9,Q9,D16                   @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VTBL.8      D28,{D10},D28               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VMAX.S16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    VTBL.8      D29,{D10},D29               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VMIN.U16    Q9,Q9,Q3                    @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VAND        Q14,Q14,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
+    VTBL.8      D17,{D11},D15               @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+
+    VMOVL.U8    Q7,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VTBL.8      D30,{D11},D28               @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VADDW.S8    Q7,Q7,D17                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    VMAX.S16    Q7,Q7,Q2                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    VTBL.8      D31,{D11},D29               @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    VMIN.U16    Q7,Q7,Q3                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    VMOVN.I16   D18,Q9                      @vmovn_s16(pi2_tmp_cur_row.val[0])
+    VADDW.S8    Q0,Q0,D30                   @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    VMOVN.I16   D19,Q7                      @vmovn_s16(pi2_tmp_cur_row.val[1])
+    VMAX.S16    Q0,Q0,Q2                    @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    VMOVL.U8    Q14,D27                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VMIN.U16    Q0,Q0,Q3                    @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMOVN.I16   D0,Q0                       @II vmovn_s16(pi2_tmp_cur_row.val[0])
+    VADDW.S8    Q14,Q14,D31                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    VMAX.S16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    VST1.8      {D18,D19},[r12],r1          @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    VMIN.U16    Q14,Q14,Q3                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    VMOVN.I16   D1,Q14                      @II vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    VST1.8      {D0,D1},[r12],r1            @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    BNE         PU1_SRC_LOOP                @If not equal jump to the inner loop
+
+    ADD         r0,r0,#16                   @pu1_src += 16
+
+    SUBS        r8,r8,#16                   @Decrement column by 16
+    CMP         r8,#8                       @Check whether residue remains
+    MOV         r2,r3                       @Reload pu1_src_left
+    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
+    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
+    BLT         END_LOOPS                   @Jump to end function
+
+WIDTH_RESIDUE:
+    SUB         r6,r6,#15
+    AND         r8,r9,#0xF                  @wd_rem = wd & 0xF
+    CMP         r8,#0                       @Residue check
+    BEQ         END_LOOPS                   @No Residue jump to end function
+
+    CMP         r8,r9                       @if(wd_rem == wd)
+    BNE         AU1_MASK_FF_RESIDUE         @jump to else part
+    LDRB        r12,[r7]                    @pu1_avail[0]
+    VMOV.8      D8[0],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+    B           SKIP_AU1_MASK_FF_RESIDUE    @Skip the else part
+
+AU1_MASK_FF_RESIDUE:
+    MOV         r12,#0xFF                   @move -s to r12
+    VMOV.8      D8[0],r12                   @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+SKIP_AU1_MASK_FF_RESIDUE:
+    LDRB        r11,[r7,#1]                 @pu1_avail[1]
+    SUB         r5,r9,#1                    @wd - 1
+
+    MOV         r4,r10                      @move ht to r4 for loop count
+    VMOV.8      D8[7],r11                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    MOV         r12,r0                      @pu1_src_cpy = pu1_src
+
+PU1_SRC_LOOP_RESIDUE:
+    VLD1.8      D12,[r12]!                  @pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    VLD1.8      D13,[r12]                   @pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    SUB         r12,#8
+    LDRB        r11,[r2]                    @load pu1_src_left
+    VMOV.8      D15[7],r11                  @vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+    VEXT.8      Q7,Q7,Q6,#15                @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+
+    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    VSUB.I8     Q10,Q9,Q8                   @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    LDRB        r11,[r12,#16]               @pu1_src_cpy[16]
+    VMOV.8      D14[0],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+    VEXT.8      Q7,Q6,Q7,#1                 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+
+    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    VSUB.I8     Q11,Q9,Q8                   @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VADD.I8     Q12,Q1,Q10                  @edge_idx = vaddq_s8(const_2, sign_left)
+    VADD.I8     Q12,Q12,Q11                 @edge_idx = vaddq_s8(edge_idx, sign_right)
+
+    VTBL.8      D24,{D10},D24               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VTBL.8      D25,{D10},D25               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    VAND        Q12,Q12,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VNEG.S8     Q10,Q11                     @sign_left = vnegq_s8(sign_right)
+    VEXT.8      Q10,Q10,Q11,#15             @sign_left = vextq_s8(sign_left, sign_left, 15)
+
+    VTBL.8      D26,{D11},D24               @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VADDW.S8    Q14,Q14,D26                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VMAX.S16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q14,Q14,Q3                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    SUB         r14,r10,r4                  @ht - row
+    MUL         r14,r14,r1                  @(ht - row) * src_strd
+    ADD         r11,r14,r5                  @(ht - row) * src_strd + (wd - 1)
+    LDRB        r14,[r6, r11]               @pu1_src_org[(ht - row) * src_strd + (wd - 1)]
+    STRB        r14,[r2],#1                 @pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+
+    VST1.8      {D28},[r12],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    SUBS        r4,r4,#1                    @Decrement row by 1
+    BNE         PU1_SRC_LOOP_RESIDUE        @If not equal jump to the pu1_src loop
+
+END_LOOPS:
+    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
+
+
+
+

diff --git a/common/arm/ihevc_sao_edge_offset_class0_chroma.s b/common/arm/ihevc_sao_edge_offset_class0_chroma.s
new file mode 100644
index 0000000..1dd56f6
--- /dev/null
+++ b/common/arm/ihevc_sao_edge_offset_class0_chroma.s

@@ -0,0 +1,431 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@*  ihevc_sao_edge_offset_class0_chroma.s
+@*
+@* ,:brief
+@*  Contains function definitions for inter prediction  interpolation.
+@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@*  Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_edge_offset_class0_chroma(UWORD8 *pu1_src,
+@                              WORD32 src_strd,
+@                              UWORD8 *pu1_src_left,
+@                              UWORD8 *pu1_src_top,
+@                              UWORD8 *pu1_src_top_left,
+@                              UWORD8 *pu1_src_top_right,
+@                              UWORD8 *pu1_src_bot_left,
+@                              UWORD8 *pu1_avail,
+@                              WORD8 *pi1_sao_offset_u,
+@                              WORD8 *pi1_sao_offset_v,
+@                              WORD32 wd,
+@
+@**************Variables Vs Registers*****************************************
+@r0 =>  *pu1_src
+@r1 =>  src_strd
+@r2 =>  *pu1_src_left
+@r3 =>  *pu1_src_top
+@r4 =>  *pu1_src_top_left
+@r7 =>  *pu1_avail
+@r8 =>  *pi1_sao_offset_u
+@r5 =>  *pi1_sao_offset_v
+@r9 =>  wd
+@r10=>  ht
+
+.text
+.p2align 2
+
+.extern gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class0_chroma_a9q
+
+gi1_table_edge_idx_addr:
+.long gi1_table_edge_idx - ulbl1 - 8
+
+ihevc_sao_edge_offset_class0_chroma_a9q:
+
+
+    STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+    LDR         r9,[sp,#64]                 @Loads wd
+
+    LDR         r4,[sp,#40]                 @Loads pu1_src_top_left
+    ADD         r11,r3,r9                   @pu1_src_top[wd]
+
+    LDR         r10,[sp,#68]                @Loads ht
+    VMOV.I8     Q1,#2                       @const_2 = vdupq_n_s8(2)
+    LDRH        r12,[r11,#-2]               @pu1_src_top[wd - 1]
+
+    LDR         r7,[sp,#52]                 @Loads pu1_avail
+    VMOV.I16    Q2,#0                       @const_min_clip = vdupq_n_s16(0)
+    STRH        r12,[r4]                    @*pu1_src_top_left = pu1_src_top[wd - 1]
+
+    LDR         r8,[sp,#56]                 @Loads pi1_sao_offset_u
+    VMOV.I16    Q3,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+    SUB         r4,r10,#1                   @(ht - 1)
+
+    LDR         r14, gi1_table_edge_idx_addr @table pointer
+ulbl1:
+    add         r14,r14,pc
+    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
+    MUL         r4,r4,r1                    @(ht - 1) * src_strd
+
+    LDR         r5,[sp,#60]                 @Loads pi1_sao_offset_v
+    VLD1.8      D11,[r8]                    @offset_tbl = vld1_s8(pi1_sao_offset_u)
+    ADD         r4,r4,r0                    @pu1_src[(ht - 1) * src_strd]
+
+    MOV         r6,r0                       @pu1_src_org
+    VLD1.8      D10,[r14]                   @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    MOV         r12,r9                      @Move wd to r12 for loop count
+
+SRC_TOP_LOOP:                               @wd is always multiple of 8
+    VLD1.8      D0,[r4]!                    @Load pu1_src[(ht - 1) * src_strd + col]
+    SUBS        r12,r12,#8                  @Decrement the loop counter by 8
+    VST1.8      D0,[r3]!                    @Store to pu1_src_top[col]
+    BNE         SRC_TOP_LOOP
+    ADD         r6,r6,#14                   @pu1_src_org[14]
+
+    MOV         r3,r2                       @pu1_src_left backup to reload later
+    VLD1.8      D0,[r5]                     @offset_tbl = vld1_s8(pi1_sao_offset_v)
+    CMP         r9,#16                      @Compare wd with 16
+
+    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+
+    MOV         r8,r9                       @move wd to r8 for loop count
+
+WIDTH_LOOP_16:
+    CMP         r8,r9                       @if(col == wd)
+    BNE         AU1_MASK_FF                 @jump to else part
+    LDRB        r12,[r7]                    @pu1_avail[0]
+    VMOV.8      D8[0],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+    VMOV.8      D8[1],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 1)
+    B           SKIP_AU1_MASK_FF            @Skip the else part
+
+AU1_MASK_FF:
+    MOV         r12,#-1                     @move -1 to r12
+    VMOV.16     D8[0],r12                   @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+SKIP_AU1_MASK_FF:
+    CMP         r8,#16                      @If col == 16
+    BNE         SKIP_MASKING_IF_NOT16       @If not skip masking
+    LDRB        r12,[r7,#1]                 @pu1_avail[1]
+    VMOV.8      D9[6],r12                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14)
+    VMOV.8      D9[7],r12                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_MASKING_IF_NOT16:
+    MOV         r12,r0                      @pu1_src_cpy = pu1_src
+    MOV         r4,r10                      @move ht to r4 for loop count
+
+PU1_SRC_LOOP:
+    LDRH        r11,[r2]                    @load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
+    VLD1.8      D12,[r12]!                  @pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    VLD1.8      D13,[r12],r1                @pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    SUB         r12,#8
+    SUB         r5,r9,r8                    @wd - col
+
+    SUB         r14,r10,r4                  @ht - row
+    VMOV.16     D15[3],r11                  @vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
+    MUL         r14,r14,r1                  @(ht - row) * src_strd
+
+    VLD1.8      D30,[r12]!                  @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    VLD1.8      D31,[r12]                   @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    SUB         r12,#8
+    VEXT.8      Q7,Q7,Q6,#14                @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
+    SUB         r12,r12,r1
+
+    LDRH        r11,[r2,#2]                 @II load pu1_src_left since ht - row =0
+    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    ADD         r5,r14,r5                   @(ht - row) * src_strd + (wd - col)
+
+    VMOV.16     D29[3],r11                  @II vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
+    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+    LDRH        r14,[r6,r5]                 @pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)]
+    VSUB.U8     Q10,Q9,Q8                   @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         r4,r4,#1
+
+    LDRB        r11,[r12,#16]               @pu1_src_cpy[16]
+    VEXT.8      Q14,Q14,Q15,#14             @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
+
+    VMOV.8      D14[0],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+    VCGT.U8     Q13,Q15,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+    LDRB        r11,[r12,#17]               @pu1_src_cpy[17]
+    VCLT.U8     Q12,Q15,Q14                 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    STRH        r14,[r2],#2                 @pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+
+    ADD         r12,r12,r1
+    VMOV.8      D14[1],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+    LDRB        r11,[r12,#16]               @II pu1_src_cpy[16]
+
+    VEXT.8      Q7,Q6,Q7,#2                 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
+    VMOV.8      D28[0],r11                  @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+
+    LDRB        r11,[r12,#17]               @II pu1_src_cpy[17]
+    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    SUB         r12,r12,r1
+
+    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    VMOV.8      D28[1],r11                  @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+
+    VSUB.U8     Q11,Q9,Q8                   @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    VEXT.8      Q14,Q15,Q14,#2              @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
+
+    VADD.U8     Q7,Q1,Q10                   @edge_idx = vaddq_s8(const_2, sign_left)
+
+    VADD.U8     Q7,Q7,Q11                   @edge_idx = vaddq_s8(edge_idx, sign_right)
+    VTBL.8      D14,{D10},D14               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VSUB.U8     Q10,Q12,Q13                 @II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VCGT.U8     Q13,Q15,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    VTBL.8      D15,{D10},D15               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VCLT.U8     Q12,Q15,Q14                 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+    VAND        Q7,Q7,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
+    VUZP.8      D14,D15
+
+    VSUB.U8     Q11,Q12,Q13                 @II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    VTBL.8      D16,{D11},D14               @offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+    VADD.U8     Q12,Q1,Q10                  @II edge_idx = vaddq_s8(const_2, sign_left)
+
+    VMOVL.U8    Q9,D12                      @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VTBL.8      D17,{D0},D15
+    VADD.U8     Q12,Q12,Q11                 @II edge_idx = vaddq_s8(edge_idx, sign_right)
+
+    VZIP.S8     D16,D17
+    VTBL.8      D24,{D10},D24               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VMOVL.U8    Q6,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+    VADDW.S8    Q9,Q9,D16                   @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VTBL.8      D25,{D10},D25               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VMAX.S16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    VAND        Q12,Q12,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
+    VMIN.U16    Q9,Q9,Q3                    @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+    VUZP.8      D24,D25                     @II
+
+    VADDW.S8    Q6,Q6,D17                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    VTBL.8      D26,{D11},D24               @II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+    VMAX.S16    Q6,Q6,Q2                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    VMIN.U16    Q6,Q6,Q3                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    VTBL.8      D27,{D0},D25                @II
+    VMOVN.I16   D14,Q9                      @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    VMOVN.I16   D15,Q6                      @vmovn_s16(pi2_tmp_cur_row.val[1])
+    VZIP.S8     D26,D27                     @II
+
+    SUB         r5,r9,r8                    @II wd - col
+    VMOVL.U8    Q14,D30                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    SUB         r14,r10,r4                  @II ht - row
+
+    MUL         r14,r14,r1                  @II (ht - row) * src_strd
+    VADDW.S8    Q14,Q14,D26                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    ADD         r5,r14,r5                   @II (ht - row) * src_strd + (wd - col)
+
+    LDRH        r14,[r6,r5]                 @II pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)]
+    VMAX.S16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    STRH        r14,[r2],#2                 @II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+    VMIN.U16    Q14,Q14,Q3                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMOVL.U8    Q15,D31                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+    VADDW.S8    Q15,Q15,D27                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    VST1.8      {D14,D15},[r12],r1          @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    VMAX.S16    Q15,Q15,Q2                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    SUBS        r4,r4,#1                    @Decrement row by 1
+    VMIN.U16    Q15,Q15,Q3                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
+    VMOVN.I16   D29,Q15                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    VST1.8      {D28,D29},[r12],r1          @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    BNE         PU1_SRC_LOOP                @If not equal jump to the inner loop
+
+    ADD         r0,r0,#16                   @pu1_src += 16
+
+    SUBS        r8,r8,#16                   @Decrement column by 16
+    CMP         r8,#8                       @Check whether residue remains
+    MOV         r2,r3                       @Reload pu1_src_left
+    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
+    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
+    BLT         END_LOOPS                   @Jump to end function
+
+WIDTH_RESIDUE:
+    SUB         r6,r6,#14
+    AND         r8,r9,#0xF                  @wd_rem = wd & 0xF
+    CMP         r8,#0                       @Residue check
+    BEQ         END_LOOPS                   @No Residue jump to end function
+
+    CMP         r8,r9                       @if(wd_rem == wd)
+    BNE         AU1_MASK_FF_RESIDUE         @jump to else part
+    LDRB        r12,[r7]                    @pu1_avail[0]
+    VMOV.8      D8[0],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+    VMOV.8      D8[1],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+    B           SKIP_AU1_MASK_FF_RESIDUE    @Skip the else part
+
+AU1_MASK_FF_RESIDUE:
+    MOV         r12,#-1                     @move -1 to r12
+    VMOV.16     D8[0],r12                   @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+SKIP_AU1_MASK_FF_RESIDUE:
+    LDRB        r12,[r7,#1]                 @pu1_avail[1]
+    VMOV.8      D8[6],r12                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    VMOV.8      D8[7],r12                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+    MOV         r12,r0                      @pu1_src_cpy = pu1_src
+    MOV         r4,r10                      @move ht to r4 for loop count
+
+PU1_SRC_LOOP_RESIDUE:
+    LDRH        r11,[r2]                    @load pu1_src_left
+    VLD1.8      D12,[r12]!                  @pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    VLD1.8      D13,[r12],r1                @pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    SUB         r12,#8
+    SUB         r5,r9,#2                    @wd - 2
+
+    SUB         r14,r10,r4                  @(ht - row)
+    VMOV.16     D15[3],r11                  @vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+    LSL         r14,r14,#1                  @(ht - row) * 2
+
+    VLD1.8      D30,[r12]!                  @II pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    VLD1.8      D31,[r12]                   @II pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    SUB         r12,#8
+    VEXT.8      Q7,Q7,Q6,#14                @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+    SUB         r12,r12,r1
+
+    LDRH        r11,[r2,#2]                 @II load pu1_src_left
+    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    MUL         r14,r14,r1                  @(ht - row) * 2 * src_strd
+
+    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    VMOV.16     D29[3],r11                  @II vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+
+    LDRB        r11,[r12,#16]               @pu1_src_cpy[16]
+    VSUB.U8     Q10,Q9,Q8                   @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    ADD         r5,r14,r5                   @(ht - row) * 2 * src_strd + (wd - 2)
+
+    VMOV.8      D14[0],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+    VEXT.8      Q14,Q14,Q15,#14             @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+
+    LDRB        r11,[r12,#17]               @pu1_src_cpy[17]
+    VCGT.U8     Q13,Q15,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    LDRH        r14,[r6, r5]                @pu1_src_org[(ht - row)  * 2* src_strd + (wd - 2)]
+
+    VMOV.8      D14[1],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+    VCLT.U8     Q12,Q15,Q14                 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    ADD         r12,r12,r1
+
+    STRH        r14,[r2],#2                 @pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2]
+    VEXT.8      Q7,Q6,Q7,#2                 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+    LDRB        r11,[r12,#16]               @II pu1_src_cpy[16]
+
+    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    VMOV.8      D28[0],r11                  @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+
+    LDRB        r11,[r12,#17]               @II pu1_src_cpy[17]
+    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    SUB         r4,r4,#1                    @II Decrement row by 1
+
+    VSUB.U8     Q11,Q9,Q8                   @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    VMOV.8      D28[1],r11                  @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+    SUB         r12,r12,r1
+
+    VADD.U8     Q7,Q1,Q10                   @edge_idx = vaddq_s8(const_2, sign_left)
+    VEXT.8      Q14,Q15,Q14,#2              @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+
+    VADD.U8     Q7,Q7,Q11                   @edge_idx = vaddq_s8(edge_idx, sign_right)
+
+    VSUB.U8     Q10,Q12,Q13                 @II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    VTBL.8      D14,{D10},D14               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VCGT.U8     Q13,Q15,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+    VCLT.U8     Q12,Q15,Q14                 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    VTBL.8      D15,{D10},D15               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VSUB.U8     Q11,Q12,Q13                 @II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VAND        Q7,Q7,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
+    VUZP.8      D14,D15
+
+    VADD.U8     Q14,Q1,Q10                  @II edge_idx = vaddq_s8(const_2, sign_left)
+    VTBL.8      D16,{D11},D14               @offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+    VADD.U8     Q14,Q14,Q11                 @II edge_idx = vaddq_s8(edge_idx, sign_right)
+
+    VMOVL.U8    Q9,D12                      @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VTBL.8      D17,{D0},D15
+    VMOVL.U8    Q12,D30                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    VZIP.S8     D16,D17
+    VTBL.8      D28,{D10},D28               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VADDW.S8    Q9,Q9,D16                   @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    VMAX.S16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VTBL.8      D29,{D10},D29               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VMIN.U16    Q9,Q9,Q3                    @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMOVN.I16   D18,Q9                      @vmovn_s16(pi2_tmp_cur_row.val[0])
+    VAND        Q14,Q14,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    SUB         r5,r9,#2                    @II wd - 2
+    VUZP.8      D28,D29                     @II
+    SUB         r14,r10,r4                  @II (ht - row)
+
+    LSL         r14,r14,#1                  @II (ht - row) * 2
+    VTBL.8      D26,{D11},D28               @II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+    MUL         r14,r14,r1                  @II (ht - row) * 2 * src_strd
+
+    ADD         r5,r14,r5                   @II (ht - row) * 2 * src_strd + (wd - 2)
+    VTBL.8      D27,{D0},D29                @II
+    LDRH        r14,[r6, r5]                @II pu1_src_org[(ht - row)  * 2* src_strd + (wd - 2)]
+
+    VZIP.S8     D26,D27                     @II
+    VST1.8      {D18},[r12],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    STRH        r14,[r2],#2                 @II pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2]
+    VADDW.S8    Q12,Q12,D26                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SUBS        r4,r4,#1                    @Decrement row by 1
+
+    VMAX.S16    Q12,Q12,Q2                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q12,Q12,Q3                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMOVN.I16   D28,Q12                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    VST1.8      {D28},[r12],r1              @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    BNE         PU1_SRC_LOOP_RESIDUE        @If not equal jump to the pu1_src loop
+
+END_LOOPS:
+    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
+
+
+
+
+

diff --git a/common/arm/ihevc_sao_edge_offset_class1.s b/common/arm/ihevc_sao_edge_offset_class1.s
new file mode 100644
index 0000000..aa1337f
--- /dev/null
+++ b/common/arm/ihevc_sao_edge_offset_class1.s

@@ -0,0 +1,371 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@*  ihevc_sao_edge_offset_class1.s
+@*
+@* ,:brief
+@*  Contains function definitions for inter prediction  interpolation.
+@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@*  Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_edge_offset_class1(UWORD8 *pu1_src,
+@                              WORD32 src_strd,
+@                              UWORD8 *pu1_src_left,
+@                              UWORD8 *pu1_src_top,
+@                              UWORD8 *pu1_src_top_left,
+@                              UWORD8 *pu1_src_top_right,
+@                              UWORD8 *pu1_src_bot_left,
+@                              UWORD8 *pu1_avail,
+@                              WORD8 *pi1_sao_offset,
+@                              WORD32 wd,
+@                              WORD32 ht)
+@**************Variables Vs Registers*****************************************
+@r0 =>  *pu1_src
+@r1 =>  src_strd
+@r2 =>  *pu1_src_left
+@r3 =>  *pu1_src_top
+@r4 =>  *pu1_src_top_left
+@r5 =>  *pu1_avail
+@r6 =>  *pi1_sao_offset
+@r7 =>  wd
+@r8 =>  ht
+
+.text
+.p2align 2
+
+.extern gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class1_a9q
+
+gi1_table_edge_idx_addr:
+.long gi1_table_edge_idx - ulbl1 - 8
+
+ihevc_sao_edge_offset_class1_a9q:
+
+
+    STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+    LDR         r7,[sp,#60]                 @Loads wd
+    LDR         r4,[sp,#40]                 @Loads pu1_src_top_left
+    LDR         r5,[sp,#52]                 @Loads pu1_avail
+    LDR         r6,[sp,#56]                 @Loads pi1_sao_offset
+    LDR         r8,[sp,#64]                 @Loads ht
+
+    SUB         r9,r7,#1                    @wd - 1
+    LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
+    STRB        r10,[r4]                    @*pu1_src_top_left = pu1_src_top[wd - 1]
+    ADD         r10,r0,r9                   @pu1_src[row * src_strd + wd - 1]
+    MOV         r11,r2                      @Move pu1_src_left pointer to r11
+    MOV         r12,r8                      @Move ht to r12 for loop count
+SRC_LEFT_LOOP:
+    LDRB        r14,[r10],r1                @Load pu1_src[row * src_strd + wd - 1]
+    STRB        r14,[r11],#1                @pu1_src_left[row]
+    SUBS        r12,#1                      @Decrement the loop count
+    BNE         SRC_LEFT_LOOP               @If not equal to 0 jump to the src_left_loop
+
+    SUB         r12,r8,#1                   @ht - 1
+    MUL         r12,r12,r1                  @(ht - 1) * src_strd
+    ADD         r12,r12,r0                  @pu1_src[(ht - 1) * src_strd]
+
+    LDRB        r4,[r5,#2]                  @pu1_avail[2]
+    CMP         r4,#0                       @0 == pu1_avail[2]
+    ADDEQ       r0,r0,r1                    @pu1_src += src_strd
+    SUBEQ       r8,r8,#1                    @ht--
+
+    LDRB        r4,[r5,#3]                  @pu1_avail[3]
+    CMP         r4,#0                       @0 == pu1_avail[3]
+    SUBEQ       r8,r8,#1                    @ht--
+
+    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
+    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
+    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+    LDR         r14, gi1_table_edge_idx_addr @table pointer
+ulbl1:
+    add         r14,r14,pc
+    VLD1.8      D6,[r14]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
+
+    CMP         r7,#16                      @Compare wd with 16
+    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+
+WIDTH_LOOP_16:
+    LDRB        r4,[r5,#2]                  @pu1_avail[2]
+    CMP         r4,#0                       @0 == pu1_avail[2]
+    SUBEQ       r9,r0,r1                    @pu1_src -= src_strd
+    MOVNE       r9,r3                       @*pu1_src_top
+
+    MOV         r10,r0                      @*pu1_src
+
+    VLD1.8      D8,[r9]!                    @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+    VLD1.8      D9,[r9]!                    @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+    VLD1.8      D10,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+    VLD1.8      D11,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+
+    VLD1.8      D30,[r12]!                  @vld1q_u8(pu1_src[(ht - 1) * src_strd])
+    VLD1.8      D31,[r12]!                  @vld1q_u8(pu1_src[(ht - 1) * src_strd])
+    VCGT.U8     Q6,Q5,Q4                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+    VST1.8      {Q15},[r3]!                 @vst1q_u8(pu1_src_top[col])
+    VCLT.U8     Q7,Q5,Q4                    @vcltq_u8(pu1_cur_row, pu1_top_row)
+
+    VSUB.U8     Q8,Q7,Q6                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         r11,r8                      @move ht to r11 for loop count
+
+PU1_SRC_LOOP:
+    ADD         r10,r10,r1                  @*pu1_src + src_strd
+    VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r10,#8
+    ADD         r6,r10,r1                   @II Iteration *pu1_src + src_strd
+
+    VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
+    VLD1.8      D30,[r6]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D31,[r6]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r6,#8
+
+    VCLT.U8     Q7,Q5,Q9                    @vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         r10,r10,r1
+
+    VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    VMOVL.U8    Q13,D18                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    VADD.I8     Q6,Q0,Q8                    @edge_idx = vaddq_s8(const_2, sign_up)
+    VMOVL.U8    Q14,D19                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+    VADD.I8     Q6,Q6,Q10                   @edge_idx = vaddq_s8(edge_idx, sign_down)
+    VCGT.U8     Q11,Q9,Q15                  @II vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+    VNEG.S8     Q8,Q10                      @sign_up = vnegq_s8(sign_down)
+    VTBL.8      D12,{D6},D12                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VCLT.U8     Q12,Q9,Q15                  @II vcltq_u8(pu1_cur_row, pu1_top_row)
+
+    VSUB.U8     Q4,Q12,Q11                  @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    VTBL.8      D13,{D6},D13                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VADD.I8     Q11,Q0,Q8                   @II edge_idx = vaddq_s8(const_2, sign_up)
+
+
+    VNEG.S8     Q8,Q4                       @II sign_up = vnegq_s8(sign_down)
+    VTBL.8      D12,{D7},D12                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VADD.I8     Q11,Q11,Q4                  @II edge_idx = vaddq_s8(edge_idx, sign_down)
+
+
+    VMOVL.U8    Q10,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VTBL.8      D22,{D6},D22                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VADDW.S8    Q10,Q10,D12                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VTBL.8      D23,{D6},D23                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+
+    VMOVL.U8    Q4,D11                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VTBL.8      D13,{D7},D13                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    VMOV        Q5,Q15                      @II pu1_cur_row = pu1_next_row
+
+    VADDW.S8    Q4,Q4,D13                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    VTBL.8      D24,{D7},D22                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VMAX.S16    Q4,Q4,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    VMIN.U16    Q4,Q4,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    VTBL.8      D25,{D7},D23                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+
+    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+    VADDW.S8    Q13,Q13,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    VMOVN.I16   D21,Q4                      @vmovn_s16(pi2_tmp_cur_row.val[1])
+    VADDW.S8    Q14,Q14,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+
+    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    VST1.8      {Q10},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    VMOVN.I16   D30,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
+    SUBS        r11,r11,#2                  @II Decrement the ht loop count by 1
+    VMOVN.I16   D31,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    VST1.8      {Q15},[r10],r1              @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    BEQ         PU1_SRC_LOOP_END            @if 0 == pu1_avail[3] || 0 == pu1_avail[2] ht = ht--
+    CMP         r11,#1                      @checking any residue remains
+    BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
+
+    ADD         r10,r10,r1                  @*pu1_src + src_strd
+    VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r10,#8
+    VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
+    VCLT.U8     Q7,Q5,Q9                    @vcltq_u8(pu1_cur_row, pu1_top_row)
+    VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         r10,r10,r1
+
+    VADD.I8     Q11,Q0,Q8                   @edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q11,Q11,Q10                 @edge_idx = vaddq_s8(edge_idx, sign_down)
+    VTBL.8      D22,{D6},D22                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VTBL.8      D23,{D6},D23                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    VTBL.8      D24,{D7},D22                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VMOVL.U8    Q13,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VADDW.S8    Q13,Q13,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VMAX.S16    Q13,Q13,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q13,Q13,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VTBL.8      D25,{D7},D23                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    VMOVL.U8    Q14,D11                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VADDW.S8    Q14,Q14,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    VMOVN.I16   D30,Q13                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+    VMOVN.I16   D31,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    VST1.8      {Q15},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+PU1_SRC_LOOP_END:
+    VMOV        Q5,Q9                       @pu1_cur_row = pu1_next_row
+    SUBS        r7,r7,#16                   @Decrement the wd loop count by 16
+    CMP         r7,#8                       @Check whether residue remains
+    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
+    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
+    BLT         END_LOOPS                   @Jump to end function
+
+
+WIDTH_RESIDUE:
+    LDRB        r4,[r5,#2]                  @pu1_avail[2]
+    CMP         r4,#0                       @0 == pu1_avail[2]
+    SUBEQ       r9,r0,r1                    @pu1_src -= src_strd
+    MOVNE       r9,r3                       @*pu1_src_top
+    MOV         r10,r0
+
+    VLD1.8      D8,[r9]!                    @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+    VLD1.8      D9,[r9]!                    @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+    VLD1.8      D10,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+    VLD1.8      D11,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+
+    VLD1.8      D30,[r12]                   @vld1_u8(pu1_src[(ht - 1) * src_strd])
+    VST1.8      {D30},[r3]                  @vst1_u8(pu1_src_top[col])
+
+    VCGT.U8     Q6,Q5,Q4                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
+    VCLT.U8     Q7,Q5,Q4                    @vcltq_u8(pu1_cur_row, pu1_top_row)
+    VSUB.U8     Q8,Q7,Q6                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         r11,r8                      @move ht to r11 for loop count
+
+PU1_SRC_LOOP_RESIDUE:
+    ADD         r10,r10,r1                  @*pu1_src + src_strd
+    VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r10,#8
+    ADD         r6,r10,r1                   @II Iteration *pu1_src + src_strd
+
+    VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_next_row)
+    VLD1.8      D30,[r6]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D31,[r6]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r6,#8
+
+    VCLT.U8     Q7,Q5,Q9                    @vcltq_u8(pu1_cur_row, pu1_next_row)
+    SUB         r10,r10,r1
+
+    VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    VMOVL.U8    Q13,D18                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    VADD.I8     Q6,Q0,Q8                    @edge_idx = vaddq_s8(const_2, sign_up)
+    VCGT.U8     Q11,Q9,Q15                  @II vcgtq_u8(pu1_cur_row, pu1_next_row)
+
+    VADD.I8     Q6,Q6,Q10                   @edge_idx = vaddq_s8(edge_idx, sign_down)
+    VCLT.U8     Q12,Q9,Q15                  @II vcltq_u8(pu1_cur_row, pu1_next_row)
+
+    VNEG.S8     Q8,Q10                      @sign_up = vnegq_s8(sign_down)
+    VTBL.8      D12,{D6},D12                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VSUB.U8     Q10,Q12,Q11                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VADD.I8     Q11,Q0,Q8                   @II edge_idx = vaddq_s8(const_2, sign_up)
+    VTBL.8      D12,{D7},D12                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VNEG.S8     Q8,Q10                      @II sign_up = vnegq_s8(sign_down)
+
+    VADD.I8     Q11,Q11,Q10                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
+    VMOVL.U8    Q10,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    VADDW.S8    Q10,Q10,D12                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VTBL.8      D22,{D6},D22                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+    VTBL.8      D24,{D7},D22                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    VADDW.S8    Q13,Q13,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMOV        Q5,Q15                      @II pu1_cur_row = pu1_next_row
+    VST1.8      {D20},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    VMOVN.I16   D30,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    SUBS        r11,r11,#2                  @Decrement the ht loop count by 1
+    VST1.8      {D30},[r10],r1              @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    BEQ         END_LOOPS
+    CMP         r11,#1
+    BGT         PU1_SRC_LOOP_RESIDUE        @If not equal jump to PU1_SRC_LOOP
+
+
+    ADD         r10,r10,r1                  @*pu1_src + src_strd
+    VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r10,#8
+    VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_next_row)
+    VCGT.U8     Q7,Q9,Q5                    @vcltq_u8(pu1_cur_row, pu1_next_row)
+    VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         r10,r10,r1
+
+    VADD.I8     Q11,Q0,Q8                   @edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q11,Q11,Q10                 @edge_idx = vaddq_s8(edge_idx, sign_down)
+    VTBL.8      D22,{D6},D22                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+
+    VTBL.8      D24,{D7},D22                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VMOVL.U8    Q13,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VADDW.S8    Q13,Q13,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VMAX.S16    Q13,Q13,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q13,Q13,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMOVN.I16   D30,Q13                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    VST1.8      {D30},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+END_LOOPS:
+    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
+
+
+
+
+
+

diff --git a/common/arm/ihevc_sao_edge_offset_class1_chroma.s b/common/arm/ihevc_sao_edge_offset_class1_chroma.s
new file mode 100644
index 0000000..09d925f
--- /dev/null
+++ b/common/arm/ihevc_sao_edge_offset_class1_chroma.s

@@ -0,0 +1,407 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@*  ihevc_sao_edge_offset_class1_chroma.s
+@*
+@* ,:brief
+@*  Contains function definitions for inter prediction  interpolation.
+@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@*  Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_edge_offset_class1_chroma(UWORD8 *pu1_src,
+@                              WORD32 src_strd,
+@                              UWORD8 *pu1_src_left,
+@                              UWORD8 *pu1_src_top,
+@                              UWORD8 *pu1_src_top_left,
+@                              UWORD8 *pu1_src_top_right,
+@                              UWORD8 *pu1_src_bot_left,
+@                              UWORD8 *pu1_avail,
+@                              WORD8 *pi1_sao_offset_u,
+@                              WORD8 *pi1_sao_offset_v,
+@                              WORD32 wd,
+@                              WORD32 ht)
+@**************Variables Vs Registers*****************************************
+@r0 =>  *pu1_src
+@r1 =>  src_strd
+@r2 =>  *pu1_src_left
+@r3 =>  *pu1_src_top
+@r4 =>  *pu1_src_top_left
+@r5 =>  *pu1_avail
+@r6 =>  *pi1_sao_offset_u
+@r7 =>  *pi1_sao_offset_v
+@r8 =>  wd
+@r9 =>  ht
+
+.text
+.p2align 2
+
+.extern gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class1_chroma_a9q
+
+gi1_table_edge_idx_addr:
+.long gi1_table_edge_idx - ulbl1 - 8
+
+ihevc_sao_edge_offset_class1_chroma_a9q:
+
+
+    STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+    LDR         r7,[sp,#60]                 @Loads wd
+    LDR         r4,[sp,#40]                 @Loads pu1_src_top_left
+    LDR         r5,[sp,#52]                 @Loads pu1_avail
+    LDR         r6,[sp,#56]                 @Loads pi1_sao_offset_u
+    LDR         r7,[sp,#60]                 @Loads pi1_sao_offset_v
+    LDR         r8,[sp,#64]                 @Loads wd
+    LDR         r9,[sp,#68]                 @Loads ht
+
+    SUB         r10,r8,#2                   @wd - 2
+    LDRH        r11,[r3,r10]                @pu1_src_top[wd - 2]
+    STRH        r11,[r4]                    @*pu1_src_top_left = pu1_src_top[wd - 2]
+    ADD         r11,r0,r10                  @pu1_src[row * src_strd + wd - 2]
+    MOV         r12,r2                      @Move pu1_src_left pointer to r11
+    MOV         r14,r9                      @Move ht to r14 for loop count
+SRC_LEFT_LOOP:
+    LDRH        r10,[r11],r1                @Load pu1_src[row * src_strd + wd - 2]
+    STRH        r10,[r12],#2                @pu1_src_left[row]
+    SUBS        r14,#1                      @Decrement the loop count
+    BNE         SRC_LEFT_LOOP               @If not equal to 0 jump to the src_left_loop
+
+    SUB         r12,r9,#1                   @ht - 1
+    MUL         r12,r12,r1                  @(ht - 1) * src_strd
+    ADD         r12,r12,r0                  @pu1_src[(ht - 1) * src_strd]
+
+    LDRB        r4,[r5,#2]                  @pu1_avail[2]
+    CMP         r4,#0                       @0 == pu1_avail[2]
+    ADDEQ       r0,r0,r1                    @pu1_src += src_strd
+    SUBEQ       r9,r9,#1                    @ht--
+
+    LDRB        r4,[r5,#3]                  @pu1_avail[3]
+    CMP         r4,#0                       @0 == pu1_avail[3]
+    SUBEQ       r9,r9,#1                    @ht--
+
+    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
+    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
+    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+    LDR         r14, gi1_table_edge_idx_addr @table pointer
+ulbl1:
+    add         r14,r14,pc
+    VLD1.8      D6,[r14]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    VLD1.8      D7,[r6]                     @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
+    VLD1.8      D8,[r7]                     @offset_tbl_v = vld1_s8(pi1_sao_offset_v)
+
+    CMP         r8,#16                      @Compare wd with 16
+    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+
+WIDTH_LOOP_16:
+    LDRB        r4,[r5,#2]                  @pu1_avail[2]
+    CMP         r4,#0                       @0 == pu1_avail[2]
+    SUBEQ       r11,r0,r1                   @pu1_src -= src_strd
+    MOVNE       r11,r3                      @*pu1_src_top
+
+    MOV         r10,r0                      @*pu1_src
+
+    VLD1.8      D28,[r11]!                  @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+    VLD1.8      D29,[r11]!                  @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+    VLD1.8      D10,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+    VLD1.8      D11,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+
+    VLD1.8      D30,[r12]!                  @vld1q_u8(pu1_src[(ht - 1) * src_strd])
+    VLD1.8      D31,[r12]!                  @vld1q_u8(pu1_src[(ht - 1) * src_strd])
+    VCGT.U8     Q6,Q5,Q14                   @vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+    VST1.8      {Q15},[r3]!                 @vst1q_u8(pu1_src_top[col])
+    VCLT.U8     Q7,Q5,Q14                   @vcltq_u8(pu1_cur_row, pu1_top_row)
+
+    VSUB.U8     Q8,Q7,Q6                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         r11,r9                      @move ht to r11 for loop count
+
+PU1_SRC_LOOP:
+    ADD         r10,r10,r1                  @*pu1_src + src_strd
+    VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r10,#8
+    ADD         r6,r10,r1                   @II Iteration *pu1_src + src_strd
+
+    VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
+    VLD1.8      D30,[r6]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D31,[r6]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r6,#8
+
+    VCLT.U8     Q7,Q5,Q9                    @vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         r10,r10,r1
+
+    VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    VMOVL.U8    Q13,D18                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    VADD.I8     Q6,Q0,Q8                    @edge_idx = vaddq_s8(const_2, sign_up)
+    VMOVL.U8    Q14,D19                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+    VADD.I8     Q6,Q6,Q10                   @edge_idx = vaddq_s8(edge_idx, sign_down)
+    VCGT.U8     Q11,Q9,Q15                  @II vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+    VNEG.S8     Q8,Q10                      @sign_up = vnegq_s8(sign_down)
+    VTBL.8      D12,{D6},D12                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VCLT.U8     Q12,Q9,Q15                  @II vcltq_u8(pu1_cur_row, pu1_top_row)
+
+    VSUB.U8     Q14,Q12,Q11                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    VTBL.8      D13,{D6},D13                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VADD.I8     Q11,Q0,Q8                   @II edge_idx = vaddq_s8(const_2, sign_up)
+
+
+    VUZP.8      D12,D13
+    VNEG.S8     Q8,Q14                      @II sign_up = vnegq_s8(sign_down)
+    VTBL.8      D12,{D7},D12                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VADD.I8     Q11,Q11,Q14                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    VMOVL.U8    Q10,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VTBL.8      D13,{D8},D13
+    VZIP.8      D12,D13
+
+    VADDW.S8    Q10,Q10,D12                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VTBL.8      D22,{D6},D22                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+    VTBL.8      D23,{D6},D23                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VUZP.8      D22,D23
+
+    VMOVL.U8    Q14,D11                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    @VTBL.8     D13,D7,D13                  @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    VMOV        Q5,Q15                      @II pu1_cur_row = pu1_next_row
+
+    VADDW.S8    Q14,Q14,D13                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    VTBL.8      D24,{D7},D22                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    VTBL.8      D25,{D8},D23
+    VZIP.8      D24,D25
+    @VTBL.8     D24,D7,D22                  @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    @VTBL.8     D25,D7,D23                  @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+
+    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+    VADDW.S8    Q13,Q13,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    VMOVN.I16   D21,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    VMOVL.U8    Q14,D19                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VADDW.S8    Q14,Q14,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+
+    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    VST1.8      {Q10},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    VMOVN.I16   D30,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
+    SUBS        r11,r11,#2                  @II Decrement the ht loop count by 1
+    VMOVN.I16   D31,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    VST1.8      {Q15},[r10],r1              @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    BEQ         PU1_SRC_LOOP_END            @if 0 == pu1_avail[3] || 0 == pu1_avail[2] ht = ht--
+    CMP         r11,#1                      @checking any residue remains
+    BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
+
+    ADD         r10,r10,r1                  @*pu1_src + src_strd
+    VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r10,#8
+    VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
+    VCLT.U8     Q7,Q5,Q9                    @vcltq_u8(pu1_cur_row, pu1_top_row)
+    VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         r10,r10,r1
+
+    VADD.I8     Q11,Q0,Q8                   @edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q11,Q11,Q10                 @edge_idx = vaddq_s8(edge_idx, sign_down)
+    VTBL.8      D22,{D6},D22                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VTBL.8      D23,{D6},D23                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    VUZP.8      D22,D23
+    VTBL.8      D24,{D7},D22
+    VTBL.8      D25,{D8},D23
+    VZIP.8      D24,D25
+
+    @VTBL.8     D24,D7,D22                  @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VMOVL.U8    Q13,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VADDW.S8    Q13,Q13,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VMAX.S16    Q13,Q13,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q13,Q13,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    @VTBL.8     D25,D7,D23                  @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    VMOVL.U8    Q14,D11                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VADDW.S8    Q14,Q14,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    VMOVN.I16   D30,Q13                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+    VMOVN.I16   D31,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    VST1.8      {Q15},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+PU1_SRC_LOOP_END:
+    VMOV        Q5,Q9                       @pu1_cur_row = pu1_next_row
+    SUBS        r8,r8,#16                   @Decrement the wd loop count by 16
+    CMP         r8,#8                       @Check whether residue remains
+    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
+    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
+    BLT         END_LOOPS                   @Jump to end function
+
+
+WIDTH_RESIDUE:
+    LDRB        r4,[r5,#2]                  @pu1_avail[2]
+    CMP         r4,#0                       @0 == pu1_avail[2]
+    SUBEQ       r11,r0,r1                   @pu1_src -= src_strd
+    MOVNE       r11,r3                      @*pu1_src_top
+    MOV         r10,r0
+
+    VLD1.8      D28,[r11]!                  @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+    VLD1.8      D29,[r11]!                  @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+    VLD1.8      D10,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+    VLD1.8      D11,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+
+    VLD1.8      D30,[r12]                   @vld1_u8(pu1_src[(ht - 1) * src_strd])
+    VST1.8      {D30},[r3]                  @vst1_u8(pu1_src_top[col])
+
+    VCGT.U8     Q6,Q5,Q14                   @vcgtq_u8(pu1_cur_row, pu1_top_row)
+    VCLT.U8     Q7,Q5,Q14                   @vcltq_u8(pu1_cur_row, pu1_top_row)
+    VSUB.U8     Q8,Q7,Q6                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         r11,r9                      @move ht to r11 for loop count
+
+PU1_SRC_LOOP_RESIDUE:
+    ADD         r10,r10,r1                  @*pu1_src + src_strd
+    VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r10,#8
+    ADD         r6,r10,r1                   @II Iteration *pu1_src + src_strd
+
+    VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_next_row)
+    VLD1.8      D30,[r6]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D31,[r6]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r6,#8
+
+    VCLT.U8     Q7,Q5,Q9                    @vcltq_u8(pu1_cur_row, pu1_next_row)
+    SUB         r10,r10,r1
+
+    VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    VMOVL.U8    Q13,D18                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    VADD.I8     Q6,Q0,Q8                    @edge_idx = vaddq_s8(const_2, sign_up)
+    VCGT.U8     Q11,Q9,Q15                  @II vcgtq_u8(pu1_cur_row, pu1_next_row)
+
+    VADD.I8     Q6,Q6,Q10                   @edge_idx = vaddq_s8(edge_idx, sign_down)
+    VCLT.U8     Q12,Q9,Q15                  @II vcltq_u8(pu1_cur_row, pu1_next_row)
+
+    VNEG.S8     Q8,Q10                      @sign_up = vnegq_s8(sign_down)
+    VTBL.8      D12,{D6},D12                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VSUB.U8     Q10,Q12,Q11                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VUZP.8      D12,D13
+
+    VADD.I8     Q11,Q0,Q8                   @II edge_idx = vaddq_s8(const_2, sign_up)
+    VTBL.8      D12,{D7},D12
+    VNEG.S8     Q8,Q10                      @II sign_up = vnegq_s8(sign_down)
+
+    VTBL.8      D13,{D8},D13
+    VZIP.8      D12,D13
+
+    @VTBL.8     D12,D7,D12                  @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+    VADD.I8     Q11,Q11,Q10                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
+    VMOVL.U8    Q10,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    VADDW.S8    Q10,Q10,D12                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VTBL.8      D22,{D6},D22                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    VUZP.8      D22,D23
+
+    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+    VTBL.8      D24,{D7},D22
+    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    VTBL.8      D25,{D8},D23
+    VZIP.8      D24,D25
+    @VTBL.8     D24,D7,D22                  @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+    VADDW.S8    Q13,Q13,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMOV        Q5,Q15                      @II pu1_cur_row = pu1_next_row
+    VST1.8      {D20},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    VMOVN.I16   D30,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    SUBS        r11,r11,#2                  @Decrement the ht loop count by 1
+    VST1.8      {D30},[r10],r1              @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    BEQ         END_LOOPS
+    CMP         r11,#1
+    BGT         PU1_SRC_LOOP_RESIDUE        @If not equal jump to PU1_SRC_LOOP
+
+
+    ADD         r10,r10,r1                  @*pu1_src + src_strd
+    VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r10,#8
+    VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_next_row)
+    VCGT.U8     Q7,Q9,Q5                    @vcltq_u8(pu1_cur_row, pu1_next_row)
+    VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         r10,r10,r1
+
+    VADD.I8     Q11,Q0,Q8                   @edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q11,Q11,Q10                 @edge_idx = vaddq_s8(edge_idx, sign_down)
+    VTBL.8      D22,{D6},D22                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+
+    VUZP.8      D22,D23
+    VTBL.8      D24,{D7},D22
+    VTBL.8      D25,{D8},D23
+    VZIP.8      D24,D25
+
+    @VTBL.8     D24,D7,D22                  @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VMOVL.U8    Q13,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VADDW.S8    Q13,Q13,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VMAX.S16    Q13,Q13,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q13,Q13,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMOVN.I16   D30,Q13                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    VST1.8      {D30},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+END_LOOPS:
+    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
+
+
+
+
+
+

diff --git a/common/arm/ihevc_sao_edge_offset_class2.s b/common/arm/ihevc_sao_edge_offset_class2.s
new file mode 100644
index 0000000..33b4961
--- /dev/null
+++ b/common/arm/ihevc_sao_edge_offset_class2.s

@@ -0,0 +1,811 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@*  ihevc_sao_edge_offset_class2.s
+@*
+@* ,:brief
+@*  Contains function definitions for inter prediction  interpolation.
+@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@*  Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_edge_offset_class2(UWORD8 *pu1_src,
+@                              WORD32 src_strd,
+@                              UWORD8 *pu1_src_left,
+@                              UWORD8 *pu1_src_top,
+@                              UWORD8 *pu1_src_top_left,
+@                              UWORD8 *pu1_src_top_right,
+@                              UWORD8 *pu1_src_bot_left,
+@                              UWORD8 *pu1_avail,
+@                              WORD8 *pi1_sao_offset,
+@                              WORD32 wd,
+@                              WORD32 ht)
+@**************Variables Vs Registers*****************************************
+@r0 =>  *pu1_src
+@r1 =>  src_strd
+@r2 =>  *pu1_src_left
+@r3 =>  *pu1_src_top
+@r4 =>  *pu1_src_top_left
+@r5 =>  *pu1_avail
+@r6 =>  *pi1_sao_offset
+@r7 =>  wd
+@r8=>   ht
+
+.text
+.p2align 2
+
+.extern gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class2_a9q
+
+gi1_table_edge_idx_addr_1:
+.long gi1_table_edge_idx - ulbl1 - 8
+
+gi1_table_edge_idx_addr_2:
+.long gi1_table_edge_idx - ulbl2 - 8
+
+gi1_table_edge_idx_addr_3:
+.long gi1_table_edge_idx - ulbl3 - 8
+
+ihevc_sao_edge_offset_class2_a9q:
+
+
+    STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
+    LDR         r7,[sp,#0x3C]               @Loads wd
+
+    LDR         r8,[sp,#0x40]               @Loads ht
+    SUB         r9,r7,#1                    @wd - 1
+
+    LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
+    LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
+
+    STR         r0,[sp,#0x2C]               @Store pu1_src in sp
+    MOV         r9,r7                       @Move width to r9 for loop count
+
+    STR         r2,[sp,#0x30]               @Store pu1_src_left in sp
+    LDR         r5,[sp,#0x34]               @Loads pu1_avail
+    LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset
+    STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
+
+    SUB         sp,sp,#0x94                 @Decrement the stack pointer to store some temp arr values
+
+    STRB        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 1]
+    SUB         r10,r8,#1                   @ht-1
+    MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
+    ADD         r12,sp,#0x02                @temp array
+
+AU1_SRC_TOP_LOOP:
+    VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
+    SUBS        r9,r9,#8                    @Decrement the loop count by 8
+    VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
+    BNE         AU1_SRC_TOP_LOOP
+
+PU1_AVAIL_4_LOOP:
+    LDRB        r10,[r5,#4]                 @pu1_avail[4]
+    CMP         r10,#0
+    LDRB        r9,[r0]                     @u1_pos_0_0_tmp = pu1_src[0]
+    BEQ         PU1_AVAIL_7_LOOP
+
+    LDRB        r11,[r4]                    @pu1_src_top_left[0]
+    ADD         r14,r0,r1                   @pu1_src + src_strd
+
+    SUBS        r12,r9,r11                  @pu1_src[0] - pu1_src_top_left[0]
+    LDRB        r4,[r14,#1]                 @pu1_src[1 + src_strd]
+
+    MVNLT       r12,#0
+    MOVGT       r12,#1                      @SIGN(pu1_src[0] - pu1_src_top_left[0])
+
+    LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
+ulbl1:
+    add         r14,r14,pc
+    SUBS        r11,r9,r4                   @pu1_src[0] - pu1_src[1 + src_strd]
+
+    MVNLT       r11,#0
+    MOVGT       r11,#1                      @SIGN(pu1_src[0] - pu1_src[1 + src_strd])
+    ADD         r4,r12,r11                  @SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[1 + src_strd])
+    ADD         r4,r4,#2                    @edge_idx
+
+    LDRSB       r12,[r14,r4]                @edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         r12,#0                      @0 != edge_idx
+    BEQ         PU1_AVAIL_7_LOOP
+    LDRSB       r10,[r6,r12]                @pi1_sao_offset[edge_idx]
+    ADD         r9,r9,r10                   @pu1_src[0] + pi1_sao_offset[edge_idx]
+    USAT        r9,#8,r9                    @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_7_LOOP:
+    LDRB        r14,[r5,#7]                 @pu1_avail[7]
+    CMP         r14,#0
+    SUB         r10,r7,#1                   @wd - 1
+    SUB         r11,r8,#1                   @ht - 1
+    MLA         r12,r11,r1,r10              @wd - 1 + (ht - 1) * src_strd
+    ADD         r12,r12,r0                  @pu1_src[wd - 1 + (ht - 1) * src_strd]
+    LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd]
+    BEQ         PU1_AVAIL
+
+    SUB         r4,r12,r1                   @pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
+    LDRB        r11,[r4,#-1]                @Load pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]
+    ADD         r14,r12,r1                  @pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
+
+    SUBS        r11,r10,r11                 @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd]
+    LDRB        r4,[r14,#1]                 @Load pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
+
+    MVNLT       r11,#0
+    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd])
+
+    SUBS        r4,r10,r4                   @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
+    MVNLT       r4,#0
+    MOVGT       r4,#1                       @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
+
+    ADD         r11,r11,r4                  @Add 2 sign value
+    ADD         r11,r11,#2                  @edge_idx
+    LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
+ulbl2:
+    add         r14,r14,pc
+
+    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         r12,#0
+    BEQ         PU1_AVAIL
+    LDRSB       r11,[r6,r12]                @pi1_sao_offset[edge_idx]
+    ADD         r10,r10,r11                 @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+    USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL:
+    MOV         r12,r8                      @Move ht
+    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
+    LDRB        r11,[r5,#3]                 @pu1_avail[3]
+
+    MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
+    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
+    CMP         r11,#0
+
+    LDRB        r5,[r5,#2]                  @pu1_avail[2]
+    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+    SUBEQ       r12,r12,#1                  @ht_tmp--
+
+    CMP         r5,#0
+    VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
+    LDR         r11, gi1_table_edge_idx_addr_3 @table pointer
+ulbl3:
+    add         r11,r11,pc
+
+    ADDEQ       r0,r0,r1                    @pu1_src += src_strd
+    VLD1.8      D6,[r11]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    SUBEQ       r12,r12,#1                  @ht_tmp--
+
+    MOV         r6,r7                       @move wd to r6 loop_count
+    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
+    ADDEQ       r14,r14,#1                  @pu1_src_left_cpy += 1
+
+    STR         r0,[sp,#0x90]               @Store pu1_src in sp
+    CMP         r7,#16                      @Compare wd with 16
+
+    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+    CMP         r8,#4                       @Compare ht with 4
+    BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
+
+WIDTH_LOOP_16:
+    LDR         r7,[sp,#0xD0]               @Loads wd
+
+    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
+    CMP         r6,r7                       @col == wd
+    LDREQB      r8,[r5]                     @pu1_avail[0]
+    MOVNE       r8,#-1                      @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
+    CMP         r6,#16                      @if(col == 16)
+    BNE         SKIP_AU1_MASK_VAL
+    LDRB        r8,[r5,#1]                  @pu1_avail[1]
+    VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL:
+    LDRB        r11,[r5,#2]                 @pu1_avail[2]
+    CMP         r11,#0
+
+    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
+    MOVNE       r8,r3                       @pu1_src_top_cpy
+    SUB         r8,r8,#1                    @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
+
+    LDR         r7,[sp,#0xD0]               @Loads wd
+    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
+    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
+    SUB         r8,#8
+    ADD         r3,r3,#16
+
+    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
+    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
+    SUB         r0,#8
+    LDR         r4,[sp,#0xD4]               @Loads ht
+
+    SUB         r7,r7,r6                    @(wd - col)
+    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
+    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
+
+    ADD         r7,r7,#15                   @15 + (wd - col)
+    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
+    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
+
+    SUB         r5,r5,#1
+    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+AU1_SRC_LEFT_LOOP:
+    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
+    STRB        r8,[r5,#1]!                 @store it in the stack pointer
+    SUBS        r4,r4,#1                    @decrement the loop count
+    BNE         AU1_SRC_LEFT_LOOP
+
+    ADD         r8,r0,r1                    @I Iteration *pu1_src + src_strd
+    VMOV.I8     Q9,#0
+    LDR         r4,[sp,#0xC8]               @I Loads pu1_avail
+
+    MOV         r7,r12                      @row count, move ht_tmp to r7
+    VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r8,#8
+    LDRB        r4,[r4,#2]                  @I pu1_avail[2]
+
+    LDRB        r5,[r8,#16]                 @I pu1_src_cpy[src_strd + 16]
+    VMOV.8      D18[0],r5                   @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+
+    VEXT.8      Q9,Q8,Q9,#1                 @I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+    CMP         r4,#0                       @I
+    BNE         SIGN_UP_CHANGE_DONE         @I
+
+SIGN_UP_CHANGE:
+    SUB         r2,r12,r7                   @I ht_tmp - row
+    LDRB        r11,[r0]                    @I pu1_src_cpy[0]
+    ADD         r2,r14,r2                   @I pu1_src_left_cpy[ht_tmp - row]
+
+    LDRB        r5,[r2,#-1]                 @I load the value
+    SUBS        r4,r11,r5                   @I pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+    MVNLT       r4,#0                       @I
+    MOVGT       r4,#1                       @I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+    VMOV.8      D14[0],r4                   @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+SIGN_UP_CHANGE_DONE:
+    VCGT.U8     Q5,Q6,Q9                    @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VADD.I8     Q12,Q0,Q7                   @I edge_idx = vaddq_s8(const_2, sign_up)
+
+    VCLT.U8     Q9,Q6,Q9                    @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VSUB.U8     Q5,Q9,Q5                    @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VADD.I8     Q12,Q12,Q5                  @I edge_idx = vaddq_s8(edge_idx, sign_down)
+    VTBL.8      D18,{D6},D24                @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VTBL.8      D19,{D6},D25                @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VNEG.S8     Q7,Q5                       @I sign_up = vnegq_s8(sign_down)
+    VTBL.8      D10,{D7},D18                @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VEXT.8      Q7,Q7,Q7,#15                @I sign_up = vextq_s8(sign_up, sign_up, 15)
+
+    VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VTBL.8      D11,{D7},D19                @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    VADDW.S8    Q10,Q10,D10                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMOVL.U8    Q11,D13                     @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+    VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+    VMOV        Q6,Q8                       @I pu1_cur_row = pu1_next_row
+
+    VADDW.S8    Q11,Q11,D11                 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    VMAX.S16    Q11,Q11,Q1                  @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
+
+    VMIN.U16    Q11,Q11,Q2                  @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    VMOVN.I16   D21,Q11                     @I vmovn_s16(pi2_tmp_cur_row.val[1])
+
+PU1_SRC_LOOP:
+
+    VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    ADD         r8,r0,r1                    @II iteration *pu1_src + src_strd
+
+    VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r8,#8
+    ADD         r11,r8,r1                   @III iteration *pu1_src + src_strd
+
+    LDRB        r5,[r8,#16]                 @II pu1_src_cpy[src_strd + 16]
+    VLD1.8      D30,[r11]!                  @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D31,[r11]                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r11,#8
+    LDRB        r4,[r0]                     @II pu1_src_cpy[0]
+
+    LDRB        r8,[r11,#16]                @III pu1_src_cpy[src_strd + 16]
+    VMOV.8      D28[0],r5                   @II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+
+    SUB         r5,r12,r7                   @II ht_tmp - row
+    VEXT.8      Q11,Q8,Q14,#1               @II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+    ADD         r5,r14,r5                   @II pu1_src_left_cpy[ht_tmp - row]
+
+    LDRB        r5,[r5,#-1]                 @II load the value
+    VMOV.8      D18[0],r8                   @III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
+
+    SUBS        r4,r4,r5                    @II pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+    VEXT.8      Q9,Q15,Q9,#1                @III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+    LDRB        r2,[r0,r1]                  @III pu1_src_cpy[0]
+
+    VCGT.U8     Q12,Q6,Q11                  @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    SUB         r5,r12,r7                   @III ht_tmp - row
+
+    MVNLT       r4,#0                       @II
+    VCLT.U8     Q11,Q6,Q11                  @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    ADD         r5,r14,r5                   @III pu1_src_left_cpy[ht_tmp - row]
+
+    MOVGT       r4,#1                       @II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+    VSUB.U8     Q12,Q11,Q12                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    LDRB        r5,[r5,#-1]                 @III load the value
+
+    SUBS        r2,r2,r5                    @III pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+    VMOV.8      D14[0],r4                   @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+    MVNLT       r2,#0                       @III
+    VCGT.U8     Q5,Q8,Q9                    @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    MOVGT       r2,#1                       @III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+
+    VADD.I8     Q11,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q11,Q11,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    VCLT.U8     Q9,Q8,Q9                    @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VTBL.8      D22,{D6},D22                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
+
+    VSUB.U8     Q5,Q9,Q5                    @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    VTBL.8      D23,{D6},D23                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VEXT.8      Q7,Q7,Q7,#15                @II sign_up = vextq_s8(sign_up, sign_up, 15)
+
+    VAND        Q11,Q11,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
+    VMOV.8      D14[0],r2                   @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+    VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
+    VTBL.8      D24,{D7},D22                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VADD.I8     Q9,Q9,Q5                    @III edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    VMOVL.U8    Q13,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VTBL.8      D18,{D6},D18                @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VNEG.S8     Q7,Q5                       @III sign_up = vnegq_s8(sign_down)
+
+    VADDW.S8    Q13,Q13,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VTBL.8      D19,{D6},D19                @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VEXT.8      Q7,Q7,Q7,#15                @III sign_up = vextq_s8(sign_up, sign_up, 15)
+
+    VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
+    VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VTBL.8      D10,{D7},D18                @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VADDW.S8    Q10,Q10,D10                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+    VTBL.8      D25,{D7},D23                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    VMOVL.U8    Q14,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VADDW.S8    Q14,Q14,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    VTBL.8      D11,{D7},D19                @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    VMOVL.U8    Q9,D17                      @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+    VMOV        Q6,Q15                      @III pu1_cur_row = pu1_next_row
+    VMOVN.I16   D26,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    VMOVN.I16   D27,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
+    VADDW.S8    Q9,Q9,D11                   @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    VMAX.S16    Q9,Q9,Q1                    @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
+    VMIN.U16    Q9,Q9,Q2                    @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    CMP         r7,#1                       @III
+
+    VST1.8      {Q13},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    BGT         PU1_SRC_LOOP                @III If not equal jump to PU1_SRC_LOOP
+    BLT         INNER_LOOP_DONE
+
+    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    ADD         r8,r0,r1                    @*pu1_src + src_strd
+
+    LDRB        r2,[r0]                     @pu1_src_cpy[0]
+    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r8,#8
+    LDRB        r5,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
+
+    SUB         r11,r12,r7                  @ht_tmp - row
+    VMOV.8      D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    ADD         r11,r14,r11                 @pu1_src_left_cpy[ht_tmp - row]
+
+    LDRB        r5,[r11,#-1]                @load the value
+    VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+    SUBS        r4,r2,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+
+    VCGT.U8     Q5,Q6,Q9                    @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    MVNLT       r4,#0
+
+    MOVGT       r4,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+    VCLT.U8     Q9,Q6,Q9                    @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    VMOV.8      D14[0],r4                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+    VSUB.U8     Q5,Q9,Q5                    @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VADD.I8     Q9,Q0,Q7                    @edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q9,Q9,Q5                    @edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    VTBL.8      D18,{D6},D18                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VNEG.S8     Q7,Q5                       @sign_up = vnegq_s8(sign_down)
+
+    VTBL.8      D19,{D6},D19                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
+
+    VAND        Q9,Q9,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VTBL.8      D10,{D7},D18                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+    VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VTBL.8      D11,{D7},D19                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    VADDW.S8    Q10,Q10,D10                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMOVL.U8    Q6,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+    VADDW.S8    Q6,Q6,D11                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    VMAX.S16    Q6,Q6,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    VMIN.U16    Q6,Q6,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    VMOVN.I16   D21,Q6                      @vmovn_s16(pi2_tmp_cur_row.val[1])
+
+
+INNER_LOOP_DONE:
+    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
+    VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
+
+    LDR         r8,[sp,#0xD4]               @Loads ht
+    SUB         r5,r5,#1
+
+    SUB         r2,r2,#1
+SRC_LEFT_LOOP:
+    LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
+    SUBS        r8,r8,#1
+    STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
+    BNE         SRC_LEFT_LOOP
+
+    SUB         r6,r6,#16                   @Decrement the wd loop count by 16
+    CMP         r6,#8                       @Check whether residue remains
+    BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
+    LDR         r7,[sp,#0xD0]               @Loads wd
+    LDR         r0,[sp,#0x90]               @Loads *pu1_src
+    SUB         r7,r7,r6
+    ADD         r0,r0,r7
+    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
+    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
+
+
+WD_16_HT_4_LOOP:
+    LDR         r7,[sp,#0xD0]               @Loads wd
+    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
+    CMP         r6,r7                       @col == wd
+    LDREQB      r8,[r5]                     @pu1_avail[0]
+    MOVNE       r8,#-1                      @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
+    CMP         r6,#16                      @if(col == 16)
+    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
+    LDRB        r8,[r5,#1]                  @pu1_avail[1]
+    VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL_WD_16_HT_4:
+    LDRB        r8,[r5,#2]                  @pu1_avail[2]
+    CMP         r8,#0
+
+    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
+    MOVNE       r8,r3
+    SUB         r8,r8,#1                    @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
+
+    LDR         r7,[sp,#0xD0]               @Loads wd
+    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
+    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
+    SUB         r8,#8
+    ADD         r3,r3,#16
+
+    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
+    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
+    SUB         r0,#8
+    LDR         r4,[sp,#0xD4]               @Loads ht
+
+    SUB         r7,r7,r6                    @(wd - col)
+    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
+    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
+
+    ADD         r7,r7,#15                   @15 + (wd - col)
+    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
+    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
+
+    SUB         r5,r5,#1
+    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+AU1_SRC_LEFT_LOOP_WD_16_HT_4:
+    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
+    SUBS        r4,r4,#1                    @decrement the loop count
+    STRB        r8,[r5,#1]!                 @store it in the stack pointer
+    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
+
+    VMOV.I8     Q9,#0
+    MOV         r7,r12                      @row count, move ht_tmp to r7
+
+PU1_SRC_LOOP_WD_16_HT_4:
+    ADD         r8,r0,r1                    @*pu1_src + src_strd
+    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r8,#8
+
+    LDRB        r5,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
+    VMOV.8      D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+
+    CMP         r7,r12
+    BLT         SIGN_UP_CHANGE_WD_16_HT_4
+    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
+    LDRB        r5,[r5,#2]                  @pu1_avail[2]
+    CMP         r5,#0
+    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
+
+SIGN_UP_CHANGE_WD_16_HT_4:
+    LDRB        r8,[r0]                     @pu1_src_cpy[0]
+    SUB         r5,r12,r7                   @ht_tmp - row
+    ADD         r5,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
+    LDRB        r5,[r5,#-1]                 @load the value
+    SUBS        r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+    MVNLT       r8,#0
+    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+    VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+SIGN_UP_CHANGE_DONE_WD_16_HT_4:
+    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
+    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
+    VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
+
+    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+    VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
+    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
+    BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
+
+    LDR         r8,[sp,#0xD4]               @Loads ht
+    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
+    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
+    SUB         r5,r5,#1
+    SUB         r2,r2,#1
+
+SRC_LEFT_LOOP_WD_16_HT_4:
+    LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
+    STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
+    SUBS        r8,r8,#1
+    BNE         SRC_LEFT_LOOP_WD_16_HT_4
+
+    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
+    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
+
+
+WIDTH_RESIDUE:
+    LDR         r7,[sp,#0xD0]               @Loads wd
+    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
+    CMP         r6,r7                       @wd_residue == wd
+    LDREQB      r8,[r5]                     @pu1_avail[0]
+
+    MOVNE       r8,#-1
+    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    LDRB        r8,[r5,#1]                  @pu1_avail[1]
+    VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+PU1_AVAIL_2_RESIDUE:
+    LDRB        r11,[r5,#2]                 @pu1_avail[2]
+    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
+    SUB         r0,#8
+    CMP         r11,#0
+
+    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
+    MOVNE       r8,r3
+
+    SUB         r8,r8,#1
+
+    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
+    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
+    VLD1.8      D11,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
+    LDR         r7,[sp,#0xD0]               @Loads wd
+
+    LDR         r4,[sp,#0xD4]               @Loads ht
+    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
+    SUB         r7,r7,#1                    @(wd - 1)
+
+    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
+    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         r5,r5,#1
+
+    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 1)]
+    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+
+AU1_SRC_LEFT_LOOP_RESIDUE:
+    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
+    SUBS        r4,r4,#1                    @decrement the loop count
+    STRB        r8,[r5,#1]!                 @store it in the stack pointer
+    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
+
+
+    MOV         r7,r12                      @row count, move ht_tmp to r7
+
+PU1_SRC_LOOP_RESIDUE:
+    VMOV.I8     Q9,#0
+    ADD         r8,r0,r1                    @*pu1_src + src_strd
+    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r8,#8
+
+    LDRB        r8,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
+    VMOV.8      d18[0],r8                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+
+    CMP         r7,r12
+    BLT         SIGN_UP_CHANGE_RESIDUE
+    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
+    LDRB        r5,[r5,#2]                  @pu1_avail[2]
+    CMP         r5,#0
+    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
+
+SIGN_UP_CHANGE_RESIDUE:
+    LDRB        r8,[r0]                     @pu1_src_cpy[0]
+    SUB         r5,r12,r7                   @ht_tmp - row
+
+    ADD         r5,r14,r5
+    LDRB        r5,[r5,#-1]                 @load the value
+    SUBS        r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+    MVNLT       r8,#0
+    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+    VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+SIGN_UP_CHANGE_DONE_RESIDUE:
+    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
+    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
+    VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
+
+    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
+    SUBS        r7,r7,#1
+    BNE         PU1_SRC_LOOP_RESIDUE
+
+    LDR         r8,[sp,#0xD4]               @Loads ht
+    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
+
+    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
+    SUB         r5,r5,#1
+
+    SUB         r2,r2,#1
+
+SRC_LEFT_LOOP_RESIDUE:
+    LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
+    SUBS        r8,r8,#1
+    STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
+    BNE         SRC_LEFT_LOOP_RESIDUE
+
+
+RE_ASSINING_LOOP:
+    LDR         r8,[sp,#0xD4]               @Loads ht
+    LDR         r7,[sp,#0xD0]               @Loads wd
+
+    LDR         r0,[sp,#0xC0]               @Loads *pu1_src
+    SUB         r8,r8,#1                    @ht - 1
+
+    MLA         r6,r8,r1,r7                 @wd - 1 + (ht - 1) * src_strd
+    STRB        r9,[r0]                     @pu1_src_org[0] = u1_pos_0_0_tmp
+
+    LDR         r4,[sp,#0xBC]               @Loads pu1_src_top_left
+    ADD         r6,r0,r6                    @pu1_src[wd - 1 + (ht - 1) * src_strd]
+
+    ADD         r12,sp,#0x02
+    STRB        r10,[r6,#-1]                @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
+
+    LDRB        r11,[sp]                    @load u1_src_top_left_tmp from stack pointer
+    LDR         r3,[sp,#0xCC]               @Loads pu1_src_top
+
+    STRB        r11,[r4]                    @*pu1_src_top_left = u1_src_top_left_tmp
+
+SRC_TOP_LOOP:
+    VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
+    SUBS        r7,r7,#8                    @Decrement the width
+    VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
+    BNE         SRC_TOP_LOOP
+
+END_LOOPS:
+    ADD         sp,sp,#0x94
+    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
+
+
+

diff --git a/common/arm/ihevc_sao_edge_offset_class2_chroma.s b/common/arm/ihevc_sao_edge_offset_class2_chroma.s
new file mode 100644
index 0000000..c6fb391
--- /dev/null
+++ b/common/arm/ihevc_sao_edge_offset_class2_chroma.s

@@ -0,0 +1,1001 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@*  ihevc_sao_edge_offset_class2_chroma.s
+@*
+@* ,:brief
+@*  Contains function definitions for inter prediction  interpolation.
+@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@*  Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_edge_offset_class2_chroma(UWORD8 *pu1_src,
+@                              WORD32 src_strd,
+@                              UWORD8 *pu1_src_left,
+@                              UWORD8 *pu1_src_top,
+@                              UWORD8 *pu1_src_top_left,
+@                              UWORD8 *pu1_src_top_right,
+@                              UWORD8 *pu1_src_bot_left,
+@                              UWORD8 *pu1_avail,
+@                              WORD8 *pi1_sao_offset_u,
+@                              WORD8 *pi1_sao_offset_v,
+@                              WORD32 wd,
+@                              WORD32 ht)
+@**************Variables Vs Registers*****************************************
+@r0 =>  *pu1_src
+@r1 =>  src_strd
+@r2 =>  *pu1_src_left
+@r3 =>  *pu1_src_top
+@r4 =>  *pu1_src_top_left
+@r5 =>  *pu1_avail
+@r6 =>  *pi1_sao_offset_u
+@r9 =>  *pi1_sao_offset_v
+@r7 =>  wd
+@r8=>   ht
+
+.text
+.p2align 2
+
+.extern gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class2_chroma_a9q
+
+gi1_table_edge_idx_addr_1:
+.long gi1_table_edge_idx - ulbl1 - 8
+
+gi1_table_edge_idx_addr_2:
+.long gi1_table_edge_idx - ulbl2 - 8
+
+gi1_table_edge_idx_addr_3:
+.long gi1_table_edge_idx - ulbl3 - 8
+
+gi1_table_edge_idx_addr_4:
+.long gi1_table_edge_idx - ulbl4 - 8
+
+gi1_table_edge_idx_addr_5:
+.long gi1_table_edge_idx - ulbl5 - 8
+
+ihevc_sao_edge_offset_class2_chroma_a9q:
+
+
+    STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
+
+    LDR         r7,[sp,#0x40]               @Loads wd
+    LDR         r8,[sp,#0x44]               @Loads ht
+    SUB         r9,r7,#2                    @wd - 2
+
+    LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
+    LDRH        r10,[r3,r9]                 @pu1_src_top[wd - 2]
+
+    STR         r0,[sp,#0x2C]               @Store pu1_src in sp
+    MOV         r9,r7                       @Move width to r9 for loop count
+
+    STR         r2,[sp,#0x30]               @Store pu1_src_left in sp
+    LDR         r5,[sp,#0x34]               @Loads pu1_avail
+    LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset_u
+
+    STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
+    SUB         sp,sp,#0xD4                 @Decrement the stack pointer to store some temp arr values
+
+    STRH        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 2]
+    SUB         r10,r8,#1                   @ht-1
+    MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
+    ADD         r12,sp,#10                  @temp array
+
+AU1_SRC_TOP_LOOP:
+    VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
+    SUBS        r9,r9,#8                    @Decrement the loop count by 8
+    VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
+    BNE         AU1_SRC_TOP_LOOP
+
+PU1_AVAIL_4_LOOP_U:
+    LDRB        r9,[r5,#4]                  @pu1_avail[4]
+    CMP         r9,#0
+    LDRB        r9,[r0]                     @u1_pos_0_0_tmp_u = pu1_src[0]
+    LDRB        r10,[r0,#1]                 @u1_pos_0_0_tmp_v = pu1_src[1]
+    BEQ         PU1_AVAIL_7_LOOP_U
+
+    LDRB        r11,[r4]                    @pu1_src_top_left[0]
+    ADD         r14,r0,r1                   @pu1_src + src_strd
+
+    SUB         r12,r9,r11                  @pu1_src[0] - pu1_src_top_left[0]
+
+    LDRB        r14,[r14,#2]                @pu1_src[2 + src_strd]
+    CMP         r12,#0
+
+    MVNLT       r12,#0
+    SUB         r11,r9,r14                  @pu1_src[0] - pu1_src[2 + src_strd]
+
+    MOVGT       r12,#1                      @SIGN(pu1_src[0] - pu1_src_top_left[0])
+
+    CMP         r11,#0
+    MVNLT       r11,#0
+    LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
+ulbl1:
+    add         r14,r14,pc
+    MOVGT       r11,#1                      @SIGN(pu1_src[0] - pu1_src[2 + src_strd])
+
+    ADD         r11,r12,r11                 @SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[2 + src_strd])
+    ADD         r11,r11,#2                  @edge_idx
+
+    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         r12,#0                      @0 != edge_idx
+    BEQ         PU1_AVAIL_4_LOOP_V
+    LDRSB       r11,[r6,r12]                @pi1_sao_offset_u[edge_idx]
+    ADD         r9,r9,r11                   @pu1_src[0] + pi1_sao_offset_u[edge_idx]
+    USAT        r9,#8,r9                    @u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_4_LOOP_V:
+
+    LDRB        r11,[r4,#1]                 @pu1_src_top_left[1]
+    ADD         r14,r0,r1                   @pu1_src + src_strd
+
+    SUB         r12,r10,r11                 @pu1_src[1] - pu1_src_top_left[1]
+    LDRB        r14,[r14,#3]                @pu1_src[3 + src_strd]
+
+    CMP         r12,#0
+    MVNLT       r12,#0
+    SUB         r11,r10,r14                 @pu1_src[1] - pu1_src[3 + src_strd]
+    MOVGT       r12,#1                      @SIGN(pu1_src[0] - pu1_src_top_left[0])
+
+    CMP         r11,#0
+    MVNLT       r11,#0
+    LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
+ulbl2:
+    add         r14,r14,pc
+    MOVGT       r11,#1                      @SIGN(pu1_src[0] - pu1_src[3 + src_strd])
+
+    ADD         r11,r12,r11                 @SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[3 + src_strd])
+    ADD         r11,r11,#2                  @edge_idx
+
+    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         r12,#0                      @0 != edge_idx
+    BEQ         PU1_AVAIL_7_LOOP_U
+    LDR         r11,[sp,#0x110]             @Loads pi1_sao_offset_v
+    LDRSB       r11,[r11,r12]               @pi1_sao_offset_v[edge_idx]
+    ADD         r10,r10,r11                 @pu1_src[0] + pi1_sao_offset_v[edge_idx]
+    USAT        r10,#8,r10                  @u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_7_LOOP_U:
+    STRB        r10,[sp,#7]
+    STRB        r9,[sp,#6]
+
+    LDRB        r10,[r5,#7]                 @pu1_avail[7]
+    CMP         r10,#0
+    SUB         r10,r7,#2                   @wd - 2
+    SUB         r11,r8,#1                   @ht - 1
+    MLA         r12,r11,r1,r10              @wd - 2 + (ht - 1) * src_strd
+    ADD         r12,r12,r0                  @pu1_src[wd - 2 + (ht - 1) * src_strd]
+    LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd]
+    LDRB        r9,[r12,#1]                 @u1_pos_wd_ht_tmp_v = pu1_src[wd - 2 + (ht - 1) * src_strd]
+    BEQ         PU1_AVAIL_3_LOOP
+
+    SUB         r11,r12,r1                  @pu1_src[(wd - 2 + (ht - 1) * src_strd) - src_strd]
+    SUB         r11,r11,#2                  @pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
+    LDRB        r11,[r11]                   @Load pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
+    SUB         r11,r10,r11                 @pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd]
+    CMP         r11,#0
+    MVNLT       r11,#0
+    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd])
+
+    ADD         r14,r12,r1                  @pu1_src[(wd - 2 + (ht - 1) * src_strd) + src_strd]
+    ADD         r14,r14,#2                  @pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
+    LDRB        r14,[r14]                   @Load pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
+    SUB         r14,r10,r14                 @pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
+    CMP         r14,#0
+    MVNLT       r14,#0
+    MOVGT       r14,#1                      @SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd])
+
+    ADD         r11,r11,r14                 @Add 2 sign value
+    ADD         r11,r11,#2                  @edge_idx
+    LDR         r14, gi1_table_edge_idx_addr_3 @table pointer
+ulbl3:
+    add         r14,r14,pc
+
+    LDRSB       r14,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         r14,#0
+    BEQ         PU1_AVAIL_7_LOOP_V
+    LDRSB       r11,[r6,r14]                @pi1_sao_offset_u[edge_idx]
+    ADD         r10,r10,r11                 @pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+    USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_7_LOOP_V:
+    ADD         r12,r12,#1
+    SUB         r11,r12,r1                  @pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
+    SUB         r11,r11,#2                  @pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
+    LDRB        r11,[r11]                   @Load pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
+    SUB         r11,r9,r11                  @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 2 - src_strd]
+    CMP         r11,#0
+    MVNLT       r11,#0
+    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd])
+
+    ADD         r14,r12,r1                  @pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
+    ADD         r14,r14,#2                  @pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
+    LDRB        r14,[r14]                   @Load pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
+    SUB         r14,r9,r14                  @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
+    CMP         r14,#0
+    MVNLT       r14,#0
+    MOVGT       r14,#1                      @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
+
+    ADD         r11,r11,r14                 @Add 2 sign value
+    ADD         r11,r11,#2                  @edge_idx
+    LDR         r14, gi1_table_edge_idx_addr_4 @table pointer
+ulbl4:
+    add         r14,r14,pc
+
+    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         r12,#0
+    BEQ         PU1_AVAIL_3_LOOP
+    LDR         r14,[sp,#0x110]             @Loads pi1_sao_offset_v
+    LDRSB       r11,[r14,r12]               @pi1_sao_offset_v[edge_idx]
+    ADD         r9,r9,r11                   @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+    USAT        r9,#8,r9                    @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_3_LOOP:
+    STRB        r10,[sp,#8]
+    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
+    STRB        r9,[sp,#9]
+
+    MOV         r12,r8                      @Move ht
+    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
+    MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
+
+    LDRB        r11,[r5,#3]                 @pu1_avail[3]
+    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+    CMP         r11,#0
+
+    SUBEQ       r12,r12,#1                  @ht_tmp--
+    LDRB        r5,[r5,#2]                  @pu1_avail[2]
+
+    CMP         r5,#0
+
+    ADDEQ       r0,r0,r1                    @pu1_src += src_strd
+    VLD1.8      D6,[r6]                     @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
+    SUBEQ       r12,r12,#1                  @ht_tmp--
+
+    LDR         r6,[sp,#0x110]              @Loads pi1_sao_offset_v
+    ADDEQ       r14,r14,#2                  @pu1_src_left_cpy += 2
+
+    STR         r0,[sp,#2]                  @Store pu1_src in sp
+    VLD1.8      D7,[r6]                     @offset_tbl_v = vld1_s8(pi1_sao_offset_v)
+    LDR         r2, gi1_table_edge_idx_addr_5 @table pointer
+ulbl5:
+    add         r2,r2,pc
+
+    MOV         r6,r7                       @move wd to r6 loop_count
+    VMOV.S8     Q4,#0XFF                    @au1_mask = vdupq_n_s8(-1)
+    CMP         r7,#16                      @Compare wd with 16
+
+    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+    CMP         r8,#4                       @Compare ht with 4
+    BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
+
+WIDTH_LOOP_16:
+    LDR         r5,[sp,#0x108]              @Loads pu1_avail
+    LDR         r7,[sp,#0x114]              @Loads wd
+    CMP         r6,r7                       @col == wd
+    LDREQB      r8,[r5]                     @pu1_avail[0]
+
+    MOVNE       r8,#-1
+    VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    CMP         r6,#16                      @if(col == 16)
+    VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    BNE         SKIP_AU1_MASK_VAL
+    LDRB        r8,[r5,#1]                  @pu1_avail[1]
+    VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL:
+    LDRB        r9,[r5,#2]                  @pu1_avail[2]
+    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
+    SUB         r0,#8
+    CMP         r9,#0
+
+    LDR         r4,[sp,#0x118]              @Loads ht
+    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
+
+    LDR         r7,[sp,#0x114]              @Loads wd
+    MOVNE       r8,r3                       @pu1_src_top_cpy
+
+    SUB         r8,r8,#2                    @pu1_src - src_strd - 2
+    ADD         r3,r3,#16
+
+    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
+    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+    SUB         r8,#8
+    SUB         r7,r7,r6                    @(wd - col)
+
+    ADD         r7,r7,#14                   @15 + (wd - col)
+    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
+    LDR         r8,[sp,#0x100]              @Loads *pu1_src
+
+    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
+    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
+
+AU1_SRC_LEFT_LOOP:
+    LDRH        r8,[r7]                     @load the value and increment by src_strd
+    SUBS        r4,r4,#1                    @decrement the loop count
+
+    STRH        r8,[r5],#2                  @store it in the stack pointer
+    ADD         r7,r7,r1
+
+    BNE         AU1_SRC_LEFT_LOOP
+
+    ADD         r8,r0,r1                    @I *pu1_src + src_strd
+    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         r7,r12                      @row count, move ht_tmp to r7
+
+    VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r8,#8
+
+    ADD         r8,r8,#16                   @I
+    VMOV.I8     Q9,#0
+    LDRH        r5,[r8]                     @I pu1_src_cpy[src_strd + 16]
+
+    LDR         r10,[sp,#0x108]             @I Loads pu1_avail
+    VMOV.16     D18[0],r5                   @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    LDRB        r10,[r10,#2]                @I pu1_avail[2]
+
+    CMP         r10,#0                      @I
+    VEXT.8      Q9,Q8,Q9,#2                 @I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+    BNE         SIGN_UP_CHANGE_DONE         @I
+
+    LDRB        r11,[r0]                    @I pu1_src_cpy[0]
+    SUB         r4,r12,r7                   @I ht_tmp - row
+
+    LDRB        r10,[r0,#1]                 @I pu1_src_cpy[0]
+    LSL         r4,r4,#1                    @I (ht_tmp - row) * 2
+
+    ADD         r9,r14,r4                   @I pu1_src_left_cpy[(ht_tmp - row) * 2]
+    LDRB        r5,[r9,#-2]                 @I load the value
+
+    SUB         r8,r11,r5                   @I pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+    LDRB        r5,[r9,#-1]                 @I load the value
+
+    CMP         r8,#0                       @I
+    SUB         r4,r10,r5                   @I pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+
+    MVNLT       r8,#0                       @I
+    MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+
+    CMP         r4,#0                       @I
+    VMOV.8      D14[0],r8                   @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+    MVNLT       r4,#0                       @I
+
+    MOVGT       r4,#1                       @I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+    VMOV.8      D14[1],r4                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE:
+    VLD1.8      D30,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    VCGT.U8     Q10,Q6,Q9                   @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    VCLT.U8     Q11,Q6,Q9                   @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VSUB.U8     Q11,Q11,Q10                 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q9,Q9,Q11                   @I edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    VTBL.8      D18,{D30},D18               @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VNEG.S8     Q7,Q11                      @I sign_up = vnegq_s8(sign_down)
+
+    VTBL.8      D19,{D30},D19               @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VEXT.8      Q7,Q7,Q7,#14                @I sign_up = vextq_s8(sign_up, sign_up, 14)
+
+    VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VAND        Q11,Q9,Q4                   @I edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VMOVL.U8    Q9,D13                      @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VUZP.8      D22,D23                     @I
+
+    VTBL.8      D22,{D6},D22                @I
+    VTBL.8      D23,{D7},D23                @I
+    VZIP.8      D22,D23                     @I
+
+    VMOV        Q6,Q8                       @I pu1_cur_row = pu1_next_row
+    VADDW.S8    Q10,Q10,D22                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VADDW.S8    Q9,Q9,D23                   @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    VMAX.S16    Q9,Q9,Q1                    @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    VMIN.U16    Q9,Q9,Q2                    @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
+
+
+PU1_SRC_LOOP:
+    ADD         r8,r0,r1,LSL #1             @II *pu1_src + src_strd
+    VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
+    ADD         r11,r8,r1                   @III *pu1_src + src_strd
+
+    VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r8,#8
+    VLD1.8      D30,[r11]!                  @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D31,[r11]                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r11,#8
+
+    ADD         r8,r8,#16                   @II
+    VMOVN.I16   D21,Q9                      @I vmovn_s16(pi2_tmp_cur_row.val[1])
+    LDRH        r5,[r8]                     @II pu1_src_cpy[src_strd + 16]
+
+    ADD         r11,r11,#16                 @III
+    VMOV.16     D28[0],r5                   @II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    LDRH        r4,[r11]                    @III pu1_src_cpy[src_strd + 16]
+
+    LDRB        r8,[r0,r1]                  @II pu1_src_cpy[0]
+    VEXT.8      Q14,Q8,Q14,#2               @II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+    SUB         r5,r12,r7                   @II ht_tmp - row
+
+    LSL         r5,r5,#1                    @II (ht_tmp - row) * 2
+    VMOV.16     D18[0],r4                   @III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    ADD         r9,r14,r5                   @II pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+    LDRB        r11,[r9,#-2]                @II load the value
+    VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    SUB         r8,r8,r11                   @II pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+
+    CMP         r8,#0                       @II
+    VEXT.8      Q9,Q15,Q9,#2                @III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+    LDRB        r11,[r0,#1]                 @II pu1_src_cpy[0]
+
+    MVNLT       r8,#0                       @II
+    VCGT.U8     Q11,Q6,Q14                  @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    MOVGT       r8,#1                       @II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+
+    LDRB        r5,[r9,#-1]                 @II load the value
+    VMOV.8      D14[0],r8                   @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+    SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
+
+    SUB         r11,r11,r5                  @II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+    VCLT.U8     Q12,Q6,Q14                  @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    CMP         r11,#0                      @II
+
+    MVNLT       r11,#0                      @II
+    VSUB.U8     Q12,Q12,Q11                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOVGT       r11,#1                      @II SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+
+    LDRB        r4,[r0,r1]                  @III pu1_src_cpy[0]
+    VLD1.8      D22,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    SUB         r5,r12,r7                   @III ht_tmp - row
+
+    ADD         r10,r0,r1
+    VMOV.8      D14[1],r11                  @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+    LSL         r5,r5,#1                    @III (ht_tmp - row) * 2
+
+    ADD         r9,r14,r5                   @III pu1_src_left_cpy[(ht_tmp - row) * 2]
+    VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
+    LDRB        r10,[r10,#1]                @III pu1_src_cpy[0]
+
+    LDRB        r5,[r9,#-2]                 @III load the value
+    VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
+    SUB         r4,r4,r5                    @III pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+
+    CMP         r4,#0                       @III
+    LDRB        r9,[r9,#-1]                 @III load the value
+    VTBL.8      D26,{D22},D26               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
+
+    MVNLT       r4,#0                       @III
+    SUB         r10,r10,r9                  @III pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+    VTBL.8      D27,{D22},D27               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VEXT.8      Q7,Q7,Q7,#14                @II sign_up = vextq_s8(sign_up, sign_up, 14)
+
+    MOVGT       r4,#1                       @III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+    VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
+    CMP         r10,#0                      @III
+
+    VUZP.8      D26,D27                     @II
+    VMOV.8      d14[0],r4                   @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+
+    MVNLT       r10,#0                      @III
+    MOVGT       r10,#1                      @III SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+    VTBL.8      D24,{D6},D26                @II
+    VCGT.U8     Q10,Q8,Q9                   @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    VCLT.U8     Q11,Q8,Q9                   @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VTBL.8      D25,{D7},D27                @II
+    VSUB.U8     Q11,Q11,Q10                 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VMOV.8      D14[1],r10                  @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+    VZIP.8      D24,D25                     @II
+
+    VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
+
+    VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    VADD.I8     Q9,Q9,Q11                   @III edge_idx = vaddq_s8(edge_idx, sign_down)
+    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+    VTBL.8      D18,{D20},D18               @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VNEG.S8     Q7,Q11                      @III sign_up = vnegq_s8(sign_down)
+
+    VTBL.8      D19,{D20},D19               @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VEXT.8      Q7,Q7,Q7,#14                @III sign_up = vextq_s8(sign_up, sign_up, 14)
+
+    VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VUZP.8      D18,D19                     @III
+    VTBL.8      D22,{D6},D18                @III
+    VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    VMOV        Q6,Q15                      @III pu1_cur_row = pu1_next_row
+    VTBL.8      D23,{D7},D19                @III
+    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    VZIP.8      D22,D23                     @III
+    VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
+    VADDW.S8    Q10,Q10,D22                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    VMOVL.U8    Q9,D17                      @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+    VADDW.S8    Q9,Q9,D23                   @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
+    VMAX.S16    Q9,Q9,Q1                    @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    CMP         r7,#1
+
+    VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    VMIN.U16    Q9,Q9,Q2                    @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
+    BLT         INNER_LOOP_DONE
+
+    ADD         r8,r0,r1,LSL #1             @*pu1_src + src_strd
+    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    LDRB        r11,[r0,r1]                 @pu1_src_cpy[0]
+    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r8,#8
+    SUB         r4,r12,r7                   @ht_tmp - row
+
+    ADD         r8,r8,#16
+    VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
+    LDRH        r5,[r8]                     @pu1_src_cpy[src_strd + 16]
+
+    LSL         r4,r4,#1                    @(ht_tmp - row) * 2
+    VMOV.16     D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    ADD         r9,r14,r4                   @pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+    LDRB        r5,[r9,#-2]                 @load the value
+    VEXT.8      Q9,Q8,Q9,#2                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+    SUB         r8,r11,r5                   @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+
+    CMP         r8,#0
+    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    MVNLT       r8,#0
+
+    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+    VLD1.8      D30,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+
+    LDRB        r11,[r0,#1]                 @pu1_src_cpy[0]
+    VMOV.8      D14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+    LDRB        r5,[r9,#-1]                 @load the value
+
+    SUB         r4,r11,r5                   @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    CMP         r4,#0
+
+    MVNLT       r4,#0
+    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    MOVGT       r4,#1                       @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+
+    VMOV.8      D14[1],r4                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    VTBL.8      D26,{D30},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VTBL.8      D27,{D30},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VMOVL.U8    Q9,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VUZP.8      D26,D27
+
+    VTBL.8      D24,{D6},D26
+    VTBL.8      D25,{D7},D27
+    VZIP.8      D24,D25
+
+    VADDW.S8    Q10,Q10,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VADDW.S8    Q9,Q9,D25                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    VMAX.S16    Q9,Q9,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    VMIN.U16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+
+INNER_LOOP_DONE:
+    LDR         r8,[sp,#0x118]              @Loads ht
+    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
+
+    LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
+    VMOVN.I16   D21,Q9                      @vmovn_s16(pi2_tmp_cur_row.val[1])
+
+
+SRC_LEFT_LOOP:
+    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
+    SUBS        r8,r8,#2
+    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
+    BNE         SRC_LEFT_LOOP
+
+    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
+    VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    CMP         r6,#8                       @Check whether residue remains
+
+    BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
+    LDR         r7,[sp,#0x114]              @Loads wd
+    LDR         r0,[sp,#0x02]               @Loads *pu1_src
+    SUB         r7,r7,r6
+    ADD         r0,r0,r7
+    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
+    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
+
+
+WD_16_HT_4_LOOP:
+    LDR         r5,[sp,#0x108]              @Loads pu1_avail
+    LDR         r7,[sp,#0x114]              @Loads wd
+    CMP         r6,r7                       @col == wd
+    LDREQB      r8,[r5]                     @pu1_avail[0]
+
+    MOVNE       r8,#-1
+    VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    CMP         r6,#16                      @if(col == 16)
+    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
+    LDRB        r8,[r5,#1]                  @pu1_avail[1]
+    VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL_WD_16_HT_4:
+    LDRB        r8,[r5,#2]                  @pu1_avail[2]
+    CMP         r8,#0
+
+    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
+    MOVNE       r8,r3                       @pu1_src_top_cpy
+    SUB         r8,r8,#2                    @pu1_src - src_strd - 2
+    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+    SUB         r8,#8
+
+    ADD         r3,r3,#16
+    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
+    LDR         r4,[sp,#0x118]              @Loads ht
+    LDR         r7,[sp,#0x114]              @Loads wd
+    SUB         r7,r7,r6                    @(wd - col)
+    ADD         r7,r7,#14                   @15 + (wd - col)
+    LDR         r8,[sp,#0x100]              @Loads *pu1_src
+    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
+
+AU1_SRC_LEFT_LOOP_WD_16_HT_4:
+    LDRH        r8,[r7]                     @load the value and increment by src_strd
+    STRH        r8,[r5],#2                  @store it in the stack pointer
+    ADD         r7,r7,r1
+
+    SUBS        r4,r4,#1                    @decrement the loop count
+    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
+
+    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
+    SUB         r0,#8
+
+    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
+    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
+    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    VMOV.I8     Q9,#0
+    MOV         r7,r12                      @row count, move ht_tmp to r7
+
+PU1_SRC_LOOP_WD_16_HT_4:
+    VMOV.I8     Q9,#0
+    ADD         r8,r0,r1                    @*pu1_src + src_strd
+    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r8,#8
+
+    ADD         r8,r8,#16
+    LDRH        r5,[r8]                     @pu1_src_cpy[src_strd + 16]
+    VMOV.16     D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    VEXT.8      Q9,Q8,Q9,#2                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+
+    CMP         r7,r12
+    BLT         SIGN_UP_CHANGE_WD_16_HT_4
+    LDR         r5,[sp,#0x108]              @Loads pu1_avail
+    LDRB        r5,[r5,#2]                  @pu1_avail[2]
+    CMP         r5,#0
+    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
+
+SIGN_UP_CHANGE_WD_16_HT_4:
+    LDRB        r8,[r0]                     @pu1_src_cpy[0]
+    SUB         r5,r12,r7                   @ht_tmp - row
+    LSL         r5,r5,#1                    @(ht_tmp - row) * 2
+    ADD         r9,r14,r5                   @pu1_src_left_cpy[(ht_tmp - row) * 2]
+    LDRB        r5,[r9,#-2]                 @load the value
+    SUB         r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+    CMP         r8,#0
+    MVNLT       r8,#0
+    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+    VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+
+    LDRB        r8,[r0,#1]                  @pu1_src_cpy[0]
+    LDRB        r5,[r9,#-1]                 @load the value
+    SUB         r8,r8,r5                    @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+    CMP         r8,#0
+    MVNLT       r8,#0
+    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+    VMOV.8      d14[1],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE_WD_16_HT_4:
+    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    VLD1.8      D22,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    VTBL.8      D26,{D22},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VTBL.8      D27,{D22},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
+    VEXT.8      Q7,Q7,Q7,#14                @sign_up = vextq_s8(sign_up, sign_up, 14)
+
+    VUZP.8      D26,D27
+    VTBL.8      D24,{D6},D26
+    VTBL.8      D25,{D7},D27
+    VZIP.8      D24,D25
+
+    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMOVL.U8    Q13,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VADDW.S8    Q13,Q13,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    VMAX.S16    Q13,Q13,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    VMIN.U16    Q13,Q13,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+    VMOVN.I16   D29,Q13                     @vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
+    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
+    BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
+
+    LDR         r8,[sp,#0x118]              @Loads ht
+    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
+    LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
+
+SRC_LEFT_LOOP_WD_16_HT_4:
+    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
+    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
+
+    SUBS        r8,r8,#2
+    BNE         SRC_LEFT_LOOP_WD_16_HT_4
+
+
+    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
+    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
+    BGT         WD_16_HT_4_LOOP
+
+
+WIDTH_RESIDUE:
+    LDR         r7,[sp,#0x114]              @Loads wd
+    LDR         r5,[sp,#0x108]              @Loads pu1_avail
+    CMP         r6,r7                       @wd_residue == wd
+    LDREQB      r8,[r5]                     @pu1_avail[0]
+
+    MOVNE       r8,#-1
+    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    VMOV.8      d8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    LDRB        r8,[r5,#1]                  @pu1_avail[1]
+    VMOV.8      d8[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+    LDRB        r8,[r5,#2]                  @pu1_avail[2]
+    CMP         r8,#0
+
+    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
+    MOVNE       r8,r3
+    SUB         r8,r8,#2                    @pu1_src - src_strd - 2
+    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
+    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
+    SUB         r8,#8
+
+    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
+    LDR         r4,[sp,#0x118]              @Loads ht
+    LDR         r7,[sp,#0x114]              @Loads wd
+    LDR         r8,[sp,#0x100]              @Loads *pu1_src
+    SUB         r7,r7,#2                    @(wd - 2)
+    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 2)]
+
+AU1_SRC_LEFT_LOOP_RESIDUE:
+    LDRH        r8,[r7]                     @load the value and increment by src_strd
+    STRH        r8,[r5],#2                  @store it in the stack pointer
+    ADD         r7,r7,r1
+    SUBS        r4,r4,#1                    @decrement the loop count
+    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
+
+    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
+    SUB         r0,#8
+
+    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
+    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
+    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         r7,r12                      @row count, move ht_tmp to r7
+
+PU1_SRC_LOOP_RESIDUE:
+    VMOV.I8     Q9,#0
+    ADD         r8,r0,r1                    @*pu1_src + src_strd
+    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r8,#8
+
+    ADD         r8,r8,#16
+    LDRH        r5,[r8]                     @pu1_src_cpy[src_strd + 16]
+    VMOV.16     D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    VEXT.8      Q9,Q8,Q9,#2                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+
+    CMP         r7,r12
+    BLT         SIGN_UP_CHANGE_RESIDUE
+    LDR         r5,[sp,#0x108]              @Loads pu1_avail
+    LDRB        r5,[r5,#2]                  @pu1_avail[2]
+    CMP         r5,#0
+    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
+
+SIGN_UP_CHANGE_RESIDUE:
+    LDRB        r8,[r0]                     @pu1_src_cpy[0]
+    SUB         r5,r12,r7                   @ht_tmp - row
+    LSL         r5,r5,#1                    @(ht_tmp - row) * 2
+    ADD         r9,r14,r5                   @pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+    LDRB        r5,[r9,#-2]                 @load the value
+    SUB         r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+    CMP         r8,#0
+    MVNLT       r8,#0
+    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+    VMOV.8      d14[0],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+
+    LDRB        r8,[r0,#1]                  @pu1_src_cpy[0]
+    LDRB        r5,[r9,#-1]                 @load the value
+    SUB         r8,r8,r5                    @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+    CMP         r8,#0
+    MVNLT       r8,#0
+    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+    VMOV.8      d14[1],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE_RESIDUE:
+    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    VLD1.8      D22,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    VTBL.8      D26,{D22},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VTBL.8      D27,{D22},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
+    VEXT.8      Q7,Q7,Q7,#14                @sign_up = vextq_s8(sign_up, sign_up, 14)
+
+    VUZP.8      D26,D27
+    VTBL.8      D24,{D6},D26
+    VTBL.8      D25,{D7},D27
+    VZIP.8      D24,D25
+
+    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    VST1.8      {D28},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
+    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
+    BNE         PU1_SRC_LOOP_RESIDUE        @If not equal jump to PU1_SRC_LOOP
+
+    LDR         r8,[sp,#0x118]              @Loads ht
+    LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
+    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
+
+SRC_LEFT_LOOP_RESIDUE:
+    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
+    SUBS        r8,r8,#2
+    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
+
+    BNE         SRC_LEFT_LOOP_RESIDUE
+
+
+RE_ASSINING_LOOP:
+    LDR         r8,[sp,#0x118]              @Loads ht
+
+    LDR         r0,[sp,#0x100]              @Loads *pu1_src
+    SUB         r8,r8,#1                    @ht - 1
+
+    LDR         r7,[sp,#0x114]              @Loads wd
+
+    LDRH        r9,[sp,#6]
+    MLA         r6,r8,r1,r7                 @wd - 2 + (ht - 1) * src_strd
+
+    STRH        r9,[r0]                     @pu1_src_org[0] = u1_pos_0_0_tmp
+    ADD         r6,r0,r6                    @pu1_src[wd - 2 + (ht - 1) * src_strd]
+
+    LDRH        r9,[sp,#8]
+    ADD         r12,sp,#10
+    STRH        r9,[r6,#-2]                 @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
+
+    LDR         r4,[sp,#0xFC]               @Loads pu1_src_top_left
+    LDRH        r10,[sp]                    @load u1_src_top_left_tmp from stack pointer
+    STRH        r10,[r4]                    @*pu1_src_top_left = u1_src_top_left_tmp
+    LDR         r3,[sp,#0x10C]              @Loads pu1_src_top
+
+SRC_TOP_LOOP:
+    VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
+    SUBS        r7,r7,#8                    @Decrement the width
+    VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
+    BNE         SRC_TOP_LOOP
+
+END_LOOPS:
+    ADD         sp,sp,#0xD4
+    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
+
+
+

diff --git a/common/arm/ihevc_sao_edge_offset_class3.s b/common/arm/ihevc_sao_edge_offset_class3.s
new file mode 100644
index 0000000..268d4d8
--- /dev/null
+++ b/common/arm/ihevc_sao_edge_offset_class3.s

@@ -0,0 +1,854 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@*  ihevc_sao_edge_offset_class3.s
+@*
+@* ,:brief
+@*  Contains function definitions for inter prediction  interpolation.
+@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@*  Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_edge_offset_class3(UWORD8 *pu1_src,
+@                              WORD32 src_strd,
+@                              UWORD8 *pu1_src_left,
+@                              UWORD8 *pu1_src_top,
+@                              UWORD8 *pu1_src_top_left,
+@                              UWORD8 *pu1_src_top_right,
+@                              UWORD8 *pu1_src_bot_left,
+@                              UWORD8 *pu1_avail,
+@                              WORD8 *pi1_sao_offset,
+@                              WORD32 wd,
+@                              WORD32 ht)
+@**************Variables Vs Registers*****************************************
+@r0 =>  *pu1_src
+@r1 =>  src_strd
+@r2 =>  *pu1_src_left
+@r3 =>  *pu1_src_top
+@r4 =>  *pu1_src_top_left
+@r5 =>  *pu1_avail
+@r6 =>  *pi1_sao_offset
+@r7 =>  wd
+@r8=>   ht
+
+.text
+.p2align 2
+
+.extern gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class3_a9q
+
+gi1_table_edge_idx_addr_1:
+.long gi1_table_edge_idx - ulbl1 - 8
+
+gi1_table_edge_idx_addr_2:
+.long gi1_table_edge_idx - ulbl2 - 8
+
+gi1_table_edge_idx_addr_3:
+.long gi1_table_edge_idx - ulbl3 - 8
+
+ihevc_sao_edge_offset_class3_a9q:
+
+
+    STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
+    LDR         r7,[sp,#0x3C]               @Loads wd
+
+    LDR         r8,[sp,#0x40]               @Loads ht
+    SUB         r9,r7,#1                    @wd - 1
+
+    LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
+    LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
+
+    MOV         r9,r7                       @Move width to r9 for loop count
+
+    LDR         r5,[sp,#0x34]               @Loads pu1_avail
+    LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset
+    STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
+
+    SUB         sp,sp,#0x94                 @Decrement the stack pointer to store some temp arr values
+
+    STRB        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 1]
+    SUB         r10,r8,#1                   @ht-1
+    MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
+    ADD         r12,sp,#0x02                @temp array
+
+AU1_SRC_TOP_LOOP:
+    VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
+    SUBS        r9,r9,#8                    @Decrement the loop count by 8
+    VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
+    BNE         AU1_SRC_TOP_LOOP
+
+PU1_AVAIL_5_LOOP:
+    LDRB        r9,[r5,#5]                  @pu1_avail[5]
+    CMP         r9,#0
+    SUB         r10,r7,#1                   @[wd - 1]
+    LDRB        r9,[r0,r10]                 @u1_pos_0_0_tmp = pu1_src[wd - 1]
+    BEQ         PU1_AVAIL_6_LOOP
+
+    LDR         r11,[sp,#0xC0]              @Load pu1_src_top_right from sp
+    SUB         r10,r10,#1                  @[wd - 1 - 1]
+
+    LDRB        r11,[r11]                   @pu1_src_top_right[0]
+    SUB         r12,r9,r11                  @pu1_src[wd - 1] - pu1_src_top_right[0]
+
+    ADD         r11,r0,r1                   @pu1_src + src_strd
+
+    LDRB        r14,[r11,r10]               @pu1_src[wd - 1 - 1 + src_strd]
+    CMP         r12,#0
+    MVNLT       r12,#0
+    SUB         r11,r9,r14                  @pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]
+
+    MOVGT       r12,#1                      @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0])
+    CMP         r11,#0
+    MVNLT       r11,#0
+    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
+    LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
+ulbl1:
+    add         r14,r14,pc
+    ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
+    ADD         r11,r11,#2                  @edge_idx
+
+    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         r12,#0                      @0 != edge_idx
+    BEQ         PU1_AVAIL_6_LOOP
+    LDRSB       r10,[r6,r12]                @pi1_sao_offset[edge_idx]
+    ADD         r9,r9,r10                   @pu1_src[0] + pi1_sao_offset[edge_idx]
+    USAT        r9,#8,r9                    @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_6_LOOP:
+    LDRB        r10,[r5,#6]                 @pu1_avail[6]
+    SUB         r11,r8,#1                   @ht - 1
+
+    CMP         r10,#0
+    STR         r0,[sp,#0xC0]               @Store pu1_src in sp
+    MLA         r12,r11,r1,r0               @pu1_src[(ht - 1) * src_strd]
+
+    LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp = pu1_src[(ht - 1) * src_strd]
+    BEQ         PU1_AVAIL_3_LOOP
+
+    LDR         r14,[sp,#0xC4]              @Load pu1_src_bot_left from sp
+    SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd) - src_strd]
+
+    LDRB        r14,[r14]                   @Load pu1_src_bot_left[0]
+    ADD         r11,r11,#1                  @pu1_src[(ht - 1) * src_strd + 1 - src_strd]
+
+    LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd + 1 - src_strd]
+    SUB         r14,r10,r14                 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
+
+    SUB         r11,r10,r11                 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]
+    CMP         r11,#0
+    MVNLT       r11,#0
+    MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd])
+
+    CMP         r14,#0
+    MVNLT       r14,#0
+    MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
+
+    ADD         r11,r11,r14                 @Add 2 sign value
+
+    LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
+ulbl2:
+    add         r14,r14,pc
+    ADD         r11,r11,#2                  @edge_idx
+
+    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         r12,#0
+    BEQ         PU1_AVAIL_3_LOOP
+    LDRSB       r11,[r6,r12]                @pi1_sao_offset[edge_idx]
+    ADD         r10,r10,r11                 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+    USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_3_LOOP:
+    STR         r2,[sp,#0xC4]               @Store pu1_src_left in sp
+    MOV         r12,r8                      @Move ht
+
+    MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
+    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
+    LDRB        r11,[r5,#3]                 @pu1_avail[3]
+
+    CMP         r11,#0
+    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
+    SUBEQ       r12,r12,#1                  @ht_tmp--
+
+    LDRB        r5,[r5,#2]                  @pu1_avail[2]
+    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+    CMP         r5,#0
+
+    ADDEQ       r0,r0,r1                    @pu1_src += src_strd
+    VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
+    SUBEQ       r12,r12,#1                  @ht_tmp--
+
+    LDR         r6, gi1_table_edge_idx_addr_3 @table pointer
+ulbl3:
+    add         r6,r6,pc
+    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
+    ADDEQ       r14,r14,#1                  @pu1_src_left_cpy += 1
+
+    STR         r0,[sp,#0x90]               @Store pu1_src in sp
+    VLD1.8      D6,[r6]                     @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    MOV         r6,r7                       @move wd to r6 loop_count
+
+    CMP         r7,#16                      @Compare wd with 16
+    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+    CMP         r8,#4                       @Compare ht with 4
+    BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
+
+WIDTH_LOOP_16:
+    LDR         r7,[sp,#0xD0]               @Loads wd
+
+    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
+    CMP         r6,r7                       @col == wd
+    LDREQB      r8,[r5]                     @pu1_avail[0]
+    MOVNE       r8,#-1
+    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    CMP         r6,#16                      @if(col == 16)
+    BNE         SKIP_AU1_MASK_VAL
+    LDRB        r8,[r5,#1]                  @pu1_avail[1]
+    VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL:
+    LDRB        r8,[r5,#2]                  @pu1_avail[2]
+    CMP         r8,#0
+
+    LDR         r4,[sp,#0xD4]               @Loads ht
+    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
+
+    MOVNE       r8,r3
+    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
+
+    LDR         r7,[sp,#0xD0]               @Loads wd
+    ADD         r8,r8,#1                    @pu1_src - src_strd + 1
+
+    SUB         r7,r7,r6                    @(wd - col)
+    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+    SUB         r8,#8
+    ADD         r3,r3,#16
+
+    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
+    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
+    SUB         r0,#8
+    ADD         r7,r7,#15                   @15 + (wd - col)
+
+    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
+    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
+    SUB         r5,r5,#1
+
+AU1_SRC_LEFT_LOOP:
+    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
+    SUBS        r4,r4,#1                    @decrement the loop count
+    STRB        r8,[r5,#1]!                 @store it in the stack pointer
+    BNE         AU1_SRC_LEFT_LOOP
+
+    VMOV.I8     Q9,#0
+    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
+
+    ADD         r8,r0,r1                    @I *pu1_src + src_strd
+    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         r7,r12                      @row count, move ht_tmp to r7
+
+    SUB         r5,r12,r7                   @I ht_tmp - row
+    VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r8,#8
+    ADD         r8,r14,r5                   @I pu1_src_left_cpy[ht_tmp - row]
+
+    ADD         r8,r8,#1                    @I pu1_src_left_cpy[ht_tmp - row + 1]
+    LDRB        r8,[r8]
+
+    LDR         r5,[sp,#0xC8]               @I Loads pu1_avail
+    VMOV.8      D19[7],r8                   @I vsetq_lane_u8
+    LDRB        r5,[r5,#2]                  @I pu1_avail[2]
+
+    VEXT.8      Q9,Q9,Q8,#15                @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+    CMP         r5,#0                       @I
+    BNE         SIGN_UP_CHANGE_DONE         @I
+
+SIGN_UP_CHANGE:
+    LDRB        r8,[r0,#15]                 @I pu1_src_cpy[15]
+    SUB         r5,r0,r1                    @I pu1_src_cpy[16 - src_strd]
+
+    LDRB        r5,[r5,#16]                 @I load the value
+    SUB         r8,r8,r5                    @I pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+    CMP         r8,#0                       @I
+    MVNLT       r8,#0                       @I
+    MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+    VMOV.8      D15[7],r8                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+
+SIGN_UP_CHANGE_DONE:
+    VCGT.U8     Q5,Q6,Q9                    @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VCLT.U8     Q9,Q6,Q9                    @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VSUB.U8     Q5,Q9,Q5                    @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q9,Q9,Q5                    @I edge_idx = vaddq_s8(edge_idx, sign_down)
+    VTBL.8      D18,{D6},D18                @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VNEG.S8     Q7,Q5                       @I sign_up = vnegq_s8(sign_down)
+
+    VEXT.8      Q7,Q7,Q7,#1                 @I sign_up = vextq_s8(sign_up, sign_up, 1)
+    VTBL.8      D19,{D6},D19                @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VTBL.8      D10,{D7},D18                @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+    VMOVL.U8    Q11,D13                     @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VADDW.S8    Q10,Q10,D10                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VTBL.8      D11,{D7},D19                @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMOV        Q6,Q8
+    VADDW.S8    Q11,Q11,D11                 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    VMAX.S16    Q11,Q11,Q1                  @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    VMIN.U16    Q11,Q11,Q2                  @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
+
+PU1_SRC_LOOP:
+    ADD         r8,r0,r1,LSL #1             @II *pu1_src + src_strd
+    VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
+    SUB         r5,r12,r7                   @II ht_tmp - row
+
+    ADD         r4,r0,r1                    @II pu1_src_cpy[16 - src_strd]
+    VMOVN.I16   D21,Q11                     @I vmovn_s16(pi2_tmp_cur_row.val[1])
+    ADD         r2,r8,r1                    @III *pu1_src + src_strd
+
+    LDRB        r11,[r4,#15]                @II pu1_src_cpy[15]
+    VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r8,#8
+    SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
+
+    ADD         r8,r14,r5                   @II pu1_src_left_cpy[ht_tmp - row]
+    VLD1.8      D30,[r2]!                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D31,[r2]                    @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r2,#8
+    LDRB        r8,[r8,#1]
+
+    LDRB        r4,[r0,#16]                 @II load the value
+    VMOV.8      D19[7],r8                   @II vsetq_lane_u8
+    SUB         r11,r11,r4                  @II pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+
+    CMP         r11,#0                      @II
+    VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    SUB         r5,r12,r7                   @III ht_tmp - row
+
+    MVNLT       r11,#0                      @II
+    VEXT.8      Q9,Q9,Q8,#15                @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+    MOVGT       r11,#1                      @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+
+    ADD         r8,r14,r5                   @III pu1_src_left_cpy[ht_tmp - row]
+    VMOV.8      D15[7],r11                  @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+    CMP         r7,#1                       @III
+
+    BNE         NEXT_ROW_ELSE_2             @III
+    LDR         r5,[sp,#0xC8]               @III Loads pu1_avail
+    LDRB        r5,[r5,#3]                  @III pu1_avail[3]
+    CMP         r5,#0                       @III
+    SUBNE       r8,r2,#2                    @III pu1_src_cpy[src_strd - 1]
+
+NEXT_ROW_ELSE_2:
+    LDRB        r8,[r8,#1]                  @III
+    VCGT.U8     Q12,Q6,Q9                   @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    ADD         r5,r0,r1
+
+    LDRB        r2,[r5,#15]                 @III pu1_src_cpy[15]
+    VCLT.U8     Q13,Q6,Q9                   @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    LDRB        r5,[r0,#16]                 @III load the value
+
+    SUB         r2,r2,r5                    @III pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+    VSUB.U8     Q12,Q13,Q12                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    CMP         r2,#0                       @III
+
+    MVNLT       r2,#0                       @III
+    VMOV.8      D19[7],r8                   @III vsetq_lane_u8
+    MOVGT       r2,#1                       @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+
+    SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
+    VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
+
+    VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
+    VEXT.8      Q9,Q9,Q15,#15               @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+
+    VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    VEXT.8      Q7,Q7,Q7,#1                 @II sign_up = vextq_s8(sign_up, sign_up, 1)
+    VTBL.8      D26,{D6},D26                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VCGT.U8     Q5,Q8,Q9                    @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    VMOV.8      D15[7],r2                   @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+    VTBL.8      D27,{D6},D27                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VCLT.U8     Q9,Q8,Q9                    @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VSUB.U8     Q5,Q9,Q5                    @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    VTBL.8      D24,{D7},D26                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
+
+    VADD.I8     Q9,Q9,Q5                    @III edge_idx = vaddq_s8(edge_idx, sign_down)
+    VTBL.8      D25,{D7},D27                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    VNEG.S8     Q7,Q5                       @III sign_up = vnegq_s8(sign_down)
+
+    VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VTBL.8      D18,{D6},D18                @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    VEXT.8      Q7,Q7,Q7,#1                 @III sign_up = vextq_s8(sign_up, sign_up, 1)
+    VTBL.8      D19,{D6},D19                @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    VTBL.8      D10,{D7},D18                @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    VADDW.S8    Q10,Q10,D10                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VTBL.8      D11,{D7},D19                @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    VMOVL.U8    Q11,D17                     @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
+    VADDW.S8    Q11,Q11,D11                 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
+    VMAX.S16    Q11,Q11,Q1                  @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    VMOV        Q6,Q15                      @II pu1_cur_row = pu1_next_row
+    VMIN.U16    Q11,Q11,Q2                  @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    CMP         r7,#1                       @III
+    VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
+    BLT         INNER_LOOP_DONE
+
+    ADD         r8,r0,r1,LSL #1             @*pu1_src + src_strd
+    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
+    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
+
+    LDRB        r5,[r5,#3]                  @pu1_avail[3]
+    VMOVN.I16   D21,Q11                     @III vmovn_s16(pi2_tmp_cur_row.val[1])
+    CMP         r5,#0
+
+    ADD         r4,r0,r1                    @pu1_src_cpy[16 - src_strd]
+    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r8,#8
+    LDRB        r5,[r0,#16]                 @load the value
+
+    BEQ         NEXT_ROW_ELSE_3
+    LDRB        r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
+    B           NEXT_ROW_POINTER_ASSIGNED_3
+NEXT_ROW_ELSE_3:
+    SUB         r11,r12,r7                  @ht_tmp - row
+    ADD         r8,r14,r11                  @pu1_src_left_cpy[ht_tmp - row]
+    ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
+    LDRB        r8,[r8]
+
+NEXT_ROW_POINTER_ASSIGNED_3:
+    LDRB        r11,[r4,#15]                @pu1_src_cpy[15]
+    VMOV.8      D19[7],r8                   @vsetq_lane_u8
+    SUB         r8,r11,r5                   @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+
+    CMP         r8,#0
+    VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+    MVNLT       r8,#0
+
+    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    VCGT.U8     Q12,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+    VCLT.U8     Q13,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+    VSUB.U8     Q12,Q13,Q12                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
+
+    VMOVL.U8    Q11,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+    VADDW.S8    Q10,Q10,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VADDW.S8    Q11,Q11,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    VMAX.S16    Q11,Q11,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    VMIN.U16    Q11,Q11,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+INNER_LOOP_DONE:
+    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+    LDR         r8,[sp,#0xD4]               @Loads ht
+
+    VMOVN.I16   D21,Q11                     @vmovn_s16(pi2_tmp_cur_row.val[1])
+    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
+
+    VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
+SRC_LEFT_LOOP:
+    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
+    SUBS        r8,r8,#4
+    STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
+    BNE         SRC_LEFT_LOOP
+
+    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
+    CMP         r6,#8                       @Check whether residue remains
+    BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
+    LDR         r7,[sp,#0xD0]               @Loads wd
+    LDR         r0,[sp,#0x90]               @Loads *pu1_src
+    SUB         r7,r7,r6
+    ADD         r0,r0,r7
+    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
+    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
+
+
+
+WD_16_HT_4_LOOP:
+    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
+    LDR         r7,[sp,#0xD0]               @Loads wd
+    CMP         r6,r7                       @col == wd
+    LDREQB      r8,[r5]                     @pu1_avail[0]
+    MOVNE       r8,#-1
+    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    CMP         r6,#16                      @if(col == 16)
+    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
+    LDRB        r8,[r5,#1]                  @pu1_avail[1]
+    VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL_WD_16_HT_4:
+    LDRB        r8,[r5,#2]                  @pu1_avail[2]
+    CMP         r8,#0
+
+    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
+    MOVNE       r8,r3
+    ADD         r8,r8,#1                    @pu1_src - src_strd + 1
+    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+    SUB         r8,#8
+
+    ADD         r3,r3,#16
+    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
+    LDR         r4,[sp,#0xD4]               @Loads ht
+    LDR         r7,[sp,#0xD0]               @Loads wd
+    SUB         r7,r7,r6                    @(wd - col)
+    ADD         r7,r7,#15                   @15 + (wd - col)
+    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
+    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
+    SUB         r5,r5,#1
+
+AU1_SRC_LEFT_LOOP_WD_16_HT_4:
+    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
+    STRB        r8,[r5,#1]!                 @store it in the stack pointer
+    SUBS        r4,r4,#1                    @decrement the loop count
+    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
+
+    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
+    SUB         r0,#8
+
+    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
+    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
+    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    VMOV.I8     Q9,#0
+    MOV         r7,r12                      @row count, move ht_tmp to r7
+
+PU1_SRC_LOOP_WD_16_HT_4:
+    ADD         r8,r0,r1                    @*pu1_src + src_strd
+    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r8,#8
+    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
+    LDRB        r5,[r5,#3]                  @pu1_avail[3]
+    CMP         r5,#0
+    BEQ         NEXT_ROW_ELSE_WD_16_HT_4
+    CMP         r7,#1
+    LDREQB      r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
+    BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
+NEXT_ROW_ELSE_WD_16_HT_4:
+    SUB         r5,r12,r7                   @ht_tmp - row
+    ADD         r8,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
+    ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
+    LDRB        r8,[r8]
+
+NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
+    VMOV.8      D19[7],r8                   @vsetq_lane_u8
+    VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+
+    CMP         r7,r12
+    BNE         SIGN_UP_CHANGE_WD_16_HT_4
+    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
+    LDRB        r5,[r5,#2]                  @pu1_avail[2]
+    CMP         r5,#0
+    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
+
+SIGN_UP_CHANGE_WD_16_HT_4:
+    LDRB        r8,[r0,#15]                 @pu1_src_cpy[15]
+    ADD         r5,r0,#16                   @pu1_src_cpy[16]
+    SUB         r5,r5,r1                    @pu1_src_cpy[16 - src_strd]
+    LDRB        r5,[r5]                     @load the value
+    SUB         r8,r8,r5                    @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+    CMP         r8,#0
+    MVNLT       r8,#0
+    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+    VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+
+SIGN_UP_CHANGE_DONE_WD_16_HT_4:
+    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
+    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
+    VEXT.8      Q7,Q7,Q7,#1                 @sign_up = vextq_s8(sign_up, sign_up, 1)
+
+    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+    VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
+    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
+    BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
+
+    LDR         r8,[sp,#0xD4]               @Loads ht
+    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
+    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
+SRC_LEFT_LOOP_WD_16_HT_4:
+    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
+    STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
+    SUBS        r8,r8,#4
+    BNE         SRC_LEFT_LOOP_WD_16_HT_4
+
+    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
+    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
+    BGT         WD_16_HT_4_LOOP             @If not equal jump to width_loop
+
+
+WIDTH_RESIDUE:
+    LDR         r7,[sp,#0xD0]               @Loads wd
+    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
+    CMP         r6,r7                       @wd_residue == wd
+    LDREQB      r8,[r5]                     @pu1_avail[0]
+
+    MOVNE       r8,#-1
+    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    LDRB        r8,[r5,#1]                  @pu1_avail[1]
+    VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+PU1_AVAIL_2_RESIDUE:
+    LDRB        r8,[r5,#2]                  @pu1_avail[2]
+    CMP         r8,#0
+
+    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
+    MOVNE       r8,r3
+    ADD         r8,r8,#1                    @pu1_src - src_strd + 1
+    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+    SUB         r8,#8
+
+
+    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
+    LDR         r4,[sp,#0xD4]               @Loads ht
+    LDR         r7,[sp,#0xD0]               @Loads wd
+    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
+    SUB         r7,r7,#1                    @(wd - 1)
+    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 1)]
+    SUB         r5,r5,#1
+
+AU1_SRC_LEFT_LOOP_RESIDUE:
+    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
+    STRB        r8,[r5,#1]!                 @store it in the stack pointer
+    SUBS        r4,r4,#1                    @decrement the loop count
+    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
+
+    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
+    SUB         r0,#8
+
+    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
+    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
+    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         r7,r12                      @row count, move ht_tmp to r7
+
+PU1_SRC_LOOP_RESIDUE:
+    VMOV.I8     Q9,#0
+    ADD         r8,r0,r1                    @*pu1_src + src_strd
+    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r8,#8
+    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
+    LDRB        r5,[r5,#3]                  @pu1_avail[3]
+    CMP         r5,#0
+    BEQ         NEXT_ROW_ELSE_RESIDUE
+    CMP         r7,#1
+    LDREQB      r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
+    BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
+NEXT_ROW_ELSE_RESIDUE:
+    SUB         r5,r12,r7                   @ht_tmp - row
+    ADD         r8,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
+    ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
+    LDRB        r8,[r8]
+
+NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
+    VMOV.8      D19[7],r8                   @vsetq_lane_u8
+    VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+
+    CMP         r7,r12
+    BNE         SIGN_UP_CHANGE_RESIDUE
+    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
+    LDRB        r5,[r5,#2]                  @pu1_avail[2]
+    CMP         r5,#0
+    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
+
+SIGN_UP_CHANGE_RESIDUE:
+    LDRB        r8,[r0,#15]                 @pu1_src_cpy[15]
+    ADD         r5,r0,#16                   @pu1_src_cpy[16]
+    SUB         r5,r5,r1                    @pu1_src_cpy[16 - src_strd]
+    LDRB        r5,[r5]                     @load the value
+    SUB         r8,r8,r5                    @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+    CMP         r8,#0
+    MVNLT       r8,#0
+    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+    VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+
+SIGN_UP_CHANGE_DONE_RESIDUE:
+    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
+    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
+    VEXT.8      Q7,Q7,Q7,#1                 @sign_up = vextq_s8(sign_up, sign_up, 1)
+
+    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
+    SUBS        r7,r7,#1
+    BNE         PU1_SRC_LOOP_RESIDUE
+
+    LDR         r8,[sp,#0xD4]               @Loads ht
+    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
+    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
+
+SRC_LEFT_LOOP_RESIDUE:
+    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
+    SUBS        r8,r8,#4
+    STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
+    BNE         SRC_LEFT_LOOP_RESIDUE
+
+
+RE_ASSINING_LOOP:
+    LDR         r7,[sp,#0xD0]               @Loads wd
+    LDR         r0,[sp,#0xC0]               @Loads *pu1_src
+
+    LDR         r11,[sp,#0xD4]              @Loads ht
+    ADD         r8,r0,r7                    @pu1_src[wd]
+
+    LDR         r4,[sp,#0xBC]               @Loads pu1_src_top_left
+    SUB         r11,r11,#1                  @ht - 1
+
+    STRB        r9,[r8,#-1]                 @pu1_src_org[wd - 1] = u1_pos_wd_0_tmp
+    MLA         r6,r11,r1,r0                @pu1_src_org[(ht - 1) * src_strd]
+
+    LDRB        r8,[sp]                     @load u1_src_top_left_tmp from stack pointer
+    ADD         r12,sp,#0x02
+
+    STRB        r10,[r6]                    @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
+    STRB        r8,[r4]                     @*pu1_src_top_left = u1_src_top_left_tmp
+    LDR         r3,[sp,#0xCC]               @Loads pu1_src_top
+
+SRC_TOP_LOOP:
+    VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
+    SUBS        r7,r7,#8                    @Decrement the width
+    VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
+    BNE         SRC_TOP_LOOP
+
+END_LOOPS:
+    ADD         sp,sp,#0x94
+    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
+
+
+

diff --git a/common/arm/ihevc_sao_edge_offset_class3_chroma.s b/common/arm/ihevc_sao_edge_offset_class3_chroma.s
new file mode 100644
index 0000000..2ecabe9
--- /dev/null
+++ b/common/arm/ihevc_sao_edge_offset_class3_chroma.s

@@ -0,0 +1,1052 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@*  ihevc_sao_edge_offset_class3_chroma.s
+@*
+@* ,:brief
+@*  Contains function definitions for inter prediction  interpolation.
+@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@*  Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@*  None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src,
+@                              WORD32 src_strd,
+@                              UWORD8 *pu1_src_left,
+@                              UWORD8 *pu1_src_top,
+@                              UWORD8 *pu1_src_top_left,
+@                              UWORD8 *pu1_src_top_right,
+@                              UWORD8 *pu1_src_bot_left,
+@                              UWORD8 *pu1_avail,
+@                              WORD8 *pi1_sao_offset_u,
+@                              WORD8 *pi1_sao_offset_v,
+@                              WORD32 wd,
+@                              WORD32 ht)
+@**************Variables Vs Registers*****************************************
+@r0 =>  *pu1_src
+@r1 =>  src_strd
+@r2 =>  *pu1_src_left
+@r3 =>  *pu1_src_top
+@r4 =>  *pu1_src_top_left
+@r5 =>  *pu1_avail
+@r6 =>  *pi1_sao_offset_u
+@r9 =>  *pi1_sao_offset_v
+@r7 =>  wd
+@r8=>   ht
+
+.text
+.p2align 2
+
+.extern gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class3_chroma_a9q
+
+gi1_table_edge_idx_addr_1:
+.long gi1_table_edge_idx - ulbl1 - 8
+
+gi1_table_edge_idx_addr_2:
+.long gi1_table_edge_idx - ulbl2 - 8
+
+gi1_table_edge_idx_addr_3:
+.long gi1_table_edge_idx - ulbl3 - 8
+
+gi1_table_edge_idx_addr_4:
+.long gi1_table_edge_idx - ulbl4 - 8
+
+gi1_table_edge_idx_addr_5:
+.long gi1_table_edge_idx - ulbl5 - 8
+
+ihevc_sao_edge_offset_class3_chroma_a9q:
+
+
+    STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
+
+    LDR         r7,[sp,#0x40]               @Loads wd
+    LDR         r8,[sp,#0x44]               @Loads ht
+    SUB         r9,r7,#2                    @wd - 2
+
+    LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
+    LDRH        r10,[r3,r9]                 @pu1_src_top[wd - 2]
+
+    MOV         r9,r7                       @Move width to r9 for loop count
+
+    LDR         r5,[sp,#0x34]               @Loads pu1_avail
+    LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset_u
+
+    STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
+    SUB         sp,sp,#0xD4                 @Decrement the stack pointer to store some temp arr values
+
+    STRH        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 2]
+    SUB         r10,r8,#1                   @ht-1
+    MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
+    ADD         r12,sp,#10                  @temp array
+
+AU1_SRC_TOP_LOOP:
+    VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
+    SUBS        r9,r9,#8                    @Decrement the loop count by 8
+    VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
+    BNE         AU1_SRC_TOP_LOOP
+
+PU1_AVAIL_5_LOOP_U:
+    LDRB        r9,[r5,#5]                  @pu1_avail[5]
+    CMP         r9,#0
+    SUB         r14,r7,#2                   @[wd - 2]
+    LDRB        r9,[r0,r14]                 @u1_pos_0_0_tmp_u = pu1_src[wd - 2]
+    SUB         r11,r7,#1                   @[wd - 1]
+    LDRB        r10,[r0,r11]                @u1_pos_0_0_tmp_v = pu1_src[wd - 1]
+    BEQ         PU1_AVAIL_6_LOOP_U
+
+    LDR         r11,[sp,#0x100]             @Load pu1_src_top_right from sp
+    LDRB        r11,[r11]                   @pu1_src_top_right[0]
+    SUB         r12,r9,r11                  @pu1_src[wd - 2] - pu1_src_top_right[0]
+    CMP         r12,#0
+    MVNLT       r12,#0
+    MOVGT       r12,#1                      @SIGN(pu1_src[wd - 2] - pu1_src_top_right[0])
+    ADD         r11,r0,r1                   @pu1_src + src_strd
+    SUB         r14,r14,#2                  @[wd - 2 - 2]
+    LDRB        r14,[r11,r14]               @pu1_src[wd - 2 - 2 + src_strd]
+    SUB         r11,r9,r14                  @pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]
+    CMP         r11,#0
+    MVNLT       r11,#0
+    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
+    ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
+    ADD         r11,r11,#2                  @edge_idx
+    LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
+ulbl1:
+    add         r14,r14,pc
+
+    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         r12,#0                      @0 != edge_idx
+    BEQ         PU1_AVAIL_5_LOOP_V
+    LDRSB       r11,[r6,r12]                @pi1_sao_offset_u[edge_idx]
+    ADD         r9,r9,r11                   @pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx]
+    USAT        r9,#8,r9                    @u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_5_LOOP_V:
+
+    LDR         r11,[sp,#0x100]             @Load pu1_src_top_right from sp
+    LDRB        r11,[r11,#1]                @pu1_src_top_right[1]
+    SUB         r12,r10,r11                 @pu1_src[wd - 1] - pu1_src_top_right[1]
+    CMP         r12,#0
+    MVNLT       r12,#0
+    MOVGT       r12,#1                      @SIGN(pu1_src[wd - 1] - pu1_src_top_right[1])
+    ADD         r11,r0,r1                   @pu1_src + src_strd
+    SUB         r14,r7,#3                   @[wd - 1 - 2]
+    LDRB        r14,[r11,r14]               @pu1_src[wd - 1 - 2 + src_strd]
+    SUB         r11,r10,r14                 @pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]
+    CMP         r11,#0
+    MVNLT       r11,#0
+    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
+    ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
+    ADD         r11,r11,#2                  @edge_idx
+    LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
+ulbl2:
+    add         r14,r14,pc
+
+    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         r12,#0                      @0 != edge_idx
+    BEQ         PU1_AVAIL_6_LOOP_U
+    LDR         r11,[sp,#0x110]             @Loads pi1_sao_offset_v
+    LDRSB       r11,[r11,r12]               @pi1_sao_offset_v[edge_idx]
+    ADD         r10,r10,r11                 @pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx]
+    USAT        r10,#8,r10                  @u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_6_LOOP_U:
+    STRB        r9,[sp,#6]
+    STRB        r10,[sp,#7]
+    STR         r0,[sp,#0x100]              @Store pu1_src in sp
+
+    LDRB        r10,[r5,#6]                 @pu1_avail[6]
+    CMP         r10,#0
+    SUB         r11,r8,#1                   @ht - 1
+    MLA         r12,r11,r1,r0               @pu1_src[(ht - 1) * src_strd]
+    LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp_u = pu1_src[(ht - 1) * src_strd]
+    LDRB        r9,[r12,#1]                 @u1_pos_wd_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1]
+    BEQ         PU1_AVAIL_3_LOOP
+
+    SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd - src_strd]
+    ADD         r11,r11,#2                  @pu1_src[(ht - 1) * src_strd +  2 - src_strd]
+    LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd +  2 - src_strd]
+    SUB         r11,r10,r11                 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd]
+    CMP         r11,#0
+    MVNLT       r11,#0
+    MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd])
+
+    LDR         r14,[sp,#0x104]             @Load pu1_src_bot_left from sp
+    LDRB        r14,[r14]                   @Load pu1_src_bot_left[0]
+    SUB         r14,r10,r14                 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
+    CMP         r14,#0
+    MVNLT       r14,#0
+    MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
+
+    ADD         r11,r11,r14                 @Add 2 sign value
+    ADD         r11,r11,#2                  @edge_idx
+    LDR         r14, gi1_table_edge_idx_addr_3 @table pointer
+ulbl3:
+    add         r14,r14,pc
+
+    LDRSB       r14,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         r14,#0
+    BEQ         PU1_AVAIL_6_LOOP_V
+    LDRSB       r11,[r6,r14]                @pi1_sao_offset_u[edge_idx]
+    ADD         r10,r10,r11                 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+    USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_6_LOOP_V:
+    ADD         r12,r12,#1                  @pu1_src[(ht - 1) * src_strd + 1]
+    SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd + 1) - src_strd]
+    ADD         r11,r11,#2                  @pu1_src[(ht - 1) * src_strd + 2 - src_strd]
+    LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd + 2 - src_strd]
+    SUB         r11,r9,r11                  @pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]
+    CMP         r11,#0
+    MVNLT       r11,#0
+    MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd])
+
+    LDR         r14,[sp,#0x104]             @Load pu1_src_bot_left from sp
+    LDRB        r14,[r14,#1]                @Load pu1_src_bot_left[1]
+    SUB         r14,r9,r14                  @pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]
+    CMP         r14,#0
+    MVNLT       r14,#0
+    MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1])
+
+    ADD         r11,r11,r14                 @Add 2 sign value
+    ADD         r11,r11,#2                  @edge_idx
+    LDR         r14, gi1_table_edge_idx_addr_4 @table pointer
+ulbl4:
+    add         r14,r14,pc
+
+    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         r12,#0
+    BEQ         PU1_AVAIL_3_LOOP
+    LDR         r14,[sp,#0x110]             @Loads pi1_sao_offset_v
+    LDRSB       r11,[r14,r12]               @pi1_sao_offset_v[edge_idx]
+    ADD         r9,r9,r11                   @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+    USAT        r9,#8,r9                    @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_3_LOOP:
+    STRB        r10,[sp,#8]
+    STRB        r9,[sp,#9]
+    STR         r2,[sp,#0x104]              @Store pu1_src_left in sp
+
+    MOV         r12,r8                      @Move ht
+    MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
+    LDRB        r11,[r5,#3]                 @pu1_avail[3]
+    CMP         r11,#0
+    BNE         PU1_AVAIL_2_LOOP
+    SUB         r12,r12,#1                  @ht_tmp--
+
+PU1_AVAIL_2_LOOP:
+    LDRB        r5,[r5,#2]                  @pu1_avail[2]
+    CMP         r5,#0
+    BNE         PU1_AVAIL_2_LOOP_END
+
+    ADD         r0,r0,r1                    @pu1_src += src_strd
+    SUB         r12,r12,#1                  @ht_tmp--
+    ADD         r14,r14,#2                  @pu1_src_left_cpy += 2
+
+PU1_AVAIL_2_LOOP_END:
+    STR         r0,[sp,#2]                  @Store pu1_src in sp
+    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
+    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
+    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+    VLD1.8      D6,[r6]                     @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
+    LDR         r6,[sp,#0x110]              @Loads pi1_sao_offset_v
+    VLD1.8      D7,[r6]                     @offset_tbl_v = vld1_s8(pi1_sao_offset_v)
+    LDR         r2, gi1_table_edge_idx_addr_5 @table pointer
+ulbl5:
+    add         r2,r2,pc
+    @VLD1.8     D6,[r6]                     @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
+    MOV         r6,r7                       @move wd to r6 loop_count
+
+    CMP         r7,#16                      @Compare wd with 16
+    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+    CMP         r8,#4                       @Compare ht with 4
+    BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
+
+WIDTH_LOOP_16:
+    LDR         r7,[sp,#0x114]              @Loads wd
+    CMP         r6,r7                       @col == wd
+    LDR         r5,[sp,#0x108]              @Loads pu1_avail
+
+    LDREQB      r8,[r5]                     @pu1_avail[0]
+    MOVNE       r8,#-1
+
+    VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    LDRB        r11,[r5,#2]                 @pu1_avail[2]
+
+    CMP         r6,#16                      @if(col == 16)
+    VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    BNE         SKIP_AU1_MASK_VAL
+    LDRB        r8,[r5,#1]                  @pu1_avail[1]
+    VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL:
+    CMP         r11,#0
+    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
+    SUB         r0,#8
+    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
+
+    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
+    VMOV.I8     Q9,#0
+    MOVNE       r8,r3
+
+    ADD         r8,r8,#2                    @pu1_src - src_strd + 2
+    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+    SUB         r8,#8
+    ADD         r3,r3,#16
+
+    LDR         r4,[sp,#0x118]              @Loads ht
+    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
+    LDR         r7,[sp,#0x114]              @Loads wd
+
+    SUB         r7,r7,r6                    @(wd - col)
+    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
+    ADD         r7,r7,#14                   @15 + (wd - col)
+
+    LDR         r8,[sp,#0x100]              @Loads *pu1_src
+    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
+
+AU1_SRC_LEFT_LOOP:
+    LDRH        r8,[r7]                     @load the value and increment by src_strd
+    SUBS        r4,r4,#1                    @decrement the loop count
+
+    STRH        r8,[r5],#2                  @store it in the stack pointer
+    ADD         r7,r7,r1
+    BNE         AU1_SRC_LEFT_LOOP
+
+
+    MOV         r7,r12                      @row count, move ht_tmp to r7
+    VMOV.I8     Q9,#0                       @I
+    ADD         r11,r0,r1                   @I *pu1_src + src_strd
+
+    SUB         r5,r12,r7                   @I ht_tmp - row
+    VLD1.8      D16,[r11]!                  @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r11]                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r11,#8
+    ADD         r8,r14,r5,LSL #1            @I pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+    LDRH        r5,[r8,#2]                  @I
+    VMOV.16     D19[3],r5                   @I vsetq_lane_u8
+    LDR         r11,[sp,#0x108]             @I Loads pu1_avail
+
+    LDRB        r11,[r11,#2]                @I pu1_avail[2]
+    VEXT.8      Q9,Q9,Q8,#14                @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+    CMP         r11,#0                      @I
+    BNE         SIGN_UP_CHANGE_DONE         @I
+
+    LDRB        r8,[r0,#14]                 @I pu1_src_cpy[14]
+    SUB         r5,r0,r1                    @I
+
+    LDRB        r11,[r5,#16]                @I load the value pu1_src_cpy[16 - src_strd]
+
+    LDRB        r9,[r0,#15]                 @I pu1_src_cpy[15]
+    SUB         r8,r8,r11                   @I pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+
+    LDRB        r10,[r5,#17]                @I load the value pu1_src_cpy[17 - src_strd]
+    CMP         r8,#0                       @I
+
+    MVNLT       r8,#0                       @I
+    SUB         r9,r9,r10                   @I pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+    MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+    CMP         r9,#0                       @I
+
+    MVNLT       r9,#0                       @I
+    VMOV.8      D15[6],r8                   @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+    MOVGT       r9,#1                       @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+    VMOV.8      D15[7],r9                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE:
+    VLD1.8      D28,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    VCGT.U8     Q10,Q6,Q9                   @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    VCLT.U8     Q11,Q6,Q9                   @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VSUB.U8     Q11,Q11,Q10                 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q9,Q9,Q11                   @I edge_idx = vaddq_s8(edge_idx, sign_down)
+    VTBL.8      D18,{D28},D18               @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VNEG.S8     Q7,Q11                      @I sign_up = vnegq_s8(sign_down)
+
+    VTBL.8      D19,{D28},D19               @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VEXT.8      Q7,Q7,Q7,#2                 @I sign_up = vextq_s8(sign_up, sign_up, 2)
+
+    VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VUZP.8      D18,D19                     @I
+    VTBL.8      D22,{D6},D18                @I
+    VTBL.8      D23,{D7},D19                @I
+    VZIP.8      D22,D23                     @I
+
+    VMOVL.U8    Q9,D13                      @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VADDW.S8    Q10,Q10,D22                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMOV        Q6,Q8                       @I pu1_cur_row = pu1_next_row
+    VADDW.S8    Q9,Q9,D23                   @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
+    VMAX.S16    Q9,Q9,Q1                    @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    VMIN.U16    Q9,Q9,Q2                    @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+
+PU1_SRC_LOOP:
+    ADD         r11,r0,r1,LSL #1            @II *pu1_src + src_strd
+    VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
+    SUB         r5,r12,r7                   @II ht_tmp - row
+
+    ADD         r4,r0,r1                    @III *pu1_src + src_strd
+    VMOVN.I16   D21,Q9                      @I vmovn_s16(pi2_tmp_cur_row.val[1])
+    ADD         r8,r14,r5,LSL #1            @II pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+    LDRH        r9,[r8,#2]
+    VLD1.8      D16,[r11]!                  @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r11]                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r11,#8
+    LDRB        r10,[r4,#14]                @II pu1_src_cpy[14]
+
+    LDRB        r8,[r4,#15]                 @II pu1_src_cpy[15]
+    VMOV.16     D29[3],r9                   @II vsetq_lane_u8
+    ADD         r4,r11,r1                   @III *pu1_src + src_strd
+
+    LDRB        r5,[r0,#17]                 @II load the value pu1_src_cpy[17 - src_strd]
+    VLD1.8      D30,[r4]!                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D31,[r4]                    @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r4,#8
+    LDRB        r11,[r0,#16]                @II load the value pu1_src_cpy[16 - src_strd]
+
+    SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
+    VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    SUB         r10,r10,r11                 @II pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+
+    CMP         r10,#0                      @II
+    VEXT.8      Q14,Q14,Q8,#14              @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+    SUB         r8,r8,r5                    @II pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+    MVNLT       r10,#0                      @II
+    VLD1.8      D21,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    MOVGT       r10,#1                      @II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+
+    CMP         r8,#0                       @II
+    VMOV.8      D15[6],r10                  @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+    MVNLT       r8,#0                       @II
+
+    MOVGT       r8,#1                       @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+    SUB         r10,r12,r7                  @III ht_tmp - row
+    VMOV.8      D15[7],r8                   @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+    ADD         r11,r14,r10,LSL #1          @III pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+    CMP         r7,#1                       @III
+    VCGT.U8     Q11,Q6,Q14                  @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    BNE         NEXT_ROW_POINTER_ASSIGNED_2 @III
+
+    LDR         r5,[sp,#0x108]              @III Loads pu1_avail
+    LDRB        r5,[r5,#3]                  @III pu1_avail[3]
+    CMP         r5,#0                       @III
+    SUBNE       r11,r4,#4                   @III pu1_src[src_strd - 2]
+
+NEXT_ROW_POINTER_ASSIGNED_2:
+    LDRH        r5,[r11,#2]                 @III
+    VCLT.U8     Q12,Q6,Q14                  @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    ADD         r11,r0,r1                   @III
+
+    LDRB        r9,[r11,#14]                @III pu1_src_cpy[14]
+    VMOV.16     D19[3],r5                   @III vsetq_lane_u8
+    LDRB        r8,[r11,#15]                @III pu1_src_cpy[15]
+
+    LDRB        r11,[r0,#16]                @III load the value pu1_src_cpy[16 - src_strd]
+    VSUB.U8     Q12,Q12,Q11                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    LDRB        r10,[r0,#17]                @III load the value pu1_src_cpy[17 - src_strd]
+
+    SUB         r9,r9,r11                   @III pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+    VEXT.8      Q9,Q9,Q15,#14               @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+    SUB         r10,r8,r10                  @III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+    CMP         r9,#0                       @III
+    VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
+    MVNLT       r9,#0                       @III
+
+    MOVGT       r9,#1                       @III SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+    VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
+    CMP         r10,#0                      @III
+
+    VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
+    VTBL.8      D26,{D21},D26               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    MVNLT       r10,#0                      @III
+    MOVGT       r10,#1                      @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+    VEXT.8      Q7,Q7,Q7,#2                 @II sign_up = vextq_s8(sign_up, sign_up, 2)
+    VTBL.8      D27,{D21},D27               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VCGT.U8     Q11,Q8,Q9                   @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    VMOV.8      D15[6],r9                   @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+    VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VMOV.8      D15[7],r10                  @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+    VUZP.8      D26,D27                     @II
+
+    VCLT.U8     Q10,Q8,Q9                   @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VTBL.8      D24,{D6},D26                @II
+    VSUB.U8     Q11,Q10,Q11                 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
+    VTBL.8      D25,{D7},D27                @II
+    VADD.I8     Q9,Q9,Q11                   @III edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    VZIP.8      D24,D25                     @II
+
+    VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VTBL.8      D18,{D20},D18               @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VNEG.S8     Q7,Q11                      @III sign_up = vnegq_s8(sign_down)
+
+    VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VTBL.8      D19,{D20},D19               @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VEXT.8      Q7,Q7,Q7,#2                 @III sign_up = vextq_s8(sign_up, sign_up, 2)
+
+    VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VUZP.8      D18,D19                     @III
+
+    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VTBL.8      D22,{D6},D18                @III
+    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    VTBL.8      D23,{D7},D19                @III
+    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    VMOVL.U8    Q9,D17                      @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VZIP.8      D22,D23                     @III
+
+    VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
+    VADDW.S8    Q10,Q10,D22                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    VMOV        Q6,Q15                      @III pu1_cur_row = pu1_next_row
+    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
+    VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    CMP         r7,#1                       @III
+
+    VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
+    VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VADDW.S8    Q9,Q9,D23                   @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    VMAX.S16    Q9,Q9,Q1                    @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    VMIN.U16    Q9,Q9,Q2                    @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
+    BLT         INNER_LOOP_DONE
+
+
+    ADD         r11,r0,r1,LSL #1            @*pu1_src + src_strd
+    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
+    SUB         r5,r12,r7                   @ht_tmp - row
+
+    ADD         r8,r14,r5,LSL #1            @pu1_src_left_cpy[(ht_tmp - row) * 2]
+    VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
+    CMP         r7,#1
+
+    LDRB        r4,[r0,#16]                 @load the value pu1_src_cpy[16 - src_strd]
+    VLD1.8      D16,[r11]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r11]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r11,#8
+    LDRB        r9,[r0,#17]                 @load the value pu1_src_cpy[17 - src_strd]
+
+    BNE         NEXT_ROW_POINTER_ASSIGNED_3
+    LDR         r5,[sp,#0x108]              @Loads pu1_avail
+    LDRB        r5,[r5,#3]                  @pu1_avail[3]
+    CMP         r5,#0
+    SUBNE       r8,r11,#4                   @pu1_src[src_strd - 2]
+
+NEXT_ROW_POINTER_ASSIGNED_3:
+    LDRH        r5,[r8,#2]
+    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    LDRB        r8,[r0,#14]                 @pu1_src_cpy[14]
+
+    SUB         r8,r8,r4                    @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+    VMOV.16     D19[3],r5                   @vsetq_lane_u8
+    LDRB        r10,[r0,#15]                @pu1_src_cpy[15]
+
+    CMP         r8,#0
+    VEXT.8      Q9,Q9,Q8,#14                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+    SUB         r10,r10,r9                  @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+    MVNLT       r8,#0
+    VLD1.8      D28,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+
+    CMP         r10,#0
+    VMOV.8      D15[6],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+    MVNLT       r10,#0
+
+    MOVGT       r10,#1                      @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+    VMOV.8      D15[7],r10                  @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VSUB.U8     Q11,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VADD.I8     Q9,Q0,Q7                    @edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q9,Q9,Q11                   @edge_idx = vaddq_s8(edge_idx, sign_down)
+    VTBL.8      D18,{D28},D18               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    VTBL.8      D19,{D28},D19               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    VAND        Q9,Q9,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VUZP.8      D18,D19
+
+    VTBL.8      D22,{D6},D18
+    VTBL.8      D23,{D7},D19
+
+    VMOVL.U8    Q9,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VZIP.8      D22,D23
+
+    VADDW.S8    Q10,Q10,D22                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VADDW.S8    Q9,Q9,D23                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    VMAX.S16    Q9,Q9,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    VMIN.U16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+
+INNER_LOOP_DONE:
+
+    LDR         r8,[sp,#0x118]              @Loads ht
+    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
+    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
+
+    LSL         r8,r8,#1
+    VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
+    LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
+
+SRC_LEFT_LOOP:
+    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
+    SUBS        r8,r8,#4
+    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
+    BNE         SRC_LEFT_LOOP
+
+    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
+    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    CMP         r6,#8                       @Check whether residue remains
+
+    BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
+    LDR         r7,[sp,#0x114]              @Loads wd
+    LDR         r0,[sp,#0x02]               @Loads *pu1_src
+    SUB         r7,r7,r6
+    ADD         r0,r0,r7
+    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
+    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
+
+WD_16_HT_4_LOOP:
+    LDR         r7,[sp,#0x114]              @Loads wd
+
+    LDR         r5,[sp,#0x108]              @Loads pu1_avail
+    CMP         r6,r7                       @col == wd
+
+    LDREQB      r8,[r5]                     @pu1_avail[0]
+    MOVNE       r8,#-1
+    VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    CMP         r6,#16                      @if(col == 16)
+    VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
+    LDRB        r8,[r5,#1]                  @pu1_avail[1]
+    VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL_WD_16_HT_4:
+    LDRB        r11,[r5,#2]                 @pu1_avail[2]
+    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
+
+    CMP         r11,#0
+    MOVNE       r8,r3
+    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
+    SUB         r0,#8
+    ADD         r8,r8,#2                    @pu1_src - src_strd + 2
+
+    ADD         r3,r3,#16
+    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+    SUB         r8,#8
+    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
+
+    LDR         r4,[sp,#0x118]              @Loads ht
+    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
+    LDR         r7,[sp,#0x114]              @Loads wd
+
+    SUB         r7,r7,r6                    @(wd - col)
+    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
+    ADD         r7,r7,#14                   @15 + (wd - col)
+
+    LDR         r8,[sp,#0x100]              @Loads *pu1_src
+    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
+
+AU1_SRC_LEFT_LOOP_WD_16_HT_4:
+    LDRH        r8,[r7]                     @load the value and increment by src_strd
+    SUBS        r4,r4,#1                    @decrement the loop count
+
+    STRH        r8,[r5],#2                  @store it in the stack pointer
+    ADD         r7,r7,r1
+    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
+
+    VMOV.I8     Q9,#0
+    MOV         r7,r12                      @row count, move ht_tmp to r7
+
+PU1_SRC_LOOP_WD_16_HT_4:
+    ADD         r9,r0,r1                    @*pu1_src + src_strd
+
+    LDR         r5,[sp,#0x108]              @Loads pu1_avail
+    VLD1.8      D16,[r9]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r9]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r9,#8
+    LDRB        r5,[r5,#3]                  @pu1_avail[3]
+
+    SUB         r11,r12,r7                  @ht_tmp - row
+    ADD         r8,r14,r11,LSL #1           @pu1_src_left_cpy[(ht_tmp - row) * 2]
+    ADD         r8,r8,#2                    @pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
+
+    CMP         r5,#0
+    BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
+    CMP         r7,#1
+    SUBEQ       r8,r9,#2                    @pu1_src[src_strd - 2]
+
+NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
+    LDRH        r5,[r8]
+    VMOV.16     D19[3],r5                   @vsetq_lane_u8
+    VEXT.8      Q9,Q9,Q8,#14                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+
+    CMP         r7,r12
+    BLT         SIGN_UP_CHANGE_WD_16_HT_4
+    LDR         r5,[sp,#0x108]              @Loads pu1_avail
+    LDRB        r5,[r5,#2]                  @pu1_avail[2]
+    CMP         r5,#0
+    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
+
+SIGN_UP_CHANGE_WD_16_HT_4:
+    LDRB        r8,[r0,#14]                 @pu1_src_cpy[14]
+    SUB         r9,r0,r1
+
+    LDRB        r5,[r9,#16]                 @load the value pu1_src_cpy[16 - src_strd]
+
+    LDRB        r10,[r0,#15]                @pu1_src_cpy[15]
+    SUB         r8,r8,r5                    @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+
+    LDRB        r11,[r9,#17]                @load the value pu1_src_cpy[17 - src_strd]
+    CMP         r8,#0
+
+    MVNLT       r8,#0
+    SUB         r10,r10,r11                 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+
+    CMP         r10,#0
+    VMOV.8      D15[6],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+    MVNLT       r10,#0
+
+    MOVGT       r10,#1                      @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+    VMOV.8      D15[7],r10                  @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE_WD_16_HT_4:
+    VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
+    VTBL.8      D26,{D20},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+
+    VTBL.8      D27,{D20},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VEXT.8      Q7,Q7,Q7,#2                 @sign_up = vextq_s8(sign_up, sign_up, 2)
+
+    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+
+    VUZP.8      D26,D27
+    VTBL.8      D24,{D6},D26
+    VTBL.8      D25,{D7},D27
+    VZIP.8      D24,D25
+
+    VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
+    VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+    VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
+    VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
+
+    LDR         r8,[sp,#0x118]              @Loads ht
+    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
+    LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
+
+SRC_LEFT_LOOP_WD_16_HT_4:
+    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
+    SUBS        r8,r8,#2
+    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
+    BNE         SRC_LEFT_LOOP_WD_16_HT_4
+
+    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
+    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
+    BGT         WD_16_HT_4_LOOP             @If not equal jump to width_loop
+
+WIDTH_RESIDUE:
+    LDR         r7,[sp,#0x114]              @Loads wd
+
+    LDR         r5,[sp,#0x108]              @Loads pu1_avail
+    CMP         r6,r7                       @wd_residue == wd
+
+    LDREQB      r8,[r5]                     @pu1_avail[0]
+
+    MOVNE       r8,#-1
+    LDRB        r11,[r5,#1]                 @pu1_avail[1]
+
+    LDRB        r9,[r5,#2]                  @pu1_avail[2]
+    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    CMP         r9,#0
+
+    SUBEQ       r10,r0,r1                   @pu1_src - src_strd
+    VMOV.8      d8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    MOVNE       r10,r3
+
+    ADD         r10,r10,#2                  @pu1_src - src_strd + 2
+    VMOV.8      d8[6],r11                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
+
+    LDR         r4,[sp,#0x118]              @Loads ht
+    VMOV.8      d8[7],r11                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    LDR         r7,[sp,#0x114]              @Loads wd
+
+    LDR         r8,[sp,#0x100]              @Loads *pu1_src
+    VLD1.8      D10,[r10]!                  @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+    VLD1.8      D11,[r10]                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+    SUB         r10,#8
+    SUB         r7,r7,#2                    @(wd - 2)
+
+    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 2)]
+
+AU1_SRC_LEFT_LOOP_RESIDUE:
+    LDRH        r8,[r7]                     @load the value and increment by src_strd
+    ADD         r7,r7,r1
+    STRH        r8,[r5],#2                  @store it in the stack pointer
+    SUBS        r4,r4,#1                    @decrement the loop count
+    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
+
+    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
+    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
+    SUB         r0,#8
+
+    VMOV.I8     Q9,#0
+    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
+    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         r7,r12                      @row count, move ht_tmp to r7
+
+PU1_SRC_LOOP_RESIDUE:
+    ADD         r9,r0,r1                    @*pu1_src + src_strd
+
+    SUB         r11,r12,r7                  @ht_tmp - row
+    VLD1.8      D16,[r9]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    VLD1.8      D17,[r9]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         r9,#8
+    LDR         r5,[sp,#0x108]              @Loads pu1_avail
+
+    LDRB        r5,[r5,#3]                  @pu1_avail[3]
+    ADD         r8,r14,r11,LSL #1           @pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+    CMP         r5,#0
+    ADD         r8,r8,#2                    @pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
+
+    BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
+    CMP         r7,#1
+    SUBEQ       r8,r9,#2                    @pu1_src[src_strd - 2]
+
+NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
+    LDRB        r5,[r8]
+
+    LDRB        r8,[r8,#1]
+    VMOV.8      D19[6],r5                   @vsetq_lane_u8
+    CMP         r7,r12
+
+    VMOV.8      D19[7],r8                   @vsetq_lane_u8
+    VEXT.8      Q9,Q9,Q8,#14                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+
+    BLT         SIGN_UP_CHANGE_RESIDUE
+    LDR         r5,[sp,#0x108]              @Loads pu1_avail
+    LDRB        r5,[r5,#2]                  @pu1_avail[2]
+    CMP         r5,#0
+    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
+
+SIGN_UP_CHANGE_RESIDUE:
+    LDRB        r8,[r0,#14]                 @pu1_src_cpy[14]
+    SUB         r9,r0,r1
+
+    LDRB        r5,[r9,#16]                 @load the value pu1_src_cpy[16 - src_strd]
+
+    LDRB        r10,[r0,#15]                @pu1_src_cpy[15]
+    SUB         r8,r8,r5                    @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+
+    LDRB        r11,[r9,#17]                @load the value pu1_src_cpy[17 - src_strd]
+    CMP         r8,#0
+
+    MVNLT       r8,#0
+    SUB         r10,r10,r11                 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+
+    CMP         r10,#0
+    VMOV.8      D15[6],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+    MVNLT       r10,#0
+
+    MOVGT       r10,#1                      @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+    VMOV.8      D15[7],r10                  @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE_RESIDUE:
+    VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
+    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
+    VTBL.8      D26,{D20},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+
+    VTBL.8      D27,{D20},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    VEXT.8      Q7,Q7,Q7,#2                 @sign_up = vextq_s8(sign_up, sign_up, 14)
+
+    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+
+    VUZP.8      D26,D27
+    VTBL.8      D24,{D6},D26
+    VTBL.8      D25,{D7},D27
+    VZIP.8      D24,D25
+
+    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
+    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
+    VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    BNE         PU1_SRC_LOOP_RESIDUE        @If not equal jump to PU1_SRC_LOOP
+
+    LDR         r8,[sp,#0x118]              @Loads ht
+    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
+
+    LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
+
+SRC_LEFT_LOOP_RESIDUE:
+    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
+    SUBS        r8,r8,#2
+    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
+    BNE         SRC_LEFT_LOOP_RESIDUE
+
+
+RE_ASSINING_LOOP:
+    LDR         r7,[sp,#0x114]              @Loads wd
+    LDR         r8,[sp,#0x118]              @Loads ht
+
+    LDR         r0,[sp,#0x100]              @Loads *pu1_src
+    SUB         r10,r7,#2                   @wd - 2
+
+    LDRH        r9,[sp,#6]
+    SUB         r8,r8,#1                    @ht - 1
+
+    STRH        r9,[r0,r10]                 @pu1_src_org[0] = u1_pos_0_0_tmp
+    MLA         r6,r8,r1,r0                 @pu1_src[(ht - 1) * src_strd]
+
+    LDR         r4,[sp,#0xFC]               @Loads pu1_src_top_left
+
+    LDRH        r9,[sp,#8]
+    ADD         r12,sp,#10
+
+    STRH        r9,[r6]                     @pu1_src_org[(ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
+
+    LDRH        r10,[sp]                    @load u1_src_top_left_tmp from stack pointer
+    STRH        r10,[r4]                    @*pu1_src_top_left = u1_src_top_left_tmp
+    LDR         r3,[sp,#0x10C]              @Loads pu1_src_top
+
+SRC_TOP_LOOP:
+    VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
+    SUBS        r7,r7,#8                    @Decrement the width
+    VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
+    BNE         SRC_TOP_LOOP
+
+END_LOOPS:
+    ADD         sp,sp,#0xD4
+    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
+
+
+

diff --git a/common/arm/ihevc_weighted_pred_bi.s b/common/arm/ihevc_weighted_pred_bi.s
new file mode 100644
index 0000000..5308423
--- /dev/null
+++ b/common/arm/ihevc_weighted_pred_bi.s

@@ -0,0 +1,269 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_weighted_pred_bi.s
+@*
+@* @brief
+@*  contains function definitions for weighted prediction used in inter
+@* prediction
+@*
+@* @author
+@*  parthiban v
+@*
+@* @par list of functions:
+@*  - ihevc_weighted_pred_bi()
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*  does bi-weighted prediction on the arrays pointed by  pi2_src1 and
+@* pi2_src2 and stores it at location pointed  by pi2_dst   assumptions : the
+@* function is optimized considering the fact width and  height are multiple
+@* of 2.
+@*
+@* @par description:
+@*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
+@* off1 + 1) << (shift - 1) ) >> shift
+@*
+@* @param[in] pi2_src1
+@*  pointer to source 1
+@*
+@* @param[in] pi2_src2
+@*  pointer to source 2
+@*
+@* @param[out] pu1_dst
+@*  pointer to destination
+@*
+@* @param[in] src_strd1
+@*  source stride 1
+@*
+@* @param[in] src_strd2
+@*  source stride 2
+@*
+@* @param[in] dst_strd
+@*  destination stride
+@*
+@* @param[in] wgt0
+@*  weight to be multiplied to source 1
+@*
+@* @param[in] off0
+@*  offset 0
+@*
+@* @param[in] wgt1
+@*  weight to be multiplied to source 2
+@*
+@* @param[in] off1
+@*  offset 1
+@*
+@* @param[in] shift
+@*  (14 bit depth) + log2_weight_denominator
+@*
+@* @param[in] lvl_shift1
+@*  added before shift and offset
+@*
+@* @param[in] lvl_shift2
+@*  added before shift and offset
+@*
+@* @param[in] ht
+@*  height of the source
+@*
+@* @param[in] wd
+@*  width of the source
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_weighted_pred_bi(word16 *pi2_src1,
+@                            word16 *pi2_src2,
+@                            uword8 *pu1_dst,
+@                            word32 src_strd1,
+@                            word32 src_strd2,
+@                            word32 dst_strd,
+@                            word32 wgt0,
+@                            word32 off0,
+@                            word32 wgt1,
+@                            word32 off1,
+@                            word32 shift,
+@                            word32 lvl_shift1,
+@                            word32 lvl_shift2,
+@                            word32 ht,
+@                            word32 wd)
+
+@**************variables vs registers*****************************************
+@   r0 => *pi2_src1
+@   r1 => *pi2_src2
+@   r2 => *pu1_dst
+@   r3 =>  src_strd1
+@   r4 =>  src_strd2
+@   r5 =>  dst_strd
+@   r6 =>  wgt0
+@   r7 =>  off0
+@   r8 =>  wgt1
+@   r9 =>  off1
+@   r10 =>  shift
+@   r11 =>  lvl_shift1
+@   r12 =>  lvl_shift2
+@   r14 =>  ht
+@   r7  =>  wd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_weighted_pred_bi_a9q
+
+.type ihevc_weighted_pred_bi_a9q, %function
+
+ihevc_weighted_pred_bi_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r6,[sp,#48]                 @load wgt0
+    ldr         r11,[sp,#68]                @load lvl_shift1
+    ldr         r12,[sp,#72]                @load lvl_shift2
+    vmov.s16    d7[0],r6                    @moved for scalar multiplication
+    mul         r4,r11,r6                   @lvl_shift1 * wgt0
+    ldr         r8,[sp,#56]                 @load wgt1
+    ldr         r7,[sp,#52]                 @load off0
+    vmov.s16    d7[1],r8                    @moved for scalar multiplication
+    mla         r4,r12,r8,r4                @(lvl_shift1 * wgt0) + (lvl_shift2 * wgt1)
+    ldr         r9,[sp,#60]                 @load off1
+    add         r5,r7,r9                    @off0 + off1
+    ldr         r10,[sp,#64]                @load shift
+    add         r5,r5,#1                    @off0 + off1 + 1
+    sub         r14,r10,#1                  @shift - 1
+    ldr         r7,[sp,#80]                 @load wd
+    lsl         r5,r5,r14                   @((off0 + off1 + 1) << (shift - 1))
+    vdup.u32    q14,r10                     @vmovq_n_s32(0-shift)
+    add         r4,r4,r5                    @tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1))
+    vdup.u32    q15,r4                      @vmovq_n_s32(tmp_lvl_shift)
+    vneg.s32    q14,q14
+    ldr         r4,[sp,#40]                 @load src_strd2
+    lsl         r9,r7,#1
+    ldr         r5,[sp,#44]                 @load dst_strd
+    lsl         r3,r3,#1
+    ldr         r14,[sp,#76]                @load ht
+    lsl         r4,r4,#1
+
+    cmp         r14,#0                      @check ht == 0
+    beq         end_loops                   @if equal, then end the function
+
+outer_loop:
+    cmp         r7,#0                       @check wd == 0
+    beq         end_loops                   @if equal, then end the function
+
+core_loop:
+    add         r6,r0,r3                    @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         r8,r1,r4                    @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+    vld1.s16    {d0},[r0]!                  @load and increment the pi2_src1
+    add         r10,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
+    vld1.s16    {d1},[r1]!                  @load and increment the pi2_src2
+    vmull.s16   q2,d0,d7[0]                 @vmull_n_s16(pi2_src1_val1, (int16_t) wgt0)
+    vld1.s16    {d2},[r6],r3                @load and increment the pi2_src_tmp1 ii iteration
+    vmull.s16   q4,d1,d7[1]                 @vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
+    vld1.s16    {d3},[r8],r4                @load and increment the pi2_src_tmp1 ii iteration
+    vadd.s32    q2,q2,q4                    @vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)
+
+    vld1.s16    {d0},[r6],r3                @load and increment the pi2_src1 iii iteration
+    vmull.s16   q5,d2,d7[0]                 @vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration
+
+    vld1.s16    {d1},[r8],r4                @load and increment the pi2_src2 iii iteration
+    vadd.s32    q2,q2,q15                   @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+    vmull.s16   q7,d0,d7[0]                 @vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration
+
+    vld1.s16    {d2},[r6],r3                @load and increment the pi2_src_tmp1 iv iteration
+    vmull.s16   q6,d3,d7[1]                 @vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
+    vshl.s32    q2,q2,q14                   @vshlq_s32(i4_tmp1_t1, tmp_shift_t)
+
+    vld1.s16    {d3},[r8],r4                @load and increment the pi2_src_tmp1 iv iteration
+    vadd.s32    q5,q5,q6                    @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration
+
+    vqmovun.s32 d4,q2                       @vqmovun_s32(sto_res_tmp1)
+    vmull.s16   q8,d1,d7[1]                 @vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration
+
+    vadd.s32    q5,q5,q15                   @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
+    vmov.s32    d5,d4                       @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
+    vadd.s32    q7,q7,q8                    @vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration
+
+    vshl.s32    q5,q5,q14                   @vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration
+    vmull.s16   q9,d2,d7[0]                 @vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration
+    vqmovn.u16  d4,q2                       @vqmovn_u16(sto_res_tmp3)
+    vadd.s32    q7,q7,q15                   @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
+
+    vqmovun.s32 d10,q5                      @vqmovun_s32(sto_res_tmp1) ii iteration
+    vmull.s16   q10,d3,d7[1]                @vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration
+
+    vshl.s32    q7,q7,q14                   @vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration
+    vmov.s32    d11,d10                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
+
+    vadd.s32    q9,q9,q10                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
+    vqmovun.s32 d14,q7                      @vqmovun_s32(sto_res_tmp1) iii iteration
+
+    vadd.s32    q9,q9,q15                   @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteration
+    vst1.s32    {d4[0]},[r2]!               @store pu1_dst i iteration
+
+    vqmovn.u16  d10,q5                      @vqmovn_u16(sto_res_tmp3) ii iteration
+    vshl.s32    q9,q9,q14                   @vshlq_s32(i4_tmp2_t1, tmp_shift_t) iv iteration
+    vst1.s32    {d10[0]},[r10],r5           @store pu1_dst ii iteration
+
+
+    vmov.s32    d15,d14                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
+    vqmovn.u16  d14,q7                      @vqmovn_u16(sto_res_tmp3) iii iteration
+    vqmovun.s32 d18,q9                      @vqmovun_s32(sto_res_tmp1) iv iteration
+    vmov.s32    d19,d18                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
+    vst1.s32    {d14[0]},[r10],r5           @store pu1_dst iii iteration
+    vqmovn.u16  d18,q9                      @vqmovn_u16(sto_res_tmp3) iv iteration
+    subs        r7,r7,#4                    @decrement wd by 4 and check for 0
+    vst1.s32    {d18[0]},[r10],r5           @store pu1_dst iv iteration
+
+    bgt         core_loop                   @if greater than 0 repeat the core loop again
+
+end_core_loop:
+    rsb         r11,r9,r3,lsl #2            @2*src_strd1 - wd
+    subs        r14,r14,#4                  @decrement the ht by 4
+    rsb         r12,r9,r4,lsl #2            @2*src_strd2 - wd
+    add         r0,r0,r11                   @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+    asr         r7,r9,#1
+    add         r1,r1,r12                   @pi2_src2 + 4*src_strd2 - 2*wd
+    rsb         r10,r7,r5,lsl #2            @2*dst_strd - wd
+    add         r2,r2,r10                   @pu1_dst + dst_std - wd
+    bgt         core_loop                   @if ht is greater than 0 goto outer_loop
+
+end_loops:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+
+
+

diff --git a/common/arm/ihevc_weighted_pred_bi_default.s b/common/arm/ihevc_weighted_pred_bi_default.s
new file mode 100644
index 0000000..b560c15
--- /dev/null
+++ b/common/arm/ihevc_weighted_pred_bi_default.s

@@ -0,0 +1,494 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_weighted_pred_bi_default.s
+@*
+@* @brief
+@*  contains function definitions for weighted prediction used in inter
+@* prediction
+@*
+@* @author
+@*  parthiban v
+@*
+@* @par list of functions:
+@*  - ihevc_weighted_pred_bi_default()
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*  does default bi-weighted prediction on the arrays pointed by pi2_src1 and
+@* pi2_src2 and stores it at location  pointed by pi2_dst assumptions : the
+@* function is optimized considering the fact width and  height are multiple
+@* of 2.
+@*
+@* @par description:
+@*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
+@* >> shift  where shift = 15 - bitdepth
+@*
+@* @param[in] pi2_src1
+@*  pointer to source 1
+@*
+@* @param[in] pi2_src2
+@*  pointer to source 2
+@*
+@* @param[out] pu1_dst
+@*  pointer to destination
+@*
+@* @param[in] src_strd1
+@*  source stride 1
+@*
+@* @param[in] src_strd2
+@*  source stride 2
+@*
+@* @param[in] dst_strd
+@*  destination stride
+@*
+@* @param[in] lvl_shift1
+@*  added before shift and offset
+@*
+@* @param[in] lvl_shift2
+@*  added before shift and offset
+@*
+@* @param[in] ht
+@*  height of the source
+@*
+@* @param[in] wd
+@*  width of the source
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+@void ihevc_weighted_pred_bi_default(word16 *pi2_src1,
+@                                    word16 *pi2_src2,
+@                                    uword8 *pu1_dst,
+@                                    word32 src_strd1,
+@                                    word32 src_strd2,
+@                                    word32 dst_strd,
+@                                    word32 lvl_shift1,
+@                                    word32 lvl_shift2,
+@                                    word32 ht,
+@                                    word32 wd)
+
+@**************variables vs registers*****************************************
+@   r0 => *pi2_src1
+@   r1 => *pi2_src2
+@   r2 => *pu1_dst
+@   r3 =>  src_strd1
+@   r4 =>  src_strd2
+@   r5 =>  dst_strd
+@   r6 =>  lvl_shift1
+@   r7 =>  lvl_shift2
+@   r8 =>  ht
+@   r9 =>  wd
+.text
+.align 4
+
+
+
+
+.globl ihevc_weighted_pred_bi_default_a9q
+
+.type ihevc_weighted_pred_bi_default_a9q, %function
+
+ihevc_weighted_pred_bi_default_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+    ldr         r4,[sp,#40]                 @load src_strd2
+    lsl         r3,r3,#1
+    ldr         r5,[sp,#44]                 @load dst_strd
+    ldr         r6,[sp,#48]                 @load lvl_shift1
+    lsl         r4,r4,#1
+    ldr         r7,[sp,#52]                 @load lvl_shift2
+    ldr         r8,[sp,#56]                 @load ht
+    ldr         r9,[sp,#60]                 @load wd
+    vdup.16     q2,r6                       @lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1)
+    vdup.16     q3,r7                       @lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2)
+    vmov.i16    q0,#0x40                    @tmp_lvl_shift = 1 << (shift - 1)
+    vadd.i16    q2,q3
+    vadd.s16    q0,q0,q2
+@   vmvn.i32    q1,#0x6                         @vmovq_n_s32(tmp_shift)
+    lsl         r6,r9,#1
+    rsb         r7,r6,r3,lsl #2             @4*src_strd1 - wd
+    rsb         r10,r6,r4,lsl #2            @4*src_strd2 - wd
+    @asr            r6,#1
+    @rsb            r6,r6,r5,lsl #2             @4*dst_strd - wd
+
+    cmp         r8,#0                       @check ht == 0
+    beq         end_loops                   @if equal, then end the function
+
+chroma_decision:
+    orr         r14,r8,r9
+    cmp         r14,#10
+    beq         outer_loop_chroma_8x2
+
+    cmp         r14,#6
+    beq         outer_loop_chroma_4x2
+
+
+luma_decision:
+    cmp         r9,#24
+    beq         outer_loop_8
+
+    cmp         r9,#16
+    bge         outer_loop_16
+
+    cmp         r9,#12
+    beq         outer_loop_4
+
+    cmp         r9,#8
+    bge         outer_loop_8
+
+
+
+
+
+
+outer_loop_4:
+    cmp         r9,#0                       @check wd == 0
+    beq         end_loops                   @if equal, then end the function
+
+core_loop_4:
+    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+    vld1.s16    {d6},[r0]!                  @load and increment the pi2_src1
+    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
+    vld1.s16    {d7},[r1]!                  @load and increment the pi2_src2
+    vld1.s16    {d8},[r11],r3               @load and increment the pi2_src1 ii iteration
+    vqadd.s16   d18,d6,d7
+    vqadd.s16   d18,d18,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+    vld1.s16    {d9},[r12],r4               @load and increment the pi2_src2 ii iteration
+    vqadd.s16   d20,d8,d9                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+    vqadd.s16   d19,d20,d0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
+    vqshrun.s16 d20,q9,#7
+    vld1.s16    {d22},[r11],r3              @load and increment the pi2_src1 iii iteration
+    vld1.s16    {d23},[r12],r4              @load and increment the pi2_src2 iii iteration
+    vqadd.s16   d30,d22,d23
+    vqadd.s16   d30,d30,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
+    vld1.s16    {d24},[r11],r3              @load and increment the pi2_src1 iv iteration
+    vld1.s16    {d25},[r12],r4              @load and increment the pi2_src2 iv iteration
+    vqadd.s16   d18,d24,d25                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
+    vqadd.s16   d31,d18,d0
+    vst1.32     {d20[0]},[r2]!              @store pu1_dst i iteration
+    vst1.32     {d20[1]},[r14],r5           @store pu1_dst ii iteration
+    vqshrun.s16 d30,q15,#7
+    vst1.32     {d30[0]},[r14],r5           @store pu1_dst iii iteration                                                @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
+    subs        r9,r9,#4                    @decrement wd by 4 and check for 0
+    vst1.32     {d30[1]},[r14],r5           @store pu1_dst iv iteration
+    bgt         core_loop_4                 @if greater than 0 repeat the core loop again
+
+end_core_loop_4:
+
+    subs        r8,r8,#4                    @decrement the ht by 4
+
+    add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+    asr         r9,r6,#1
+    add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
+    rsb         r14,r9,r5,lsl #2            @4*dst_strd - wd
+    add         r2,r2,r14
+                                            @pu1_dst + dst_std - wd
+    bgt         core_loop_4                 @if ht is greater than 0 goto outer_loop
+
+    b           end_loops
+
+
+@ this is only for chroma module with input 2x2
+outer_loop_chroma_4x2:
+    cmp         r9,#0                       @check wd == 0
+    beq         end_loops                   @if equal, then end the function
+    rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
+    rsb         r10,r6,r4,lsl #1            @2*src_strd2 - wd
+core_loop_chroma_4x2:
+    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+    vld1.s16    {d6},[r0]!                  @load and increment the pi2_src1
+    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
+    vld1.s16    {d7},[r1]!                  @load and increment the pi2_src2
+    vld1.s16    {d8},[r11],r3               @load and increment the pi2_src1 ii iteration
+    vqadd.s16   d18,d6,d7
+    vqadd.s16   d18,d18,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+    vld1.s16    {d9},[r12],r4               @load and increment the pi2_src2 ii iteration
+    vqadd.s16   d20,d8,d9                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+    vqadd.s16   d19,d20,d0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
+    vqshrun.s16 d20,q9,#7
+    vst1.32     {d20[0]},[r2]!              @store pu1_dst i iteration
+    vst1.32     {d20[1]},[r14],r5           @store pu1_dst ii iteration
+
+    subs        r9,r9,#4                    @decrement wd by 4 and check for 0
+
+    bgt         core_loop_chroma_4x2        @if greater than 0 repeat the core loop again
+
+end_core_loop_chorma_4x2:
+
+    subs        r8,r8,#2                    @decrement the ht by 4
+
+    add         r0,r0,r7                    @pi2_src1 + 2*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+    asr         r9,r6,#1
+    add         r1,r1,r10                   @pi2_src2 + 2*src_strd2 - 2*wd
+    rsb         r14,r9,r5,lsl #1            @2*dst_strd - wd
+    add         r2,r2,r14
+                                            @pu1_dst + dst_std - wd
+    bgt         core_loop_chroma_4x2        @if ht is greater than 0 goto outer_loop
+
+    b           end_loops
+
+
+
+outer_loop_8:
+    cmp         r9,#0                       @check wd == 0
+    beq         end_loops                   @if equal, then end the function
+    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+core_loop_8:
+
+    vld1.s16    {q12},[r0]!                 @load and increment the pi2_src1
+    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
+    vld1.s16    {q13},[r1]!                 @load and increment the pi2_src2
+    vqadd.s16   q12,q12,q13
+    vld1.s16    {q14},[r11],r3              @load and increment the pi2_src1 ii iteration
+    vqadd.s16   q12,q12,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+    vld1.s16    {q15},[r12],r4              @load and increment the pi2_src2 ii iteration
+    vld1.s16    {q8},[r11],r3               @load and increment the pi2_src1 iii iteration
+    vqadd.s16   q11,q14,q15                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+    vld1.s16    {q9},[r12],r4               @load and increment the pi2_src2 iii iteration
+    vqadd.s16   q11,q11,q0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
+    vqshrun.s16 d20,q12,#7
+    vld1.s16    {q6},[r11],r3               @load and increment the pi2_src1 iv iteration
+    vqadd.s16   q15,q8,q9
+    vqshrun.s16 d21,q11,#7
+    vld1.s16    {q7},[r12],r4               @load and increment the pi2_src2 iv iteration
+    vqadd.s16   q15,q15,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
+    vst1.32     {d20},[r2]!                 @store pu1_dst i iteration
+    vqadd.s16   q4,q6,q7                    @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
+    vst1.32     {d21},[r14],r5              @store pu1_dst ii iteration
+    vqadd.s16   q4,q4,q0
+    vqshrun.s16 d30,q15,#7
+    vqshrun.s16 d31,q4,#7
+    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+    vst1.32     {d30},[r14],r5              @store pu1_dst iii iteration                                                @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
+    subs        r9,r9,#8                    @decrement wd by 4 and check for 0
+    vst1.32     {d31},[r14],r5              @store pu1_dst iv iteration
+    bgt         core_loop_8                 @if greater than 0 repeat the core loop again
+
+end_core_loop_8:
+
+    subs        r8,r8,#4                    @decrement the ht by 4
+
+    add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+    asr         r9,r6,#1
+    add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
+    rsb         r14,r9,r5,lsl #2            @4*dst_strd - wd
+    add         r2,r2,r14
+    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  @pu1_dst + dst_std - wd
+
+    bgt         core_loop_8
+    b           end_loops
+
+
+
+@ this is only for chroma module with inpput 4x2
+outer_loop_chroma_8x2:
+    cmp         r9,#0                       @check wd == 0
+    beq         end_loops                   @if equal, then end the function
+    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+    rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
+    rsb         r10,r6,r4,lsl #1            @2*src_strd2 - wd
+core_loop_chroma_8x2:
+
+    vld1.s16    {q12},[r0]!                 @load and increment the pi2_src1
+    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
+    vld1.s16    {q13},[r1]!                 @load and increment the pi2_src2
+    vqadd.s16   q12,q12,q13
+    vld1.s16    {q14},[r11],r3              @load and increment the pi2_src1 ii iteration
+    vqadd.s16   q12,q12,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+    vld1.s16    {q15},[r12],r4              @load and increment the pi2_src2 ii iteration
+    vld1.s16    {q8},[r11],r3               @load and increment the pi2_src1 iii iteration
+    vqadd.s16   q11,q14,q15                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+    vqadd.s16   q11,q11,q0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
+    vqshrun.s16 d20,q12,#7
+    vqshrun.s16 d21,q11,#7
+    vst1.32     {d20},[r2]!                 @store pu1_dst i iteration
+    vst1.32     {d21},[r14],r5              @store pu1_dst ii iteration
+
+    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+                                            @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
+    subs        r9,r9,#8                    @decrement wd by 4 and check for 0
+
+    bgt         core_loop_chroma_8x2        @if greater than 0 repeat the core loop again
+
+end_core_loop_chroma_8x2:
+
+    subs        r8,r8,#2                    @decrement the ht by 4
+
+    add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+    asr         r9,r6,#1
+    add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
+    rsb         r14,r9,r5,lsl #1            @4*dst_strd - wd
+    add         r2,r2,r14
+    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  @pu1_dst + dst_std - wd
+
+    bgt         core_loop_chroma_8x2
+
+    b           end_loops
+
+
+
+
+outer_loop_16:
+    cmp         r9,#0                       @check wd == 0
+    beq         end_loops                   @if equal, then end the function
+    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+    rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
+    mov         r14,#16
+    sub         r10,r14,r5
+    sub         r11,r3,r14
+    sub         r12,r14,r3
+
+    rsb         r14,r9,r5,lsl #1            @2*dst_strd - wd
+
+
+
+prolog_16:
+
+
+    vld1.s16    {q1},[r0]!                  @load and increment the pi2_src1
+    vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
+    vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
+    vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
+    vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
+    subs        r9,r9,#16
+    vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
+    subeq       r8,r8,#2
+    vqadd.s16   q11,q1,q2
+    vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
+    vqadd.s16   q14,q5,q6
+    vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
+    addeq       r0,r0,r7
+    addeq       r1,r1,r7
+    vqadd.s16   q12,q3,q4
+    vld1.s16    {q1},[r0]!
+    vqadd.s16   q13,q7,q8
+@ if the input is chroma with 8x2 block size
+    cmp         r8,#0
+    beq         epilog_16
+
+    vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
+    vqadd.s16   q11,q11,q0
+    vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
+    vqadd.s16   q14,q14,q0
+    vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
+    vqadd.s16   q12,q12,q0
+    vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
+    vqadd.s16   q15,q13,q0
+    vqshrun.s16 d20,q11,#7
+    vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
+    vqshrun.s16 d21,q14,#7
+    vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
+    vqshrun.s16 d26,q12,#7
+    vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
+    vqshrun.s16 d27,q15,#7
+
+
+
+core_loop_16:
+
+    cmp         r9,#0
+    vqadd.s16   q11,q1,q2
+    asreq       r9,r6,#1
+    vst1.32     {q10},[r2],r5
+    vqadd.s16   q14,q5,q6
+    vst1.32     {q13},[r2],r10
+    addeq       r2,r2,r14
+    vqadd.s16   q12,q3,q4
+    subs        r9,r9,#16
+    addeq       r0,r0,r7
+    vqadd.s16   q13,q7,q8
+
+    addeq       r1,r1,r7
+    subeqs      r8,r8,#2                    @decrement the ht by 2
+    beq         epilog_16
+
+
+    vqadd.s16   q11,q11,q0
+    vld1.s16    {q1},[r0]!                  @load and increment the pi2_src1
+    vqadd.s16   q14,q14,q0
+    vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
+    vqadd.s16   q12,q12,q0
+    vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
+    vqadd.s16   q15,q13,q0
+    vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
+    vqshrun.s16 d20,q11,#7
+    vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
+    vqshrun.s16 d21,q14,#7
+    vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
+    vqshrun.s16 d26,q12,#7
+    vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
+    vqshrun.s16 d27,q15,#7
+    vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
+
+
+    b           core_loop_16
+
+
+epilog_16:
+
+    vqadd.s16   q11,q11,q0
+    vqadd.s16   q14,q14,q0
+    vqadd.s16   q12,q12,q0
+    vqadd.s16   q15,q13,q0
+    vqshrun.s16 d20,q11,#7
+    vqshrun.s16 d21,q14,#7
+    vqshrun.s16 d26,q12,#7
+    vqshrun.s16 d27,q15,#7
+    vst1.32     {q10},[r2],r5
+    vst1.32     {q13},[r2]
+
+
+
+end_core_loop_16:
+
+
+
+
+
+
+
+
+end_loops:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+
+
+

diff --git a/common/arm/ihevc_weighted_pred_neon_intr.c b/common/arm/ihevc_weighted_pred_neon_intr.c
new file mode 100644
index 0000000..72b5d4f
--- /dev/null
+++ b/common/arm/ihevc_weighted_pred_neon_intr.c

@@ -0,0 +1,979 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_weighted_pred_neon_intr.c
+*
+* @brief
+*  Contains function definitions for weighted prediction used in inter
+* prediction
+*
+* @author
+*  Parthiban V
+*
+* @par List of Functions:
+*  - ihevc_weighted_pred_uni()
+*  - ihevc_weighted_pred_bi()
+*  - ihevc_weighted_pred_bi_default()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_inter_pred.h"
+#include "arm_neon.h"
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Does uni-weighted prediction on the array pointed by  pi2_src and stores
+* it at the location pointed by pi2_dst Assumptions : The function is
+* optimized considering the fact Width and  height are multiple of 2.
+*
+* @par Description:
+*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
+* offset
+*
+* @param[in] pi2_src
+*  Pointer to the source
+*
+* @param[out] pu1_dst
+*  Pointer to the destination
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] wgt0
+*  weight to be multiplied to the source
+*
+* @param[in] off0
+*  offset to be added after rounding and
+*
+* @param[in] shifting
+*
+*
+* @param[in] shift
+*  (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_uni_neonintr(WORD16 *pi2_src,
+                                      UWORD8 *pu1_dst,
+                                      WORD32 src_strd,
+                                      WORD32 dst_strd,
+                                      WORD32 wgt0,
+                                      WORD32 off0,
+                                      WORD32 shift,
+                                      WORD32 lvl_shift,
+                                      WORD32 ht,
+                                      WORD32 wd)
+{
+    WORD32 row, col;
+    int16x4_t pi2_src_val1;
+    int16x4_t pi2_src_val2;
+    int32x4_t i4_tmp1_t;
+    int32x4_t i4_tmp2_t;
+    int32x4_t sto_res_tmp1;
+    uint16x4_t sto_res_tmp2;
+    uint16x8_t sto_res_tmp3;
+    uint8x8_t sto_res;
+    int32x4_t tmp_lvl_shift_t;
+    WORD32 tmp_shift = 0 - shift;
+    int32x4_t tmp_shift_t;
+    WORD16 *pi2_src_tmp;
+    UWORD8 *pu1_dst_tmp;
+
+    WORD32 tmp_lvl_shift = lvl_shift * wgt0 + (off0 << shift);
+    tmp_lvl_shift += (1 << (shift - 1));
+    tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
+    tmp_shift_t = vmovq_n_s32(tmp_shift);
+
+    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
+    /* height has also been unrolled, hence 2 rows will processed at a time                     */
+    /* store also has been taken care for two row process                                       */
+    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
+    /* saturated and narrowed                                                                   */
+
+    for(row = ht; row > 0; row -= 2)
+    {
+        for(col = wd; col > 0; col -= 4)
+        {
+            pi2_src_tmp = pi2_src + src_strd;
+
+            pu1_dst_tmp = pu1_dst + dst_strd;
+
+            pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
+            pi2_src += 4;
+
+            pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
+            i4_tmp1_t = vmull_n_s16(pi2_src_val1, (int16_t)wgt0);
+
+            i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t);
+            i4_tmp2_t = vmull_n_s16(pi2_src_val2, (int16_t)wgt0);
+
+            sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
+            i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t);
+
+            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+            sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
+            sto_res = vqmovn_u16(sto_res_tmp3);
+
+            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
+            pu1_dst += 4;
+
+            sto_res = vqmovn_u16(sto_res_tmp3);
+            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
+        }
+        pi2_src += 2 * src_strd - wd;
+        pu1_dst += 2 * dst_strd - wd;
+    }
+}
+//WEIGHTED_PRED_UNI
+
+/**
+*******************************************************************************
+*
+* @brief
+* Chroma uni-weighted prediction on the array pointed by  pi2_src and stores
+* it at the location pointed by pi2_dst Assumptions : The function is
+* optimized considering the fact Width and  height are multiple of 2.
+*
+* @par Description:
+*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
+* offset
+*
+* @param[in] pi2_src
+*  Pointer to the source
+*
+* @param[out] pu1_dst
+*  Pointer to the destination
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] wgt0
+*  weight to be multiplied to the source
+*
+* @param[in] off0
+*  offset to be added after rounding and
+*
+* @param[in] shifting
+*
+*
+* @param[in] shift
+*  (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_chroma_uni_neonintr(WORD16 *pi2_src,
+                                             UWORD8 *pu1_dst,
+                                             WORD32 src_strd,
+                                             WORD32 dst_strd,
+                                             WORD32 wgt0_cb,
+                                             WORD32 wgt0_cr,
+                                             WORD32 off0_cb,
+                                             WORD32 off0_cr,
+                                             WORD32 shift,
+                                             WORD32 lvl_shift,
+                                             WORD32 ht,
+                                             WORD32 wd)
+{
+    WORD32 row, col;
+    int16x4_t pi2_src_val1;
+    int16x4_t pi2_src_val2;
+    int32x4_t i4_tmp1_t;
+    int32x4_t i4_tmp2_t;
+    int32x4_t sto_res_tmp1;
+    uint16x4_t sto_res_tmp2;
+    uint16x8_t sto_res_tmp3;
+    uint8x8_t sto_res;
+    int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
+    int32x4x2_t tmp_lvl_shift_t;
+    WORD32 tmp_shift = 0 - shift;
+    int32x4_t tmp_shift_t;
+    int16x4_t tmp_wgt0_u, tmp_wgt0_v;
+    int16x4x2_t wgt0;
+    WORD16 *pi2_src_tmp;
+    UWORD8 *pu1_dst_tmp;
+
+    WORD32 tmp_lvl_shift = lvl_shift * wgt0_cb + (off0_cb << shift);
+    tmp_lvl_shift += (1 << (shift - 1));
+    tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);
+
+    tmp_lvl_shift = lvl_shift * wgt0_cr + (off0_cr << shift);
+    tmp_lvl_shift += (1 << (shift - 1));
+    tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);
+
+    tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);
+
+    tmp_shift_t = vmovq_n_s32(tmp_shift);
+
+    tmp_wgt0_u = vdup_n_s16(wgt0_cb);
+    tmp_wgt0_v = vdup_n_s16(wgt0_cr);
+    wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
+
+    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
+    /* height has also been unrolled, hence 2 rows will processed at a time                     */
+    /* store also has been taken care for two row process                                       */
+    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
+    /* saturated and narrowed                                                                   */
+
+    for(row = ht; row > 0; row -= 2)
+    {
+        for(col = 2 * wd; col > 0; col -= 4)
+        {
+            pi2_src_tmp = pi2_src + src_strd;
+
+            pu1_dst_tmp = pu1_dst + dst_strd;
+
+            pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
+            pi2_src += 4;
+
+            pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
+            i4_tmp1_t = vmull_s16(pi2_src_val1, wgt0.val[0]);
+
+            i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t.val[0]);
+            i4_tmp2_t = vmull_s16(pi2_src_val2, wgt0.val[0]);
+
+            sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
+            i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t.val[0]);
+
+            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+            sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
+            sto_res = vqmovn_u16(sto_res_tmp3);
+
+            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
+            pu1_dst += 4;
+
+            sto_res = vqmovn_u16(sto_res_tmp3);
+            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
+        }
+        pi2_src += 2 * src_strd - 2 * wd;
+        pu1_dst += 2 * dst_strd - 2 * wd;
+    }
+}
+//WEIGHTED_PRED_CHROMA_UNI
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Does bi-weighted prediction on the arrays pointed by  pi2_src1 and
+* pi2_src2 and stores it at location pointed  by pi2_dst   Assumptions : The
+* function is optimized considering the fact Width and  height are multiple
+* of 2.
+*
+* @par Description:
+*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
+* off1 + 1) << (shift - 1) ) >> shift
+*
+* @param[in] pi2_src1
+*  Pointer to source 1
+*
+* @param[in] pi2_src2
+*  Pointer to source 2
+*
+* @param[out] pu1_dst
+*  Pointer to destination
+*
+* @param[in] src_strd1
+*  Source stride 1
+*
+* @param[in] src_strd2
+*  Source stride 2
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] wgt0
+*  weight to be multiplied to source 1
+*
+* @param[in] off0
+*  offset 0
+*
+* @param[in] wgt1
+*  weight to be multiplied to source 2
+*
+* @param[in] off1
+*  offset 1
+*
+* @param[in] shift
+*  (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift1
+*  added before shift and offset
+*
+* @param[in] lvl_shift2
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_bi_neonintr(WORD16 *pi2_src1,
+                                     WORD16 *pi2_src2,
+                                     UWORD8 *pu1_dst,
+                                     WORD32 src_strd1,
+                                     WORD32 src_strd2,
+                                     WORD32 dst_strd,
+                                     WORD32 wgt0,
+                                     WORD32 off0,
+                                     WORD32 wgt1,
+                                     WORD32 off1,
+                                     WORD32 shift,
+                                     WORD32 lvl_shift1,
+                                     WORD32 lvl_shift2,
+                                     WORD32 ht,
+                                     WORD32 wd)
+{
+    WORD32 row, col;
+    int16x4_t pi2_src1_val1;
+    int16x4_t pi2_src1_val2;
+    int16x4_t pi2_src2_val1;
+    int16x4_t pi2_src2_val2;
+    int32x4_t i4_tmp1_t1;
+    int32x4_t i4_tmp1_t2;
+    int32x4_t i4_tmp2_t1;
+    int32x4_t i4_tmp2_t2;
+    int32x4_t sto_res_tmp1;
+    uint16x4_t sto_res_tmp2;
+    uint16x8_t sto_res_tmp3;
+    uint8x8_t sto_res;
+    int32x4_t tmp_lvl_shift_t;
+    WORD32 tmp_shift = 0 - shift;
+    int32x4_t tmp_shift_t;
+    WORD16 *pi2_src_tmp1;
+    WORD16 *pi2_src_tmp2;
+    UWORD8 *pu1_dst_tmp;
+
+    WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0) + (lvl_shift2 * wgt1);
+    tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1));
+    tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
+    tmp_shift_t = vmovq_n_s32(tmp_shift);
+
+    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
+    /* height has also been unrolled, hence 2 rows will processed at a time                     */
+    /* store also has been taken care for two row process                                       */
+    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
+    /* saturated and narrowed                                                                   */
+
+    for(row = ht; row > 0; row -= 2)
+    {
+        for(col = wd; col > 0; col -= 4)
+        {
+            pi2_src_tmp1 = pi2_src1 + src_strd1;
+            pi2_src_tmp2 = pi2_src2 + src_strd2;
+
+            pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
+            pi2_src1 += 4;
+            pu1_dst_tmp = pu1_dst + dst_strd;
+
+            pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
+            pi2_src2 += 4;
+            i4_tmp1_t1 = vmull_n_s16(pi2_src1_val1, (int16_t)wgt0);
+
+            pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
+            i4_tmp1_t2 = vmull_n_s16(pi2_src2_val1, (int16_t)wgt1);
+
+            pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
+            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
+
+            i4_tmp2_t1 = vmull_n_s16(pi2_src1_val2, (int16_t)wgt0);
+            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
+
+            i4_tmp2_t2 = vmull_n_s16(pi2_src2_val2, (int16_t)wgt1);
+            sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
+
+            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
+            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+
+            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
+            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+            sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
+            sto_res = vqmovn_u16(sto_res_tmp3);
+
+            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
+            pu1_dst += 4;
+
+            sto_res = vqmovn_u16(sto_res_tmp3);
+            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
+        }
+        pi2_src1 += 2 * src_strd1 - wd;
+        pi2_src2 += 2 * src_strd2 - wd;
+        pu1_dst += 2 * dst_strd - wd;
+    }
+}
+//WEIGHTED_PRED_BI
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Chroma bi-weighted prediction on the arrays pointed by  pi2_src1 and
+* pi2_src2 and stores it at location pointed  by pi2_dst   Assumptions : The
+* function is optimized considering the fact Width and  height are multiple
+* of 2.
+*
+* @par Description:
+*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
+* off1 + 1) << (shift - 1) ) >> shift
+*
+* @param[in] pi2_src1
+*  Pointer to source 1
+*
+* @param[in] pi2_src2
+*  Pointer to source 2
+*
+* @param[out] pu1_dst
+*  Pointer to destination
+*
+* @param[in] src_strd1
+*  Source stride 1
+*
+* @param[in] src_strd2
+*  Source stride 2
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] wgt0
+*  weight to be multiplied to source 1
+*
+* @param[in] off0
+*  offset 0
+*
+* @param[in] wgt1
+*  weight to be multiplied to source 2
+*
+* @param[in] off1
+*  offset 1
+*
+* @param[in] shift
+*  (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift1
+*  added before shift and offset
+*
+* @param[in] lvl_shift2
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_chroma_bi_neonintr(WORD16 *pi2_src1,
+                                            WORD16 *pi2_src2,
+                                            UWORD8 *pu1_dst,
+                                            WORD32 src_strd1,
+                                            WORD32 src_strd2,
+                                            WORD32 dst_strd,
+                                            WORD32 wgt0_cb,
+                                            WORD32 wgt0_cr,
+                                            WORD32 off0_cb,
+                                            WORD32 off0_cr,
+                                            WORD32 wgt1_cb,
+                                            WORD32 wgt1_cr,
+                                            WORD32 off1_cb,
+                                            WORD32 off1_cr,
+                                            WORD32 shift,
+                                            WORD32 lvl_shift1,
+                                            WORD32 lvl_shift2,
+                                            WORD32 ht,
+                                            WORD32 wd)
+{
+    WORD32 row, col;
+    int16x4_t pi2_src1_val1;
+    int16x4_t pi2_src1_val2;
+    int16x4_t pi2_src2_val1;
+    int16x4_t pi2_src2_val2;
+    int32x4_t i4_tmp1_t1;
+    int32x4_t i4_tmp1_t2;
+    int32x4_t i4_tmp2_t1;
+    int32x4_t i4_tmp2_t2;
+    int32x4_t sto_res_tmp1;
+    uint16x4_t sto_res_tmp2;
+    uint16x8_t sto_res_tmp3;
+    uint8x8_t sto_res;
+    int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
+    int32x4x2_t tmp_lvl_shift_t;
+    WORD32 tmp_shift = 0 - shift;
+    int32x4_t tmp_shift_t;
+    int16x4_t tmp_wgt0_u, tmp_wgt0_v, tmp_wgt1_u, tmp_wgt1_v;
+    int16x4x2_t wgt0, wgt1;
+    WORD16 *pi2_src_tmp1;
+    WORD16 *pi2_src_tmp2;
+    UWORD8 *pu1_dst_tmp;
+
+    WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0_cb) + (lvl_shift2 * wgt1_cb);
+    tmp_lvl_shift += ((off0_cb + off1_cb + 1) << (shift - 1));
+    tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);
+
+    tmp_lvl_shift = (lvl_shift1 * wgt0_cr) + (lvl_shift2 * wgt1_cr);
+    tmp_lvl_shift += ((off0_cr + off1_cr + 1) << (shift - 1));
+    tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);
+
+    tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);
+
+    tmp_shift_t = vmovq_n_s32(tmp_shift);
+
+    tmp_wgt0_u = vdup_n_s16(wgt0_cb);
+    tmp_wgt0_v = vdup_n_s16(wgt0_cr);
+    wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
+    tmp_wgt1_u = vdup_n_s16(wgt1_cb);
+    tmp_wgt1_v = vdup_n_s16(wgt1_cr);
+    wgt1 = vzip_s16(tmp_wgt1_u, tmp_wgt1_v);
+
+    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
+    /* height has also been unrolled, hence 2 rows will processed at a time                     */
+    /* store also has been taken care for two row process                                       */
+    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
+    /* saturated and narrowed                                                                   */
+
+    for(row = ht; row > 0; row -= 2)
+    {
+        for(col = 2 * wd; col > 0; col -= 4)
+        {
+            pi2_src_tmp1 = pi2_src1 + src_strd1;
+            pi2_src_tmp2 = pi2_src2 + src_strd2;
+
+            pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
+            pi2_src1 += 4;
+            pu1_dst_tmp = pu1_dst + dst_strd;
+
+            pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
+            pi2_src2 += 4;
+            i4_tmp1_t1 = vmull_s16(pi2_src1_val1, wgt0.val[0]);
+
+            pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
+            i4_tmp1_t2 = vmull_s16(pi2_src2_val1, wgt1.val[0]);
+
+            pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
+            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
+
+            i4_tmp2_t1 = vmull_s16(pi2_src1_val2, wgt0.val[0]);
+            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t.val[0]);
+
+            i4_tmp2_t2 = vmull_s16(pi2_src2_val2, wgt1.val[0]);
+            sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
+
+            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
+            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+
+            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t.val[0]);
+            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+            sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
+            sto_res = vqmovn_u16(sto_res_tmp3);
+
+            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
+            pu1_dst += 4;
+
+            sto_res = vqmovn_u16(sto_res_tmp3);
+            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
+        }
+        pi2_src1 += 2 * src_strd1 - 2 * wd;
+        pi2_src2 += 2 * src_strd2 - 2 * wd;
+        pu1_dst += 2 * dst_strd - 2 * wd;
+    }
+}
+//WEIGHTED_PRED_CHROMA_BI
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location  pointed by pi2_dst Assumptions : The
+* function is optimized considering the fact Width and  height are multiple
+* of 2.
+*
+* @par Description:
+*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
+* >> shift  where shift = 15 - BitDepth
+*
+* @param[in] pi2_src1
+*  Pointer to source 1
+*
+* @param[in] pi2_src2
+*  Pointer to source 2
+*
+* @param[out] pu1_dst
+*  Pointer to destination
+*
+* @param[in] src_strd1
+*  Source stride 1
+*
+* @param[in] src_strd2
+*  Source stride 2
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] lvl_shift1
+*  added before shift and offset
+*
+* @param[in] lvl_shift2
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_bi_default_neonintr(WORD16 *pi2_src1,
+                                             WORD16 *pi2_src2,
+                                             UWORD8 *pu1_dst,
+                                             WORD32 src_strd1,
+                                             WORD32 src_strd2,
+                                             WORD32 dst_strd,
+                                             WORD32 lvl_shift1,
+                                             WORD32 lvl_shift2,
+                                             WORD32 ht,
+                                             WORD32 wd)
+{
+    WORD32 row, col;
+    int16x4_t pi2_src1_val1;
+    int16x4_t pi2_src1_val2;
+    int16x4_t pi2_src2_val1;
+    int16x4_t pi2_src2_val2;
+    int32x4_t i4_tmp1_t1;
+    int32x4_t i4_tmp1_t2;
+    int32x4_t i4_tmp2_t1;
+    int32x4_t i4_tmp2_t2;
+    int32x4_t sto_res_tmp1;
+    uint16x4_t sto_res_tmp2;
+    uint16x8_t sto_res_tmp3;
+    uint8x8_t sto_res;
+    int32x4_t tmp_lvl_shift_t;
+    int32x4_t tmp_shift_t;
+    WORD16 *pi2_src_tmp1;
+    WORD16 *pi2_src_tmp2;
+    UWORD8 *pu1_dst_tmp;
+    WORD32 shift;
+
+    shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
+    WORD32 tmp_shift = 0 - shift;
+    WORD32 tmp_lvl_shift = 1 << (shift - 1);
+    tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
+    tmp_shift_t = vmovq_n_s32(tmp_shift);
+
+    int16x4_t lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
+    int16x4_t lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);
+
+    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
+    /* height has also been unrolled, hence 2 rows will processed at a time                     */
+    /* store also has been taken care for two row process                                       */
+    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
+    /* saturated and narrowed                                                                   */
+
+    for(row = ht; row > 0; row -= 2)
+    {
+        for(col = wd; col > 0; col -= 4)
+        {
+            pi2_src_tmp1 = pi2_src1 + src_strd1;
+            pi2_src_tmp2 = pi2_src2 + src_strd2;
+
+            pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
+            pi2_src1 += 4;
+            pu1_dst_tmp = pu1_dst + dst_strd;
+
+            pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
+            pi2_src2 += 4;
+            i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);
+
+            pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
+            i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);
+
+            pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
+            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
+
+            i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
+            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
+
+            i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
+            sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
+
+            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
+            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+
+            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
+            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+            sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
+            sto_res = vqmovn_u16(sto_res_tmp3);
+
+            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
+            pu1_dst += 4;
+
+            sto_res = vqmovn_u16(sto_res_tmp3);
+            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
+        }
+        pi2_src1 += 2 * src_strd1 - wd;
+        pi2_src2 += 2 * src_strd2 - wd;
+        pu1_dst += 2 * dst_strd - wd;
+    }
+}
+//WEIGHTED_PRED_BI_DEFAULT
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location  pointed by pi2_dst Assumptions : The
+* function is optimized considering the fact Width and  height are multiple
+* of 2.
+*
+* @par Description:
+*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
+* >> shift  where shift = 15 - BitDepth
+*
+* @param[in] pi2_src1
+*  Pointer to source 1
+*
+* @param[in] pi2_src2
+*  Pointer to source 2
+*
+* @param[out] pu1_dst
+*  Pointer to destination
+*
+* @param[in] src_strd1
+*  Source stride 1
+*
+* @param[in] src_strd2
+*  Source stride 2
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] lvl_shift1
+*  added before shift and offset
+*
+* @param[in] lvl_shift2
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_chroma_bi_default_neonintr(WORD16 *pi2_src1,
+                                                    WORD16 *pi2_src2,
+                                                    UWORD8 *pu1_dst,
+                                                    WORD32 src_strd1,
+                                                    WORD32 src_strd2,
+                                                    WORD32 dst_strd,
+                                                    WORD32 lvl_shift1,
+                                                    WORD32 lvl_shift2,
+                                                    WORD32 ht,
+                                                    WORD32 wd)
+{
+    WORD32 row, col;
+    int16x4_t pi2_src1_val1;
+    int16x4_t pi2_src1_val2;
+    int16x4_t pi2_src2_val1;
+    int16x4_t pi2_src2_val2;
+    int32x4_t i4_tmp1_t1;
+    int32x4_t i4_tmp1_t2;
+    int32x4_t i4_tmp2_t1;
+    int32x4_t i4_tmp2_t2;
+    int32x4_t sto_res_tmp1;
+    uint16x4_t sto_res_tmp2;
+    uint16x8_t sto_res_tmp3;
+    uint8x8_t sto_res;
+    int32x4_t tmp_lvl_shift_t;
+    int32x4_t tmp_shift_t;
+    WORD16 *pi2_src_tmp1;
+    WORD16 *pi2_src_tmp2;
+    UWORD8 *pu1_dst_tmp;
+    WORD32 shift;
+    WORD32 tmp_shift;
+    WORD32 tmp_lvl_shift;
+    int16x4_t lvl_shift1_t;
+    int16x4_t lvl_shift2_t;
+    shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
+    tmp_shift = 0 - shift;
+    tmp_lvl_shift = 1 << (shift - 1);
+    tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
+    tmp_shift_t = vmovq_n_s32(tmp_shift);
+
+    lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
+    lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);
+
+    /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
+    /* height has also been unrolled, hence 2 rows will processed at a time                     */
+    /* store also has been taken care for two row process                                       */
+    /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
+    /* saturated and narrowed                                                                   */
+
+    for(row = ht; row > 0; row -= 2)
+    {
+        for(col = 2 * wd; col > 0; col -= 4)
+        {
+            pi2_src_tmp1 = pi2_src1 + src_strd1;
+            pi2_src_tmp2 = pi2_src2 + src_strd2;
+
+            pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
+            pi2_src1 += 4;
+            pu1_dst_tmp = pu1_dst + dst_strd;
+
+            pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
+            pi2_src2 += 4;
+            i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);
+
+            pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
+            i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);
+
+            pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
+            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
+
+            i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
+            i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
+
+            i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
+            sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
+
+            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
+            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+
+            i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
+            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+            sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
+            sto_res = vqmovn_u16(sto_res_tmp3);
+
+            sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+            sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+            vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
+            pu1_dst += 4;
+
+            sto_res = vqmovn_u16(sto_res_tmp3);
+            vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
+        }
+        pi2_src1 += 2 * src_strd1 - 2 * wd;
+        pi2_src2 += 2 * src_strd2 - 2 * wd;
+        pu1_dst += 2 * dst_strd - 2 * wd;
+    }
+}
+//WEIGHTED_PRED_CHROMA_BI_DEFAULT

diff --git a/common/arm/ihevc_weighted_pred_uni.s b/common/arm/ihevc_weighted_pred_uni.s
new file mode 100644
index 0000000..e9b69c1
--- /dev/null
+++ b/common/arm/ihevc_weighted_pred_uni.s

@@ -0,0 +1,219 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@*  ihevc_weighted_pred_uni.s
+@*
+@* @brief
+@*  contains function definitions for weighted prediction used in inter
+@* prediction
+@*
+@* @author
+@*  parthiban v
+@*
+@* @par list of functions:
+@*  - ihevc_weighted_pred_uni()
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@*  does uni-weighted prediction on the array pointed by  pi2_src and stores
+@* it at the location pointed by pi2_dst assumptions : the function is
+@* optimized considering the fact width and  height are multiple of 2.
+@*
+@* @par description:
+@*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
+@* offset
+@*
+@* @param[in] pi2_src
+@*  pointer to the source
+@*
+@* @param[out] pu1_dst
+@*  pointer to the destination
+@*
+@* @param[in] src_strd
+@*  source stride
+@*
+@* @param[in] dst_strd
+@*  destination stride
+@*
+@* @param[in] wgt0
+@*  weight to be multiplied to the source
+@*
+@* @param[in] off0
+@*  offset to be added after rounding and
+@*
+@* @param[in] shifting
+@*
+@*
+@* @param[in] shift
+@*  (14 bit depth) + log2_weight_denominator
+@*
+@* @param[in] lvl_shift
+@*  added before shift and offset
+@*
+@* @param[in] ht
+@*  height of the source
+@*
+@* @param[in] wd
+@*  width of the source
+@*
+@* @returns
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_weighted_pred_uni(word16 *pi2_src,
+@                             uword8 *pu1_dst,
+@                             word32 src_strd,
+@                             word32 dst_strd,
+@                             word32 wgt0,
+@                             word32 off0,
+@                             word32 shift,
+@                             word32 lvl_shift,
+@                             word32 ht,
+@                             word32 wd)
+
+@**************variables vs registers*****************************************
+@   r0 => *pi2_src
+@   r1 => *pu1_dst
+@   r2 =>  src_strd
+@   r3 =>  dst_strd
+@   r4 =>  wgt0
+@   r5 =>  off0
+@   r6 =>  shift
+@   r7 =>  lvl_shift
+@   r8 =>   ht
+@   r9  =>  wd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_weighted_pred_uni_a9q
+
+.type ihevc_weighted_pred_uni_a9q, %function
+
+ihevc_weighted_pred_uni_a9q:
+
+    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
+
+    ldr         r4,[sp,#40]                 @load wgt0
+    ldr         r7,[sp,#52]                 @load lvl_shift
+    mov         r11,#1
+    ldr         r5,[sp,#44]                 @load off0
+    mul         r10,r7,r4                   @lvl_shift * wgt0
+    ldr         r6,[sp,#48]                 @load shift
+    ldr         r8,[sp,#56]                 @load ht
+    add         r10,r10,r5,lsl r6           @lvl_shift * wgt0 + (off0 << shift)
+    ldr         r9,[sp,#60]                 @load wt
+    sub         r12,r6,#1
+    vmov.s16    d0[0],r4                    @moved for scalar multiplication
+    lsl         r2,r2,#1
+    vdup.u32    q14,r6                      @vmovq_n_s32(tmp_shift)
+    add         r10,r10,r11,lsl r12         @tmp_lvl_shift += (1 << (shift - 1))
+    vdup.s32    q15,r10                     @vmovq_n_s32(tmp_lvl_shift)
+    vneg.s32    q14,q14
+    lsl         r4,r9,#1
+
+    cmp         r8,#0                       @check ht == 0
+    beq         end_loops                   @if equal, then end the function
+
+outer_loop:
+    cmp         r9,#0                       @check wd == 0
+    beq         end_loops                   @if equal, then end the function
+
+core_loop:
+    add         r5,r0,r2                    @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         r6,r1,r3                    @pu1_dst_tmp = pu1_dst + dst_strd
+    vld1.s16    {d1},[r0]!                  @load and increment the pi2_src
+    vld1.s16    {d2},[r5],r2                @load and increment the pi2_src_tmp ii iteration
+    vmull.s16   q2,d1,d0[0]                 @vmull_n_s16(pi2_src_val1, (int16_t) wgt0)
+
+    vadd.i32    q2,q2,q15                   @vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t)
+    vld1.s16    {d8},[r5],r2                @load and increment the pi2_src iii iteration
+
+    vmull.s16   q3,d2,d0[0]                 @vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration
+    vld1.s16    {d9},[r5],r2                @load and increment the pi2_src_tmp iv iteration
+
+    vshl.s32    q2,q2,q14                   @vshlq_s32(i4_tmp1_t, tmp_shift_t)
+    vadd.i32    q3,q3,q15                   @vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration
+
+    vmull.s16   q5,d8,d0[0]                 @vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
+    vqmovun.s32 d4,q2                       @vqmovun_s32(sto_res_tmp1)
+
+    vadd.i32    q5,q5,q15                   @vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration
+    vmov.s32    d5,d4                       @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
+
+    vshl.s32    q3,q3,q14                   @vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration
+
+    vmull.s16   q6,d9,d0[0]                 @vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
+    vqmovn.u16  d4,q2                       @vqmovn_u16(sto_res_tmp3)
+
+    vshl.s32    q5,q5,q14                   @vshlq_s32(i4_tmp1_t, tmp_shift_t) iii iteration
+    vqmovun.s32 d6,q3                       @vqmovun_s32(sto_res_tmp1) ii iteration
+
+    vadd.i32    q6,q6,q15                   @vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration
+    vmov.s32    d7,d6                       @vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
+
+    vqmovun.s32 d10,q5                      @vqmovun_s32(sto_res_tmp1) iii iteration
+
+    vshl.s32    q6,q6,q14                   @vshlq_s32(i4_tmp2_t, tmp_shift_t) iv iteration
+    vst1.32     {d4[0]},[r1]!               @store pu1_dst i iteration
+    vmov.s32    d11,d10                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
+
+    vqmovn.u16  d6,q3                       @vqmovn_u16(sto_res_tmp3) ii iteration
+    vst1.32     {d6[0]},[r6],r3             @store pu1_dst ii iteration
+
+    vqmovn.u16  d10,q5                      @vqmovn_u16(sto_res_tmp3) iii iteration
+    vqmovun.s32 d12,q6                      @vqmovun_s32(sto_res_tmp1) iv iteration
+
+    vmov.s32    d13,d12                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iv iteration
+    vst1.32     {d10[0]},[r6],r3            @store pu1_dst i iteration iii iteration
+    vqmovn.u16  d12,q6                      @vqmovn_u16(sto_res_tmp3) iv iteration
+
+    subs        r9,r9,#4                    @decrement wd by 4 and check for 0
+    vst1.32     {d12[0]},[r6],r3            @store pu1_dst iv iteration
+    bgt         core_loop                   @if greater than 0 repeat the core loop again
+
+end_core_loop:
+    rsb         r11,r4,r2,lsl #2            @2*src_strd - wd
+    subs        r8,r8,#4                    @decrement the ht by 4
+    add         r0,r0,r11                   @pi2_src + 4*src_strd - 2*wd(since pi2_src is 16 bit pointer double the increment with double the wd decrement)
+    asr         r9,r4,#1
+    rsb         r12,r9,r3,lsl #2            @2*dst_strd - wd
+    add         r1,r1,r12                   @pu1_dst + dst_std - wd
+    bgt         core_loop                   @if ht is greater than 0 goto outer_loop
+
+end_loops:
+    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
+
+

diff --git a/common/arm64/ihevc_deblk_chroma_horz.s b/common/arm64/ihevc_deblk_chroma_horz.s
new file mode 100644
index 0000000..7097142
--- /dev/null
+++ b/common/arm64/ihevc_deblk_chroma_horz.s

@@ -0,0 +1,173 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///*******************************************************************************
+//* @file
+//*  ihevc_deblk_luma_horz.s
+//*
+//* @brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  anand s
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//void ihevc_deblk_chroma_horz(UWORD8 *pu1_src,
+//                             WORD32 src_strd,
+//                             WORD32 quant_param_p,
+//                             WORD32 quant_param_q,
+//                             WORD32 qp_offset_u,
+//                             WORD32 qp_offset_v,
+//                             WORD32 tc_offset_div2,
+//                             WORD32 filter_flag_p,
+//                             WORD32 filter_flag_q)
+//
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.extern gai4_ihevc_qp_table
+.extern gai4_ihevc_tc_table
+.globl ihevc_deblk_chroma_horz_av8
+
+.type ihevc_deblk_chroma_horz_av8, %function
+
+ihevc_deblk_chroma_horz_av8:
+    sxtw        x4,w4
+    sxtw        x5,w5
+    sxtw        x6,w6
+    ldr         w9, [sp]
+    sxtw        x9,w9
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+    mov         x10, x4
+    mov         x8, x7
+    mov         x7, x5
+    mov         x4, x6
+
+    sub         x12,x0,x1
+    ld1         {v0.8b},[x0]
+    sub         x5,x12,x1
+    add         x6,x0,x1
+    add         x1,x2,x3
+    uxtl        v0.8h, v0.8b
+    ld1         {v2.8b},[x12]
+    add         x2,x1,#1
+    ld1         {v4.8b},[x5]
+    ld1         {v16.8b},[x6]
+    adds        x1,x10,x2,asr #1
+    uxtl        v2.8h, v2.8b
+    adrp        x3, :got:gai4_ihevc_qp_table
+    ldr         x3, [x3, #:got_lo12:gai4_ihevc_qp_table]
+    bmi         l1.3312
+    cmp         x1,#0x39
+    bgt         lbl78
+    ldr         w1, [x3,x1,lsl #2]
+lbl78:
+    sub         x20,x1,#6
+    csel        x1, x20, x1,gt
+l1.3312:
+    adds        x2,x7,x2,asr #1
+    uxtl        v4.8h, v4.8b
+    bmi         l1.3332
+    cmp         x2,#0x39
+    bgt         lbl85
+    ldr         w2, [x3,x2,lsl #2]
+lbl85:
+    sub         x20,x2,#6
+    csel        x2, x20, x2,gt
+l1.3332:
+    add         x1,x1,x4,lsl #1
+    sub         v6.8h,  v0.8h ,  v2.8h
+    add         x3,x1,#2
+    cmp         x3,#0x35
+    mov         x20,#0x35
+    csel        x1, x20, x1,gt
+    shl         v6.8h, v6.8h,#2
+    uxtl        v16.8h, v16.8b
+    bgt         l1.3368
+    adds        x3,x1,#2
+    add         x20,x1,#2
+    csel        x1, x20, x1,pl
+    mov         x20,#0
+    csel        x1, x20, x1,mi
+l1.3368:
+    adrp        x3, :got:gai4_ihevc_tc_table
+    ldr         x3, [x3, #:got_lo12:gai4_ihevc_tc_table]
+    add         v4.8h,  v6.8h ,  v4.8h
+    add         x2,x2,x4,lsl #1
+    sub         v6.8h,  v4.8h ,  v16.8h
+    add         x4,x2,#2
+    ldr         w1, [x3,x1,lsl #2]
+    cmp         x4,#0x35
+    mov         x20,#0x35
+    csel        x2, x20, x2,gt
+    bgt         l1.3412
+    adds        x4,x2,#2
+    add         x20,x2,#2
+    csel        x2, x20, x2,pl
+    mov         x20,#0
+    csel        x2, x20, x2,mi
+l1.3412:
+
+
+    ldr         w2, [x3,x2,lsl #2]
+    cmp         x8,#0
+    dup         v31.8h,w2
+    dup         v30.8h,w1
+    sub         x20,x1,#0
+    neg         x1, x20
+    srshr       v6.8h, v6.8h,#3
+    dup         v28.8h,w1
+    sub         x20,x2,#0
+    neg         x1, x20
+    zip1        v4.8h, v30.8h, v31.8h
+    dup         v29.8h,w1
+
+    zip1        v18.8h, v28.8h, v29.8h
+
+    smin        v16.8h,  v6.8h ,  v4.8h
+    smax        v4.8h,  v18.8h ,  v16.8h
+    add         v2.8h,  v2.8h ,  v4.8h
+    sub         v0.8h,  v0.8h ,  v4.8h
+    sqxtun      v2.8b, v2.8h
+    sqxtun      v0.8b, v0.8h
+    beq         l1.3528
+    st1         {v2.8b},[x12]
+l1.3528:
+    cmp         x9,#0
+    beq         l1.3540
+    st1         {v0.8b},[x0]
+l1.3540:
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+

diff --git a/common/arm64/ihevc_deblk_chroma_vert.s b/common/arm64/ihevc_deblk_chroma_vert.s
new file mode 100644
index 0000000..dcb1f25
--- /dev/null
+++ b/common/arm64/ihevc_deblk_chroma_vert.s

@@ -0,0 +1,211 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///*******************************************************************************
+//* @file
+//*  ihevc_deblk_luma_vert.s
+//*
+//* @brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  anand s
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************/
+//void ihevc_deblk_chroma_vert(UWORD8 *pu1_src,
+//                             WORD32 src_strd,
+//                             WORD32 quant_param_p,
+//                             WORD32 quant_param_q,
+//                             WORD32 qp_offset_u,
+//                             WORD32 qp_offset_v,
+//                             WORD32 tc_offset_div2,
+//                             WORD32 filter_flag_p,
+//                             WORD32 filter_flag_q)
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.extern gai4_ihevc_qp_table
+.extern gai4_ihevc_tc_table
+.globl ihevc_deblk_chroma_vert_av8
+
+.type ihevc_deblk_chroma_vert_av8, %function
+
+ihevc_deblk_chroma_vert_av8:
+    sxtw        x4,w4
+    sxtw        x5,w5
+    sxtw        x6,w6
+    mov         x15,x5
+    mov         x5,x6
+    mov         x6,x15
+    mov         x12, x7
+    mov         x7, x4
+    ldr         w4, [sp]
+
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    sub         x8,x0,#4
+    add         x2,x2,x3
+    ld1         {v5.8b},[x8],x1
+    add         x2,x2,#1
+    ld1         {v17.8b},[x8],x1
+    ld1         {v16.8b},[x8],x1
+    ld1         {v4.8b},[x8]
+
+    trn1        v29.8b, v5.8b, v17.8b
+    trn2        v17.8b, v5.8b, v17.8b
+    mov         v5.d[0], v29.d[0]
+    adds        x3,x7,x2,asr #1
+    trn1        v29.8b, v16.8b, v4.8b
+    trn2        v4.8b, v16.8b, v4.8b
+    mov         v16.d[0], v29.d[0]
+    adrp        x7, :got:gai4_ihevc_qp_table
+    ldr         x7, [x7, #:got_lo12:gai4_ihevc_qp_table]
+
+
+    bmi         l1.2944
+    cmp         x3,#0x39
+    bgt         lbl78
+    ldr         w3, [x7,x3,lsl #2]
+    sxtw        x3,w3
+lbl78:
+    sub         x20,x3,#6
+    csel        x3, x20, x3,gt
+l1.2944:
+    trn1        v29.4h, v5.4h, v16.4h
+    trn2        v16.4h, v5.4h, v16.4h
+    mov         v5.d[0], v29.d[0]
+    adds        x2,x6,x2,asr #1
+    trn1        v29.4h, v17.4h, v4.4h
+    trn2        v4.4h, v17.4h, v4.4h
+    mov         v17.d[0], v29.d[0]
+    bmi         l1.2964
+    cmp         x2,#0x39
+    bgt         lbl86
+    ldr         w2, [x7,x2,lsl #2]
+    sxtw        x2,w2
+lbl86:
+    sub         x20,x2,#6
+    csel        x2, x20, x2,gt
+l1.2964:
+    trn1        v29.2s, v5.2s, v17.2s
+    trn2        v17.2s, v5.2s, v17.2s
+    mov         v5.d[0], v29.d[0]
+    add         x3,x3,x5,lsl #1
+    trn1        v29.2s, v16.2s, v4.2s
+    trn2        v4.2s, v16.2s, v4.2s
+    mov         v16.d[0], v29.d[0]
+    add         x6,x3,#2
+    uxtl        v18.8h, v17.8b
+    cmp         x6,#0x35
+    mov         x20,#0x35
+    csel        x3, x20, x3,gt
+    bgt         l1.2996
+    adds        x6,x3,#2
+    add         x20,x3,#2
+    csel        x3, x20, x3,pl
+    mov         x20,#0
+    csel        x3, x20, x3,mi
+l1.2996:
+    usubl       v0.8h, v17.8b, v16.8b
+    adrp        x6, :got:gai4_ihevc_tc_table
+    ldr         x6, [x6, #:got_lo12:gai4_ihevc_tc_table]
+    shl         v0.8h, v0.8h,#2
+    add         x2,x2,x5,lsl #1
+    add         x5,x2,#2
+    uaddw       v0.8h,  v0.8h ,  v5.8b
+    cmp         x5,#0x35
+    ldr         w3, [x6,x3,lsl #2]
+    sxtw        x3,w3
+    usubw       v4.8h,  v0.8h ,  v4.8b
+    mov         x20,#0x35
+    csel        x2, x20, x2,gt
+    bgt         l1.3036
+    adds        x5,x2,#2
+    add         x20,x2,#2
+    csel        x2, x20, x2,pl
+    mov         x20,#0
+    csel        x2, x20, x2,mi
+l1.3036:
+
+
+    srshr       v6.8h, v4.8h,#3
+    dup         v2.4h,w3
+    ldr         w2, [x6,x2,lsl #2]
+    sxtw        x2,w2
+    sub         x20,x3,#0
+    neg         x3, x20
+    cmp         x12,#0
+    dup         v3.4h,w2
+    sub         x20,x2,#0
+    neg         x2, x20
+    dup         v30.4h,w3
+    dup         v31.4h,w2
+
+    mov         v30.d[1],v31.d[0]
+    mov         v2.d[1],v3.d[0]
+
+    smin        v4.8h,  v6.8h ,  v2.8h
+    smax        v2.8h,  v30.8h ,  v4.8h
+
+    uxtl        v6.8h, v16.8b
+
+    add         v0.8h,  v6.8h ,  v2.8h
+    sub         v2.8h,  v18.8h ,  v2.8h
+    sqxtun      v0.8b, v0.8h
+    sub         x2,x0,#2
+    sqxtun      v1.8b, v2.8h
+    trn1        v29.2s, v0.2s, v1.2s
+    trn2        v1.2s, v0.2s, v1.2s
+    mov         v0.d[0], v29.d[0]
+    trn1        v29.8b, v0.8b, v1.8b
+    trn2        v1.8b, v0.8b, v1.8b
+    mov         v0.d[0], v29.d[0]
+    beq         l1.3204
+
+    st1         {v0.h}[0],[x2],x1
+    st1         {v1.h}[0],[x2],x1
+    st1         {v0.h}[1],[x2],x1
+    st1         {v1.h}[1],[x2]
+l1.3204:
+    cmp         x4,#0
+    beq         l1.3228
+    st1         {v0.h}[2],[x0],x1
+    st1         {v1.h}[2],[x0],x1
+    st1         {v0.h}[3],[x0],x1
+    st1         {v1.h}[3],[x0]
+l1.3228:
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+

diff --git a/common/arm64/ihevc_deblk_luma_horz.s b/common/arm64/ihevc_deblk_luma_horz.s
new file mode 100644
index 0000000..a5c314d
--- /dev/null
+++ b/common/arm64/ihevc_deblk_luma_horz.s

@@ -0,0 +1,586 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///*******************************************************************************
+//* @file
+//*  ihevc_deblk_luma_vert.s
+//*
+//* @brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  anand s
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************/
+
+.text
+.align 4
+
+
+.extern gai4_ihevc_tc_table
+.extern gai4_ihevc_beta_table
+.globl ihevc_deblk_luma_horz_av8
+
+.type ihevc_deblk_luma_horz_av8, %function
+
+ihevc_deblk_luma_horz_av8:
+    // stmfd sp!, {x3-x12,x14}
+    sxtw        x5,w5
+    sxtw        x6,w6
+    stp         d8,d9,[sp,#-16]!
+    stp         d10,d11,[sp,#-16]!
+    stp         d12,d13,[sp,#-16]!
+    stp         d14,d15,[sp,#-16]!
+    stp         x19, x20,[sp,#-16]!
+    stp         x21, x22,[sp,#-16]!
+
+    mov         x21,x7
+    ldr         w22,[sp,#96]
+
+    add         x3,x3,x4
+    add         x3,x3,#1
+    asr         x3,x3,#1
+    add         x7,x3,x5,lsl #1
+    add         x3,x3,x6,lsl #1
+    cmp         x7,#0x33
+    mov         x20,#0x33
+    csel        x7, x20, x7,gt
+    bgt         l1.1532
+    cmp         x7,#0x0
+    mov         x20,#0x0
+    csel        x7, x20, x7,lt              // x7 has the beta_index value
+l1.1532:
+    //     bic      x2,x2,#1
+    asr         x2,x2,#1
+
+    add         x3,x3,x2,lsl #1
+    cmp         x3,#0x35
+    mov         x20,#0x35
+    csel        x3, x20, x3,gt
+    bgt         l1.1564
+    cmp         x3,#0x0
+    mov         x20,#0x0
+    csel        x3, x20, x3,lt              // x3 has the tc_index value
+
+    //    qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
+    //    beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
+    //    tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
+
+l1.1564:
+    adrp        x2, :got:gai4_ihevc_beta_table
+    ldr         x2, [x2, #:got_lo12:gai4_ihevc_beta_table]
+
+    adrp        x4, :got:gai4_ihevc_tc_table
+    ldr         x4, [x4, #:got_lo12:gai4_ihevc_tc_table]
+
+    ldr         w5, [x2,x7,lsl #2]          // beta
+    ldr         w6, [x4,x3,lsl #2]          // tc
+
+
+
+    cmp         x6,#0
+    beq         l1.2404
+    movi        v0.4h, #0x2
+    lsl         x7,x6,#1
+    add         x14,x1,x1,lsl #1
+    neg         x19,x14
+    ldr         w8, [x0,x19]                // -3 value
+    dup         v1.8b,w7
+    lsl         x19,x1,#1
+    neg         x19,x19
+    ldr         w10, [x0,x19]               //-2 value
+    dup         v23.2s,w8                   // -3 value
+    neg         x19,x1
+    ldr         w11, [x0,x19]               //-1 value
+    dup         v24.2s,w10                  // -2 value
+    and         x8,x8,#0xff
+    ldr         w12, [x0,#0]                // 0 value
+    dup         v25.2s,w11                  // -1 value
+    and         x10,x10,#0xff
+    ldr         w9, [x0,x1]                 // 1 value
+    dup         v26.2s,w12                  // 0 value
+    and         x11,x11,#0xff
+    lsl         x19,x1,#1
+    ldr         w2, [x0,x19]                // 2 value
+    dup         v27.2s,w9                   // 1value
+    and         x12,x12,#0xff
+    dup         v28.2s,w2                   // 2 value
+    and         x9,x9,#0xff
+    and         x2,x2,#0xff
+
+    add         x12,x12,x2
+    subs        x9,x12,x9,lsl #1            // dq0 value is stored in x9
+    csneg       x9,x9,x9,pl
+    //dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
+
+    add         x8,x8,x11
+    subs        x8,x8,x10,lsl #1
+    csneg       x8,x8,x8,pl                 // dp0 value is stored in x8
+    //  dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
+
+
+
+    add         x3,x1,x1,lsl #1
+    add         x14,x0,#3
+
+
+    neg         x19,x3
+    ldrb        w2,[x14,x19]                // -2 value
+    lsl         x19,x1,#1
+    neg         x19,x19
+    ldrb        w10,[x14,x19]               // -2 value
+    neg         x19,x1
+    ldrb        w11,[x14,x19]               // -1 value
+    ldrb        w12,[x14,#0]                // 0 value
+    ldrb        w3,[x14,x1]                 // 1 value
+    lsl         x19,x1,#1
+    ldrb        w4,[x14,x19]                // 2 value
+
+
+    add         x12,x12,x4
+    subs        x12,x12,x3,lsl #1           // dq3value is stored in x12
+    csneg       x12,x12,x12,pl
+    //    dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
+
+
+    add         x2,x2,x11
+    subs        x11,x2,x10,lsl #1
+    csneg       x11,x11,x11,pl              // dp3 value is stored in x8
+    //    dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]   + pu1_src[3 * src_strd - 1] )@
+
+
+
+    add         x3,x8,x9                    // x3 has the d0 value
+    add         x4,x11,x12                  // x4 has the d3 value
+
+
+    //    d0 = dp0 + dq0@
+    //    d3 = dp3 + dq3@
+
+    add         x14,x8,x11                  // x13 has the value dp
+    add         x12,x12,x9                  // x12 has the value  dq
+    //    dp = dp0 + dp3@
+    //   dq = dq0 + dq3@
+
+    add         x11, x3, x4                 // x3 has the value d
+
+    //   d = d0 + d3@
+
+
+    cmp         x11,x5
+    bge         l1.2404
+
+    //    if(d < beta)
+
+
+    // registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11
+
+    // registers for use: x2,x7,x8,x9,x10,
+
+    asr         x10,x5,#2
+    uqadd       v30.8b,  v26.8b ,  v1.8b
+    cmp         x10,x3,lsl #1
+    uqsub       v31.8b,  v26.8b ,  v1.8b
+    ble         l1.1840
+    add         x10,x1,x1,lsl #1
+    uaddl       v6.8h,  v25.8b ,  v26.8b
+    neg         x19,x1
+    ldr         w2, [x0,x19,lsl #2]         // has the -4 value
+    neg         x19, x1
+    ldrb        w7,[x0,x19]                 // has the -1 value
+    dup         v22.2s,w2                   // -4 value
+    uaddw       v8.8h,  v6.8h ,  v27.8b
+    ldrb        w3,[x0,#0]                  // x4 has the 0 value
+    uqadd       v16.8b,  v27.8b ,  v1.8b
+    and         x2,x2,#0xff
+    mul         v12.8h, v8.8h, v0.4h[0]
+    ldr         w8, [x0,x10]                // has the 3 value
+    uaddl       v10.8h,  v24.8b ,  v28.8b
+    subs        x2,x2,x7
+    uqsub       v17.8b,  v27.8b ,  v1.8b
+    dup         v29.2s,w8                   // 3 value
+    and         x8,x8,#0xff
+    add         v12.8h,  v12.8h ,  v10.8h
+    csneg       x2,x2,x2,pl
+    rshrn       v20.8b, v12.8h,#3
+    subs        x8,x8,x3
+    csneg       x8,x8,x8,pl
+    umin        v18.8b,  v20.8b ,  v30.8b
+    add         x8,x8,x2
+
+    cmp         x8,x5,asr #3
+    bge         l1.1840
+    uaddw       v14.8h,  v8.8h ,  v28.8b
+    subs        x7,x3,x7
+    umax        v4.8b,  v18.8b ,  v31.8b
+    csneg       x7,x7,x7,pl
+    uqadd       v30.8b,  v28.8b ,  v1.8b
+    mov         x10,#5
+    rshrn       v21.8b, v14.8h,#2
+    mul         x10, x10, x6
+    uqsub       v31.8b,  v28.8b ,  v1.8b
+    add         x10, x10,#1
+    cmp         x7,x10,asr #1
+    umin        v18.8b,  v21.8b ,  v16.8b
+    bge         l1.1840
+
+
+    //        if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4])  < (beta >> 3) )
+    //            && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
+
+    umax        v5.8b,  v18.8b ,  v17.8b
+    asr         x10,x5,#2
+    uaddl       v16.8h,  v29.8b ,  v28.8b
+    cmp         x10,x4,lsl #1
+    ble         l1.1840
+
+    add         x10,x1,x1,lsl #1
+    mul         v16.8h, v16.8h, v0.4h[0]
+    add         x4,x0,#3
+
+
+    lsl         x19,x1,#2
+    neg         x19,x19
+    ldrb        w2,[x4,x19]
+    add         v16.8h,  v16.8h ,  v14.8h
+    neg         x19,x1
+    ldrb        w7,[x4,x19]
+    rshrn       v19.8b, v16.8h,#3
+    ldrb        w3,[x4,#0]
+    ldrb        w8,[x4,x10]
+    //   ubfx   x7,x2,#24,#8           @ has the -1 value
+    //  and    x2,#0xff               @ has the -4 value
+    //  ubfx   x8,x3,#24,#8           @ has the 3 value
+    //  and    x3,#0xff               @ x4 has the 0 value
+
+
+
+    subs        x8,x8,x3
+    umin        v18.8b,  v19.8b ,  v30.8b
+    csneg       x8,x8,x8,pl
+    uaddl       v6.8h,  v25.8b ,  v24.8b
+    subs        x2,x2,x7
+    umax        v3.8b,  v18.8b ,  v31.8b
+    csneg       x2,x2,x2,pl
+    uaddw       v8.8h,  v6.8h ,  v26.8b
+    add         x8,x8,x2
+    uqadd       v30.8b,  v25.8b ,  v1.8b
+    cmp         x8,x5,asr #3
+    uqsub       v31.8b,  v25.8b ,  v1.8b
+    bge         l1.1840
+    mul         v12.8h, v8.8h, v0.4h[0]
+    subs        x7,x3,x7
+    uqadd       v16.8b,  v24.8b ,  v1.8b
+    csneg       x7,x7,x7,pl
+    uaddl       v10.8h,  v23.8b ,  v27.8b
+    mov         x10,#5
+    uqsub       v17.8b,  v24.8b ,  v1.8b
+    mul         x10, x10, x6
+    add         v12.8h,  v12.8h ,  v10.8h
+    add         x10, x10,#1
+    rshrn       v20.8b, v12.8h,#3
+    cmp         x7,x10,asr #1
+    uaddw       v14.8h,  v8.8h ,  v23.8b
+    bge         l1.1840
+    umin        v18.8b,  v20.8b ,  v30.8b
+    mov         x2,#2
+    uqadd       v30.8b,  v23.8b ,  v1.8b
+    mov         w4,w21
+    umax        v2.8b,  v18.8b ,  v31.8b
+    mov         w5,w22
+    rshrn       v21.8b, v14.8h,#2
+    b           end_dep_deq_decision_horz
+    // x2 has the value of de
+    // x6 has teh value of tc
+    // x5 has the value of beta
+    // x14 has the value of dp
+    // x12 has the value of dq
+    // x0 has the value of source address
+    // x1 has the src stride
+
+l1.1840:
+    mov         x2,#1
+
+    mov         x11,x5
+    mov         w4,w21
+    mov         w5,w22
+
+    cmp         x6,#1
+    mov         x20,#0
+    csel        x9, x20, x9,eq
+    mov         x20,#0
+    csel        x10, x20, x10,eq
+    beq         end_dep_deq_decision_horz
+
+    and         x7,x4,x5
+    cmp         x7,#1
+    beq         both_flags_set_horz
+    cmp         x4,#0
+    beq         set_flag_dep_zero_horz
+
+
+    add         x8,x11,x11,asr #1
+    mov         x10,#0
+    asr         x8,x8,#3
+    cmp         x8,x14
+    mov         x20,#1
+    csel        x9, x20, x9,gt
+    mov         x20,#0
+    csel        x9, x20, x9,le
+    b           end_dep_deq_decision_horz
+set_flag_dep_zero_horz:
+
+    add         x8,x11,x11,asr #1
+    mov         x9,#0
+    asr         x8,x8,#3
+    cmp         x8,x12
+    mov         x20,#1
+    csel        x10, x20, x10,gt
+    mov         x20,#0
+    csel        x10, x20, x10,le
+    b           end_dep_deq_decision_horz
+
+both_flags_set_horz:
+    add         x8,x11,x11,asr #1
+    asr         x8,x8,#3
+    cmp         x8,x14
+    mov         x20,#1
+    csel        x9, x20, x9,gt
+    mov         x20,#0
+    csel        x9, x20, x9,le
+    cmp         x8,x12
+    mov         x20,#1
+    csel        x10, x20, x10,gt
+    mov         x20,#0
+    csel        x10, x20, x10,le
+end_dep_deq_decision_horz:
+
+    //x0=source address
+    //x1=stride
+    // x2 =de
+    // x4=flag p
+    //x5= flag q
+    //x6 =tc
+    // x9 =dep
+    // x10=deq
+
+
+
+    //    add        x14,x1,x1,lsl #1
+    //    lsl        x7,x6,#1
+    //    vdup.8    d1,x7
+    //    vmov.i16  d0,#0x2
+    umin        v18.8b,  v21.8b ,  v16.8b
+    cmp         x2,#1
+    uqsub       v31.8b,  v23.8b ,  v1.8b
+    beq         l1.2408
+    uaddl       v8.8h,  v23.8b ,  v22.8b
+    cmp         x5,#1
+
+    bne         strong_filtering_p
+
+strong_filtering_q:
+    mov         x12,x0
+    st1         {v4.s}[0],[x12],x1
+    st1         {v5.s}[0],[x12],x1
+    st1         {v3.s}[0],[x12]
+    cmp         x4,#1
+    bne         l1.2404
+strong_filtering_p:
+    umax        v5.8b,  v18.8b ,  v17.8b
+    mov         x12,x0
+    mul         v8.8h, v8.8h, v0.4h[0]
+    sub         x20,x1,#0
+    neg         x11, x20
+    add         v16.8h,  v8.8h ,  v14.8h
+    add         x12,x12,x11
+    rshrn       v19.8b, v16.8h,#3
+    st1         {v2.s}[0],[x12],x11
+    umin        v18.8b,  v19.8b ,  v30.8b
+    st1         {v5.s}[0],[x12],x11
+    umax        v3.8b,  v18.8b ,  v31.8b
+    st1         {v3.s}[0],[x12]
+
+l1.2404:
+    // ldmfd sp!, {x3-x12,pc}
+    ldp         x21, x22,[sp],#16
+    ldp         x19, x20,[sp],#16
+    ldp         d14,d15,[sp],#16
+    ldp         d12,d13,[sp],#16
+    ldp         d10,d11,[sp],#16
+    ldp         d8,d9,[sp],#16
+    ret
+
+    // x4=flag p
+    //x5= flag q
+    //x6 =tc
+    // x9 =dep
+    // x10=deq
+
+
+    //        d22             -4 value
+
+    //d23        @ -3 value
+
+    //    vdup.32    d24,x11            @ -2 value
+
+    //    vdup.32    d25, x11        @-1 value
+
+    //    vdup.32    d26,x11            @ 0 value
+
+    //    vdup.32    d27,x11            @ 1value
+
+    //    vdup.32    d28,x11            @ 2 value
+
+    //    vdup.32    d29,x11            @ 3 value
+
+l1.2408:
+
+    movi        v0.4h, #0x9
+
+    usubl       v10.8h,  v26.8b ,  v25.8b
+
+    mul         v10.8h, v10.8h, v0.4h[0]
+
+    movi        v0.4h, #0x3
+
+    usubl       v12.8h,  v27.8b ,  v24.8b
+    mul         v12.8h, v12.8h, v0.4h[0]
+
+
+    dup         v30.8b,w6                   // duplicating the +tc value
+
+    sub         x20,x6,#0
+    neg         x12, x20
+    dup         v31.8b,w12                  // duplicating the -tc value
+
+
+
+    sub         v10.8h,  v10.8h ,  v12.8h
+
+
+
+    srshr       v10.8h, v10.8h,#4
+    //   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
+
+    abs         v8.8h, v10.8h
+    xtn         v9.8b,  v8.8h
+    // storing the absolute values of delta in d9
+
+    sqxtn       v10.8b,  v10.8h
+    // storing the clipped values of delta in d16
+
+
+    smin        v11.8b,  v10.8b ,  v30.8b
+    smax        v8.8b,  v31.8b ,  v11.8b    // d8 has the value  delta = clip3(delta, -tc, tc)//
+
+
+    uxtl        v6.8h, v25.8b
+
+    saddw       v4.8h,  v6.8h ,  v8.8b
+
+    sqxtun      v12.8b, v4.8h
+    uxtl        v6.8h, v26.8b
+    ssubw       v4.8h,  v6.8h ,  v8.8b
+    sqxtun      v13.8b, v4.8h
+
+
+    mov         x11,#0xa
+    mul         x12, x11, x6
+    dup         v2.8b,w12                   // d2 has the 10*tc value
+    mov         v18.8b, v24.8b
+    dup         v0.8b,w6
+    sshr        v0.8b,v0.8b,#1
+    neg         v1.8b, v0.8b
+
+    cmp         x4,#1
+    bne         l1.2724
+    cmp         x9,#1
+    bne         l1.2700
+
+    // d12 and d13 have the value temp_p0 and temp_q0
+    uaddl       v14.8h,  v23.8b ,  v25.8b
+    rshrn       v14.8b, v14.8h,#1
+    usubl       v14.8h,  v14.8b ,  v24.8b
+    saddw       v14.8h,  v14.8h ,  v8.8b
+    sqshrn      v14.8b, v14.8h,#1
+    smin        v15.8b,  v14.8b ,  v0.8b
+    smax        v14.8b,  v1.8b ,  v15.8b
+
+    // d14 has the delta p value
+    uxtl        v16.8h, v24.8b
+    saddw       v16.8h,  v16.8h ,  v14.8b
+    sqxtun      v14.8b, v16.8h
+
+    //  d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@
+    cmhs        v18.8b,v9.8b,v2.8b
+    bsl         v18.8b,v24.8b,v14.8b
+
+l1.2700:
+    mov         x12,x0
+    sub         x20,x1,#0
+    neg         x11, x20
+    add         x12,x12,x11
+    cmhs        v19.8b,v9.8b,v2.8b
+    bsl         v19.8b,v25.8b,v12.8b
+    st1         {v19.s}[0],[x12],x11
+    st1         {v18.s}[0],[x12]
+l1.2724:
+    cmp         x5,#1
+    bne         l1.2404
+    cmp         x10,#1
+    mov         v18.8b, v27.8b
+    bne         l1.2852
+
+    uaddl       v14.8h,  v26.8b ,  v28.8b
+    rshrn       v14.8b, v14.8h,#1
+    usubl       v14.8h,  v14.8b ,  v27.8b
+    ssubw       v14.8h,  v14.8h ,  v8.8b
+    sqshrn      v14.8b, v14.8h,#1
+    smin        v15.8b,  v14.8b ,  v0.8b
+    smax        v14.8b,  v1.8b ,  v15.8b
+// d14 has the delta p value
+    uxtl        v16.8h, v27.8b
+    saddw       v16.8h,  v16.8h ,  v14.8b
+    sqxtun      v14.8b, v16.8h
+    cmhs        v18.8b,v9.8b,v2.8b
+    bsl         v18.8b,v27.8b,v14.8b
+l1.2852:
+    mov         x12,x0
+    cmhs        v19.8b,v9.8b,v2.8b
+    bsl         v19.8b,v26.8b,v13.8b
+    st1         {v19.s}[0],[x12],x1
+    st1         {v18.s}[0],[x12]
+    // ldmfd sp!, {x3-x12,x15}
+    ldp         x21, x22,[sp],#16
+    ldp         x19, x20,[sp],#16
+    ldp         d14,d15,[sp],#16
+    ldp         d12,d13,[sp],#16
+    ldp         d10,d11,[sp],#16
+    ldp         d8,d9,[sp],#16
+    ret
+
+

diff --git a/common/arm64/ihevc_deblk_luma_vert.s b/common/arm64/ihevc_deblk_luma_vert.s
new file mode 100644
index 0000000..bc3cc6c
--- /dev/null
+++ b/common/arm64/ihevc_deblk_luma_vert.s

@@ -0,0 +1,635 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///*******************************************************************************
+//* //file
+//*  ihevc_deblk_luma_vert.s
+//*
+//* //brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//*  anand s
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************/
+
+.text
+.align 4
+
+
+
+.extern gai4_ihevc_tc_table
+.extern gai4_ihevc_beta_table
+
+.globl ihevc_deblk_luma_vert_av8
+
+.type ihevc_deblk_luma_vert_av8, %function
+
+ihevc_deblk_luma_vert_av8:
+
+    sxtw        x5,w5
+    sxtw        x6,w6
+    stp         d8,d9,[sp,#-16]!
+    stp         d10,d11,[sp,#-16]!
+    stp         d12,d13,[sp,#-16]!
+    stp         d14,d15,[sp,#-16]!
+    stp         x19, x20,[sp,#-16]!
+    stp         x21, x22,[sp,#-16]!
+    mov         x21,x7
+    ldr         w22,[sp,#96]
+    add         x3,x3,x4
+    add         x3,x3,#1
+    asr         x3,x3,#1
+    add         x7,x3,x5,lsl #1
+    add         x3,x3,x6,lsl #1
+    cmp         x7,#0x33
+    mov         x20,#0x33
+    csel        x7, x20, x7,gt
+    bgt         l1.56
+    cmp         x7,#0x0
+    mov         x20,#0x0
+    csel        x7, x20, x7,lt              // x7 has the beta_index value
+l1.56:
+
+//     bic      x2,x2,#1
+    asr         x2,x2,#1
+
+    add         x3,x3,x2,lsl #1
+    cmp         x3,#0x35
+    mov         x20,#0x35
+    csel        x3, x20, x3,gt
+    bgt         l1.88
+    cmp         x3,#0x0
+    mov         x20,#0x0
+    csel        x3, x20, x3,lt              // x3 has the tc_index value
+
+//    qp_luma = (quant_param_p + quant_param_q + 1) >> 1//
+//    beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)//
+//    tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)//
+
+l1.88:
+    adrp        x2, :got:gai4_ihevc_beta_table
+    ldr         x2, [x2, #:got_lo12:gai4_ihevc_beta_table]
+
+    movi        v18.8b, #0x2
+    adrp        x4, :got:gai4_ihevc_tc_table
+    ldr         x4, [x4, #:got_lo12:gai4_ihevc_tc_table]
+
+    ldr         w5,[x2,x7,lsl #2]           // beta
+    movi        v16.8h, #0x2
+    ldr         w6,[x4,x3,lsl #2]           // tc
+    lsl         x8,x6,#1
+    cmp         x6,#0
+    dup         v19.8b,w8
+    sub         x7,x0,#4
+    movi        v23.8b, #0x3
+    beq         l1.964
+
+
+    sub         x19,x0,#3
+    ld1         {v15.8b},[x7],x1
+    ldrb        w8,[x19]                    // -3 value
+    ld1         {v1.8b},[x7],x1
+    ldrb        w10,[x19,#1]                //-2 value
+    ld1         {v29.8b},[x7],x1
+    ldrb        w11,[x19,#2]                //-1 value
+    ld1         {v0.8b},[x7]
+    ldrb        w12,[x0,#0]                 // 0 value
+    ldrb        w9,[x0,#1]                  // 1 value
+    trn1        v24.8b,v15.8b,v1.8b
+    trn2        v1.8b,v15.8b,v1.8b
+    ldrb        w2,[x0,#2]                  // 2 value
+    trn1        v2.8b,v29.8b,v0.8b
+    trn2        v0.8b,v29.8b,v0.8b
+    add         x12,x12,x2
+    subs        x9,x12,x9,lsl #1            // dq0 value is stored in x9
+    csneg       x9,x9,x9,pl
+//dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )//
+    mov         v29.8b,v24.8b
+    trn1        v24.4h,v29.4h,v2.4h
+    trn2        v2.4h,v29.4h,v2.4h
+    add         x8,x8,x11
+    mov         v15.8b,v1.8b
+    trn1        v1.4h,v15.4h,v0.4h
+    trn2        v0.4h,v15.4h,v0.4h
+    subs        x8,x8,x10,lsl #1
+    csneg       x8,x8,x8,pl
+//  dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )//
+
+
+
+    add         x14,x1,x1,lsl #1
+    add         x14,x0,x14
+
+    sub         x19,x14,#3
+    dup         v4.2s, v24.2s[1]
+    ldrb        w2,[x19]                    // -2 value
+    dup         v7.2s, v2.2s[1]
+    ldrb        w10,[x19,#1]                // -2 value
+    dup         v3.2s, v2.2s[0]
+    ldrb        w11,[x19,#2]                // -1 value
+    dup         v5.2s, v1.2s[1]
+    ldrb        w12,[x14,#0]                // 0 value
+    dup         v6.2s, v1.2s[0]
+    ldrb        w3,[x14,#1]                 // 1 value
+    dup         v2.2s, v0.2s[0]
+    ldrb        w4,[x14,#2]                 // 2 value
+
+
+    add         x12,x12,x4
+    subs        x12,x12,x3,lsl #1           // dq3value is stored in x12
+    csneg       x12,x12,x12,pl
+//    dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )//
+
+
+    add         x2,x2,x11
+    subs        x11,x2,x10,lsl #1
+    csneg       x11,x11,x11,pl              // dp3 value is stored in x8
+//    dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]   + pu1_src[3 * src_strd - 1] )//
+
+
+
+    add         x3,x8,x9                    // x3 has the d0 value
+    add         x4,x11,x12                  // x4 has the d3 value
+
+
+//    d0 = dp0 + dq0//
+//    d3 = dp3 + dq3//
+
+    add         x14,x8,x11                  // x13 has the value dp
+    add         x12,x12,x9                  // x12 has the value  dq
+//    dp = dp0 + dp3//
+//   dq = dq0 + dq3//
+
+    add         x11, x3, x4                 // x3 has the value d
+
+//   d = d0 + d3//
+
+
+    cmp         x11,x5
+    dup         v22.2s, v0.2s[1]
+    bge         l1.964
+
+//    if(d < beta)
+
+
+    // registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11
+
+    // registers for use: x2,x7,x8,x9,x10,
+    uqsub       v30.8b,v7.8b,v19.8b
+    asr         x10,x5,#2
+    uqadd       v31.8b,v7.8b,v19.8b
+    cmp         x10,x3,lsl #1
+    uaddl       v0.8h,v5.8b,v4.8b
+    ble         l1.336
+
+    sub         x19,x0,4
+    ldrb        w2,[x19]
+    uaddw       v0.8h,  v0.8h ,  v2.8b
+    ldrb        w7,[x19,#3]
+    umull       v20.8h, v7.8b, v23.8b
+    ldrb        w3,[x0,#0]
+    umlal       v20.8h, v22.8b, v18.8b
+    ldrb        w8,[x0,#3]
+//   ubfx   x7,x2,#24,#8           // has the -1 value
+//  and    x2,#0xff               // has the -4 value
+//  ubfx   x8,x3,#24,#8           // has the 3 value
+//  and    x3,#0xff               // x4 has the 0 value
+
+    add         v20.8h,  v20.8h ,  v0.8h
+    subs        x8,x8,x3
+    rshrn       v22.8b,v20.8h,#3
+    csneg       x8,x8,x8,pl
+    subs        x2,x2,x7
+    umin        v21.8b,  v22.8b ,  v31.8b
+    csneg       x2,x2,x2,pl
+    umax        v22.8b,  v21.8b ,  v30.8b
+    add         x8,x8,x2
+    uaddl       v20.8h,v7.8b,v3.8b
+    cmp         x8,x5,asr #3
+    mla         v20.8h, v0.8h, v16.8h
+    bge         l1.336
+    uaddw       v0.8h,  v0.8h ,  v7.8b
+    subs        x7,x3,x7
+    rshrn       v20.8b,v20.8h,#3
+    csneg       x7,x7,x7,pl
+    rshrn       v0.8b,v0.8h,#2
+    mov         x10,#5
+    uqadd       v30.8b,v5.8b,v19.8b
+    mul         x10, x10, x6
+    uqsub       v31.8b,v5.8b,v19.8b
+    add         x10, x10,#1
+    cmp         x7,x10,asr #1
+    bge         l1.336
+
+
+//        if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4])  < (beta >> 3) )
+//            && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
+
+
+    asr         x10,x5,#2
+    uqsub       v25.8b,v4.8b,v19.8b
+    cmp         x10,x4,lsl #1
+    uqadd       v21.8b,v4.8b,v19.8b
+    ble         l1.336
+    umin        v26.8b,  v20.8b ,  v21.8b
+    add         x4,x1,x1,lsl #1
+    add         x4,x4,x0
+    umax        v20.8b,  v26.8b ,  v25.8b
+    sub         x19,x4,#4
+    ldrb        w2,[x19]
+    umin        v19.8b,  v0.8b ,  v30.8b
+    ldrb        w7,[x19,#3]
+    umax        v21.8b,  v19.8b ,  v31.8b
+    ldrb        w3,[x4,#0]
+    lsl         x10,x6,#1
+    ldrb        w8,[x4,#3]
+//   ubfx   x7,x2,#24,#8           // has the -1 value
+//  and    x2,#0xff               // has the -4 value
+//  ubfx   x8,x3,#24,#8           // has the 3 value
+//  and    x3,#0xff               // x4 has the 0 value
+    uaddl       v0.8h,v2.8b,v3.8b
+    dup         v19.8b,w10
+    subs        x8,x8,x3
+    uaddw       v0.8h,  v0.8h ,  v4.8b
+    csneg       x8,x8,x8,pl
+    uqadd       v30.8b,v2.8b,v19.8b
+    subs        x2,x2,x7
+    uqsub       v31.8b,v2.8b,v19.8b
+    csneg       x2,x2,x2,pl
+    uaddl       v26.8h,v5.8b,v6.8b
+    add         x8,x8,x2
+    mla         v26.8h, v0.8h, v16.8h
+    cmp         x8,x5,asr #3
+    bge         l1.336
+    rshrn       v26.8b,v26.8h,#3
+    subs        x7,x3,x7
+    uqadd       v27.8b,v3.8b,v19.8b
+    csneg       x7,x7,x7,pl
+    uqsub       v28.8b,v3.8b,v19.8b
+    mov         x10,#5
+    umin        v16.8b,  v26.8b ,  v30.8b
+    mul         x10, x10, x6
+    add         x10, x10,#1
+    cmp         x7,x10,asr #1
+    umax        v26.8b,  v16.8b ,  v31.8b
+    bge         l1.336
+    uqadd       v30.8b,v6.8b,v19.8b
+
+    mov         x2,#2
+    mov         x4,x21
+    uqsub       v31.8b,v6.8b,v19.8b
+    mov         x5,x22
+    b           end_dep_deq_decision
+// x2 has the value of de
+// x6 has teh value of tc
+// x5 has the value of beta
+// x14 has the value of dp
+// x12 has the value of dq
+// x0 has the value of source address
+// x1 has the src stride
+
+l1.336:
+    mov         x2,#1
+l1.424:
+    mov         x11,x5
+    mov         x4,x21
+    mov         x5,x22
+
+    cmp         x6,#1
+    mov         x20,#0
+    csel        x9, x20, x9,eq
+    mov         x20,#0
+    csel        x10, x20, x10,eq
+    beq         end_dep_deq_decision
+
+    and         x7,x4,x5
+
+    cmp         x7,#1
+    beq         both_flags_set
+    cmp         x4,#0
+    beq         set_flag_dep_zero
+
+
+    add         x8,x11,x11,asr #1
+    mov         x10,#0
+    asr         x8,x8,#3
+    cmp         x8,x14
+    mov         x20,#1
+    csel        x9, x20, x9,gt
+    mov         x20,#0
+    csel        x9, x20, x9,le
+    b           end_dep_deq_decision
+set_flag_dep_zero:
+
+    add         x8,x11,x11,asr #1
+    mov         x9,#0
+    asr         x8,x8,#3
+    cmp         x8,x12
+    mov         x20,#1
+    csel        x10, x20, x10,gt
+    mov         x20,#0
+    csel        x10, x20, x10,le
+    b           end_dep_deq_decision
+
+both_flags_set:
+    add         x8,x11,x11,asr #1
+    asr         x8,x8,#3
+    cmp         x8,x14
+    mov         x20,#1
+    csel        x9, x20, x9,gt
+    mov         x20,#0
+    csel        x9, x20, x9,le
+    cmp         x8,x12
+    mov         x20,#1
+    csel        x10, x20, x10,gt
+    mov         x20,#0
+    csel        x10, x20, x10,le
+end_dep_deq_decision:
+
+//x0=source address
+//x1=stride
+// x2 =de
+// x4=flag p
+//x5= flag q
+//x6 =tc
+// x9 =dep
+// x10=deq
+//    b    l1.964
+
+
+    cmp         x2,#2
+// x4 has the value of de
+    bne         l1.968
+
+    cmp         x5,#0
+    beq         l1.780
+// x5 has the flag of q
+
+    add         x3,x0,#2
+    st1         {v22.b}[0],[x3],x1
+
+    st1         {v22.b}[1],[x3],x1
+
+    st1         {v22.b}[2],[x3],x1
+
+    st1         {v22.b}[3],[x3]
+    add         x3,x0,x1
+    mov         v29.8b,v20.8b
+    trn1        v20.8b,v29.8b,v21.8b
+    trn2        v21.8b,v29.8b,v21.8b
+
+    st1         {v20.h}[0],[x0]
+    st1         {v21.h}[0],[x3],x1
+    st1         {v20.h}[1],[x3],x1
+    st1         {v21.h}[1],[x3]
+
+
+l1.780:
+    cmp         x4,#0
+    beq         l1.964
+    // x4 has the flag p
+
+
+    dup         v7.2s, v24.2s[0]
+    sub         x3,x0,#1
+    uaddw       v16.8h,  v0.8h ,  v6.8b
+    add         x7,x3,x1
+    rshrn       v2.8b,v16.8h,#2
+    st1         {v26.b}[0],[x3]
+    sub         x0,x0,#3
+    umin        v16.8b,  v2.8b ,  v27.8b
+    st1         {v26.b}[1],[x7],x1
+    umull       v2.8h, v6.8b, v23.8b
+    umlal       v2.8h, v7.8b, v18.8b
+    st1         {v26.b}[2],[x7],x1
+    umax        v5.8b,  v16.8b ,  v28.8b
+    st1         {v26.b}[3],[x7]
+    add         v0.8h,  v2.8h ,  v0.8h
+    rshrn       v0.8b,v0.8h,#3
+
+
+    umin        v1.8b,  v0.8b ,  v30.8b
+    umax        v0.8b,  v1.8b ,  v31.8b
+
+    mov         v29.8b,v0.8b
+    trn1        v0.8b,v29.8b,v5.8b
+    trn2        v5.8b,v29.8b,v5.8b
+    st1         {v0.h}[0],[x0],x1
+    st1         {v5.h}[0],[x0],x1
+    st1         {v0.h}[1],[x0],x1
+    st1         {v5.h}[1],[x0]
+l1.964:
+    ldp         x21, x22,[sp],#16
+    ldp         x19, x20,[sp],#16
+    ldp         d14,d15,[sp],#16
+    ldp         d12,d13,[sp],#16
+    ldp         d10,d11,[sp],#16
+    ldp         d8,d9,[sp],#16
+    ret
+
+l1.968:
+
+
+    movi        v0.8h, #0x9
+    neg         x11, x6
+    cmp         x4,#0
+    // checks for the flag p
+    movi        v16.8h, #0x3
+    movi        v24.8b, #0x1
+
+
+    dup         v30.8b,w11
+    and         x11,x6,#0xff
+    dup         v31.8b,w11
+
+    usubl       v18.8h,v4.8b,v2.8b
+    mul         v18.8h, v18.8h, v0.8h
+    usubl       v0.8h,v5.8b,v3.8b
+
+
+
+    mul         v16.8h, v0.8h, v16.8h
+    sub         v16.8h,  v18.8h ,  v16.8h
+    srshr       v16.8h,v16.8h,#4
+//   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4//
+
+    abs         v0.8h, v16.8h
+    xtn         v0.8b,  v0.8h
+    // storing the absolute values of delta in d0
+
+    sqxtn       v16.8b,v16.8h
+    // storing the clipped values of delta in d16
+
+    movi        v1.8b, #0xa
+    dup         v21.8b,w11
+    mul         v1.8b, v1.8b, v21.8b
+    // d1 stores the value (10 * tc)
+
+//if(abs(delta) < 10 * tc)
+
+    smin        v18.8b,  v16.8b ,  v31.8b
+    smax        v20.8b,  v18.8b ,  v30.8b
+
+// delta = clip3(delta, -tc, tc)//
+    sxtl        v16.8h, v20.8b
+    uxtl        v18.8h, v2.8b
+    add         v18.8h,  v18.8h ,  v16.8h
+
+    sqxtun      v22.8b, v18.8h
+    uxtl        v18.8h, v4.8b
+    sub         v16.8h,  v18.8h ,  v16.8h
+    sqxtun      v23.8b, v16.8h
+// tmp_p0 = clip_u8(pu1_src[-1] + delta)//
+//  tmp_q0 = clip_u8(pu1_src[0] - delta)//
+    beq         l1.1272
+
+
+
+    cmp         x9,#1
+    bne         l1.1212
+// checks for the flag dep
+
+    asr         x3,x6,#1
+
+
+    uaddl       v16.8h,v6.8b,v2.8b
+    uaddw       v16.8h,  v16.8h ,  v24.8b
+    dup         v18.8b,w3
+    sub         x20,x3,#0
+    neg         x3, x20
+    dup         v19.8b,w3
+    ushr        v16.8h,v16.8h,#1
+    xtn         v16.8b,  v16.8h
+
+    usubl       v16.8h,v16.8b,v3.8b
+    saddw       v16.8h,  v16.8h ,  v20.8b
+    sshr        v16.8h,v16.8h,#1
+    sqxtn       v16.8b,v16.8h
+
+    smin        v17.8b,  v16.8b ,  v18.8b
+    smax        v16.8b,  v19.8b ,  v17.8b
+
+
+
+
+    uxtl        v18.8h, v3.8b
+    sxtl        v16.8h, v16.8b
+    add         v16.8h,  v18.8h ,  v16.8h
+
+    sqxtun      v16.8b, v16.8h
+    mov         v30.8b,v3.8b
+    cmhs        v3.8b,v0.8b,v1.8b
+
+
+    bsl         v3.8b,v30.8b,v16.8b
+l1.1212:
+    dup         v16.8b,w11
+    sub         x12,x0,#3
+    sub         x3,x0,#1
+//     smul v16.8b, v16.8b, v1.8b
+    mov         v29.8b,v6.8b
+    trn1        v6.8b,v29.8b,v3.8b
+    trn2        v3.8b,v29.8b,v3.8b
+    st1         {v6.h}[0],[x12],x1
+    cmhs        v16.8b,v0.8b,v1.8b
+    st1         {v3.h}[0],[x12],x1
+    bsl         v16.8b,v2.8b,v22.8b
+    st1         {v16.b}[0],[x3],x1
+    st1         {v16.b}[1],[x3],x1
+    st1         {v6.h}[1],[x12],x1
+    st1         {v16.b}[2],[x3],x1
+    st1         {v3.h}[1],[x12]
+    st1         {v16.b}[3],[x3]
+l1.1272:
+    cmp         x5,#0
+    beq         l1.964
+    // checks for the flag q
+    cmp         x10,#1
+    bne         l1.1412
+    // checks for the flag deq
+    mov         v2.8b,v7.8b
+    asr         x3,x6,#1
+
+    dup         v6.8b,w3
+    sub         x20,x3,#0
+    neg         x3, x20
+    dup         v16.8b,w3
+    uaddl       v2.8h,v2.8b,v4.8b
+    uaddw       v2.8h,  v2.8h ,  v24.8b
+    ushr        v2.8h,v2.8h,#1
+    xtn         v2.8b,  v2.8h
+
+    usubl       v2.8h,v2.8b,v5.8b
+    ssubw       v2.8h,  v2.8h ,  v20.8b
+    sshr        v2.8h,v2.8h,#1
+    sqxtn       v3.8b,v2.8h
+
+    smin        v2.8b,  v3.8b ,  v6.8b
+    smax        v3.8b,  v16.8b ,  v2.8b
+    //  dup  v6.8b,w2
+    //   smul v6.8b, v6.8b, v1.8b
+
+
+
+    uxtl        v16.8h, v5.8b
+    sxtl        v2.8h, v3.8b
+    add         v2.8h,  v16.8h ,  v2.8h
+    sqxtun      v3.8b, v2.8h
+    mov         v30.8b,v5.8b
+    cmhs        v5.8b,v0.8b,v1.8b
+
+
+    bsl         v5.8b,v30.8b,v3.8b
+l1.1412:
+    //  dup  v2.8b,w2
+    add         x3,x0,#2
+    add         x11,x3,x1
+    //   smul v1.8b, v2.8b, v1.8b
+    st1         {v7.b}[0],[x3]
+    st1         {v7.b}[1],[x11],x1
+    st1         {v7.b}[2],[x11],x1
+    cmhs        v0.8b,v0.8b,v1.8b
+    st1         {v7.b}[3],[x11]
+    bsl         v0.8b,v4.8b,v23.8b
+    mov         v29.8b,v0.8b
+    trn1        v0.8b,v29.8b,v5.8b
+    trn2        v5.8b,v29.8b,v5.8b
+    st1         {v0.h}[0],[x0],x1
+    st1         {v5.h}[0],[x0],x1
+    st1         {v0.h}[1],[x0],x1
+    st1         {v5.h}[1],[x0]
+
+    ldp         x21, x22,[sp],#16
+    ldp         x19, x20,[sp],#16
+    ldp         d14,d15,[sp],#16
+    ldp         d12,d13,[sp],#16
+    ldp         d10,d11,[sp],#16
+    ldp         d8,d9,[sp],#16
+    ret
+
+

diff --git a/common/arm64/ihevc_inter_pred_chroma_copy.s b/common/arm64/ihevc_inter_pred_chroma_copy.s
new file mode 100644
index 0000000..7ac6855
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_chroma_copy.s

@@ -0,0 +1,256 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_inter_pred_chroma_copy.s
+//*
+//* @brief
+//*  Contains function definitions for inter prediction  interpolation.
+//* Functions are coded using NEON  intrinsics and can be compiled using ARM
+//* RVCT
+//*
+//* @author
+//*  Yogeswaran RS
+//*
+//* @par List of Functions:
+//*
+//*
+//* @remarks
+//*  None
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*   Chroma interprediction filter for copy
+//*
+//* @par Description:
+//*    Copies the array of width 'wd' and height 'ht' from the  location pointed
+//*    by 'src' to the location pointed by 'dst'
+//*
+//* @param[in] pu1_src
+//*  UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] pi1_coeff
+//*  WORD8 pointer to the filter coefficients
+//*
+//* @param[in] ht
+//*  integer height of the array
+//*
+//* @param[in] wd
+//*  integer width of the array
+//*
+//* @returns
+//*
+//* @remarks
+//*  None
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_inter_pred_chroma_copy( UWORD8 *pu1_src,
+//                                   UWORD8 *pu1_dst,
+//                                   WORD32 src_strd,
+//                                   WORD32 dst_strd,
+//                                   WORD8 *pi1_coeff,
+//                                   WORD32 ht,
+//                                   WORD32 wd)
+//**************Variables Vs Registers*****************************************
+//x0 => *pu1_src
+//x1 => *pu1_dst
+//x2 =>  src_strd
+//x3 =>  dst_strd
+//x4 => *pi1_coeff
+//x5 =>  ht
+//x6 =>  wd
+
+.text
+.align 4
+
+.globl ihevc_inter_pred_chroma_copy_av8
+
+.type ihevc_inter_pred_chroma_copy_av8, %function
+
+ihevc_inter_pred_chroma_copy_av8:
+
+    LSL         x12,x6,#1                   //wd << 1
+    CMP         x5,#0                       //checks ht == 0
+    BLE         END_LOOPS
+    AND         x8,x5,#3                    //check ht for mul of 2
+    SUB         x5,x5,x8                    //check the rounded height value
+    TST         x12,#15                     //checks wd for multiples for 16
+    BEQ         CORE_LOOP_WD_16
+    TST         x12,#7                      //checks wd for multiples for 4 & 8
+    BEQ         CORE_LOOP_WD_8
+    SUB         x11,x12,#4
+    CMP         x5,#0
+    BEQ         OUTER_LOOP_WD_4_HT_2
+
+OUTER_LOOP_WD_4:
+    SUBS        x4,x12,#0                   //checks wd == 0
+    BLE         END_INNER_LOOP_WD_4
+
+INNER_LOOP_WD_4:
+    LD1         {v0.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
+    ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
+    ST1         {v0.s}[0],[x1]              //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    LD1         {v0.s}[0],[x7],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    ADD         x0,x0,#4                    //pu1_src += 4
+    ST1         {v0.s}[0],[x6],x3           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    LD1         {v0.s}[0],[x7],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    SUBS        x4,x4,#4                    //(wd -4)
+    ST1         {v0.s}[0],[x6],x3           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    LD1         {v0.s}[0],[x7],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    ADD         x1,x1,#4                    //pu1_dst += 4
+    ST1         {v0.s}[0],[x6],x3           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    BGT         INNER_LOOP_WD_4
+
+END_INNER_LOOP_WD_4:
+    SUBS        x5,x5,#4                    //ht - 4
+    SUB         x0,x7,x11                   //pu1_src = pu1_src_tmp
+    SUB         x1,x6,x11                   //pu1_dst = pu1_dst_tmp
+    BGT         OUTER_LOOP_WD_4
+    CMP         x8,#0
+    BGT         OUTER_LOOP_WD_4_HT_2
+
+END_LOOPS:
+    RET
+
+OUTER_LOOP_WD_4_HT_2:
+    SUBS        x4,x12,#0                   //checks wd == 0
+    BLE         END_LOOPS
+
+INNER_LOOP_WD_4_HT_2:
+    LD1         {v0.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
+    ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
+    ST1         {v0.s}[0],[x1]              //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    LD1         {v0.s}[0],[x7],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    ADD         x0,x0,#4                    //pu1_src += 4
+    ST1         {v0.s}[0],[x6],x3           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    SUBS        x4,x4,#4                    //(wd -4)
+    ADD         x1,x1,#4                    //pu1_dst += 4
+    BGT         INNER_LOOP_WD_4_HT_2
+    B           END_LOOPS
+
+CORE_LOOP_WD_8:
+    SUB         x11,x12,#8
+    CMP         x5,#0
+    BEQ         OUTER_LOOP_WD_8_HT_2
+
+OUTER_LOOP_WD_8:
+    SUBS        x4,x12,#0                   //checks wd
+    BLE         END_INNER_LOOP_WD_8
+
+
+INNER_LOOP_WD_8:
+    ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
+    LD1         {v0.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
+    ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
+    ST1         {v0.8b},[x1],#8             //vst1_u8(pu1_dst_tmp, tmp_src)
+    LD1         {v1.8b},[x7],x2             //vld1_u8(pu1_src_tmp)
+    ST1         {v1.8b},[x6],x3             //vst1_u8(pu1_dst_tmp, tmp_src)
+    SUBS        x4,x4,#8                    //wd - 8(Loop condition)
+    LD1         {v2.8b},[x7],x2             //vld1_u8(pu1_src_tmp)
+    ST1         {v2.8b},[x6],x3             //vst1_u8(pu1_dst_tmp, tmp_src)
+    LD1         {v3.8b},[x7],x2             //vld1_u8(pu1_src_tmp)
+    ST1         {v3.8b},[x6],x3             //vst1_u8(pu1_dst_tmp, tmp_src)
+    BGT         INNER_LOOP_WD_8
+
+END_INNER_LOOP_WD_8:
+    SUBS        x5,x5,#4                    //ht -= 4
+    SUB         x0,x7,x11                   //pu1_src = pu1_src_tmp
+    SUB         x1,x6,x11                   //pu1_dst = pu1_dst_tmp
+    BGT         OUTER_LOOP_WD_8
+    CMP         x8,#0
+    BGT         OUTER_LOOP_WD_8_HT_2
+    B           END_LOOPS
+
+OUTER_LOOP_WD_8_HT_2:
+    SUBS        x4,x12,#0                   //checks wd
+    BLE         END_LOOPS
+
+INNER_LOOP_WD_8_HT_2:
+    ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
+    LD1         {v0.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
+    ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
+    ST1         {v0.8b},[x1],#8             //vst1_u8(pu1_dst_tmp, tmp_src)
+    LD1         {v1.8b},[x7],x2             //vld1_u8(pu1_src_tmp)
+    ST1         {v1.8b},[x6],x3             //vst1_u8(pu1_dst_tmp, tmp_src)
+    B           END_LOOPS
+
+CORE_LOOP_WD_16:
+    SUB         x11,x12,#16
+    CMP         x5,#0
+    BEQ         OUTER_LOOP_WD_16_HT_2
+
+OUTER_LOOP_WD_16:
+    SUBS        x4,x12,#0                   //checks wd
+    BLE         END_INNER_LOOP_WD_16
+
+INNER_LOOP_WD_16:
+    ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
+    LD1         {v0.16b},[x0],#16           //vld1_u8(pu1_src_tmp)
+    ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
+    ST1         {v0.16b},[x1],#16           //vst1_u8(pu1_dst_tmp, tmp_src)
+    LD1         {v1.16b},[x7],x2            //vld1_u8(pu1_src_tmp)
+    ST1         {v1.16b},[x6],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
+    SUBS        x4,x4,#16                   //wd - 16(Loop condition)
+    LD1         {v2.16b},[x7],x2            //vld1_u8(pu1_src_tmp)
+    ST1         {v2.16b},[x6],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
+    LD1         {v3.16b},[x7],x2            //vld1_u8(pu1_src_tmp)
+    ST1         {v3.16b},[x6],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
+    BGT         INNER_LOOP_WD_16
+
+END_INNER_LOOP_WD_16:
+    SUBS        x5,x5,#4                    //ht -= 4
+    SUB         x0,x7,x11                   //pu1_src = pu1_src_tmp
+    SUB         x1,x6,x11                   //pu1_dst = pu1_dst_tmp
+    BGT         OUTER_LOOP_WD_16
+    CMP         x8,#0
+    BGT         OUTER_LOOP_WD_16_HT_2
+    B           END_LOOPS
+
+OUTER_LOOP_WD_16_HT_2:
+    SUBS        x4,x12,#0                   //checks wd
+    BLE         END_LOOPS
+
+INNER_LOOP_WD_16_HT_2:
+    ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
+    LD1         {v0.16b},[x0],#16           //vld1_u8(pu1_src_tmp)
+    ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
+    ST1         {v0.16b},[x1],#16           //vst1_u8(pu1_dst_tmp, tmp_src)
+    LD1         {v1.16b},[x7],x2            //vld1_u8(pu1_src_tmp)
+    ST1         {v1.16b},[x6],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
+
+    RET
+
+

diff --git a/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s b/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s
new file mode 100644
index 0000000..e479651
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s

@@ -0,0 +1,348 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* //file
+//*  ihevc_inter_pred_chroma_copy_w16out_neon.s
+//*
+//* //brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//*  yogeswaran rs
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* //brief
+//*   chroma interprediction filter for copy
+//*
+//* //par description:
+//*    copies the array of width 'wd' and height 'ht' from the  location pointed
+//*    by 'src' to the location pointed by 'dst'
+//*
+//* //param[in] pu1_src
+//*  uword8 pointer to the source
+//*
+//* //param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] dst_strd
+//*  integer destination stride
+//*
+//* //param[in] pi1_coeff
+//*  word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src,
+//                                            word16 *pi2_dst,
+//                                            word32 src_strd,
+//                                            word32 dst_strd,
+//                                            word8 *pi1_coeff,
+//                                            word32 ht,
+//                                            word32 wd)
+//**************variables vs registers*****************************************
+//x0 => *pu1_src
+//x1 => *pi2_dst
+//x2 =>  src_strd
+//x3 =>  dst_strd
+//x4 => *pi1_coeff
+//x5 =>  ht
+//x6 =>  wd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_chroma_copy_w16out_av8
+
+.type ihevc_inter_pred_chroma_copy_w16out_av8, %function
+
+ihevc_inter_pred_chroma_copy_w16out_av8:
+
+    // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    mov         x15,x4 // pi1_coeff
+    mov         x16,x5 // ht
+    mov         x17,x6 // wd
+
+
+    mov         x12,x17                     //loads wd
+    lsl         x12,x12,#1                  //2*wd
+    mov         x7,x16                      //loads ht
+    cmp         x7,#0                       //ht condition(ht == 0)
+    ble         end_loops                   //loop
+    and         x8,x7,#3                    //check ht for mul of 2
+    sub         x9,x7,x8                    //check the rounded height value
+    and         x11,x7,#6
+    cmp         x11,#6
+    beq         loop_ht_6
+    tst         x12,#7                      //conditional check for wd (multiples)
+    beq         core_loop_wd_8
+
+loop_ht_6:
+    sub         x11,x12,#4
+    lsl         x6, x3,#1
+    adds        x6, x6,#0
+    cmp         x9,#0
+    beq         outer_loop_wd_4_ht_2
+
+outer_loop_wd_4:
+    subs        x4,x12,#0                   //wd conditional subtract
+    ble         end_inner_loop_wd_4
+
+inner_loop_wd_4:
+    ld1         {v0.8b},[x0]                //vld1_u8(pu1_src_tmp)
+    add         x5,x0,x2                    //pu1_src +src_strd
+    uxtl        v0.8h, v0.8b                //vmovl_u8(vld1_u8(pu1_src_tmp)
+    add         x10,x1,x6
+    subs        x4,x4,#4                    //wd - 4
+    shl         v0.2d, v0.2d,#6             //vshlq_n_s64(temp, 6)
+    ld1         {v22.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
+    add         x0,x0,#4                    //pu1_src += 4
+    st1         {v0.1d},[x1]                //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    add         x1,x1,#8
+    uxtl        v22.8h, v22.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    ld1         {v24.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
+    shl         v22.2d, v22.2d,#6           //vshlq_n_s64(temp, 6)
+    uxtl        v24.8h, v24.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    st1         {v22.1d},[x10],x6           //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    shl         v24.2d, v24.2d,#6           //vshlq_n_s64(temp, 6)
+    ld1         {v26.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
+    st1         {v24.1d},[x10],x6           //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    uxtl        v26.8h, v26.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    shl         v26.2d, v26.2d,#6           //vshlq_n_s64(temp, 6)
+    st1         {v26.1d},[x10],x6           //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    bgt         inner_loop_wd_4
+
+end_inner_loop_wd_4:
+    subs        x9,x9,#4                    //ht - 4
+    sub         x0,x5,x11
+    sub         x1,x10,x11,lsl #1
+    bgt         outer_loop_wd_4
+    cmp         x8,#0
+    bgt         outer_loop_wd_4_ht_2
+
+
+end_loops:
+    // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+outer_loop_wd_4_ht_2:
+    subs        x4,x12,#0                   //wd conditional subtract
+    ble         end_inner_loop_wd_4
+
+inner_loop_wd_4_ht_2:
+    ld1         {v0.8b},[x0]                //vld1_u8(pu1_src_tmp)
+    add         x5,x0,x2                    //pu1_src +src_strd
+    uxtl        v0.8h, v0.8b                //vmovl_u8(vld1_u8(pu1_src_tmp)
+    add         x10,x1,x6
+    subs        x4,x4,#4                    //wd - 4
+    shl         v0.2d, v0.2d,#6             //vshlq_n_s64(temp, 6)
+    ld1         {v22.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
+    add         x0,x0,#4                    //pu1_src += 4
+    st1         {v0.1d},[x1]                //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    add         x1,x1,#8
+    uxtl        v22.8h, v22.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    ld1         {v24.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
+    shl         v22.2d, v22.2d,#6           //vshlq_n_s64(temp, 6)
+    uxtl        v24.8h, v24.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    st1         {v22.1d},[x10],x6           //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    bgt         inner_loop_wd_4_ht_2
+    b           end_loops
+
+
+core_loop_wd_8:
+    //sub            x11,x12,#8
+    lsl         x5, x3,#1
+    adds        x5, x5,#0
+    sub         x20,x12,x3, lsl #2          // x11 = (dst_strd * 4) - width
+    neg         x11, x20
+    sub         x20,x12,x2,lsl #2           //x2->src_strd
+    neg         x8, x20
+    lsr         x4, x12, #3                 // divide by 8
+    mov         x7,x9
+    mul         x7, x7, x4
+    sub         x4,x12,#0                   //wd conditional check
+    sub         x7,x7,#4                    //subtract one for epilog
+    cmp         x9,#0
+    beq         core_loop_wd_8_ht_2
+
+prolog:
+    add         x6,x0,x2                    //pu1_src_tmp += src_strd
+    add         x10,x1,x5
+    ld1         {v8.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
+    ld1         {v10.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    ld1         {v12.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    ld1         {v14.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    uxtl        v16.8h, v8.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
+    uxtl        v18.8h, v10.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v20.8h, v12.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v22.8h, v14.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    subs        x4,x4,#8                    //wd decrements by 8
+    shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
+    shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
+    shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
+    shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
+    add         x20,x0,x8
+    csel        x0, x20, x0,le
+    add         x6,x0,x2                    //pu1_src_tmp += src_strd
+    ld1         {v8.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
+    ld1         {v10.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    ld1         {v12.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    ld1         {v14.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+
+    st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
+    add         x20,x1,x11,lsl #1
+    csel        x1, x20, x1,le
+    sub         x20,x12,#0                  //wd conditional check
+    csel        x4, x20, x4,le
+
+    subs        x7,x7,#4                    //ht - 4
+
+    blt         epilog_end                  //jumps to epilog_end
+    beq         epilog                      //jumps to epilog
+
+
+
+outer_loop_wd_8:
+
+    st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+    uxtl        v16.8h, v8.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
+
+    st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+    uxtl        v18.8h, v10.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+    uxtl        v20.8h, v12.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    uxtl        v22.8h, v14.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    subs        x4,x4,#8                    //wd decrements by 8
+    add         x20,x0,x8
+    csel        x0, x20, x0,le
+
+    add         x6,x0,x2                    //pu1_src_tmp += src_strd
+
+    ld1         {v8.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
+    shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
+
+    ld1         {v10.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
+
+    ld1         {v12.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
+
+    ld1         {v14.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    add         x10,x1,x5
+
+    shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
+
+    st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
+
+    add         x20,x1,x11,lsl #1
+    csel        x1, x20, x1,le
+    sub         x20,x12,#0                  //wd conditional check
+    csel        x4, x20, x4,le
+
+    subs        x7,x7,#4                    //ht - 4
+    bgt         outer_loop_wd_8
+
+epilog:
+    st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+    uxtl        v16.8h, v8.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
+
+    st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+    uxtl        v18.8h, v10.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+    uxtl        v20.8h, v12.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    uxtl        v22.8h, v14.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    //add          x6,x0,x2                //pu1_src_tmp += src_strd
+
+    shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
+    shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
+    shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
+    add         x10,x1,x5
+    shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
+
+    st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
+epilog_end:
+    st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+    st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+    st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+    b           end_loops
+
+core_loop_wd_8_ht_2:
+    add         x6,x0,x2                    //pu1_src_tmp += src_strd
+    add         x10,x1,x5
+    ld1         {v8.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
+    ld1         {v10.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    uxtl        v16.8h, v8.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
+    uxtl        v18.8h, v10.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    subs        x12,x12,#8                  //wd decrements by 8
+    shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
+    shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
+    st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
+    st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+    bgt         core_loop_wd_8_ht_2
+
+    // ldmfd sp!,{x4-x12,x15}         //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_inter_pred_chroma_horz.s b/common/arm64/ihevc_inter_pred_chroma_horz.s
new file mode 100644
index 0000000..cf4f0f9
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_chroma_horz.s

@@ -0,0 +1,771 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* //file
+//*  ihevc_inter_pred_chroma_horz_neon.s
+//*
+//* //brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//*  yogeswaran rs / akshaya mukund
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* //brief
+//*    chroma interprediction filter for horizontal input
+//*
+//* //par description:
+//*    applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
+//*    to the elements pointed by 'pu1_src' and  writes to the location pointed
+//*    by 'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
+//*    assumptions : the function is optimized considering the fact width is
+//*    multiple of 2,4 or 8. if width is 2, then height  should be multiple of 2.
+//*    width 4,8 is optimized further
+//*
+//* //param[in] pu1_src
+//*  uword8 pointer to the source
+//*
+//* //param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] dst_strd
+//*  integer destination stride
+//*
+//* //param[in] pi1_coeff
+//*  word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_inter_pred_chroma_horz(uword8 *pu1_src,
+//                                   uword8 *pu1_dst,
+//                                   word32 src_strd,
+//                                   word32 dst_strd,
+//                                   word8 *pi1_coeff,
+//                                   word32 ht,
+//                                   word32 wd)
+//**************variables vs registers*****************************************
+//x0 => *pu1_src
+//x1 => *pi2_dst
+//x2 =>  src_strd
+//x3 =>  dst_strd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_chroma_horz_av8
+
+.type ihevc_inter_pred_chroma_horz_av8, %function
+
+ihevc_inter_pred_chroma_horz_av8:
+
+    // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    mov         x15,x4 // pi1_coeff
+    mov         x16,x5 // ht
+    mov         x17,x6 // wd
+
+
+    mov         x4,x15                      //loads pi1_coeff
+    mov         x7,x16                      //loads ht
+    mov         x10,x17                     //loads wd
+
+    ld1         {v0.8b},[x4]                //coeff = vld1_s8(pi1_coeff)
+    subs        x14,x7,#0                   //checks for ht == 0
+    abs         v2.8b, v0.8b                //vabs_s8(coeff)
+    mov         x11,#2
+    ble         end_loops
+
+    dup         v24.8b, v2.8b[0]            //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+    sub         x12,x0,#2                   //pu1_src - 2
+    dup         v25.8b, v2.8b[1]            //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+    add         x4,x12,x2                   //pu1_src_tmp2_8 = pu1_src + src_strd
+    dup         v26.8b, v2.8b[2]            //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+
+    tst         x10,#3                      //checks wd for multiples
+    lsl         x5, x10, #1
+
+    dup         v27.8b, v2.8b[3]            //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+
+    bne         outer_loop_4
+    cmp         x10,#12
+    beq         skip_16
+
+    cmp         x10,#8
+    bge         outer_loop_16
+skip_16:
+    tst         x7,#3
+
+    sub         x9,x0,#2
+    beq         outer_loop_ht_4             //jumps to else condition
+
+    b           outer_loop_8
+
+
+outer_loop_16:
+    mov         x10,x5                      //2wd
+    mul         x14, x14 , x10
+
+    sub         x20,x3,#16
+    neg         x6, x20
+
+    add         x4,x12,x2
+    mov         x9,#10
+    and         x0, x12, #31
+    sub         x20,x5,x3,lsl #1
+    neg         x8, x20
+    add         x20,x12, x2 , lsl #1
+    prfm        PLDL1KEEP,[x20]
+
+
+
+    add         x19,x12,#8
+    ld1         { v0.2s},[x12],x11          //vector load pu1_src
+    ld1         { v1.2s},[x19],x11          //vector load pu1_src
+    add         x20,x4, x2 , lsl #1
+    prfm        PLDL1KEEP,[x20]
+
+    ld1         { v2.2s},[x12],x11          //vector load pu1_src
+    ld1         { v3.2s},[x19],x11          //vector load pu1_src
+
+    ld1         { v4.2s},[x12],x11          //vector load pu1_src
+    ld1         { v5.2s},[x19],x11          //vector load pu1_src
+
+    ld1         { v6.2s},[x12],x9           //vector load pu1_src
+    ld1         { v7.2s},[x19],x9           //vector load pu1_src
+
+
+    add         x19,x4,#8
+    umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    ld1         { v8.2s},[x4],x11           //vector load pu1_src
+    ld1         { v9.2s},[x19],x11          //vector load pu1_src
+
+    umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         { v10.2s},[x4],x11          //vector load pu1_src
+    ld1         { v11.2s},[x19],x11         //vector load pu1_src
+
+    umlal       v30.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    ld1         { v12.2s},[x4],x11          //vector load pu1_src
+    ld1         { v13.2s},[x19],x11         //vector load pu1_src
+
+    umlsl       v30.8h, v6.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    ld1         { v14.2s},[x4],x9           //vector load pu1_src
+    ld1         { v15.2s},[x19],x9          //vector load pu1_src
+
+    umull       v28.8h, v3.8b, v25.8b
+
+    umlsl       v28.8h, v1.8b, v24.8b
+
+
+    umlal       v28.8h, v5.8b, v26.8b
+
+    umlsl       v28.8h, v7.8b, v27.8b
+
+
+    cmp         x14,#32
+    beq         epilog_end
+    sub         x14, x14,#64
+
+inner_loop_16:
+
+
+
+
+//     bgt            l_2
+
+//    add x20,x12, x2 , lsl #1
+    prfm        PLDL1KEEP,[x20]
+//    add x20,x4, x2 , lsl #1
+    prfm        PLDL1KEEP,[x20]
+
+
+
+    subs        x10,x10,#16
+
+    umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+
+    add         x20,x12,x8
+    csel        x12, x20, x12,eq
+    add         x20,x12,x2
+    csel        x4, x20, x4,eq
+    umlsl       v22.8h, v8.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+
+
+    add         x20,x12, x2 , lsl #2
+    prfm        PLDL1KEEP,[x20]
+    sqrshrun    v30.8b, v30.8h,#6
+
+    add         x19,x12,#8
+    ld1         { v0.2s},[x12],x11          //vector load pu1_src
+    ld1         { v1.2s},[x19],x11          //vector load pu1_src
+
+    sqrshrun    v31.8b, v28.8h,#6
+
+
+
+    ld1         { v2.2s},[x12],x11          //vector load pu1_src
+    ld1         { v3.2s},[x19],x11          //vector load pu1_src
+    umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+
+
+
+    ld1         { v4.2s},[x12],x11          //vector load pu1_src
+    ld1         { v5.2s},[x19],x11          //vector load pu1_src
+    umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+    ld1         { v6.2s},[x12],x9           //vector load pu1_src
+    ld1         { v7.2s},[x19],x9           //vector load pu1_src
+    umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    add         x20,x4, x2 , lsl #2
+    prfm        PLDL1KEEP,[x20]
+    umlsl       v20.8h, v9.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    //mov       v30.s[1],v31.s[0]
+    add         x13,x1,#8
+    st1         { v30.4h}, [x1],x3
+    st1         { v31.4h}, [x13],x3
+    umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    add         x19,x4,#8
+    ld1         { v8.2s},[x4],x11           //vector load pu1_src
+    ld1         { v9.2s},[x19],x11          //vector load pu1_src
+    umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+    ld1         { v10.2s},[x4],x11          //vector load pu1_src
+    ld1         { v11.2s},[x19],x11         //vector load pu1_src
+    umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    ld1         { v12.2s},[x4],x11          //vector load pu1_src
+    ld1         { v13.2s},[x19],x11         //vector load pu1_src
+    umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         { v14.2s},[x4],x9           //vector load pu1_src
+    ld1         { v15.2s},[x19],x11         //vector load pu1_src
+    umlal       v30.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    cmp         x10,#0
+    sqrshrun    v22.8b, v22.8h,#6
+    sqrshrun    v23.8b, v20.8h,#6
+
+
+
+    umlsl       v30.8h, v6.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    csel        x10, x5, x10,eq             //2wd
+    umull       v28.8h, v3.8b, v25.8b
+
+
+    //add       x13,x1,#8
+    //mov       v22.s[1],v23.s[0]
+    st1         { v22.4h},[x1],x6           //store the result pu1_dst
+    st1         { v23.4h},[x13],x6          //store the result pu1_dst
+    umlsl       v28.8h, v1.8b, v24.8b
+
+
+    add         x20,x1,x8
+    csel        x1, x20, x1,eq
+    umlal       v28.8h, v5.8b, v26.8b
+
+    subs        x14,x14,#32                 //decrement the ht loop
+    umlsl       v28.8h, v7.8b, v27.8b
+
+//      mov            x0, x7
+
+    bgt         inner_loop_16
+
+
+
+    add         x14,x14,#64
+    cmp         x14,#32
+    beq         epilog_end
+
+epilog:
+    sqrshrun    v30.8b, v30.8h,#6
+    sqrshrun    v31.8b, v28.8h,#6
+
+
+
+    add         x13,x1,#8
+    //mov       v30.s[1],v31.s[0]
+    st1         { v30.4h}, [x1],x3
+    st1         { v31.4h}, [x13],x3
+
+    umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+
+
+
+    umlsl       v22.8h, v8.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    subs        x10,x10,#16                 //decrement the wd loop
+    umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    add         x20,x12,x8
+    csel        x12, x20, x12,eq
+    umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+    csel        x10, x5, x10,eq             //2wd
+
+
+    add         x20,x12,x2
+    csel        x4, x20, x4,eq
+    umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    add         x19,x12,#8
+    ld1         { v0.2s},[x12],x11          //vector load pu1_src
+    ld1         { v1.2s},[x19],x11          //vector load pu1_src
+
+    umlsl       v20.8h, v9.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    ld1         { v2.2s},[x12],x11          //vector load pu1_src
+    ld1         { v3.2s},[x19],x11          //vector load pu1_src
+    umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    ld1         { v4.2s},[x12],x11          //vector load pu1_src
+    ld1         { v5.2s},[x19],x11          //vector load pu1_src
+
+    umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+    ld1         { v6.2s},[x12],x9           //vector load pu1_src
+    ld1         { v7.2s},[x19],x9           //vector load pu1_src
+    umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+
+    add         x19,x4,#8
+    ld1         { v8.2s},[x4],x11           //vector load pu1_src
+    ld1         { v9.2s},[x19],x11          //vector load pu1_src
+    umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    ld1         { v10.2s},[x4],x11          //vector load pu1_src
+    ld1         { v11.2s},[x19],x11         //vector load pu1_src
+    umlal       v30.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    umlsl       v30.8h, v6.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    ld1         { v12.2s},[x4],x11          //vector load pu1_src
+    ld1         { v13.2s},[x19],x11         //vector load pu1_src
+    umull       v28.8h, v3.8b, v25.8b
+    ld1         { v14.2s},[x4],x9           //vector load pu1_src
+    ld1         { v15.2s},[x19],x9          //vector load pu1_src
+    umlsl       v28.8h, v1.8b, v24.8b
+    sqrshrun    v22.8b, v22.8h,#6
+    sqrshrun    v23.8b, v20.8h,#6
+
+    //mov       v22.s[1],v23.s[0]
+    st1         { v22.4h},[x1],x6           //store the result pu1_dst
+    st1         { v23.4h},[x13],x6          //store the result pu1_dst
+    umlal       v28.8h, v5.8b, v26.8b
+
+    umlsl       v28.8h, v7.8b, v27.8b
+    add         x20,x1,x8
+    csel        x1, x20, x1,eq
+
+
+
+epilog_end:
+    sqrshrun    v30.8b, v30.8h,#6
+    sqrshrun    v31.8b, v28.8h,#6
+
+
+    umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    umlsl       v22.8h, v8.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+    umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    umlsl       v20.8h, v9.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+    sqrshrun    v22.8b, v22.8h,#6
+    sqrshrun    v23.8b, v20.8h,#6
+
+    add         x13,x1,#8
+
+    //mov       v30.s[1],v31.s[0]
+    st1         { v30.4h}, [x1],x3
+    st1         { v31.4h}, [x13],x3
+
+    //mov       v22.s[1],v23.s[0]
+    st1         { v22.4h},[x1]              //store the result pu1_dst
+    st1         { v23.4h},[x13]             //store the result pu1_dst
+
+
+
+    b           end_loops
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+outer_loop_8:
+
+
+    add         x6,x1,x3                    //pu1_dst + dst_strd
+    mov         x7,x5
+    add         x4,x12,x2                   //pu1_src + src_strd
+
+
+inner_loop_8:
+    //ld1 {v0.2s, v1.2s},[x12],x11                //vector load pu1_src
+    ld1         {v0.2s},[x12],x11           //vector load pu1_src
+    ld1         {v1.2s},[x12],x11           //vector load pu1_src
+    ld1         {v2.2s},[x12],x11           //vector load pu1_src
+    ld1         {v3.2s},[x12],x11           //vector load pu1_src
+
+    //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
+    umull       v8.8h, v1.8b, v25.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
+    //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
+    umlal       v8.8h, v2.8b, v26.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    umlsl       v8.8h, v3.8b, v27.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    ld1         {v4.2s},[x4],x11            //vector load pu1_src
+    ld1         {v5.2s},[x4],x11            //vector load pu1_src
+    ld1         {v6.2s},[x4],x11            //vector load pu1_src
+    ld1         {v7.2s},[x4],x11            //vector load pu1_src
+    //ld1 {v12.2s, v13.2s},[x4],x11                //vector load pu1_src + src_strd
+    //vext.u8    d14,d12,d13,#2                    //vector extract of src[0_2]
+    umull       v10.8h, v5.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    umlsl       v10.8h, v4.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    //vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
+    //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
+    sqrshrun    v8.8b, v8.8h,#6             //right shift and saturating narrow result 1
+    umlal       v10.8h, v6.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    umlsl       v10.8h, v7.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    st1         {v8.8b},[x1],#8             //store the result pu1_dst
+
+    sqrshrun    v10.8b, v10.8h,#6           //right shift and saturating narrow result 2
+    subs        x7,x7,#8                    //decrement the wd loop
+    st1         {v10.8b},[x6],#8            //store the result pu1_dst
+    bgt         inner_loop_8
+
+    sub         x12,x12,x5
+    subs        x14,x14,#2                  //decrement the ht loop
+    sub         x1,x1,x5
+    add         x12,x12,x2,lsl #1
+    add         x1,x1,x3,lsl #1
+    bgt         outer_loop_8
+    b           end_loops
+
+//height if 4 comes
+outer_loop_ht_4:
+
+    mov         x7,x5
+
+prologue_ht_4:
+
+inner_loop_ht_4:
+
+    mov         x12,x9
+    mov         x4,x1
+
+    sub         x8, x2, #6
+
+    ld1         {v0.2s},[x12],x11           //(1)vector load pu1_src
+    ld1         {v1.2s},[x12],x11           //(1)vector load pu1_src
+    ld1         {v2.2s},[x12],x11           //(1)vector load pu1_src
+    //ld1 {v3.2s},[x12],x2                //(1)vector load pu1_src
+    ld1         {v3.2s},[x12],x8            //(1)vector load pu1_src
+
+    //sub        x12, x12, #6                //(1)
+
+    ld1         {v4.2s},[x12],x11           //(2)vector load pu1_src
+    ld1         {v5.2s},[x12],x11           //(2)vector load pu1_src
+    ld1         {v6.2s},[x12],x11           //(2)vector load pu1_src
+    //ld1 {v7.2s},[x12],x2                //(2)vector load pu1_src
+    ld1         {v7.2s},[x12],x8            //(2)vector load pu1_src
+
+    //sub        x12, x12, #6                //(2)
+
+    ld1         {v14.2s},[x12],x11          //(3)vector load pu1_src
+    umull       v8.8h, v1.8b, v25.8b        //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    ld1         {v15.2s},[x12],x11          //(3)vector load pu1_src
+    umlsl       v8.8h, v0.8b, v24.8b        //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         {v16.2s},[x12],x11          //(3)vector load pu1_src
+    umlal       v8.8h, v2.8b, v26.8b        //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    //ld1 {v17.2s},[x12],x2                //(3)vector load pu1_src
+    ld1         {v17.2s},[x12],x8           //(3)vector load pu1_src
+    umlsl       v8.8h, v3.8b, v27.8b        //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    //sub        x12, x12, #6                //(3)
+    umull       v10.8h, v5.8b, v25.8b       //(2)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    ld1         {v18.2s},[x12],x11          //(4)vector load pu1_src
+    umlsl       v10.8h, v4.8b, v24.8b       //(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         {v19.2s},[x12],x11          //(4)vector load pu1_src
+    umlal       v10.8h, v6.8b, v26.8b       //(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    ld1         {v20.2s},[x12],x11          //(4)vector load pu1_src
+    umlsl       v10.8h, v7.8b, v27.8b       //(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    ld1         {v21.2s},[x12],x2           //(4)vector load pu1_src
+    sqrshrun    v8.8b, v8.8h,#6             //(1)right shift and saturating narrow result 1
+
+    add         x9,x9,#8                    //(core loop)
+
+    subs        x7,x7,#8                    //(prologue)decrement the wd loop
+    beq         epilogue
+
+core_loop:
+    mov         x12,x9
+
+    ld1         {v0.2s},[x12],x11           //(1_1)vector load pu1_src
+    umull       v12.8h, v15.8b, v25.8b      //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    ld1         {v1.2s},[x12],x11           //(1_1)vector load pu1_src
+    umlsl       v12.8h, v14.8b, v24.8b      //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         {v2.2s},[x12],x11           //(1_1)vector load pu1_src
+    umlal       v12.8h, v16.8b, v26.8b      //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    //ld1 {v3.2s},[x12],x2                //(1_1)vector load pu1_src
+    ld1         {v3.2s},[x12],x8            //(1_1)vector load pu1_src
+    umlsl       v12.8h, v17.8b, v27.8b      //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    //sub        x12, x12, #6                //(1_1)
+
+    st1         {v8.8b},[x4],x3             //(1)store the result pu1_dst
+    sqrshrun    v10.8b, v10.8h,#6           //(2)right shift and saturating narrow result 2
+
+    ld1         {v4.2s},[x12],x11           //(2_1)vector load pu1_src
+    umull       v22.8h, v19.8b, v25.8b      //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    ld1         {v5.2s},[x12],x11           //(2_1)vector load pu1_src
+    umlsl       v22.8h, v18.8b, v24.8b      //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         {v6.2s},[x12],x11           //(2_1)vector load pu1_src
+    umlal       v22.8h, v20.8b, v26.8b      //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    //ld1 {v7.2s},[x12],x2                //(2_1)vector load pu1_src
+    ld1         {v7.2s},[x12],x8            //(2_1)vector load pu1_src
+    umlsl       v22.8h, v21.8b, v27.8b      //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    //sub        x12, x12, #6                //(2_1)
+
+    st1         {v10.8b},[x4],x3            //(2)store the result pu1_dst
+    sqrshrun    v12.8b, v12.8h,#6           //(3)right shift and saturating narrow result 1
+
+    ld1         {v14.2s},[x12],x11          //(3_1)vector load pu1_src
+    umull       v8.8h, v1.8b, v25.8b        //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    ld1         {v15.2s},[x12],x11          //(3_1)vector load pu1_src
+    umlsl       v8.8h, v0.8b, v24.8b        //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         {v16.2s},[x12],x11          //(3_1)vector load pu1_src
+    umlal       v8.8h, v2.8b, v26.8b        //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    //ld1 {v17.2s},[x12],x2                //(3_1)vector load pu1_src
+    ld1         {v17.2s},[x12],x8           //(3_1)vector load pu1_src
+    umlsl       v8.8h, v3.8b, v27.8b        //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    //sub        x12, x12, #6                //(3_1)
+
+    st1         {v12.8b},[x4],x3            //(3)store the result pu1_dst
+    sqrshrun    v22.8b, v22.8h,#6           //(4)right shift and saturating narrow result 2
+
+    add         x9,x9,#8                    //(core loop)
+
+    umull       v10.8h, v5.8b, v25.8b       //(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    ld1         {v18.2s},[x12],x11          //(4_1)vector load pu1_src
+
+    ld1         {v19.2s},[x12],x11          //(4_1)vector load pu1_src
+    umlsl       v10.8h, v4.8b, v24.8b       //(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         {v20.2s},[x12],x11          //(4_1)vector load pu1_src
+    umlal       v10.8h, v6.8b, v26.8b       //(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    ld1         {v21.2s},[x12],x2           //(4_1)vector load pu1_src
+    umlsl       v10.8h, v7.8b, v27.8b       //(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    add         x1,x1,#8                    //(core loop)
+
+    subs        x7,x7,#8                    //(core loop)
+
+    st1         {v22.8b},[x4], x3           //(4)store the result pu1_dst
+    sqrshrun    v8.8b, v8.8h,#6             //(1_1)right shift and saturating narrow result 1
+
+    mov         x4, x1                      //(core loop)
+
+    bgt         core_loop                   //loopback
+
+epilogue:
+    umull       v12.8h, v15.8b, v25.8b      //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    umlsl       v12.8h, v14.8b, v24.8b      //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    umlal       v12.8h, v16.8b, v26.8b      //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    umlsl       v12.8h, v17.8b, v27.8b      //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    st1         {v8.8b},[x4],x3             //(1)store the result pu1_dst
+    sqrshrun    v10.8b, v10.8h,#6           //(2)right shift and saturating narrow result 2
+
+    umull       v22.8h, v19.8b, v25.8b      //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    umlsl       v22.8h, v18.8b, v24.8b      //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    umlal       v22.8h, v20.8b, v26.8b      //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    umlsl       v22.8h, v21.8b, v27.8b      //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    st1         {v10.8b},[x4],x3            //(2)store the result pu1_dst
+    sqrshrun    v12.8b, v12.8h,#6           //(3)right shift and saturating narrow result 1
+
+    st1         {v12.8b},[x4],x3            //(3)store the result pu1_dst
+
+    add         x1,x1,#8                    //(core loop)
+
+    sqrshrun    v22.8b, v22.8h,#6           //(4)right shift and saturating narrow result 2
+
+
+    st1         {v22.8b},[x4], x3           //(4)store the result pu1_dst
+
+    sub         x9,x9,x5
+    subs        x14,x14,#4                  //decrement the ht loop
+    sub         x1,x1,x5
+    add         x9,x9,x2,lsl #2
+    add         x1,x1,x3,lsl #2
+    bgt         outer_loop_ht_4
+    b           end_loops
+
+outer_loop_4:
+    add         x6,x1,x3                    //pu1_dst + dst_strd
+    mov         x7,x5
+    add         x4,x12,x2                   //pu1_src + src_strd
+
+inner_loop_4:
+    //ld1 {v0.2s, v1.2s},[x12]                    //vector load pu1_src
+
+    ld1         {v20.2s},[x12],x11          //vector load pu1_src
+    ld1         {v21.2s},[x12],x11          //vector load pu1_src
+    ld1         {v22.2s},[x12],x11          //vector load pu1_src
+    ld1         {v23.2s},[x12]              //vector load pu1_src
+
+    sub         x12,x12,#2                  //increment the input pointer
+    ld1         {v16.2s},[x4],x11           //vector load pu1_src
+    ld1         {v17.2s},[x4],x11           //vector load pu1_src
+    ld1         {v18.2s},[x4],x11           //vector load pu1_src
+    ld1         {v19.2s},[x4]               //vector load pu1_src
+    //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
+    //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
+    //ld1 {v12.2s, v13.2s},[x4]                    //vector load pu1_src + src_strd
+    //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
+
+    sub         x4,x4,#2                    //increment the input pointer
+    //vext.u8    d14,d12,d13,#2                    //vector extract of src[0_2]
+    //vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
+    //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
+
+    zip1        v0.2s, v20.2s, v16.2s
+    zip2        v4.2s, v20.2s, v16.2s       //vector zip the i iteration and ii interation in single register
+    zip1        v1.2s, v21.2s, v17.2s
+    zip2        v5.2s, v21.2s, v17.2s
+    zip1        v2.2s, v22.2s, v18.2s
+    zip2        v6.2s, v22.2s, v18.2s
+    zip1        v3.2s, v23.2s, v19.2s
+    zip2        v7.2s, v23.2s, v19.2s
+
+    umull       v8.8h, v1.8b, v25.8b        //arithmetic operations for ii iteration in the same time
+    umlsl       v8.8h, v0.8b, v24.8b
+    umlal       v8.8h, v2.8b, v26.8b
+    umlsl       v8.8h, v3.8b, v27.8b
+
+    sqrshrun    v8.8b, v8.8h,#6             //narrow right shift and saturating the result
+    st1         {v8.s}[0],[x1],#4           //store the i iteration result which is in upper part of the register
+    subs        x7,x7,#4                    //decrement the wd by 4
+
+    st1         {v8.s}[1],[x6],#4           //store the ii iteration result which is in lower part of the register
+
+    bgt         inner_loop_4
+
+    sub         x12,x12,x5
+    subs        x14,x14,#2                  //decrement the ht by 2
+    sub         x1,x1,x5
+    add         x12,x12,x2,lsl #1
+    add         x1,x1,x3,lsl #1
+    bgt         outer_loop_4
+
+end_loops:
+
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s b/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s
new file mode 100644
index 0000000..a35fdaa
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s

@@ -0,0 +1,798 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* //file
+//*  ihevc_inter_pred_chroma_horz_neon.s
+//*
+//* //brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//*  yogeswaran rs / akshaya mukund
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* //brief
+//*       chroma interprediction filter to store horizontal 16bit ouput
+//*
+//* //par description:
+//*    applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
+//*    to the elements pointed by 'pu1_src' and  writes to the location pointed
+//*    by 'pu1_dst'  no downshifting or clipping is done and the output is  used
+//*    as an input for vertical filtering or weighted  prediction
+//*
+//* //param[in] pu1_src
+//*  uword8 pointer to the source
+//*
+//* //param[out] pi2_dst
+//*  word16 pointer to the destination
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] dst_strd
+//*  integer destination stride
+//*
+//* //param[in] pi1_coeff
+//*  word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_inter_pred_chroma_horz_w16out(uword8 *pu1_src,
+//                                          word16 *pi2_dst,
+//                                          word32 src_strd,
+//                                          word32 dst_strd,
+//                                          word8 *pi1_coeff,
+//                                          word32 ht,
+//                                          word32 wd)
+//**************variables vs registers*****************************************
+//x0 => *pu1_src
+//x1 => *pi2_dst
+//x2 =>  src_strd
+//x3 =>  dst_strd
+
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_chroma_horz_w16out_av8
+
+
+.type ihevc_inter_pred_chroma_horz_w16out_av8, %function
+
+ihevc_inter_pred_chroma_horz_w16out_av8:
+
+    // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    mov         x15,x4 // pi1_coeff
+    mov         x16,x5 // ht
+    mov         x17,x6 // wd
+
+    mov         x4,x15                      //loads pi1_coeff
+    mov         x6,x16                      //loads ht
+    mov         x10,x17                     //loads wd
+
+    ld1         {v0.8b},[x4]                //coeff = vld1_s8(pi1_coeff)
+    subs        x14,x6,#0                   //checks for ht == 0
+    abs         v2.8b, v0.8b                //vabs_s8(coeff)
+
+//******* added
+    mov         x11, #2
+//******* added ends
+
+    ble         end_loops
+
+    dup         v24.8b, v2.8b[0]            //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+    sub         x12,x0,#2                   //pu1_src - 2
+    dup         v25.8b, v2.8b[1]            //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+    add         x4,x12,x2                   //pu1_src_tmp2_8 = pu1_src + src_strd
+    dup         v26.8b, v2.8b[2]            //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+
+    tst         x10,#3                      //checks wd for multiples of 4
+    lsl         x5, x10, #1                 //2wd
+
+    dup         v27.8b, v2.8b[3]            //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+
+    and         x7,x14,#1                   //added                //calculating ht_residue ht_residue = (ht & 1)
+    sub         x14,x14,x7                  //added                //decrement height by ht_residue(residue value is calculated outside)
+
+    bne         outer_loop_4                // this branching happens when the width is 2 or 6
+
+    cmp         x10,#12
+    beq         skip_16
+
+    cmp         x10,#8
+    bge         outer_loop_16
+
+skip_16:
+    tst         x6,#3
+
+//******* removal
+    //mov        x11,#8
+//******* removal ends
+
+    sub         x9,x0,#2
+    beq         outer_loop_ht_4             //this branching happens when the height is a a multiple of 4
+
+
+
+//     cmp        x10,#12
+//     beq     outer_loop_8
+//     cmp        x10,#16
+//     bge    outer_loop_16
+    b           outer_loop_8
+
+
+
+outer_loop_16:
+    add         x4,x12,x2
+
+
+    and         x0, x12, #31
+    add         x20,x12, x2 , lsl #1
+    prfm        PLDL1KEEP,[x20]
+
+
+
+
+
+
+    add         x19,x12,#8
+    ld1         { v0.2s},[x12],x11          //vector load pu1_src
+    ld1         { v1.2s},[x19],x11          //vector load pu1_src
+    mov         x10,x5                      //2wd
+    mul         x14, x14 , x10
+    ld1         { v2.2s},[x12],x11          //vector load pu1_src
+    ld1         { v3.2s},[x19],x11          //vector load pu1_src
+    add         x20,x4, x2 , lsl #1
+    prfm        PLDL1KEEP,[x20]
+    mov         x9,#10
+    ld1         { v4.2s},[x12],x11          //vector load pu1_src
+    ld1         { v5.2s},[x19],x11          //vector load pu1_src
+    sub         x20,x3,#8
+    neg         x6, x20
+    sub         x8,x3,#8
+    ld1         { v6.2s},[x12],x9           //vector load pu1_src
+    ld1         { v7.2s},[x19],x9           //vector load pu1_src
+
+
+    add         x19,x4,#8
+    umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    ld1         { v8.2s},[x4],x11           //vector load pu1_src
+    ld1         { v9.2s},[x19],x11          //vector load pu1_src
+
+    umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         { v10.2s},[x4],x11          //vector load pu1_src
+    ld1         { v11.2s},[x19],x11         //vector load pu1_src
+
+    umlal       v30.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    ld1         { v12.2s},[x4],x11          //vector load pu1_src
+    ld1         { v13.2s},[x19],x11         //vector load pu1_src
+
+    umlsl       v30.8h, v6.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    ld1         { v14.4s},[x4],x9           //vector load pu1_src
+    ld1         { v15.2s},[x19],x9          //vector load pu1_src
+
+    umull       v28.8h, v3.8b, v25.8b
+    lsl         x6,x6,#1
+    sub         x20,x5,x3,lsl #1
+    neg         x3, x20
+    umlsl       v28.8h, v1.8b, v24.8b
+    lsl         x8,x8,#1
+    sub         x20,x5,x2,lsl #1
+    neg         x7, x20
+    umlal       v28.8h, v5.8b, v26.8b
+
+    umlsl       v28.8h, v7.8b, v27.8b
+    cmp         x14,#32
+    beq         epilog_end
+    sub         x14, x14,#64
+
+inner_loop_16:
+
+    // and            x7, x12, #31                    //decrement the wd loop
+    // cmp            x7, x0
+    add         x20,x12, x2 , lsl #2
+    prfm        PLDL1KEEP,[x20]
+    add         x20,x4, x2 , lsl #2
+    prfm        PLDL1KEEP,[x20]
+
+
+    subs        x10,x10,#16
+
+    umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+
+
+//     add x20,x12,x2,lsl #1
+    //csel x12, x20, x12,eq
+//     sub x20,x12,x5
+    //csel x12, x20, x12,eq
+    add         x20,x12,x7
+    csel        x12, x20, x12,eq
+    add         x20,x12,x2
+    csel        x4, x20, x4,eq
+
+
+    st1         { v30.8h}, [x1],#16
+    umlsl       v22.8h, v8.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+
+
+
+    add         x19,x12,#8
+    ld1         { v0.2s},[x12],x11          //vector load pu1_src
+    ld1         { v1.2s},[x19],x11          //vector load pu1_src
+    umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+
+
+
+    ld1         { v2.2s},[x12],x11          //vector load pu1_src
+    ld1         { v3.2s},[x19],x11          //vector load pu1_src
+    umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+    ld1         { v4.2s},[x12],x11          //vector load pu1_src
+    ld1         { v5.2s},[x19],x11          //vector load pu1_src
+    umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    st1         { v28.8h}, [x1],x8
+    umlsl       v20.8h, v9.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         { v6.2s},[x12],x9           //vector load pu1_src
+    ld1         { v7.2s},[x19],x9           //vector load pu1_src
+    umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    add         x19,x4,#8
+    ld1         { v8.2s},[x4],x11           //vector load pu1_src
+    ld1         { v9.2s},[x19],x11          //vector load pu1_src
+    umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+    ld1         { v10.2s},[x4],x11          //vector load pu1_src
+    ld1         { v11.2s},[x19],x11         //vector load pu1_src
+    umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    ld1         { v12.2s},[x4],x11          //vector load pu1_src
+    ld1         { v13.2s},[x19],x11         //vector load pu1_src
+    umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         { v14.2s},[x4],x9           //vector load pu1_src
+    ld1         { v15.2s},[x19],x9          //vector load pu1_src
+    umlal       v30.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    st1         { v22.8h},[x1],#16          //store the result pu1_dst
+    umlsl       v30.8h, v6.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    csel        x10, x5, x10,eq             //2wd
+    umull       v28.8h, v3.8b, v25.8b
+
+
+
+    umlsl       v28.8h, v1.8b, v24.8b
+    st1         { v20.8h},[x1],x6           //store the result pu1_dst
+
+
+    add         x20,x1,x3,lsl #1
+    csel        x1, x20, x1,eq
+    umlal       v28.8h, v5.8b, v26.8b
+
+    subs        x14,x14,#32                 //decrement the ht loop
+    umlsl       v28.8h, v7.8b, v27.8b
+
+
+
+//     mov            x0, x7
+    bgt         inner_loop_16
+
+
+
+    add         x14,x14,#64
+    cmp         x14,#32
+    beq         epilog_end
+
+epilog:
+
+    st1         { v30.8h}, [x1],#16
+    umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    st1         { v28.8h}, [x1],x8
+
+
+
+    umlsl       v22.8h, v8.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    subs        x10,x10,#16                 //decrement the wd loop
+    umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+//     add x20,x12,x2,lsl #1
+    //csel x12, x20, x12,eq
+    add         x20,x12,x7
+    csel        x12, x20, x12,eq
+    umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+    // sub x20,x12,x5
+    //csel x12, x20, x12,eq
+    csel        x10, x5, x10,eq             //2wd
+    add         x20,x12,x2
+    csel        x4, x20, x4,eq
+    umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    add         x19,x12,#8
+    ld1         { v0.2s},[x12],x11          //vector load pu1_src
+    ld1         { v1.2s},[x19],x11          //vector load pu1_src
+
+    umlsl       v20.8h, v9.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         { v2.2s},[x12],x11          //vector load pu1_src
+    ld1         { v3.2s},[x19],x11          //vector load pu1_src
+
+    umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    ld1         { v4.2s},[x12],x11          //vector load pu1_src
+    ld1         { v5.2s},[x19],x11          //vector load pu1_src
+
+    umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+    ld1         { v6.2s},[x12],x9           //vector load pu1_src
+    ld1         { v7.2s},[x19],x9           //vector load pu1_src
+    umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    add         x19,x4,#8
+    ld1         { v8.2s},[x4],x11           //vector load pu1_src
+    ld1         { v9.2s},[x19],x11          //vector load pu1_src
+    umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         { v10.2s},[x4],x11          //vector load pu1_src
+    ld1         { v11.2s},[x19],x11         //vector load pu1_src
+    umlal       v30.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    umlsl       v30.8h, v6.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    ld1         { v12.2s},[x4],x11          //vector load pu1_src
+    ld1         { v13.2s},[x19],x11         //vector load pu1_src
+    umull       v28.8h, v3.8b, v25.8b
+
+    ld1         { v14.2s},[x4],x9           //vector load pu1_src
+    ld1         { v15.2s},[x19],x9          //vector load pu1_src
+
+    umlsl       v28.8h, v1.8b, v24.8b
+    st1         { v22.8h},[x1],#16          //store the result pu1_dst
+    umlal       v28.8h, v5.8b, v26.8b
+    st1         { v20.8h},[x1],x6           //store the result pu1_dst
+    umlsl       v28.8h, v7.8b, v27.8b
+    add         x20,x1,x3,lsl #1
+    csel        x1, x20, x1,eq
+
+
+epilog_end:
+
+    umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    umlsl       v22.8h, v8.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+    umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    umlsl       v20.8h, v9.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+    st1         { v30.8h}, [x1],#16
+    st1         { v28.8h}, [x1],x8
+    st1         { v22.8h},[x1],#16          //store the result pu1_dst
+    st1         { v20.8h},[x1],x6           //store the result pu1_dst
+
+
+    mov         x6,x16                      //loads ht
+
+    and         x7,x6,#1
+
+    cmp         x7,#0
+    mov         x10,x5
+    add         x20,x12,x2,lsl #1
+    csel        x12, x20, x12,ne
+    sub         x20,x12,x5
+    csel        x12, x20, x12,ne
+    add         x20,x1,x3,lsl #1
+    csel        x1, x20, x1,ne
+
+
+    bgt         loop_residue_4
+
+    b           end_loops
+
+
+
+
+outer_loop_8:
+
+    add         x6,x1,x3,lsl #1             //pu1_dst + dst_strd
+    mov         x10,x5                      //2wd
+    add         x4,x12,x2                   //pu1_src + src_strd
+
+inner_loop_8:
+    //ld1 {v0.2s, v1.2s},[x12],x11                //vector load pu1_src
+    ld1         {v0.2s},[x12],x11           //vector load pu1_src
+    ld1         {v1.2s},[x12],x11           //vector load pu1_src
+    ld1         {v2.2s},[x12],x11           //vector load pu1_src
+    ld1         {v3.2s},[x12],x11           //vector load pu1_src
+
+
+    //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
+    umull       v8.8h, v1.8b, v25.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
+    //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
+    umlal       v8.8h, v2.8b, v26.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    umlsl       v8.8h, v3.8b, v27.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    //ld1 {v12.2s, v13.2s},[x4],x11                //vector load pu1_src + src_strd
+    ld1         {v4.2s},[x4],x11            //vector load pu1_src
+    ld1         {v5.2s},[x4],x11            //vector load pu1_src
+    ld1         {v6.2s},[x4],x11            //vector load pu1_src
+    ld1         {v7.2s},[x4],x11            //vector load pu1_src
+    //vext.u8    d14,d12,d13,#2                    //vector extract of src[0_2]
+    umull       v10.8h, v5.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    umlsl       v10.8h, v4.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    //vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
+    //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
+    umlal       v10.8h, v6.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    umlsl       v10.8h, v7.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    st1         {v8.8h}, [x1],#16
+
+    subs        x10,x10,#8                  //decrement the wd loop
+    st1         {v10.8h},[x6],#16           //store the result pu1_dst
+    bgt         inner_loop_8
+
+    sub         x12,x12,x5
+    subs        x14,x14,#2                  //decrement the ht loop
+    sub         x1,x1,x5,lsl #1
+    add         x12,x12,x2,lsl #1
+    add         x1,x1,x3,lsl #2
+    bgt         outer_loop_8
+
+    cmp         x7,#0
+    mov         x10,x5
+    bgt         loop_residue_4
+
+    b           end_loops
+
+
+
+//height if 4 comes
+outer_loop_ht_4:
+
+    mov         x10,x5
+
+prologue_ht_4:
+    lsl         x8, x3, #1
+
+inner_loop_ht_4:
+
+    mov         x12,x9
+    mov         x4,x1
+
+    sub         x0, x2, #6                  // not sure if x0 needs to be preserved
+
+    ld1         {v0.2s},[x12],x11           //(1)vector load pu1_src
+    ld1         {v1.2s},[x12],x11           //(1)vector load pu1_src
+    ld1         {v2.2s},[x12],x11           //(1)vector load pu1_src
+    ld1         {v3.2s},[x12],x0            //(1)vector load pu1_src
+
+    ld1         {v4.2s},[x12],x11           //(2)vector load pu1_src
+    ld1         {v5.2s},[x12],x11           //(2)vector load pu1_src
+    ld1         {v6.2s},[x12],x11           //(2)vector load pu1_src
+    ld1         {v7.2s},[x12],x0            //(2)vector load pu1_src
+
+    ld1         {v14.2s},[x12],x11          //(3)vector load pu1_src
+    umull       v8.8h, v1.8b, v25.8b        //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    ld1         {v15.2s},[x12],x11          //(3)vector load pu1_src
+    umlsl       v8.8h, v0.8b, v24.8b        //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         {v16.2s},[x12],x11          //(3)vector load pu1_src
+    umlal       v8.8h, v2.8b, v26.8b        //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    ld1         {v17.2s},[x12],x0           //(3)vector load pu1_src
+    umlsl       v8.8h, v3.8b, v27.8b        //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    ld1         {v18.2s},[x12],x11          //(4)vector load pu1_src
+    umull       v10.8h, v5.8b, v25.8b       //(2)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    ld1         {v19.2s},[x12],x11          //(4)vector load pu1_src
+    umlsl       v10.8h, v4.8b, v24.8b       //(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         {v20.2s},[x12],x11          //(4)vector load pu1_src
+    umlal       v10.8h, v6.8b, v26.8b       //(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    ld1         {v21.2s},[x12],x2           //(4)vector load pu1_src
+    umlsl       v10.8h, v7.8b, v27.8b       //(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    add         x9,x9,#8                    //(core loop)
+
+    subs        x10,x10,#8                  //(prologue)decrement the wd loop
+    beq         epilogue
+
+core_loop:
+    st1         {v8.8h},[x4],x8             //(1)store the result pu1_dst
+    mov         x12,x9
+
+    ld1         {v0.2s},[x12],x11           //(1_1)vector load pu1_src
+    umull       v12.8h, v15.8b, v25.8b      //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    ld1         {v1.2s},[x12],x11           //(1_1)vector load pu1_src
+    umlsl       v12.8h, v14.8b, v24.8b      //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         {v2.2s},[x12],x11           //(1_1)vector load pu1_src
+    umlal       v12.8h, v16.8b, v26.8b      //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    ld1         {v3.2s},[x12],x0            //(1_1)vector load pu1_src
+    umlsl       v12.8h, v17.8b, v27.8b      //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    st1         {v10.8h},[x4],x8            //(2)store the result pu1_dst
+    add         x9,x9,#8                    //(core loop)
+
+    ld1         {v4.2s},[x12],x11           //(2_1)vector load pu1_src
+    umull       v22.8h, v19.8b, v25.8b      //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    ld1         {v5.2s},[x12],x11           //(2_1)vector load pu1_src
+    umlsl       v22.8h, v18.8b, v24.8b      //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         {v6.2s},[x12],x11           //(2_1)vector load pu1_src
+    umlal       v22.8h, v20.8b, v26.8b      //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    ld1         {v7.2s},[x12],x0            //(2_1)vector load pu1_src
+    umlsl       v22.8h, v21.8b, v27.8b      //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    st1         {v12.8h},[x4],x8            //(3)store the result pu1_dst
+    add         x1,x1,#16                   //(core loop)
+
+    ld1         {v14.2s},[x12],x11          //(3_1)vector load pu1_src
+    umull       v8.8h, v1.8b, v25.8b        //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    ld1         {v15.2s},[x12],x11          //(3_1)vector load pu1_src
+    umlsl       v8.8h, v0.8b, v24.8b        //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         {v16.2s},[x12],x11          //(3_1)vector load pu1_src
+    umlal       v8.8h, v2.8b, v26.8b        //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    ld1         {v17.2s},[x12],x0           //(3_1)vector load pu1_src
+    umlsl       v8.8h, v3.8b, v27.8b        //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    st1         {v22.8h}, [x4], x8          //(4)store the result pu1_dst
+    subs        x10,x10,#8                  //(core loop)
+
+    umull       v10.8h, v5.8b, v25.8b       //(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    ld1         {v18.2s},[x12],x11          //(4_1)vector load pu1_src
+
+    ld1         {v19.2s},[x12],x11          //(4_1)vector load pu1_src
+    umlsl       v10.8h, v4.8b, v24.8b       //(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    ld1         {v20.2s},[x12],x11          //(4_1)vector load pu1_src
+    umlal       v10.8h, v6.8b, v26.8b       //(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    mov         x4, x1                      //(core loop)
+
+    ld1         {v21.2s},[x12],x0           //(4_1)vector load pu1_src
+    umlsl       v10.8h, v7.8b, v27.8b       //(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+
+    bgt         core_loop                   //loopback
+
+epilogue:
+    umull       v12.8h, v15.8b, v25.8b      //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    umlsl       v12.8h, v14.8b, v24.8b      //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    umlal       v12.8h, v16.8b, v26.8b      //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    umlsl       v12.8h, v17.8b, v27.8b      //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    st1         {v8.8h},[x4], x8            //(1)store the result pu1_dst
+
+    umull       v22.8h, v19.8b, v25.8b      //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    umlsl       v22.8h, v18.8b, v24.8b      //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    umlal       v22.8h, v20.8b, v26.8b      //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    umlsl       v22.8h, v21.8b, v27.8b      //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    st1         {v10.8h},[x4], x8           //(2)store the result pu1_dst
+
+    st1         {v12.8h},[x4], x8           //(3)store the result pu1_dst
+
+    add         x1,x1,#16                   //(core loop)
+
+    st1         {v22.8h},[x4], x8           //(4)store the result pu1_dst
+
+    sub         x9,x9,x5
+    subs        x14,x14,#4                  //decrement the ht loop
+    sub         x1,x1,x5,lsl #1
+    add         x9,x9,x2,lsl #2
+    add         x1,x1,x3,lsl #3
+    bgt         outer_loop_ht_4
+
+    cmp         x7,#0
+    mov         x10,x5
+    csel        x12, x9, x12,gt
+    csel        x4, x1, x4,gt
+    bgt         loop_residue_4
+
+    b           end_loops
+
+outer_loop_4:
+    add         x6,x1,x3,lsl #1             //pu1_dst + dst_strd
+    mov         x10,x5
+    add         x4,x12,x2                   //pu1_src + src_strd
+
+inner_loop_4:
+    //ld1 {v0.2s, v1.2s},[x12]                    //vector load pu1_src
+    ld1         {v20.2s},[x12],x11          //vector load pu1_src
+    ld1         {v21.2s},[x12],x11          //vector load pu1_src
+    ld1         {v22.2s},[x12],x11          //vector load pu1_src
+    ld1         {v23.2s},[x12]              //vector load pu1_src
+
+//**** removal
+    //add        x12,x12,#4                        //increment the input pointer
+//**** removal ends
+//**** addn
+    sub         x12,x12,#2                  //increment the input pointer
+//**** addn ends
+    ld1         {v16.2s},[x4],x11           //vector load pu1_src
+    ld1         {v17.2s},[x4],x11           //vector load pu1_src
+    ld1         {v18.2s},[x4],x11           //vector load pu1_src
+    ld1         {v19.2s},[x4]               //vector load pu1_src
+    //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
+    //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
+    //ld1 {v12.2s, v13.2s},[x4]                    //vector load pu1_src + src_strd
+    //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
+
+    //add        x4,x4,#4                        //increment the input pointer
+    sub         x4,x4,#2
+    //vext.u8    d14,d12,d13,#2                    //vector extract of src[0_2]
+    //vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
+    //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
+
+//**** removal
+    //zip1 v0.2s, v0.2s, v12.2s
+    //zip2  v12.2s, v0.2s, v12.2s                             //vector zip the i iteration and ii interation in single register
+    //zip1 v2.2s, v2.2s, v14.2s
+    //zip2  v14.2s, v2.2s, v14.2s
+    //zip1 v4.2s, v4.2s, v16.2s
+    //zip2  v16.2s, v4.2s, v16.2s
+    //zip1 v6.2s, v6.2s, v18.2s
+    //zip2  v18.2s, v6.2s, v18.2s
+//**** removal ends
+//**** addn
+    zip1        v0.2s, v20.2s, v16.2s
+    zip2        v4.2s, v20.2s, v16.2s       //vector zip the i iteration and ii interation in single register
+    zip1        v1.2s, v21.2s, v17.2s
+    zip2        v5.2s, v21.2s, v17.2s
+    zip1        v2.2s, v22.2s, v18.2s
+    zip2        v6.2s, v22.2s, v18.2s
+    zip1        v3.2s, v23.2s, v19.2s
+    zip2        v7.2s, v23.2s, v19.2s
+//**** addn ends
+
+    umull       v8.8h, v1.8b, v25.8b        //arithmetic operations for ii iteration in the same time
+    umlsl       v8.8h, v0.8b, v24.8b
+    umlal       v8.8h, v2.8b, v26.8b
+    umlsl       v8.8h, v3.8b, v27.8b
+
+    st1         {v8.d}[0],[x1],#8           //store the i iteration result which is in upper part of the register
+    subs        x10,x10,#4                  //decrement the wd by 4
+
+    st1         {v8.d}[1],[x6],#8           //store the ii iteration result which is in lower part of the register
+
+    bgt         inner_loop_4
+
+    sub         x12,x12,x5
+    subs        x14,x14,#2                  //decrement the ht by 2
+    sub         x1,x1,x5,lsl #1
+    add         x12,x12,x2,lsl #1
+    add         x1,x1,x3,lsl #2
+    bgt         outer_loop_4
+
+    cmp         x7,#0
+    mov         x10,x5
+    beq         end_loops
+
+loop_residue_4:
+
+    mov         x10,x5                      //2wd
+
+loop_residue:
+
+    //ld1 {v0.2s, v1.2s},[x12]                    //vector load pu1_src
+    ld1         {v20.2s},[x12],x11          //vector load pu1_src
+    ld1         {v21.2s},[x12],x11          //vector load pu1_src
+    ld1         {v22.2s},[x12],x11          //vector load pu1_src
+    ld1         {v23.2s},[x12]              //vector load pu1_src
+    //vext.u8        d2,d0,d1,#2                //vector extract of src[0_2]
+    //umull v8.8h, v2.8b, v25.8b                //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    //umlsl v8.8h, v0.8b, v24.8b                //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    //vext.u8        d4,d0,d1,#4                //vector extract of src[0_4]
+    //add            x12,x12,#4                //pu1_src + 4
+    sub         x12, x12, #2
+    //vext.u8        d6,d0,d1,#6                //vector extract of src[0_6]
+    //umlal v8.8h, v4.8b, v26.8b                //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    //umlsl v8.8h, v6.8b, v27.8b                //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+    umull       v8.8h, v21.8b, v25.8b
+    umlsl       v8.8h, v20.8b, v24.8b
+    umlal       v8.8h, v22.8b, v26.8b
+    umlsl       v8.8h, v23.8b, v27.8b
+
+    st1         {v8.1d},[x1]                //store the result pu1_dst
+    subs        x10,x10,#4                  //decrement the wd loop
+    add         x1,x1,#8                    //pi2_dst + 8
+
+    bgt         loop_residue                //loop again
+
+    //inner loop ends
+    //add            x8,x3,lsl #1            //2*dst_strd
+    //sub             x8,x8,x5,lsl #1            //2*dst_strd - 2wd
+    //sub             x9,x2,x5                //src_strd - 2wd
+    //subs             x7,x7,#1                //decrement the ht loop
+    //add             x12,x12,x9                //pu1_src + src_strd
+    //add            x1,x1,x8                //pu1_dst + 2*dst_strd
+    //bgt              outer_loop_residue_4    //loop again
+    //b                 end_loops                //jumps to end
+
+end_loops:
+
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_inter_pred_chroma_vert.s b/common/arm64/ihevc_inter_pred_chroma_vert.s
new file mode 100644
index 0000000..2de789f
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_chroma_vert.s

@@ -0,0 +1,405 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* //file
+//*  ihevc_inter_pred_chroma_vert_neon.s
+//*
+//* //brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//*  yogeswaran rs
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+///**
+//*******************************************************************************
+//*
+//* //brief
+//*   chroma interprediction filter for vertical input
+//*
+//* //par description:
+//*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+//*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+//*    'pu1_dst'  the output is down shifted by 6 and clipped to 8 bits
+//*    assumptions : the function is optimized considering the fact width is
+//*    multiple of 2,4 or 8. and also considering height  should be multiple of 2
+//*    width 4,8 is optimized further
+//*
+//* //param[in] pu1_src
+//*  uword8 pointer to the source
+//*
+//* //param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] dst_strd
+//*  integer destination stride
+//*
+//* //param[in] pi1_coeff
+//*  word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_inter_pred_chroma_vert(uword8 *pu1_src,
+//                                   uword8 *pu1_dst,
+//                                   word32 src_strd,
+//                                   word32 dst_strd,
+//                                   word8 *pi1_coeff,
+//                                   word32 ht,
+//                                   word32 wd)
+//**************variables vs registers*****************************************
+//x0 => *pu1_src
+//x1 => *pi2_dst
+//x2 =>  src_strd
+//x3 =>  dst_strd
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_chroma_vert_av8
+
+.type ihevc_inter_pred_chroma_vert_av8, %function
+
+ihevc_inter_pred_chroma_vert_av8:
+
+    // stmfd sp!,{x4-x12,x14}        //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    mov         x15,x4 // pi1_coeff
+    mov         x16,x5 // ht
+    mov         x17,x6 // wd
+
+    mov         x4,x16                      //loads ht
+    mov         x12,x15                     //loads pi1_coeff
+    cmp         x4,#0                       //checks ht == 0
+    mov         x6,x17                      //loads wd
+    sub         x0,x0,x2                    //pu1_src - src_strd
+    ld1         {v0.8b},[x12]               //loads pi1_coeff
+
+    ble         end_loops                   //jumps to end
+
+    tst         x6,#3                       //checks (wd & 3)
+    abs         v3.8b, v0.8b                //vabs_s8(coeff)
+    lsl         x10,x6,#1                   //2*wd
+    dup         v0.8b, v3.8b[0]             //coeffabs_0
+    dup         v1.8b, v3.8b[1]             //coeffabs_1
+    dup         v2.8b, v3.8b[2]             //coeffabs_2
+    dup         v3.8b, v3.8b[3]             //coeffabs_3
+
+    bgt         outer_loop_wd_2             //jumps to loop handling wd ==2
+
+    tst         x4,#7                       //checks ht for mul of 8
+    beq         core_loop_ht_8              //when height is multiple of 8
+
+    lsl         x7,x3,#1                    //2*dst_strd
+    sub         x9,x7,x10                   //2*dst_strd - 2wd
+    lsl         x12,x2,#1                   //2*src_strd
+    sub         x8,x12,x10                  //2*src_strd - 2wd
+    mov         x5,x10                      //2wd
+
+inner_loop_ht_2:                            //called when wd is multiple of 4 and ht is 4,2
+
+    add         x6,x0,x2                    //pu1_src +src_strd
+    ld1         {v9.8b},[x6],x2             //loads pu1_src
+    subs        x5,x5,#8                    //2wd - 8
+    ld1         {v5.8b},[x0],#8             //loads src
+    umull       v6.8h, v9.8b, v1.8b         //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+    ld1         {v4.8b},[x6],x2             //loads incremented src
+    umlsl       v6.8h, v5.8b, v0.8b         //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
+    ld1         {v8.8b},[x6],x2             //loads incremented src
+    umlal       v6.8h, v4.8b, v2.8b         //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
+    umull       v4.8h, v4.8b, v1.8b
+    umlsl       v6.8h, v8.8b, v3.8b
+    umlsl       v4.8h, v9.8b, v0.8b
+    ld1         {v10.8b},[x6]               //loads the incremented src
+    umlal       v4.8h, v8.8b, v2.8b
+    sqrshrun    v6.8b, v6.8h,#6             //shifts right
+    umlsl       v4.8h, v10.8b, v3.8b
+    add         x6,x1,x3                    //pu1_dst + dst_strd
+    sqrshrun    v4.8b, v4.8h,#6             //shifts right
+    st1         {v6.8b},[x1],#8             //stores the loaded value
+
+    st1         {v4.8b},[x6]                //stores the loaded value
+
+    bgt         inner_loop_ht_2             //inner loop again
+
+    subs        x4,x4,#2                    //ht - 2
+    add         x1,x1,x9                    //pu1_dst += (2*dst_strd - 2wd)
+    mov         x5,x10                      //2wd
+    add         x0,x0,x8                    //pu1_src += (2*src_strd - 2wd)
+
+    bgt         inner_loop_ht_2             //loop again
+
+    b           end_loops                   //jumps to end
+
+outer_loop_wd_2:                            //called when width is multiple of 2
+    lsl         x5,x3,#1                    //2*dst_strd
+    mov         x12,x10                     //2wd
+    sub         x9,x5,x10                   //2*dst_strd - 2wd
+    lsl         x7,x2,#1                    //2*src_strd
+    sub         x8,x7,x10                   //2*src_strd - 2wd
+
+inner_loop_wd_2:
+
+    add         x6,x0,x2                    //pu1_src + src_strd
+    ld1         {v6.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0
+    subs        x12,x12,#4                  //2wd - 4
+    add         x0,x0,#4                    //pu1_src + 4
+    ld1         {v6.s}[1],[x6],x2           //loads pu1_src_tmp
+    dup         v7.2s, v6.2s[1]
+    ld1         {v7.s}[1],[x6],x2           //loads pu1_src_tmp
+    umull       v4.8h, v7.8b, v1.8b         //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+    dup         v7.2s, v7.2s[1]
+    ld1         {v7.s}[1],[x6],x2
+    umlsl       v4.8h, v6.8b, v0.8b
+    umlal       v4.8h, v7.8b, v2.8b
+    dup         v7.2s, v7.2s[1]
+    ld1         {v7.s}[1],[x6]
+    add         x6,x1,x3                    //pu1_dst + dst_strd
+    umlsl       v4.8h, v7.8b, v3.8b
+    sqrshrun    v4.8b, v4.8h,#6             //vrshrq_n_s16(vreinterpretq_s16_u16(mul_res1),6)
+    st1         {v4.s}[0],[x1]              //stores the loaded value
+    add         x1,x1,#4                    //pu1_dst += 4
+    st1         {v4.s}[1],[x6]              //stores the loaded value
+
+    bgt         inner_loop_wd_2             //inner loop again
+
+    //inner loop ends
+    subs        x4,x4,#2                    //ht - 2
+    add         x1,x1,x9                    //pu1_dst += 2*dst_strd - 2*wd
+    mov         x12,x10                     //2wd
+    add         x0,x0,x8                    //pu1_src += 2*src_strd - 2*wd
+
+    bgt         inner_loop_wd_2             //loop again
+
+    b           end_loops                   //jumps to end
+
+core_loop_ht_8:                             //when wd & ht is multiple of 8
+
+    lsl         x12,x3,#2                   //4*dst_strd
+    sub         x8,x12,x10                  //4*dst_strd - 2wd
+    lsl         x12,x2,#2                   //4*src_strd
+    sub         x9,x12,x10                  //4*src_strd - 2wd
+
+    bic         x5,x10,#7                   //x5 ->wd
+    lsr         x14, x10, #3                //divide by 8
+    mul         x12, x4 , x14               //multiply height by width
+    sub         x12, x12,#4                 //subtract by one for epilog
+
+prolog:
+    add         x6,x0,x2                    //pu1_src + src_strd
+    ld1         {v5.8b},[x6],x2             //loads pu1_src
+    subs        x5,x5,#8                    //2wd - 8
+    ld1         {v4.8b},[x0],#8             //loads the source
+    ld1         {v6.8b},[x6],x2             //load and increment
+    umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
+    ld1         {v7.8b},[x6],x2             //load and increment
+    umlsl       v30.8h, v4.8b, v0.8b
+    add         x7,x1,x3                    //pu1_dst
+    umlal       v30.8h, v6.8b, v2.8b
+    umlsl       v30.8h, v7.8b, v3.8b
+    ld1         {v8.8b},[x6],x2             //load and increment
+
+    umull       v28.8h, v6.8b, v1.8b        //mul_res 2
+    add         x20,x0,x9                   //pu1_dst += 4*dst_strd - 2*wd
+    csel        x0, x20, x0,le
+    umlsl       v28.8h, v5.8b, v0.8b
+    bic         x20,x10,#7                  //x5 ->wd
+    csel        x5, x20, x5,le
+    umlal       v28.8h, v7.8b, v2.8b
+    ld1         {v9.8b},[x6],x2
+    umlsl       v28.8h, v8.8b, v3.8b
+    sqrshrun    v30.8b, v30.8h,#6
+
+    ld1         {v10.8b},[x6],x2
+    umull       v26.8h, v7.8b, v1.8b
+    add         x6,x0,x2                    //pu1_src + src_strd
+    umlsl       v26.8h, v6.8b, v0.8b
+    st1         {v30.8b},[x1],#8            //stores the loaded value
+    umlal       v26.8h, v8.8b, v2.8b
+    ld1         {v4.8b},[x0],#8             //loads the source
+    umlsl       v26.8h, v9.8b, v3.8b
+    sqrshrun    v28.8b, v28.8h,#6
+
+    add         x20,x1,x8                   //pu1_src += 4*src_strd - 2*wd
+    csel        x1, x20, x1,le
+    umull       v24.8h, v8.8b, v1.8b
+    ld1         {v5.8b},[x6],x2             //loads pu1_src
+    umlsl       v24.8h, v7.8b, v0.8b
+    subs        x12,x12,#4
+    ld1         {v6.8b},[x6],x2             //load and increment
+    umlal       v24.8h, v9.8b, v2.8b
+    ld1         {v7.8b},[x6],x2             //load and increment
+    umlsl       v24.8h, v10.8b, v3.8b
+
+    lsl         x11,x2,#2
+    st1         {v28.8b},[x7],x3            //stores the loaded value
+    sqrshrun    v26.8b, v26.8h,#6
+    sub         x20,x2,x2,lsl #3
+    neg         x11, x20
+    add         x14,x2,x2,lsl #1
+    add         x14,x14,x11
+    ble         epilog                      //jumps to epilog
+
+kernel_8:
+
+    umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
+    subs        x5,x5,#8                    //2wd - 8
+    umlsl       v30.8h, v4.8b, v0.8b
+    add         x20,x0,x9                   //pu1_dst += 4*dst_strd - 2*wd
+    csel        x0, x20, x0,le
+    umlal       v30.8h, v6.8b, v2.8b
+    lsl         x20,x2,#3
+    sub         x20,x20,x2
+    csel        x11,x20,x11,le
+    //rsble        x11,x2,x2,lsl #3
+    umlsl       v30.8h, v7.8b, v3.8b
+    st1         {v26.8b},[x7],x3            //stores the loaded value
+    sqrshrun    v24.8b, v24.8h,#6
+
+    ld1         {v8.8b},[x6],x2             //load and increment
+
+    umull       v28.8h, v6.8b, v1.8b        //mul_res 2
+    bic         x20,x10,#7                  //x5 ->wd
+    csel        x5, x20, x5,le
+    umlsl       v28.8h, v5.8b, v0.8b
+    st1         {v24.8b},[x7],x3            //stores the loaded value
+
+    umlal       v28.8h, v7.8b, v2.8b
+
+    ld1         {v9.8b},[x6],x2
+    sqrshrun    v30.8b, v30.8h,#6
+
+    umlsl       v28.8h, v8.8b, v3.8b
+    ld1         {v10.8b},[x6],x2
+    add         x7,x1,x3                    //pu1_dst
+    umull       v26.8h, v7.8b, v1.8b
+    add         x6,x0,x2                    //pu1_src + src_strd
+
+    add         x20,x0, x11
+    prfm        PLDL1KEEP,[x20]
+
+
+    umlsl       v26.8h, v6.8b, v0.8b
+    ld1         {v4.8b},[x0],#8             //loads the source
+
+    umlal       v26.8h, v8.8b, v2.8b
+    st1         {v30.8b},[x1],#8            //stores the loaded value
+
+    umlsl       v26.8h, v9.8b, v3.8b
+    ld1         {v5.8b},[x6],x2             //loads pu1_src
+
+    add         x11,x11,x2
+    sqrshrun    v28.8b, v28.8h,#6
+
+    umull       v24.8h, v8.8b, v1.8b
+    ld1         {v6.8b},[x6],x2             //load and increment
+    add         x20,x1,x8                   //pu1_src += 4*src_strd - 2*wd
+    csel        x1, x20, x1,le
+
+    cmp         x11,x14
+    lsl         x20,x2,#3
+    sub         x20,x20,x2
+    csel        x11,x20,x11,gt
+    //rsbgt        x11,x2,x2,lsl #3
+
+    umlsl       v24.8h, v7.8b, v0.8b
+    subs        x12,x12,#4
+
+    umlal       v24.8h, v9.8b, v2.8b
+    ld1         {v7.8b},[x6],x2             //load and increment
+
+    umlsl       v24.8h, v10.8b, v3.8b
+    st1         {v28.8b},[x7],x3            //stores the loaded value
+    sqrshrun    v26.8b, v26.8h,#6
+
+    bgt         kernel_8                    //jumps to kernel_8
+
+epilog:
+
+    umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
+    umlsl       v30.8h, v4.8b, v0.8b
+    umlal       v30.8h, v6.8b, v2.8b
+    umlsl       v30.8h, v7.8b, v3.8b
+    st1         {v26.8b},[x7],x3            //stores the loaded value
+    sqrshrun    v24.8b, v24.8h,#6
+
+    ld1         {v8.8b},[x6],x2             //load and increment
+    umull       v28.8h, v6.8b, v1.8b        //mul_res 2
+    umlsl       v28.8h, v5.8b, v0.8b
+    umlal       v28.8h, v7.8b, v2.8b
+    umlsl       v28.8h, v8.8b, v3.8b
+    st1         {v24.8b},[x7],x3            //stores the loaded value
+    sqrshrun    v30.8b, v30.8h,#6
+
+    ld1         {v9.8b},[x6],x2
+    umull       v26.8h, v7.8b, v1.8b
+    add         x7,x1,x3                    //pu1_dst
+    umlsl       v26.8h, v6.8b, v0.8b
+    st1         {v30.8b},[x1],#8            //stores the loaded value
+
+    sqrshrun    v28.8b, v28.8h,#6
+    umlal       v26.8h, v8.8b, v2.8b
+    ld1         {v10.8b},[x6],x2
+    umlsl       v26.8h, v9.8b, v3.8b
+
+    umull       v24.8h, v8.8b, v1.8b
+    sqrshrun    v26.8b, v26.8h,#6
+    st1         {v28.8b},[x7],x3            //stores the loaded value
+    umlsl       v24.8h, v7.8b, v0.8b
+    umlal       v24.8h, v9.8b, v2.8b
+    st1         {v26.8b},[x7],x3            //stores the loaded value
+    umlsl       v24.8h, v10.8b, v3.8b
+
+    sqrshrun    v24.8b, v24.8h,#6
+    st1         {v24.8b},[x7],x3            //stores the loaded value
+end_loops:
+    // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+

diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s
new file mode 100644
index 0000000..55e7f54
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s

@@ -0,0 +1,356 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* //file
+//*  ihevc_inter_pred_chroma_vert_neon_w16inp_neon.s
+//*
+//* //brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//*  yogeswaran rs / parthiban
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+///**
+//*******************************************************************************
+//*
+//* //brief
+//*       chroma interprediction filter for 16bit vertical input.
+//*
+//* //par description:
+//*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+//*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+//*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
+//*    clipped to lie  between 0 and 255   assumptions : the function is
+//*    optimized considering the fact width and  height are multiple of 2.
+//*
+//* //param[in] pi2_src
+//*  word16 pointer to the source
+//*
+//* //param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] dst_strd
+//*  integer destination stride
+//*
+//* //param[in] pi1_coeff
+//*  word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_inter_pred_chroma_vert_w16inp(word16 *pi2_src,
+//                                          uword8 *pu1_dst,
+//                                          word32 src_strd,
+//                                          word32 dst_strd,
+//                                          word8 *pi1_coeff,
+//                                          word32 ht,
+//                                          word32 wd)
+//**************variables vs registers*****************************************
+//x0 => *pu1_src
+//x1 => *pi2_dst
+//x2 =>  src_strd
+//x3 =>  dst_strd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_chroma_vert_w16inp_av8
+
+.type ihevc_inter_pred_chroma_vert_w16inp_av8, %function
+
+ihevc_inter_pred_chroma_vert_w16inp_av8:
+
+    // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    mov         x15,x4 // pi1_coeff
+    mov         x16,x5 // ht
+    mov         x17,x6 // wd
+
+    mov         x4, x15                     //loads pi1_coeff
+    mov         x6, x17                     //wd
+    lsl         x2,x2,#1                    //src_strd = 2* src_strd
+    mov         x5,x16                      //loads ht
+    ld1         {v0.8b},[x4]                //loads pi1_coeff
+    sub         x4,x0,x2                    //pu1_src - src_strd
+    sxtl        v0.8h, v0.8b                //long the value
+
+    tst         x6,#3                       //checks wd  == 2
+    dup         v12.4h, v0.4h[0]            //coeff_0
+    dup         v13.4h, v0.4h[1]            //coeff_1
+    dup         v14.4h, v0.4h[2]            //coeff_2
+    dup         v15.4h, v0.4h[3]            //coeff_3
+
+    bgt         core_loop_ht_2              //jumps to loop handles wd 2
+
+    tst         x5,#3                       //checks ht == mul of 4
+    beq         core_loop_ht_4              //jumps to loop handles ht mul of 4
+
+core_loop_ht_2:
+    lsl         x7,x2,#1                    //2*src_strd
+    lsl         x12,x3,#1                   //2*dst_strd
+    lsl         x9,x6,#2                    //4*wd
+    sub         x6,x12,x6,lsl #1            //2*dst_strd - 2*wd
+    sub         x8,x7,x9                    //2*src_strd - 4*wd
+    mov         x12,x9                      //4wd
+
+inner_loop_ht_2:
+    add         x0,x4,x2                    //increments pi2_src
+    ld1         {v0.4h},[x4],#8             //loads pu1_src
+    smull       v0.4s, v0.4h, v12.4h        //vmull_s16(src_tmp1, coeff_0)
+    subs        x12,x12,#8                  //2wd + 8
+    ld1         {v2.4h},[x0],x2             //loads pi2_src
+    smull       v8.4s, v2.4h, v12.4h        //vmull_s16(src_tmp2, coeff_0)
+    ld1         {v3.4h},[x0],x2             //loads pi2_src
+    smlal       v0.4s, v2.4h, v13.4h
+    ld1         {v6.4h},[x0],x2
+    smlal       v8.4s, v3.4h, v13.4h
+    ld1         {v2.4h},[x0]
+    add         x7,x1,x3                    //pu1_dst + dst_strd
+    smlal       v0.4s, v3.4h, v14.4h
+    smlal       v8.4s, v6.4h, v14.4h
+    smlal       v0.4s, v6.4h, v15.4h
+    smlal       v8.4s, v2.4h, v15.4h
+    sqshrn      v0.4h, v0.4s,#6             //right shift
+    sqshrn      v30.4h, v8.4s,#6            //right shift
+    sqrshrun    v0.8b, v0.8h,#6             //rounding shift
+    sqrshrun    v30.8b, v30.8h,#6           //rounding shift
+    st1         {v0.s}[0],[x1],#4           //stores the loaded value
+    st1         {v30.s}[0],[x7]             //stores the loaded value
+    bgt         inner_loop_ht_2             //inner loop -again
+
+    //inner loop ends
+    subs        x5,x5,#2                    //increments ht
+    add         x1,x1,x6                    //pu1_dst += 2*dst_strd - 2*wd
+    mov         x12,x9                      //4wd
+    add         x4,x4,x8                    //pi1_src_tmp1 += 2*src_strd - 4*wd
+    bgt         inner_loop_ht_2             //loop again
+
+    b           end_loops                   //jumps to end
+
+core_loop_ht_4:
+    lsl         x7,x2,#2                    //2*src_strd
+    lsl         x12,x3,#2                   //2*dst_strd
+    lsr         x11, x6, #1                 //divide by 2
+    sub         x14,x12,x6,lsl #1           //2*dst_strd - 2*wd
+    sub         x8,x7,x6,lsl #2             //2*src_strd - 4*wd
+
+    mul         x12, x5 , x11               //multiply height by width
+    sub         x12, x12,#4                 //subtract by one for epilog
+    lsl         x11, x6, #1                 //2*wd
+
+prolog:
+    add         x0,x4,x2                    //increments pi2_src
+    ld1         {v0.4h},[x4],#8             //loads pu1_src
+    ld1         {v1.4h},[x0],x2             //loads pi2_src
+    subs        x11,x11,#4
+    ld1         {v2.4h},[x0],x2             //loads pi2_src
+    smull       v30.4s, v0.4h, v12.4h       //vmull_s16(src_tmp1, coeff_0)
+    ld1         {v3.4h},[x0],x2
+    smlal       v30.4s, v1.4h, v13.4h
+    smlal       v30.4s, v2.4h, v14.4h
+    add         x9,x1,x3                    //pu1_dst + dst_strd
+    smlal       v30.4s, v3.4h, v15.4h
+
+    ld1         {v4.4h},[x0],x2
+    smull       v28.4s, v1.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    add         x20,x4,x8
+    csel        x4, x20, x4,le
+    smlal       v28.4s, v2.4h, v13.4h
+    ld1         {v5.4h},[x0],x2
+    smlal       v28.4s, v3.4h, v14.4h
+    ld1         {v6.4h},[x0],x2
+    smlal       v28.4s, v4.4h, v15.4h
+    lsl         x20,x6,#1
+    csel        x11, x20, x11,le
+
+    sqshrn      v30.4h, v30.4s,#6           //right shift
+
+    smull       v26.4s, v2.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    add         x0,x4,x2
+    smlal       v26.4s, v3.4h, v13.4h
+    smlal       v26.4s, v4.4h, v14.4h
+    ld1         {v0.4h},[x4],#8             //loads pu1_src
+    smlal       v26.4s, v5.4h, v15.4h
+
+    sqrshrun    v30.8b, v30.8h,#6           //rounding shift
+    sqshrn      v28.4h, v28.4s,#6           //right shift
+
+    ld1         {v1.4h},[x0],x2             //loads pi2_src
+    smull       v24.4s, v3.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    st1         {v30.s}[0],[x1],#4          //stores the loaded value
+    smlal       v24.4s, v4.4h, v13.4h
+    ld1         {v2.4h},[x0],x2             //loads pi2_src
+    smlal       v24.4s, v5.4h, v14.4h
+    ld1         {v3.4h},[x0],x2
+    smlal       v24.4s, v6.4h, v15.4h
+    add         x20,x1,x14
+    csel        x1, x20, x1,le
+
+    sqshrn      v26.4h, v26.4s,#6           //right shift
+    subs        x12,x12,#4
+    sqrshrun    v28.8b, v28.8h,#6           //rounding shift
+
+    beq         epilog                      //jumps to epilog
+
+kernel_4:
+    smull       v30.4s, v0.4h, v12.4h       //vmull_s16(src_tmp1, coeff_0)
+    subs        x11,x11,#4
+    smlal       v30.4s, v1.4h, v13.4h
+    st1         {v28.s}[0],[x9],x3          //stores the loaded value
+    smlal       v30.4s, v2.4h, v14.4h
+    smlal       v30.4s, v3.4h, v15.4h
+
+    sqshrn      v24.4h, v24.4s,#6           //right shift
+    sqrshrun    v26.8b, v26.8h,#6           //rounding shift
+
+    ld1         {v4.4h},[x0],x2
+    smull       v28.4s, v1.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smlal       v28.4s, v2.4h, v13.4h
+    smlal       v28.4s, v3.4h, v14.4h
+    smlal       v28.4s, v4.4h, v15.4h
+    st1         {v26.s}[0],[x9],x3          //stores the loaded value
+    add         x20,x4,x8
+    csel        x4, x20, x4,le
+    lsl         x20,x6,#1
+    csel        x11, x20, x11,le
+
+    sqshrn      v30.4h, v30.4s,#6           //right shift
+    sqrshrun    v24.8b, v24.8h,#6           //rounding shift
+
+    ld1         {v5.4h},[x0],x2
+    smull       v26.4s, v2.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    ld1         {v6.4h},[x0],x2
+    smlal       v26.4s, v3.4h, v13.4h
+    st1         {v24.s}[0],[x9]             //stores the loaded value
+    add         x0,x4,x2
+    smlal       v26.4s, v4.4h, v14.4h
+    ld1         {v0.4h},[x4],#8             //loads pu1_src
+    smlal       v26.4s, v5.4h, v15.4h
+
+    sqshrn      v28.4h, v28.4s,#6           //right shift
+    sqrshrun    v30.8b, v30.8h,#6           //rounding shift
+
+    ld1         {v1.4h},[x0],x2             //loads pi2_src
+    smull       v24.4s, v3.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    add         x9,x1,x3                    //pu1_dst + dst_strd
+    ld1         {v2.4h},[x0],x2             //loads pi2_src
+    smlal       v24.4s, v4.4h, v13.4h
+    ld1         {v3.4h},[x0],x2
+    smlal       v24.4s, v5.4h, v14.4h
+
+    st1         {v30.s}[0],[x1],#4          //stores the loaded value
+    smlal       v24.4s, v6.4h, v15.4h
+
+    sqshrn      v26.4h, v26.4s,#6           //right shift
+    sqrshrun    v28.8b, v28.8h,#6           //rounding shift
+    add         x20,x1,x14
+    csel        x1, x20, x1,le
+
+    subs        x12,x12,#4
+
+    bgt         kernel_4                    //jumps to kernel_4
+
+epilog:
+    smull       v30.4s, v0.4h, v12.4h       //vmull_s16(src_tmp1, coeff_0)
+    st1         {v28.s}[0],[x9],x3          //stores the loaded value
+    smlal       v30.4s, v1.4h, v13.4h
+    smlal       v30.4s, v2.4h, v14.4h
+    smlal       v30.4s, v3.4h, v15.4h
+
+    sqshrn      v24.4h, v24.4s,#6           //right shift
+    sqrshrun    v26.8b, v26.8h,#6           //rounding shift
+
+    smull       v28.4s, v1.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    ld1         {v4.4h},[x0],x2
+    smlal       v28.4s, v2.4h, v13.4h
+    st1         {v26.s}[0],[x9],x3          //stores the loaded value
+    smlal       v28.4s, v3.4h, v14.4h
+    smlal       v28.4s, v4.4h, v15.4h
+
+    sqshrn      v30.4h, v30.4s,#6           //right shift
+    sqrshrun    v24.8b, v24.8h,#6           //rounding shift
+
+    smull       v26.4s, v2.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    ld1         {v5.4h},[x0],x2
+    smlal       v26.4s, v3.4h, v13.4h
+    smlal       v26.4s, v4.4h, v14.4h
+    smlal       v26.4s, v5.4h, v15.4h
+
+    sqshrn      v28.4h, v28.4s,#6           //right shift
+    sqrshrun    v30.8b, v30.8h,#6           //rounding shift
+
+    st1         {v24.s}[0],[x9]             //stores the loaded value
+    smull       v24.4s, v3.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smlal       v24.4s, v4.4h, v13.4h
+    add         x9,x1,x3                    //pu1_dst + dst_strd
+    ld1         {v6.4h},[x0],x2
+    smlal       v24.4s, v5.4h, v14.4h
+    smlal       v24.4s, v6.4h, v15.4h
+    st1         {v30.s}[0],[x1],#4          //stores the loaded value
+
+    sqrshrun    v28.8b, v28.8h,#6           //rounding shift
+    sqshrn      v26.4h, v26.4s,#6           //right shift
+
+    st1         {v28.s}[0],[x9],x3          //stores the loaded value
+    sqrshrun    v26.8b, v26.8h,#6           //rounding shift
+
+    sqshrn      v24.4h, v24.4s,#6           //right shift
+    st1         {v26.s}[0],[x9],x3          //stores the loaded value
+    sqrshrun    v24.8b, v24.8h,#6           //rounding shift
+
+    st1         {v24.s}[0],[x9]             //stores the loaded value
+
+end_loops:
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+

diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
new file mode 100644
index 0000000..b6d0eb2
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s

@@ -0,0 +1,343 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* //file
+//*  ihevc_inter_pred_chroma_vert_neon_w16inp_w16out_neon.s
+//*
+//* //brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//*  yogeswaran rs / parthiban
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+///**
+//*******************************************************************************
+//*
+//* //brief
+//*    chroma interprediction filter for 16bit vertical input and output.
+//*
+//* //par description:
+//*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+//*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+//*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 6 and
+//*    8192 is  subtracted to store it as a 16 bit number  the output is used as
+//*    a input to weighted prediction   assumptions : the function is optimized
+//*    considering the fact width and  height are multiple of 2.
+//*
+//* //param[in] pi2_src
+//*  word16 pointer to the source
+//*
+//* //param[out] pi2_dst
+//*  word16 pointer to the destination
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] dst_strd
+//*  integer destination stride
+//*
+//* //param[in] pi1_coeff
+//*  word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_inter_pred_chroma_vert_w16inp_w16out(word16 *pi2_src,
+//                                                 word16 *pi2_dst,
+//                                                 word32 src_strd,
+//                                                 word32 dst_strd,
+//                                                 word8 *pi1_coeff,
+//                                                 word32 ht,
+//                                                 word32 wd)
+//**************variables vs registers*****************************************
+//x0 => *pu1_src
+//x1 => *pi2_dst
+//x2 =>  src_strd
+//x3 =>  dst_strd
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_chroma_vert_w16inp_w16out_av8
+
+.type ihevc_inter_pred_chroma_vert_w16inp_w16out_av8, %function
+
+ihevc_inter_pred_chroma_vert_w16inp_w16out_av8:
+
+    // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    mov         x15,x4 // pi1_coeff
+    mov         x16,x5 // ht
+    mov         x17,x6 // wd
+
+    mov         x4, x15                     //loads pi1_coeff
+    mov         x6, x17                     //wd
+    lsl         x2,x2,#1                    //src_strd = 2* src_strd
+    mov         x5,x16                      //loads ht
+    ld1         {v0.8b},[x4]                //loads pi1_coeff
+    sub         x4,x0,x2                    //pu1_src - src_strd
+    sxtl        v0.8h, v0.8b                //long the value
+
+    tst         x6,#3                       //checks wd  == 2
+    dup         v12.4h, v0.4h[0]            //coeff_0
+    dup         v13.4h, v0.4h[1]            //coeff_1
+    dup         v14.4h, v0.4h[2]            //coeff_2
+    dup         v15.4h, v0.4h[3]            //coeff_3
+
+    bgt         core_loop_ht_2              //jumps to loop handles wd 2
+
+    tst         x5,#3                       //checks ht == mul of 4
+    beq         core_loop_ht_4              //jumps to loop handles ht mul of 4
+
+core_loop_ht_2:
+    lsl         x7,x2,#1                    //2*src_strd
+    lsl         x3,x3,#1                    //2*dst_strd
+    lsl         x9,x6,#2                    //4*wd
+    sub         x6,x3,x6,lsl #1             //2*dst_strd - 2*wd
+    sub         x8,x7,x9                    //2*src_strd - 4*wd
+    mov         x12,x9                      //4wd
+
+inner_loop_ht_2:
+    add         x0,x4,x2                    //increments pi2_src
+    ld1         {v0.4h},[x4],#8             //loads pu1_src
+    smull       v0.4s, v0.4h, v12.4h        //vmull_s16(src_tmp1, coeff_0)
+    subs        x12,x12,#8                  //2wd + 8
+    ld1         {v2.4h},[x0],x2             //loads pi2_src
+    smull       v8.4s, v2.4h, v12.4h        //vmull_s16(src_tmp2, coeff_0)
+    ld1         {v3.4h},[x0],x2             //loads pi2_src
+    smlal       v0.4s, v2.4h, v13.4h
+    ld1         {v6.4h},[x0],x2
+    smlal       v8.4s, v3.4h, v13.4h
+    ld1         {v2.4h},[x0]
+    add         x7,x1,x3                    //pu1_dst + dst_strd
+    smlal       v0.4s, v3.4h, v14.4h
+    smlal       v8.4s, v6.4h, v14.4h
+    smlal       v0.4s, v6.4h, v15.4h
+    smlal       v8.4s, v2.4h, v15.4h
+    sqshrn      v0.4h, v0.4s,#6             //right shift
+    sqshrn      v30.4h, v8.4s,#6            //right shift
+    st1         {v0.2s},[x1],#8             //stores the loaded value
+    st1         {v30.2s},[x7]               //stores the loaded value
+    bgt         inner_loop_ht_2             //inner loop -again
+
+    //inner loop ends
+    subs        x5,x5,#2                    //increments ht
+    add         x1,x1,x6,lsl #1             //pu1_dst += 2*dst_strd - 2*wd
+    mov         x12,x9                      //4wd
+    add         x4,x4,x8                    //pi1_src_tmp1 += 2*src_strd - 4*wd
+    bgt         inner_loop_ht_2             //loop again
+
+    b           end_loops                   //jumps to end
+
+core_loop_ht_4:
+    lsl         x7,x2,#2                    //2*src_strd
+    lsl         x10,x3,#2                   //2*dst_strd
+    lsr         x11, x6, #1                 //divide by 2
+    sub         x14,x10,x6,lsl #1           //2*dst_strd - 2*wd
+    sub         x8,x7,x6,lsl #2             //2*src_strd - 4*wd
+
+    mul         x12, x5 , x11               //multiply height by width
+    sub         x12, x12,#4                 //subtract by one for epilog
+    lsl         x11, x6, #1                 //2*wd
+    lsl         x3,x3,#1                    //2*dst_strd
+
+prolog:
+    add         x0,x4,x2                    //increments pi2_src
+    ld1         {v0.4h},[x4],#8             //loads pu1_src
+    ld1         {v1.4h},[x0],x2             //loads pi2_src
+    subs        x11,x11,#4
+    ld1         {v2.4h},[x0],x2             //loads pi2_src
+    smull       v30.4s, v0.4h, v12.4h       //vmull_s16(src_tmp1, coeff_0)
+    ld1         {v3.4h},[x0],x2
+    smlal       v30.4s, v1.4h, v13.4h
+    smlal       v30.4s, v2.4h, v14.4h
+    add         x9,x1,x3                    //pu1_dst + dst_strd
+    smlal       v30.4s, v3.4h, v15.4h
+
+    ld1         {v4.4h},[x0],x2
+    smull       v28.4s, v1.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    add         x20,x4,x8
+    csel        x4, x20, x4,le
+    lsl         x20,x6,#1
+    csel        x11, x20, x11,le
+    smlal       v28.4s, v2.4h, v13.4h
+    smlal       v28.4s, v3.4h, v14.4h
+    ld1         {v5.4h},[x0],x2
+    smlal       v28.4s, v4.4h, v15.4h
+
+    sqshrn      v30.4h, v30.4s,#6           //right shift
+
+    ld1         {v6.4h},[x0],x2
+    smull       v26.4s, v2.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smlal       v26.4s, v3.4h, v13.4h
+    smlal       v26.4s, v4.4h, v14.4h
+    add         x0,x4,x2
+    ld1         {v0.4h},[x4],#8             //loads pu1_src
+    smlal       v26.4s, v5.4h, v15.4h
+
+    sqshrn      v28.4h, v28.4s,#6           //right shift
+
+    ld1         {v1.4h},[x0],x2             //loads pi2_src
+    smull       v24.4s, v3.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    st1         {v30.2s},[x1],#8            //stores the loaded value
+    smlal       v24.4s, v4.4h, v13.4h
+    ld1         {v2.4h},[x0],x2             //loads pi2_src
+    smlal       v24.4s, v5.4h, v14.4h
+    ld1         {v3.4h},[x0],x2
+    smlal       v24.4s, v6.4h, v15.4h
+    add         x20,x1,x14,lsl #1
+    csel        x1, x20, x1,le
+
+    sqshrn      v26.4h, v26.4s,#6           //right shift
+    subs        x12,x12,#4
+
+    beq         epilog                      //jumps to epilog
+
+kernel_4:
+    smull       v30.4s, v0.4h, v12.4h       //vmull_s16(src_tmp1, coeff_0)
+    subs        x11,x11,#4
+    smlal       v30.4s, v1.4h, v13.4h
+    st1         {v28.2s},[x9],x3            //stores the loaded value
+    smlal       v30.4s, v2.4h, v14.4h
+    smlal       v30.4s, v3.4h, v15.4h
+
+    sqshrn      v24.4h, v24.4s,#6           //right shift
+
+    ld1         {v4.4h},[x0],x2
+    smull       v28.4s, v1.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smlal       v28.4s, v2.4h, v13.4h
+    smlal       v28.4s, v3.4h, v14.4h
+    smlal       v28.4s, v4.4h, v15.4h
+    st1         {v26.2s},[x9],x3            //stores the loaded value
+    add         x20,x4,x8
+    csel        x4, x20, x4,le
+    lsl         x20,x6,#1
+    csel        x11, x20, x11,le
+
+    sqshrn      v30.4h, v30.4s,#6           //right shift
+
+    ld1         {v5.4h},[x0],x2
+    smull       v26.4s, v2.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    ld1         {v6.4h},[x0],x2
+    smlal       v26.4s, v3.4h, v13.4h
+    st1         {v24.2s},[x9]               //stores the loaded value
+    add         x0,x4,x2
+    smlal       v26.4s, v4.4h, v14.4h
+    ld1         {v0.4h},[x4],#8             //loads pu1_src
+    smlal       v26.4s, v5.4h, v15.4h
+
+    sqshrn      v28.4h, v28.4s,#6           //right shift
+
+    ld1         {v1.4h},[x0],x2             //loads pi2_src
+    smull       v24.4s, v3.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    ld1         {v2.4h},[x0],x2             //loads pi2_src
+    smlal       v24.4s, v4.4h, v13.4h
+    add         x9,x1,x3                    //pu1_dst + dst_strd
+    ld1         {v3.4h},[x0],x2
+    smlal       v24.4s, v5.4h, v14.4h
+
+    st1         {v30.2s},[x1],#8            //stores the loaded value
+    smlal       v24.4s, v6.4h, v15.4h
+
+    sqshrn      v26.4h, v26.4s,#6           //right shift
+    add         x20,x1,x14,lsl #1
+    csel        x1, x20, x1,le
+
+    subs        x12,x12,#4
+
+    bgt         kernel_4                    //jumps to kernel_4
+
+epilog:
+    smull       v30.4s, v0.4h, v12.4h       //vmull_s16(src_tmp1, coeff_0)
+    st1         {v28.2s},[x9],x3            //stores the loaded value
+    smlal       v30.4s, v1.4h, v13.4h
+    smlal       v30.4s, v2.4h, v14.4h
+    smlal       v30.4s, v3.4h, v15.4h
+
+    sqshrn      v24.4h, v24.4s,#6           //right shift
+
+    smull       v28.4s, v1.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    ld1         {v4.4h},[x0],x2
+    smlal       v28.4s, v2.4h, v13.4h
+    st1         {v26.2s},[x9],x3            //stores the loaded value
+    smlal       v28.4s, v3.4h, v14.4h
+    smlal       v28.4s, v4.4h, v15.4h
+
+    sqshrn      v30.4h, v30.4s,#6           //right shift
+
+    smull       v26.4s, v2.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    ld1         {v5.4h},[x0],x2
+    smlal       v26.4s, v3.4h, v13.4h
+    smlal       v26.4s, v4.4h, v14.4h
+    smlal       v26.4s, v5.4h, v15.4h
+
+    sqshrn      v28.4h, v28.4s,#6           //right shift
+
+    st1         {v24.2s},[x9]               //stores the loaded value
+    smull       v24.4s, v3.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smlal       v24.4s, v4.4h, v13.4h
+    add         x9,x1,x3                    //pu1_dst + dst_strd
+    ld1         {v6.4h},[x0],x2
+    smlal       v24.4s, v5.4h, v14.4h
+    smlal       v24.4s, v6.4h, v15.4h
+    st1         {v30.2s},[x1],#8            //stores the loaded value
+
+    sqshrn      v26.4h, v26.4s,#6           //right shift
+
+    st1         {v28.2s},[x9],x3            //stores the loaded value
+
+    sqshrn      v24.4h, v24.4s,#6           //right shift
+    st1         {v26.2s},[x9],x3            //stores the loaded value
+
+    st1         {v24.2s},[x9]               //stores the loaded value
+
+end_loops:
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+

diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
new file mode 100644
index 0000000..9f5687f
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s

@@ -0,0 +1,392 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* //file
+//*  ihevc_inter_pred_chroma_vert_w16out_neon.s
+//*
+//* //brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//*  yogeswaran rs/ pathiban
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+///**
+//*******************************************************************************
+//*
+//* //brief
+//*   interprediction chroma filter to store vertical 16bit ouput
+//*
+//* //par description:
+//*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+//*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+//*    'pu1_dst'  no downshifting or clipping is done and the output is  used as
+//*    an input for weighted prediction   assumptions : the function is optimized
+//*    considering the fact width is  multiple of 2,4 or 8. and also considering
+//*    height  should be multiple of 2. width 4,8 is optimized further
+//*
+//* //param[in] pu1_src
+//*  uword8 pointer to the source
+//*
+//* //param[out] pi2_dst
+//*  word16 pointer to the destination
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] dst_strd
+//*  integer destination stride
+//*
+//* //param[in] pi1_coeff
+//*  word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*****************************************************************************
+//*/
+//void ihevc_inter_pred_chroma_vert_w16out(uword8 *pu1_src,
+//                                            word16 *pi2_dst,
+//                                            word32 src_strd,
+//                                            word32 dst_strd,
+//                                            word8 *pi1_coeff,
+//                                            word32 ht,
+//                                            word32 wd)
+//**************variables vs registers*****************************************
+//x0 => *pu1_src
+//x1 => *pi2_dst
+//x2 =>  src_strd
+//x3 =>  dst_strd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_chroma_vert_w16out_av8
+
+.type ihevc_inter_pred_chroma_vert_w16out_av8, %function
+
+ihevc_inter_pred_chroma_vert_w16out_av8:
+
+    // stmfd sp!,{x4-x12,x14}        //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    mov         x15,x4 // pi1_coeff
+    mov         x16,x5 // ht
+    mov         x17,x6 // wd
+
+
+    mov         x4,x16                      //loads ht
+    mov         x12,x15                     //loads pi1_coeff
+    cmp         x4,#0                       //checks ht == 0
+    mov         x6,x17                      //loads wd
+    sub         x0,x0,x2                    //pu1_src - src_strd
+    ld1         {v0.8b},[x12]               //loads pi1_coeff
+
+    ble         end_loops                   //jumps to end
+
+    tst         x6,#3                       //checks (wd & 3)
+    abs         v3.8b, v0.8b                //vabs_s8(coeff)
+    lsl         x10,x6,#1                   //2*wd
+    dup         v0.8b, v3.8b[0]             //coeffabs_0
+    dup         v1.8b, v3.8b[1]             //coeffabs_1
+    dup         v2.8b, v3.8b[2]             //coeffabs_2
+    dup         v3.8b, v3.8b[3]             //coeffabs_3
+
+    bgt         outer_loop_wd_2             //jumps to loop handling wd ==2
+
+    tst         x4,#7                       //checks ht for mul of 8
+    beq         core_loop_ht_8              //when height is multiple of 8
+
+    lsl         x7,x3,#2                    //2*dst_strd
+    sub         x9,x7,x10,lsl #1            //4*dst_strd - 4wd
+    lsl         x12,x2,#1                   //2*src_strd
+    sub         x8,x12,x10                  //2*src_strd - 2wd
+    lsl         x3, x3, #1
+    mov         x5,x10                      //2wd
+
+inner_loop_ht_2:                            //called when wd is multiple of 4 and ht is 4,2
+
+    add         x6,x0,x2                    //pu1_src +src_strd
+    ld1         {v9.8b},[x6],x2             //loads pu1_src
+    subs        x5,x5,#8                    //2wd - 8
+    ld1         {v5.8b},[x0],#8             //loads src
+    umull       v6.8h, v9.8b, v1.8b         //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+    ld1         {v4.8b},[x6],x2             //loads incremented src
+    umlsl       v6.8h, v5.8b, v0.8b         //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
+    ld1         {v8.8b},[x6],x2             //loads incremented src
+    umlal       v6.8h, v4.8b, v2.8b         //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
+    umull       v4.8h, v4.8b, v1.8b
+    ld1         {v10.8b},[x6]               //loads the incremented src
+    umlsl       v6.8h, v8.8b, v3.8b
+    umlsl       v4.8h, v9.8b, v0.8b
+    umlal       v4.8h, v8.8b, v2.8b
+    umlsl       v4.8h, v10.8b, v3.8b
+    add         x6,x1,x3                    //pu1_dst + dst_strd
+    st1         { v6.8h},[x1],#16           //stores the loaded value
+
+    st1         { v4.8h},[x6]               //stores the loaded value
+
+    bgt         inner_loop_ht_2             //inner loop again
+
+    subs        x4,x4,#2                    //ht - 2
+    add         x1,x1,x9                    //pu1_dst += (2*dst_strd - 2wd)
+    mov         x5,x10                      //2wd
+    add         x0,x0,x8                    //pu1_src += (2*src_strd - 2wd)
+
+    bgt         inner_loop_ht_2             //loop again
+
+    b           end_loops                   //jumps to end
+
+outer_loop_wd_2:                            //called when width is multiple of 2
+    lsl         x5,x3,#2                    //2*dst_strd
+    mov         x12,x10                     //2wd
+    sub         x9,x5,x10,lsl #1            //4*dst_strd - 4wd
+    lsl         x7,x2,#1                    //2*src_strd
+    sub         x8,x7,x10                   //2*src_strd - 2wd
+
+inner_loop_wd_2:
+
+    add         x6,x0,x2                    //pu1_src + src_strd
+    ld1         {v6.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0
+    subs        x12,x12,#4                  //2wd - 4
+    add         x0,x0,#4                    //pu1_src + 4
+    ld1         {v6.s}[1],[x6],x2           //loads pu1_src_tmp
+    dup         v7.2s, v6.2s[1]
+    ld1         {v7.s}[1],[x6],x2           //loads pu1_src_tmp
+    umull       v4.8h, v7.8b, v1.8b         //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+    dup         v7.2s, v7.2s[1]
+    ld1         {v7.s}[1],[x6],x2
+    umlsl       v4.8h, v6.8b, v0.8b
+    umlal       v4.8h, v7.8b, v2.8b
+    dup         v7.2s, v7.2s[1]
+    ld1         {v7.s}[1],[x6]
+    add         x6,x1,x3,lsl #1             //pu1_dst + dst_strd
+    umlsl       v4.8h, v7.8b, v3.8b
+    st1         {v4.d}[0],[x1]              //stores the loaded value
+    add         x1,x1,#8                    //pu1_dst += 4
+    st1         {v4.d}[1],[x6]              //stores the loaded value
+
+    bgt         inner_loop_wd_2             //inner loop again
+
+    //inner loop ends
+    subs        x4,x4,#2                    //ht - 2
+    add         x1,x1,x9                    //pu1_dst += 2*dst_strd - 2*wd
+    mov         x12,x10                     //2wd
+    add         x0,x0,x8                    //pu1_src += 2*src_strd - 2*wd
+
+    bgt         inner_loop_wd_2             //loop again
+
+    b           end_loops                   //jumps to end
+
+core_loop_ht_8:                             //when wd & ht is multiple of 8
+
+    lsl         x12,x3,#3                   //4*dst_strd
+    sub         x8,x12,x10,lsl #1           //4*dst_strd - 2wd
+    lsl         x12,x2,#2                   //4*src_strd
+    sub         x9,x12,x10                  //4*src_strd - 2wd
+
+    bic         x5,x10,#7                   //x5 ->wd
+    lsr         x14, x10, #3                //divide by 8
+    mul         x12, x4 , x14               //multiply height by width
+    sub         x12, x12,#4                 //subtract by one for epilog
+    lsl         x3, x3, #1
+
+prolog:
+    add         x6,x0,x2                    //pu1_src + src_strd
+    ld1         {v5.8b},[x6],x2             //loads pu1_src
+    subs        x5,x5,#8                    //2wd - 8
+    ld1         {v4.8b},[x0],#8             //loads the source
+    ld1         {v6.8b},[x6],x2             //load and increment
+    umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
+    ld1         {v7.8b},[x6],x2             //load and increment
+    umlsl       v30.8h, v4.8b, v0.8b
+    add         x7,x1,x3                    //pu1_dst
+    umlal       v30.8h, v6.8b, v2.8b
+    umlsl       v30.8h, v7.8b, v3.8b
+    ld1         {v8.8b},[x6],x2             //load and increment
+
+    umull       v28.8h, v6.8b, v1.8b        //mul_res 2
+    add         x20,x0,x9                   //pu1_dst += 4*dst_strd - 2*wd
+    csel        x0, x20, x0,le
+    umlsl       v28.8h, v5.8b, v0.8b
+    bic         x20,x10,#7                  //x5 ->wd
+    csel        x5, x20, x5,le
+    umlal       v28.8h, v7.8b, v2.8b
+    ld1         {v9.8b},[x6],x2
+    umlsl       v28.8h, v8.8b, v3.8b
+
+    ld1         {v10.8b},[x6],x2
+    umull       v26.8h, v7.8b, v1.8b
+    add         x6,x0,x2                    //pu1_src + src_strd
+    umlsl       v26.8h, v6.8b, v0.8b
+    st1         { v30.16b},[x1],#16         //stores the loaded value
+    umlal       v26.8h, v8.8b, v2.8b
+    ld1         {v4.8b},[x0],#8             //loads the source
+    umlsl       v26.8h, v9.8b, v3.8b
+
+    add         x20,x1,x8                   //pu1_src += 4*src_strd - 2*wd
+    csel        x1, x20, x1,le
+    umull       v24.8h, v8.8b, v1.8b
+    ld1         {v5.8b},[x6],x2             //loads pu1_src
+    umlsl       v24.8h, v7.8b, v0.8b
+    subs        x12,x12,#4
+    ld1         {v6.8b},[x6],x2             //load and increment
+    umlal       v24.8h, v9.8b, v2.8b
+    ld1         {v7.8b},[x6],x2             //load and increment
+    umlsl       v24.8h, v10.8b, v3.8b
+    sub         x20,x2,x2,lsl #3
+    neg         x11, x20
+    add         x14,x2,x2,lsl #1
+    add         x14,x14,x11
+    st1         { v28.16b},[x7],x3          //stores the loaded value
+
+    ble         epilog                      //jumps to epilog
+
+kernel_8:
+
+    umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
+    subs        x5,x5,#8                    //2wd - 8
+    umlsl       v30.8h, v4.8b, v0.8b
+    add         x20,x0,x9                   //pu1_dst += 4*dst_strd - 2*wd
+    csel        x0, x20, x0,le
+    umlal       v30.8h, v6.8b, v2.8b
+
+    lsl         x20,x2,#3
+    sub         x20,x20,x2
+    csel        x11,x20,x11,le
+    //rsble        x11,x2,x2,lsl #3
+    umlsl       v30.8h, v7.8b, v3.8b
+    st1         { v26.16b},[x7],x3          //stores the loaded value
+
+    ld1         {v8.8b},[x6],x2             //load and increment
+
+    umull       v28.8h, v6.8b, v1.8b        //mul_res 2
+    bic         x20,x10,#7                  //x5 ->wd
+    csel        x5, x20, x5,le
+    umlsl       v28.8h, v5.8b, v0.8b
+    st1         { v24.16b},[x7],x3          //stores the loaded value
+
+    umlal       v28.8h, v7.8b, v2.8b
+    ld1         {v9.8b},[x6],x2
+
+    umlsl       v28.8h, v8.8b, v3.8b
+    ld1         {v10.8b},[x6],x2
+    add         x7,x1,x3                    //pu1_dst
+    umull       v26.8h, v7.8b, v1.8b
+    add         x6,x0,x2                    //pu1_src + src_strd
+    add         x20,x0, x11
+    prfm        PLDL1KEEP,[x20]
+
+    umlsl       v26.8h, v6.8b, v0.8b
+    ld1         {v4.8b},[x0],#8             //loads the source
+
+    add         x11,x11,x2
+    umlal       v26.8h, v8.8b, v2.8b
+    st1         { v30.16b},[x1],#16         //stores the loaded value
+
+    umlsl       v26.8h, v9.8b, v3.8b
+    ld1         {v5.8b},[x6],x2             //loads pu1_src
+
+    umull       v24.8h, v8.8b, v1.8b
+    ld1         {v6.8b},[x6],x2             //load and increment
+    add         x20,x1,x8                   //pu1_src += 4*src_strd - 2*wd
+    csel        x1, x20, x1,le
+
+    cmp         x11,x14
+
+    lsl         x20,x2,#3
+    sub         x20,x20,x2
+    csel        x11,x20,x11,gt
+    //rsbgt        x11,x2,x2,lsl #3
+
+    umlsl       v24.8h, v7.8b, v0.8b
+    subs        x12,x12,#4
+
+
+    umlal       v24.8h, v9.8b, v2.8b
+    ld1         {v7.8b},[x6],x2             //load and increment
+
+    umlsl       v24.8h, v10.8b, v3.8b
+    st1         { v28.16b},[x7],x3          //stores the loaded value
+
+    bgt         kernel_8                    //jumps to kernel_8
+
+epilog:
+
+    umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
+    umlsl       v30.8h, v4.8b, v0.8b
+    umlal       v30.8h, v6.8b, v2.8b
+    umlsl       v30.8h, v7.8b, v3.8b
+    st1         { v26.16b},[x7],x3          //stores the loaded value
+
+    ld1         {v8.8b},[x6],x2             //load and increment
+    umull       v28.8h, v6.8b, v1.8b        //mul_res 2
+    umlsl       v28.8h, v5.8b, v0.8b
+    umlal       v28.8h, v7.8b, v2.8b
+    umlsl       v28.8h, v8.8b, v3.8b
+    st1         { v24.16b},[x7],x3          //stores the loaded value
+
+    ld1         {v9.8b},[x6],x2
+    umull       v26.8h, v7.8b, v1.8b
+    add         x7,x1,x3                    //pu1_dst
+    umlsl       v26.8h, v6.8b, v0.8b
+    st1         { v30.16b},[x1],#16         //stores the loaded value
+    umlal       v26.8h, v8.8b, v2.8b
+    ld1         {v10.8b},[x6],x2
+    umlsl       v26.8h, v9.8b, v3.8b
+
+    umull       v24.8h, v8.8b, v1.8b
+    st1         { v28.16b},[x7],x3          //stores the loaded value
+    umlsl       v24.8h, v7.8b, v0.8b
+    umlal       v24.8h, v9.8b, v2.8b
+    st1         { v26.16b},[x7],x3          //stores the loaded value
+    umlsl       v24.8h, v10.8b, v3.8b
+
+    st1         { v24.16b},[x7],x3          //stores the loaded value
+
+end_loops:
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+

diff --git a/common/arm64/ihevc_inter_pred_filters_luma_horz.s b/common/arm64/ihevc_inter_pred_filters_luma_horz.s
new file mode 100644
index 0000000..1e246da
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_filters_luma_horz.s

@@ -0,0 +1,605 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//******************************************************************************
+//* //file
+//*  ihevc_inter_pred_luma_horz.s
+//*
+//* //brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//*  parthiban v
+//*
+//* //par list of functions:
+//*
+//*  - ihevc_inter_pred_luma_horz()
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
+///* include reconstruction */
+//
+
+///**
+//*******************************************************************************
+//*
+//* //brief
+//*     interprediction luma filter for vertical input
+//*
+//* //par description:
+//*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+//*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+//*    'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
+//*    assumptions : the function is optimized considering the fact width is
+//*    multiple of 4 or 8. and height as multiple of 2.
+//*
+//* //param[in] pu1_src
+//*  uword8 pointer to the source
+//*
+//* //param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] dst_strd
+//*  integer destination stride
+//*
+//* //param[in] pi1_coeff
+//*  word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_inter_pred_luma_horz (
+//                            uword8 *pu1_src,
+//                            uword8 *pu1_dst,
+//                            word32 src_strd,
+//                            word32 dst_strd,
+//                            word8 *pi1_coeff,
+//                            word32 ht,
+//                            word32 wd   )
+
+//**************variables vs registers*****************************************
+//    x0 => *pu1_src
+//    x1 => *pu1_dst
+//    x2 =>  src_strd
+//    x3 =>  dst_strd
+//    x4 => *pi1_coeff
+//    x5 =>  ht
+//    x6 =>  wd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_luma_horz_av8
+
+.type ihevc_inter_pred_luma_horz_av8, %function
+
+ihevc_inter_pred_luma_horz_av8:
+
+    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+    //str        x1,[sp,#-4]
+    // mov        x7,#8192
+
+    mov         x15,x4 // pi1_coeff
+    mov         x16,x5 // ht
+    mov         x17,x6 // wd
+
+start_loop_count:
+    // ldr         x1,[sp,#-4]
+
+
+    mov         x4,x15                      //loads pi1_coeff
+    mov         x8,x16                      //loads ht
+    mov         x10,x17                     //loads wd
+
+    ld1         {v0.8b},[x4]                //coeff = vld1_s8(pi1_coeff)
+    mov         x11,#1
+    subs        x14,x8,#0                   //checks for ht == 0
+
+    abs         v2.8b, v0.8b                //vabs_s8(coeff)
+
+    //ble          end_loops
+
+
+    dup         v24.8b, v2.8b[0]            //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+    sub         x12,x0,#3                   //pu1_src - 3
+    dup         v25.8b, v2.8b[1]            //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+    add         x4,x12,x2                   //pu1_src_tmp2_8 = pu1_src + src_strd
+    dup         v26.8b, v2.8b[2]            //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+    sub         x20,x10,x2,lsl #1           //2*src_strd - wd
+    neg         x9, x20
+    dup         v27.8b, v2.8b[3]            //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+    sub         x20,x10,x3,lsl #1           //2*dst_strd - wd
+    neg         x8, x20
+    dup         v28.8b, v2.8b[4]            //coeffabs_4 = vdup_lane_u8(coeffabs, 4)
+
+    dup         v29.8b, v2.8b[5]            //coeffabs_5 = vdup_lane_u8(coeffabs, 5)
+    // tst          x10,#7                            //checks wd for multiples
+    dup         v30.8b, v2.8b[6]            //coeffabs_6 = vdup_lane_u8(coeffabs, 6)
+    dup         v31.8b, v2.8b[7]            //coeffabs_7 = vdup_lane_u8(coeffabs, 7)
+
+    mov         x7,x1
+
+    cmp         x10,#4
+    ble         outer_loop_4
+
+    cmp         x10,#24
+    mov         x20,#16
+    csel        x10, x20, x10,eq
+    add         x20, x8,#8
+    csel        x8, x20, x8,eq
+    add         x20, x9,#8
+    csel        x9, x20, x9,eq
+
+    cmp         x10,#16
+    bge         outer_loop_16
+
+    cmp         x10,#12
+    add         x20, x8,#4
+    csel        x8, x20, x8,eq
+    add         x20, x9,#4
+    csel        x9, x20, x9,eq
+    b           outer_loop_8
+
+
+outer_loop8_residual:
+    sub         x12,x0,#3                   //pu1_src - 3
+    mov         x1,x7
+    mov         x14,#32
+    add         x1, x1,#16
+    add         x12, x12,#16
+    mov         x10,#8
+    add         x8, x8,#8
+    add         x9, x9,#8
+
+outer_loop_8:
+
+    add         x6,x1,x3                    //pu1_dst + dst_strd
+    add         x4,x12,x2                   //pu1_src + src_strd
+    subs        x5,x10,#0                   //checks wd
+
+    ble         end_inner_loop_8
+
+inner_loop_8:
+    ld1         {v0.2s},[x12],x11           //vector load pu1_src
+    ld1         {v1.2s},[x12],x11
+    ld1         {v2.2s},[x12],x11
+    ld1         {v3.2s},[x12],x11
+
+
+
+
+
+    // vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
+    // vext.u8    d3,d0,d1,#3                        //vector extract of src[0_3]
+    // vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
+    // vext.u8    d5,d0,d1,#5                        //vector extract of src[0_5]
+    // vext.u8    d6,d0,d1,#6                        //vector extract of src [0_6]
+    // vext.u8    d7,d0,d1,#7                        //vector extract of src[0_7]
+    // vext.u8    d1,d0,d1,#1                        //vector extract of src[0_1]
+    // vext.u8    d14,d12,d13,#2
+
+    //vext.u8    d15,d12,d13,#3                    //vector extract of src[0_3]
+    // vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
+    // vext.u8    d17,d12,d13,#5                    //vector extract of src[0_5]
+    //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
+    //vext.u8    d19,d12,d13,#7                    //vector extract of src[0_7]
+    //vext.u8    d13,d12,d13,#1                    //vector extract of src[0_1]
+    ld1         {v4.2s},[x12],x11
+    umull       v8.8h, v1.8b, v25.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+    ld1         {v5.2s},[x12],x11
+    umlal       v8.8h, v3.8b, v27.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    ld1         {v6.2s},[x12],x11
+    umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    ld1         {v7.2s},[x12],x11
+    umlsl       v8.8h, v2.8b, v26.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    ld1         {v12.2s},[x4],x11           //vector load pu1_src + src_strd
+    umlal       v8.8h, v4.8b, v28.8b        //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+    ld1         {v13.2s},[x4],x11
+    umlsl       v8.8h, v5.8b, v29.8b        //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+    ld1         {v14.2s},[x4],x11
+    umlal       v8.8h, v6.8b, v30.8b        //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+    ld1         {v15.2s},[x4],x11
+    umlsl       v8.8h, v7.8b, v31.8b        //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+    ld1         {v16.2s},[x4],x11           //vector load pu1_src + src_strd
+
+    umull       v10.8h, v15.8b, v27.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    ld1         {v17.2s},[x4],x11
+    umlsl       v10.8h, v14.8b, v26.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    ld1         {v18.2s},[x4],x11
+    umlal       v10.8h, v16.8b, v28.8b      //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+    ld1         {v19.2s},[x4],x11           //vector load pu1_src + src_strd
+    umlsl       v10.8h, v17.8b, v29.8b      //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+    sqrshrun    v20.8b, v8.8h,#6            //right shift and saturating narrow result 1
+    umlal       v10.8h, v18.8b, v30.8b      //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+    umlsl       v10.8h, v19.8b, v31.8b      //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+    st1         {v20.8b},[x1],#8            //store the result pu1_dst
+    umlsl       v10.8h, v12.8b, v24.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    umlal       v10.8h, v13.8b, v25.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+
+    sqrshrun    v8.8b, v10.8h,#6            //right shift and saturating narrow result 2
+    subs        x5,x5,#8                    //decrement the wd loop
+    st1         {v8.8b},[x6],#8             //store the result pu1_dst
+    cmp         x5,#4
+    bgt         inner_loop_8
+
+end_inner_loop_8:
+    subs        x14,x14,#2                  //decrement the ht loop
+    add         x12,x12,x9                  //increment the src pointer by 2*src_strd-wd
+    add         x1,x1,x8                    //increment the dst pointer by 2*dst_strd-wd
+    bgt         outer_loop_8
+
+
+
+
+
+    mov         x10,x17                     //loads wd
+    cmp         x10,#12
+
+    beq         outer_loop4_residual
+
+
+end_loops:
+
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+    ret
+
+
+
+
+
+
+outer_loop_16:
+    mov         x15, #-7
+    stp         x0,x7, [sp, #-16]!
+
+    add         x6,x1,x3                    //pu1_dst + dst_strd
+    add         x4,x12,x2                   //pu1_src + src_strd
+    and         x0, x12, #31
+    sub         x5,x10,#0                   //checks wd
+    //ble          end_loops1
+    add         x20,x12, x2, lsl #1
+    prfm        PLDL1KEEP,[x20]
+    ld1         { v0.2s},[x12],#8           //vector load pu1_src
+    ld1         { v1.2s},[x12],x15          //vector load pu1_src
+    add         x20,x4, x2, lsl #1
+    prfm        PLDL1KEEP,[x20]
+    ld1         { v2.2s},[x12],#8
+    ld1         { v3.2s},[x12],x15
+    ld1         { v4.2s},[x12],#8
+    ld1         { v5.2s},[x12],x15
+    ld1         { v6.2s},[x12],#8
+    ld1         { v7.2s},[x12],x15
+    ld1         { v12.2s},[x12],#8
+    ld1         { v13.2s},[x12],x15
+    umull       v8.8h, v2.8b, v25.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+    ld1         { v14.2s},[x12],#8
+    ld1         { v15.2s},[x12],x15
+    umlal       v8.8h, v6.8b, v27.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    ld1         { v16.2s},[x12],#8
+    ld1         { v17.2s},[x12],x15
+    umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    ld1         { v18.2s},[x12],#8
+    ld1         { v19.2s},[x12],x15
+    umlsl       v8.8h, v4.8b, v26.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlal       v8.8h, v12.8b, v28.8b       //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+    umlsl       v8.8h, v14.8b, v29.8b       //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+    umlal       v8.8h, v16.8b, v30.8b       //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+    umlsl       v8.8h, v18.8b, v31.8b       //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+
+
+inner_loop_16:
+
+
+    subs        x5,x5,#16
+    umull       v20.8h, v3.8b, v25.8b
+
+    add         x12, x12,#8
+    umlsl       v20.8h, v1.8b, v24.8b
+
+    sub         x20,x14,#2
+    csel        x14, x20, x14,eq
+    umlal       v20.8h, v7.8b, v27.8b
+
+    ld1         { v0.2s},[x4],#8            //vector load pu1_src
+    ld1         { v1.2s},[x4],x15           //vector load pu1_src
+
+    umlsl       v20.8h, v5.8b, v26.8b
+
+    ld1         { v2.2s},[x4],#8
+    ld1         { v3.2s},[x4],x15
+
+    umlal       v20.8h, v13.8b, v28.8b
+
+    ld1         { v4.2s},[x4],#8
+    ld1         { v5.2s},[x4],x15
+    umlal       v20.8h, v17.8b, v30.8b
+
+    ld1         { v6.2s},[x4],#8
+    ld1         { v7.2s},[x4],x15
+    umlsl       v20.8h, v15.8b, v29.8b
+
+    ld1         { v12.2s},[x4],#8
+    ld1         { v13.2s},[x4],x15
+    umlsl       v20.8h, v19.8b, v31.8b
+
+    ld1         { v14.2s},[x4],#8
+    ld1         { v15.2s},[x4],x15
+    sqrshrun    v8.8b, v8.8h,#6             //right shift and saturating narrow result 1
+
+    ld1         { v16.2s},[x4],#8
+    ld1         { v17.2s},[x4],x15
+    umull       v10.8h, v2.8b, v25.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    ld1         { v18.2s},[x4],#8
+    ld1         { v19.2s},[x4],x15
+    umlal       v10.8h, v6.8b, v27.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    add         x4, x4,#8
+    umlsl       v10.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+    add         x20,x12,x9                  //increment the src pointer by 2*src_strd-wd
+    csel        x12, x20, x12,eq
+    umlsl       v10.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    add         x20,x12,x2                  //pu1_src + src_strd
+    csel        x4, x20, x4,eq
+    sqrshrun    v9.8b, v20.8h,#6
+
+    umlal       v10.8h, v12.8b, v28.8b      //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+
+//    and            x7, x12, #31
+    umlsl       v10.8h, v14.8b, v29.8b      //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+
+    umlal       v10.8h, v16.8b, v30.8b      //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+
+    umlsl       v10.8h, v18.8b, v31.8b      //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+
+    umull       v22.8h, v3.8b, v25.8b
+
+    umlsl       v22.8h, v1.8b, v24.8b
+
+    st1         { v8.8b},[x1],#8            //store the result pu1_dst
+    st1         { v9.8b},[x1],#8            //store the result pu1_dst
+    umlal       v22.8h, v7.8b, v27.8b
+
+    add         x20,x1,x8
+    csel        x1, x20, x1,eq
+    sqrshrun    v10.8b, v10.8h,#6           //right shift and saturating narrow result 2
+
+//    cmp            x7, x0
+    umlsl       v22.8h, v5.8b, v26.8b
+
+    add         x20,x12, x2, lsl #2
+    prfm        PLDL1KEEP,[x20]
+    umlal       v22.8h, v13.8b, v28.8b
+
+    add         x20,x4, x2, lsl #2
+    prfm        PLDL1KEEP,[x20]
+    umlal       v22.8h, v17.8b, v30.8b
+
+//    mov            x0, x7
+    umlsl       v22.8h, v15.8b, v29.8b
+
+    cmp         x14,#0
+    umlsl       v22.8h, v19.8b, v31.8b
+
+    beq         epilog_16
+    ld1         { v0.2s},[x12],#8           //vector load pu1_src
+    ld1         { v1.2s},[x12],x15          //vector load pu1_src
+    ld1         { v2.2s},[x12],#8
+    ld1         { v3.2s},[x12],x15
+    ld1         { v4.2s},[x12],#8
+    ld1         { v5.2s},[x12],x15
+    ld1         { v6.2s},[x12],#8
+    ld1         { v7.2s},[x12],x15
+    ld1         { v12.2s},[x12],#8
+    ld1         { v13.2s},[x12],x15
+    sqrshrun    v11.8b, v22.8h,#6
+    umull       v8.8h, v2.8b, v25.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+    ld1         { v14.2s},[x12],#8
+    ld1         { v15.2s},[x12],x15
+    umlal       v8.8h, v6.8b, v27.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    ld1         { v16.2s},[x12],#8
+    ld1         { v17.2s},[x12],x15
+    umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    ld1         { v18.2s},[x12],#8
+    ld1         { v19.2s},[x12],x15
+    umlsl       v8.8h, v4.8b, v26.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlal       v8.8h, v12.8b, v28.8b       //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+    cmp         x5,#0
+    umlsl       v8.8h, v14.8b, v29.8b       //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+    csel        x5, x10, x5,eq
+    umlal       v8.8h, v16.8b, v30.8b       //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+    st1         { v10.8b},[x6],#8           //store the result pu1_dst
+    st1         { v11.8b},[x6],#8           //store the result pu1_dst
+    umlsl       v8.8h, v18.8b, v31.8b       //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+    add         x20,x1,x3                   //pu1_dst + dst_strd
+    csel        x6, x20, x6,eq
+    b           inner_loop_16
+
+
+epilog_16:
+    sqrshrun    v11.8b, v22.8h,#6
+    st1         { v10.8b},[x6],#8           //store the result pu1_dst
+    st1         { v11.8b},[x6],#8           //store the result pu1_dst
+
+    ldp         x0,x7, [sp], #16
+    mov         x10,x17
+    cmp         x10,#24
+
+    beq         outer_loop8_residual
+
+
+
+end_loops1:
+
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+    ret
+
+
+
+
+
+
+
+
+outer_loop4_residual:
+    sub         x12,x0,#3                   //pu1_src - 3
+    mov         x1,x7
+    add         x1, x1,#8
+    mov         x10,#4
+    add         x12, x12,#8
+    mov         x14,#16
+    add         x8, x8,#4
+    add         x9, x9,#4
+
+outer_loop_4:
+    add         x6,x1,x3                    //pu1_dst + dst_strd
+    add         x4,x12,x2                   //pu1_src + src_strd
+
+    subs        x5,x10,#0                   //checks wd
+    ble         end_inner_loop_4
+
+inner_loop_4:
+    ld1         {v20.2s},[x12],x11          //vector load pu1_src
+    ld1         {v21.2s},[x12],x11
+    ld1         {v22.2s},[x4],x11           //vector load pu1_src + src_strd
+    ld1         {v23.2s},[x4],x11
+
+    zip1        v0.2s, v20.2s, v22.2s
+    zip2        v12.2s, v20.2s, v22.2s      //vector zip the i iteration and ii interation in single register
+    zip1        v1.2s, v21.2s, v23.2s
+    zip2        v13.2s, v21.2s, v23.2s
+
+    ld1         {v20.2s},[x12],x11          //vector load pu1_src
+    ld1         {v21.2s},[x12],x11
+    ld1         {v22.2s},[x4],x11           //vector load pu1_src + src_strd
+    ld1         {v23.2s},[x4],x11
+
+    zip1        v2.2s, v20.2s, v22.2s
+    zip2        v14.2s, v20.2s, v22.2s
+    zip1        v3.2s, v21.2s, v23.2s
+    zip2        v15.2s, v21.2s, v23.2s
+
+    ld1         {v20.2s},[x12],x11          //vector load pu1_src
+    ld1         {v21.2s},[x12],x11
+    ld1         {v22.2s},[x4],x11           //vector load pu1_src + src_strd
+    ld1         {v23.2s},[x4],x11
+
+    zip1        v4.2s, v20.2s, v22.2s
+    zip2        v16.2s, v20.2s, v22.2s
+    zip1        v5.2s, v21.2s, v23.2s
+    zip2        v17.2s, v21.2s, v23.2s
+
+    ld1         {v20.2s},[x12],x11          //vector load pu1_src
+    ld1         {v21.2s},[x12],x11
+    ld1         {v22.2s},[x4],x11           //vector load pu1_src + src_strd
+    ld1         {v23.2s},[x4],x11
+
+    zip1        v6.2s, v20.2s, v22.2s
+    zip2        v18.2s, v20.2s, v22.2s
+    zip1        v7.2s, v21.2s, v23.2s
+    zip2        v19.2s, v21.2s, v23.2s
+
+    //add        x12,x12,#4                        //increment the input pointer
+    sub         x12,x12,#4
+    //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
+    //vext.u8    d3,d0,d1,#3                        //vector extract of src[0_3]
+    //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
+
+    //vext.u8    d5,d0,d1,#5                        //vector extract of src[0_5]
+    //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
+    //vext.u8    d7,d0,d1,#7                        //vector extract of src[0_7]
+    //vext.u8    d1,d0,d1,#1                        //vector extract of src[0_1]
+
+    sub         x4,x4,#4
+    // add        x4,x4,#4                        //increment the input pointer
+    // vext.u8    d14,d12,d13,#2                    //vector extract of src[0_2]
+    // vext.u8    d15,d12,d13,#3                    //vector extract of src[0_3]
+    // vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
+    // vext.u8    d17,d12,d13,#5                    //vector extract of src[0_5]
+    // vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
+    // vext.u8    d19,d12,d13,#7                    //vector extract of src[0_7]
+    //vext.u8    d13,d12,d13,#1                    //vector extract of src[0_1]
+
+    umull       v8.8h, v1.8b, v25.8b        //arithmetic operations for ii iteration in the same time
+    umlsl       v8.8h, v0.8b, v24.8b
+    umlsl       v8.8h, v2.8b, v26.8b
+    umlal       v8.8h, v3.8b, v27.8b
+    umlal       v8.8h, v4.8b, v28.8b
+    umlsl       v8.8h, v5.8b, v29.8b
+    umlal       v8.8h, v6.8b, v30.8b
+    umlsl       v8.8h, v7.8b, v31.8b
+
+    sqrshrun    v8.8b, v8.8h,#6             //narrow right shift and saturating the result
+    st1         {v8.s}[0],[x1],#4           //store the i iteration result which is in upper part of the register
+    st1         {v8.s}[1],[x6],#4           //store the ii iteration result which is in lower part of the register
+    subs        x5,x5,#4                    //decrement the wd by 4
+    bgt         inner_loop_4
+
+end_inner_loop_4:
+    subs        x14,x14,#2                  //decrement the ht by 4
+    add         x12,x12,x9                  //increment the input pointer 2*src_strd-wd
+    add         x1,x1,x8                    //increment the output pointer 2*dst_strd-wd
+    bgt         outer_loop_4
+    //subs     x7,x7,#1
+    // bgt     start_loop_count
+
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+    ret
+
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert.s b/common/arm64/ihevc_inter_pred_filters_luma_vert.s
new file mode 100644
index 0000000..48dc30f
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_filters_luma_vert.s

@@ -0,0 +1,522 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//******************************************************************************
+//* //file
+//*  ihevc_inter_pred_filters_luma_vert.s
+//*
+//* //brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//*  parthiban v
+//*
+//* //par list of functions:
+//*
+//*  - ihevc_inter_pred_luma_vert()
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
+///* include reconstruction */
+
+
+
+///**
+//*******************************************************************************
+//*
+//* //brief
+//*     interprediction luma filter for vertical input
+//*
+//* //par description:
+//*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+//*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+//*    'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
+//*    assumptions : the function is optimized considering the fact width is
+//*    multiple of 4 or 8. and height as multiple of 2.
+//*
+//* //param[in] pu1_src
+//*  uword8 pointer to the source
+//*
+//* //param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] dst_strd
+//*  integer destination stride
+//*
+//* //param[in] pi1_coeff
+//*  word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_inter_pred_luma_vert (
+//                            uword8 *pu1_src,
+//                            uword8 *pu1_dst,
+//                            word32 src_strd,
+//                            word32 dst_strd,
+//                            word8 *pi1_coeff,
+//                            word32 ht,
+//                            word32 wd   )
+
+//**************variables vs registers*****************************************
+//    x0 => *pu1_src
+//    x1 => *pu1_dst
+//    x2 =>  src_strd
+//    x6 =>  dst_strd
+//    x12 => *pi1_coeff
+//    x5 =>  ht
+//    x3 =>  wd
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_luma_vert_av8
+
+.type ihevc_inter_pred_luma_vert_av8, %function
+
+ihevc_inter_pred_luma_vert_av8:
+
+    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    mov         x15,x4 // pi1_coeff
+    mov         x16,x5 // ht
+    mov         x17,x6 // wd
+
+    mov         x12,x15                     //load pi1_coeff
+    mov         x6,x3
+    mov         x5,x17                      //load wd
+    ld1         {v0.8b},[x12]               //coeff = vld1_s8(pi1_coeff)
+    sub         x12,x2,x2,lsl #2            //src_ctrd & pi1_coeff
+    abs         v0.8b, v0.8b                //vabs_s8(coeff)
+    add         x0,x0,x12                   //x0->pu1_src    x12->pi1_coeff
+    mov         x3,x16                      //load ht
+    subs        x7,x3,#0                    //x3->ht
+    //ble          end_loops            //end loop jump
+    dup         v22.8b, v0.8b[0]            //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
+    cmp         x5,#8
+    dup         v23.8b, v0.8b[1]            //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
+    dup         v24.8b, v0.8b[2]            //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
+    dup         v25.8b, v0.8b[3]            //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
+    dup         v26.8b, v0.8b[4]            //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
+    dup         v27.8b, v0.8b[5]            //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
+    dup         v28.8b, v0.8b[6]            //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
+    dup         v29.8b, v0.8b[7]            //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
+    blt         core_loop_wd_4              //core loop wd 4 jump
+    stp         x0,x1, [sp, #-16]!
+
+    bic         x4,x5,#7                    //x5 ->wd
+    sub         x20,x4,x6,lsl #2            //x6->dst_strd    x5    ->wd
+    neg         x9, x20
+    sub         x20,x4,x2,lsl #2            //x2->src_strd
+    neg         x8, x20
+    lsr         x3, x5, #3                  //divide by 8
+    mul         x7, x7, x3                  //multiply height by width
+    sub         x7, x7,#4                   //subtract by one for epilog
+
+prolog:
+
+    and         x10, x0, #31
+    add         x3,x0,x2                    //pu1_src_tmp += src_strd//
+    ld1         {v1.8b},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    ld1         {v0.8b},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    subs        x4,x4,#8
+    ld1         {v2.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umull       v8.8h, v1.8b, v23.8b        //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+    ld1         {v3.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
+    umlsl       v8.8h, v0.8b, v22.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+    ld1         {v4.8b},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    umlsl       v8.8h, v2.8b, v24.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+    ld1         {v5.8b},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    umlal       v8.8h, v3.8b, v25.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    ld1         {v6.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umlal       v8.8h, v4.8b, v26.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    ld1         {v7.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
+    umlsl       v8.8h, v5.8b, v27.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+    ld1         {v16.8b},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    umlal       v8.8h, v6.8b, v28.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    ld1         {v17.8b},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    umlsl       v8.8h, v7.8b, v29.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+
+
+    ld1         {v18.8b},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umull       v10.8h, v2.8b, v23.8b       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+
+    add         x20,x0,x8
+    csel        x0, x20, x0,le
+    umlsl       v10.8h, v1.8b, v22.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+
+    bic         x20,x5,#7                   //x5 ->wd
+    csel        x4, x20, x4,le
+    umlsl       v10.8h, v3.8b, v24.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+
+    prfm        PLDL1KEEP,[x3]
+    umlal       v10.8h, v4.8b, v25.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    add         x20,x3, x2
+    prfm        PLDL1KEEP,[x20]
+    umlal       v10.8h, v5.8b, v26.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    add         x20,x3, x2, lsl #1
+    prfm        PLDL1KEEP,[x20]
+    umlsl       v10.8h, v6.8b, v27.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+
+    add         x3, x3, x2
+    umlal       v10.8h, v7.8b, v28.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+
+    add         x20,x3, x2, lsl #1
+    prfm        PLDL1KEEP,[x20]
+    umlsl       v10.8h, v16.8b, v29.8b      //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+
+    add         x3,x0,x2                    //pu1_src_tmp += src_strd//
+    sqrshrun    v8.8b, v8.8h,#6             //sto_res = vqmovun_s16(sto_res_tmp)//
+
+    ld1         {v1.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umull       v12.8h, v3.8b, v23.8b
+    ld1         {v0.8b},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    umlsl       v12.8h, v2.8b, v22.8b
+    ld1         {v2.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umlsl       v12.8h, v4.8b, v24.8b
+    umlal       v12.8h, v5.8b, v25.8b
+    umlal       v12.8h, v6.8b, v26.8b
+    umlsl       v12.8h, v7.8b, v27.8b
+    umlal       v12.8h, v16.8b, v28.8b
+    umlsl       v12.8h, v17.8b, v29.8b
+    add         x14,x1,x6
+    st1         {v8.8b},[x1],#8             //vst1_u8(pu1_dst,sto_res)//
+    sqrshrun    v10.8b, v10.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
+    add         x20,x1,x9
+    csel        x1, x20, x1,le
+
+    umull       v14.8h, v4.8b, v23.8b
+    subs        x7,x7,#4
+    umlsl       v14.8h, v3.8b, v22.8b
+    umlsl       v14.8h, v5.8b, v24.8b
+    umlal       v14.8h, v6.8b, v25.8b
+    ld1         {v3.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
+    umlal       v14.8h, v7.8b, v26.8b
+    ld1         {v4.8b},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    umlsl       v14.8h, v16.8b, v27.8b
+    ld1         {v5.8b},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    umlal       v14.8h, v17.8b, v28.8b
+    ld1         {v6.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umlsl       v14.8h, v18.8b, v29.8b
+    ld1         {v7.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
+
+    st1         {v10.8b},[x14],x6           //vst1_u8(pu1_dst_tmp,sto_res)//
+    sqrshrun    v12.8b, v12.8h,#6
+
+
+    blt         epilog_end                  //jumps to epilog_end
+    beq         epilog                      //jumps to epilog
+
+kernel_8:
+
+    subs        x4,x4,#8
+    umull       v8.8h, v1.8b, v23.8b        //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+
+    add         x20,x0,x8
+    csel        x0, x20, x0,le
+    umlsl       v8.8h, v0.8b, v22.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+
+    bic         x20,x5,#7                   //x5 ->wd
+    csel        x4, x20, x4,le
+    umlsl       v8.8h, v2.8b, v24.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+
+    ld1         {v16.8b},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    umlal       v8.8h, v3.8b, v25.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+
+    ld1         {v17.8b},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    umlal       v8.8h, v4.8b, v26.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+
+    ld1         {v18.8b},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umlsl       v8.8h, v5.8b, v27.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+
+    umlal       v8.8h, v6.8b, v28.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+
+    umlsl       v8.8h, v7.8b, v29.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+    st1         {v12.8b},[x14],x6
+
+//    and            x11, x0, #31
+    sqrshrun    v14.8b, v14.8h,#6
+
+    add         x3,x0,x2                    //pu1_src_tmp += src_strd//
+    umull       v10.8h, v2.8b, v23.8b       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+
+    ld1         {v0.8b},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    umlsl       v10.8h, v1.8b, v22.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+
+    umlsl       v10.8h, v3.8b, v24.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+
+    ld1         {v1.8b},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    umlal       v10.8h, v4.8b, v25.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+
+    st1         {v14.8b},[x14],x6
+    umlal       v10.8h, v5.8b, v26.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+
+    add         x14,x1,#0
+    umlsl       v10.8h, v6.8b, v27.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+
+    add         x1, x1, #8
+    umlal       v10.8h, v7.8b, v28.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+
+    umlsl       v10.8h, v16.8b, v29.8b      //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+
+    add         x20,x1,x9
+    csel        x1, x20, x1,le
+    sqrshrun    v8.8b, v8.8h,#6             //sto_res = vqmovun_s16(sto_res_tmp)//
+
+//    cmp            x11, x10
+    umull       v12.8h, v3.8b, v23.8b
+
+    add         x10, x3, x2, lsl #3         // 10*strd - 8+2
+    umlsl       v12.8h, v2.8b, v22.8b
+
+    add         x10, x10, x2                // 11*strd
+    umlsl       v12.8h, v4.8b, v24.8b
+
+    ld1         {v2.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umlal       v12.8h, v5.8b, v25.8b
+
+    umlal       v12.8h, v6.8b, v26.8b
+    st1         {v8.8b},[x14],x6            //vst1_u8(pu1_dst,sto_res)//
+
+    prfm        PLDL1KEEP,[x10]             //11+ 0
+    umlsl       v12.8h, v7.8b, v27.8b
+
+    add         x20,x10, x2
+    prfm        PLDL1KEEP,[x20]             //11+ 1*strd
+    umlal       v12.8h, v16.8b, v28.8b
+
+    add         x20,x10, x2, lsl #1
+    prfm        PLDL1KEEP,[x20]             //11+ 2*strd
+    umlsl       v12.8h, v17.8b, v29.8b
+
+    add         x10, x10, x2                //12*strd
+    sqrshrun    v10.8b, v10.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
+
+    add         x20,x10, x2, lsl #1
+    prfm        PLDL1KEEP,[x20]             //11+ 3*strd
+    umull       v14.8h, v4.8b, v23.8b
+
+//    mov            x10, x11
+    umlsl       v14.8h, v3.8b, v22.8b
+
+    subs        x7,x7,#4
+    umlsl       v14.8h, v5.8b, v24.8b
+
+    umlal       v14.8h, v6.8b, v25.8b
+    ld1         {v3.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
+    umlal       v14.8h, v7.8b, v26.8b
+    ld1         {v4.8b},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    umlsl       v14.8h, v16.8b, v27.8b
+    ld1         {v5.8b},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    umlal       v14.8h, v17.8b, v28.8b
+    ld1         {v6.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umlsl       v14.8h, v18.8b, v29.8b
+    ld1         {v7.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
+
+    sqrshrun    v12.8b, v12.8h,#6
+    st1         {v10.8b},[x14],x6           //vst1_u8(pu1_dst_tmp,sto_res)//
+
+
+
+    bgt         kernel_8                    //jumps to kernel_8
+
+epilog:
+
+    umull       v8.8h, v1.8b, v23.8b        //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+    umlsl       v8.8h, v0.8b, v22.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+    umlsl       v8.8h, v2.8b, v24.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+    umlal       v8.8h, v3.8b, v25.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    umlal       v8.8h, v4.8b, v26.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    umlsl       v8.8h, v5.8b, v27.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+    umlal       v8.8h, v6.8b, v28.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    umlsl       v8.8h, v7.8b, v29.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+    st1         {v12.8b},[x14],x6
+
+    sqrshrun    v14.8b, v14.8h,#6
+
+    ld1         {v16.8b},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    umull       v10.8h, v2.8b, v23.8b       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+    umlsl       v10.8h, v1.8b, v22.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+    umlsl       v10.8h, v3.8b, v24.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+    umlal       v10.8h, v4.8b, v25.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    umlal       v10.8h, v5.8b, v26.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    umlsl       v10.8h, v6.8b, v27.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+    umlal       v10.8h, v7.8b, v28.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    umlsl       v10.8h, v16.8b, v29.8b      //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+    st1         {v14.8b},[x14],x6
+
+    sqrshrun    v8.8b, v8.8h,#6             //sto_res = vqmovun_s16(sto_res_tmp)//
+
+    ld1         {v17.8b},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    umull       v12.8h, v3.8b, v23.8b
+    umlsl       v12.8h, v2.8b, v22.8b
+    umlsl       v12.8h, v4.8b, v24.8b
+    umlal       v12.8h, v5.8b, v25.8b
+    umlal       v12.8h, v6.8b, v26.8b
+    umlsl       v12.8h, v7.8b, v27.8b
+    umlal       v12.8h, v16.8b, v28.8b
+    umlsl       v12.8h, v17.8b, v29.8b
+    add         x14,x1,x6
+    st1         {v8.8b},[x1],#8             //vst1_u8(pu1_dst,sto_res)//
+    sqrshrun    v10.8b, v10.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
+
+    ld1         {v18.8b},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umull       v14.8h, v4.8b, v23.8b
+    umlsl       v14.8h, v3.8b, v22.8b
+    umlsl       v14.8h, v5.8b, v24.8b
+    umlal       v14.8h, v6.8b, v25.8b
+    umlal       v14.8h, v7.8b, v26.8b
+    umlsl       v14.8h, v16.8b, v27.8b
+    umlal       v14.8h, v17.8b, v28.8b
+    umlsl       v14.8h, v18.8b, v29.8b
+
+    st1         {v10.8b},[x14],x6           //vst1_u8(pu1_dst_tmp,sto_res)//
+    sqrshrun    v12.8b, v12.8h,#6
+
+epilog_end:
+    st1         {v12.8b},[x14],x6
+    sqrshrun    v14.8b, v14.8h,#6
+
+    st1         {v14.8b},[x14],x6
+
+
+end_loops:
+    tst         x5,#7
+    ldp         x0,x1, [sp],#16
+
+    // ldmeqfd sp!,{x4-x12,x15}    //reload the registers from sp
+    bne         lbl409
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+    ret
+lbl409:
+    mov         x5, #4
+    add         x0, x0, #8
+    add         x1, x1, #8
+    mov         x7, #16
+    //
+
+core_loop_wd_4:
+    sub         x20,x5,x6,lsl #2            //x6->dst_strd    x5    ->wd
+    neg         x9, x20
+    sub         x20,x5,x2,lsl #2            //x2->src_strd
+    neg         x8, x20
+    movi        v4.8b, #0
+
+outer_loop_wd_4:
+    subs        x12,x5,#0
+    ble         end_inner_loop_wd_4         //outer loop jump
+
+inner_loop_wd_4:
+    add         x3,x0,x2
+    ld1         {v4.s}[1],[x3],x2           //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)//
+    subs        x12,x12,#4
+    dup         v5.2s, v4.2s[1]             //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
+    ld1         {v5.s}[1],[x3],x2           //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)//
+    ld1         {v4.s}[0],[x0]              //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)//
+    umull       v0.8h, v5.8b, v23.8b        //mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)//
+
+    dup         v6.2s, v5.2s[1]             //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
+    add         x0,x0,#4
+    ld1         {v6.s}[1],[x3],x2           //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)//
+    umlsl       v0.8h, v4.8b, v22.8b        //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)//
+
+    dup         v7.2s, v6.2s[1]             //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
+    ld1         {v7.s}[1],[x3],x2           //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
+    umlsl       v0.8h, v6.8b, v24.8b        //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)//
+
+    umull       v8.8h, v7.8b, v23.8b
+    dup         v4.2s, v7.2s[1]             //src_tmp1 = vdup_lane_u32(src_tmp4, 1)//
+    umull       v2.8h, v7.8b, v25.8b        //mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)//
+    ld1         {v4.s}[1],[x3],x2           //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)//
+    umlsl       v8.8h, v6.8b, v22.8b
+    umlal       v0.8h, v4.8b, v26.8b        //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)//
+
+    dup         v5.2s, v4.2s[1]             //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
+    umlsl       v8.8h, v4.8b, v24.8b
+    ld1         {v5.s}[1],[x3],x2           //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)//
+    umlsl       v2.8h, v5.8b, v27.8b        //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)//
+
+    dup         v6.2s, v5.2s[1]             //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
+    umlal       v8.8h, v5.8b, v25.8b
+    ld1         {v6.s}[1],[x3],x2           //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)//
+    umlal       v0.8h, v6.8b, v28.8b        //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)//
+
+    dup         v7.2s, v6.2s[1]             //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
+    umlal       v8.8h, v6.8b, v26.8b
+    ld1         {v7.s}[1],[x3],x2           //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
+    umlsl       v2.8h, v7.8b, v29.8b        //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)//
+
+    dup         v4.2s, v7.2s[1]
+    add         v0.8h,  v0.8h ,  v2.8h      //mul_res1 = vaddq_u16(mul_res1, mul_res2)//
+
+    umlsl       v8.8h, v7.8b, v27.8b
+    ld1         {v4.s}[1],[x3],x2
+    umlal       v8.8h, v4.8b, v28.8b
+    dup         v5.2s, v4.2s[1]
+    sqrshrun    v0.8b, v0.8h,#6             //sto_res = vqmovun_s16(sto_res_tmp)//
+
+    ld1         {v5.s}[1],[x3]
+    add         x3,x1,x6
+    st1         {v0.s}[0],[x1]              //vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)//
+
+    umlsl       v8.8h, v5.8b, v29.8b
+    st1         {v0.s}[1],[x3],x6           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)//
+    sqrshrun    v8.8b, v8.8h,#6
+
+    st1         {v8.s}[0],[x3],x6
+    add         x1,x1,#4
+    st1         {v8.s}[1],[x3]
+    bgt         inner_loop_wd_4
+
+end_inner_loop_wd_4:
+    subs        x7,x7,#4
+    add         x1,x1,x9
+    add         x0,x0,x8
+    bgt         outer_loop_wd_4
+
+    // ldmfd sp!, {x4-x12, x15}    //reload the registers from sp
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+    ret
+

diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s
new file mode 100644
index 0000000..64a00b2
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s

@@ -0,0 +1,407 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//******************************************************************************
+//* //file
+//*  ihevc_inter_pred_filters_luma_vert_w16inp.s
+//*
+//* //brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//*  yogeswaran rs
+//*
+//* //par list of functions:
+//*
+//*  - ihevc_inter_pred_filters_luma_vert_w16inp()
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
+///* include reconstruction */
+//
+
+///**
+//*******************************************************************************
+//*
+//* //brief
+//*    luma vertical filter for 16bit input.
+//*
+//* //par description:
+//*     applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+//*     the elements pointed by 'pu1_src' and  writes to the location pointed by
+//*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
+//*     clipped to lie  between 0 and 255   assumptions : the function is
+//*     optimized considering the fact width is  multiple of 4. and height as
+//*     multiple of 2.
+//*
+//* //param[in] pi2_src
+//*  word16 pointer to the source
+//*
+//* //param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] dst_strd
+//*  integer destination stride
+//*
+//* //param[in] pi1_coeff
+//*  word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src,
+//                                    uword8 *pu1_dst,
+//                                    word32 src_strd,
+//                                    word32 dst_strd,
+//                                    word8 *pi1_coeff,
+//                                    word32 ht,
+//                                    word32 wd   )
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_luma_vert_w16inp_av8
+
+.type ihevc_inter_pred_luma_vert_w16inp_av8, %function
+
+ihevc_inter_pred_luma_vert_w16inp_av8:
+
+    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    mov         x15,x4 // pi1_coeff
+    mov         x16,x5 // ht
+    mov         x17,x6 // wd
+
+    mov         x12,x15                     //load pi1_coeff
+    mov         x6,x3
+    mov         x5,x17                      //load wd
+    ld1         {v0.8b},[x12]               //coeff = vld1_s8(pi1_coeff)
+    lsl         x2, x2, #1
+    sub         x12,x2,x2,lsl #2            //src_ctrd & pi1_coeff
+    //abs  v0.8b, v0.8b                //vabs_s8(coeff)
+    add         x0,x0,x12                   //x0->pu1_src    x12->pi1_coeff
+    mov         x3,x16                      //load ht
+    subs        x7,x3,#0                    //x3->ht
+    //ble          end_loops            //end loop jump
+    sxtl        v0.8h, v0.8b
+    dup         v22.4h, v0.4h[0]            //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
+    dup         v23.4h, v0.4h[1]            //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
+    dup         v24.4h, v0.4h[2]            //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
+    dup         v25.4h, v0.4h[3]            //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
+    dup         v26.4h, v0.4h[4]            //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
+    dup         v27.4h, v0.4h[5]            //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
+    dup         v28.4h, v0.4h[6]            //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
+    dup         v29.4h, v0.4h[7]            //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
+
+    sub         x20,x5,x6,lsl #2            //x6->dst_strd    x5    ->wd
+    neg         x9, x20
+    sub         x20,x5,x2,lsl #2            //x2->src_strd
+    neg         x8, x20
+    sub         x8,x8,x5
+    lsr         x3, x5, #2                  //divide by 4
+    mul         x7, x7, x3                  //multiply height by width
+    sub         x7, x7,#4                   //subtract by one for epilog
+    mov         x4,x5                       //x5 ->wd
+    //lsl x2, x2, #1
+
+prolog:
+
+    add         x3,x0,x2                    //pu1_src_tmp += src_strd//
+    ld1         {v1.4h},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    ld1         {v0.4h},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    subs        x4,x4,#4
+    ld1         {v2.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    smull       v8.4s, v1.4h, v23.4h        //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+    ld1         {v3.4h},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
+    smlal       v8.4s, v0.4h, v22.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+    ld1         {v4.4h},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    smlal       v8.4s, v2.4h, v24.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+    ld1         {v5.4h},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    smlal       v8.4s, v3.4h, v25.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    ld1         {v6.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    smlal       v8.4s, v4.4h, v26.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    ld1         {v7.4h},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
+    smlal       v8.4s, v5.4h, v27.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+    smlal       v8.4s, v6.4h, v28.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    smlal       v8.4s, v7.4h, v29.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+
+    ld1         {v16.4h},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
+
+    smull       v10.4s, v2.4h, v23.4h       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+    add         x20,x0,x8,lsl #0
+    csel        x0, x20, x0,le
+    smlal       v10.4s, v1.4h, v22.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+    csel        x4, x5, x4,le               //x5 ->wd
+    smlal       v10.4s, v3.4h, v24.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+    ld1         {v17.4h},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    smlal       v10.4s, v4.4h, v25.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    ld1         {v18.4h},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    smlal       v10.4s, v5.4h, v26.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    add         x3,x0,x2                    //pu1_src_tmp += src_strd//
+    smlal       v10.4s, v6.4h, v27.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+    smlal       v10.4s, v7.4h, v28.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    smlal       v10.4s, v16.4h, v29.4h      //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+    sqshrn      v8.4h, v8.4s,#6
+
+    ld1         {v1.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    smull       v12.4s, v3.4h, v23.4h
+    ld1         {v0.4h},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    smlal       v12.4s, v2.4h, v22.4h
+    ld1         {v2.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    smlal       v12.4s, v4.4h, v24.4h
+    smlal       v12.4s, v5.4h, v25.4h
+    smlal       v12.4s, v6.4h, v26.4h
+    smlal       v12.4s, v7.4h, v27.4h
+    smlal       v12.4s, v16.4h, v28.4h
+    smlal       v12.4s, v17.4h, v29.4h
+    add         x14,x1,x6
+    sqshrn      v10.4h, v10.4s,#6
+    sqrshrun    v8.8b, v8.8h,#6             //sto_res = vqmovun_s16(sto_res_tmp)//
+
+    smull       v14.4s, v4.4h, v23.4h
+    smlal       v14.4s, v3.4h, v22.4h
+    smlal       v14.4s, v5.4h, v24.4h
+    smlal       v14.4s, v6.4h, v25.4h
+    ld1         {v3.4h},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
+    smlal       v14.4s, v7.4h, v26.4h
+    ld1         {v4.4h},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    smlal       v14.4s, v16.4h, v27.4h
+    ld1         {v5.4h},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    smlal       v14.4s, v17.4h, v28.4h
+    ld1         {v6.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    smlal       v14.4s, v18.4h, v29.4h
+    ld1         {v7.4h},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
+
+    st1         {v8.s}[0],[x1],#4           //vst1_u8(pu1_dst,sto_res)//
+    sqshrn      v12.4h, v12.4s,#6
+    sqrshrun    v10.8b, v10.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
+    add         x20,x1,x9
+    csel        x1, x20, x1,le
+
+    subs        x7,x7,#4
+
+    blt         epilog_end                  //jumps to epilog_end
+    beq         epilog                      //jumps to epilog
+
+kernel_8:
+
+    smull       v8.4s, v1.4h, v23.4h        //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+    subs        x4,x4,#4
+    smlal       v8.4s, v0.4h, v22.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+    add         x20,x0,x8,lsl #0
+    csel        x0, x20, x0,le
+    smlal       v8.4s, v2.4h, v24.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+    smlal       v8.4s, v3.4h, v25.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    smlal       v8.4s, v4.4h, v26.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    smlal       v8.4s, v5.4h, v27.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+    smlal       v8.4s, v6.4h, v28.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    smlal       v8.4s, v7.4h, v29.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+    st1         {v10.s}[0],[x14],x6         //vst1_u8(pu1_dst_tmp,sto_res)//
+
+    sqshrn      v14.4h, v14.4s,#6
+    sqrshrun    v12.8b, v12.8h,#6
+    ld1         {v16.4h},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
+
+    smull       v10.4s, v2.4h, v23.4h       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+    smlal       v10.4s, v1.4h, v22.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+    smlal       v10.4s, v3.4h, v24.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+    smlal       v10.4s, v4.4h, v25.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    smlal       v10.4s, v5.4h, v26.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    smlal       v10.4s, v6.4h, v27.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+    st1         {v12.s}[0],[x14],x6
+
+    smlal       v10.4s, v7.4h, v28.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    ld1         {v17.4h},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
+
+    smlal       v10.4s, v16.4h, v29.4h      //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+
+    sqshrn      v8.4h, v8.4s,#6
+    sqrshrun    v14.8b, v14.8h,#6
+
+    smull       v12.4s, v3.4h, v23.4h
+    csel        x4, x5, x4,le               //x5 ->wd
+
+    smlal       v12.4s, v2.4h, v22.4h
+    ld1         {v18.4h},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
+
+    smlal       v12.4s, v4.4h, v24.4h
+    add         x3,x0,x2                    //pu1_src_tmp += src_strd//
+
+    smlal       v12.4s, v5.4h, v25.4h
+
+    smlal       v12.4s, v6.4h, v26.4h
+    st1         {v14.s}[0],[x14],x6
+
+    smlal       v12.4s, v7.4h, v27.4h
+    ld1         {v1.4h},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
+
+    smlal       v12.4s, v16.4h, v28.4h
+    add         x14,x1,x6
+
+    smlal       v12.4s, v17.4h, v29.4h
+    ld1         {v0.4h},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
+
+    sqshrn      v10.4h, v10.4s,#6
+    sqrshrun    v8.8b, v8.8h,#6             //sto_res = vqmovun_s16(sto_res_tmp)//
+    ld1         {v2.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+
+    smull       v14.4s, v4.4h, v23.4h
+    smlal       v14.4s, v3.4h, v22.4h
+    smlal       v14.4s, v5.4h, v24.4h
+    ld1         {v3.4h},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
+
+    smlal       v14.4s, v6.4h, v25.4h
+    ld1         {v4.4h},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    smlal       v14.4s, v7.4h, v26.4h
+    ld1         {v5.4h},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    smlal       v14.4s, v16.4h, v27.4h
+    ld1         {v6.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    smlal       v14.4s, v17.4h, v28.4h
+    ld1         {v7.4h},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
+    smlal       v14.4s, v18.4h, v29.4h
+    st1         {v8.s}[0],[x1],#4           //vst1_u8(pu1_dst,sto_res)//
+
+    sqshrn      v12.4h, v12.4s,#6
+    add         x20,x1,x9
+    csel        x1, x20, x1,le
+
+    sqrshrun    v10.8b, v10.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
+    subs        x7,x7,#4
+
+    bgt         kernel_8                    //jumps to kernel_8
+
+epilog:
+
+    smull       v8.4s, v1.4h, v23.4h        //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+    smlal       v8.4s, v0.4h, v22.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+    smlal       v8.4s, v2.4h, v24.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+    smlal       v8.4s, v3.4h, v25.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    smlal       v8.4s, v4.4h, v26.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    smlal       v8.4s, v5.4h, v27.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+    smlal       v8.4s, v6.4h, v28.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    smlal       v8.4s, v7.4h, v29.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+    st1         {v10.s}[0],[x14],x6
+
+    sqshrn      v14.4h, v14.4s,#6
+    sqrshrun    v12.8b, v12.8h,#6
+
+    ld1         {v16.4h},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    smull       v10.4s, v2.4h, v23.4h       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+    smlal       v10.4s, v1.4h, v22.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+    smlal       v10.4s, v3.4h, v24.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+    smlal       v10.4s, v4.4h, v25.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    smlal       v10.4s, v5.4h, v26.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    smlal       v10.4s, v6.4h, v27.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+    smlal       v10.4s, v7.4h, v28.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    smlal       v10.4s, v16.4h, v29.4h      //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+    st1         {v12.s}[0],[x14],x6
+
+    sqshrn      v8.4h, v8.4s,#6
+    sqrshrun    v14.8b, v14.8h,#6
+
+    ld1         {v17.4h},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    smull       v12.4s, v3.4h, v23.4h
+    smlal       v12.4s, v2.4h, v22.4h
+    smlal       v12.4s, v4.4h, v24.4h
+    smlal       v12.4s, v5.4h, v25.4h
+    smlal       v12.4s, v6.4h, v26.4h
+    smlal       v12.4s, v7.4h, v27.4h
+    smlal       v12.4s, v16.4h, v28.4h
+    smlal       v12.4s, v17.4h, v29.4h
+    st1         {v14.s}[0],[x14],x6
+    sqshrn      v10.4h, v10.4s,#6
+    sqrshrun    v8.8b, v8.8h,#6             //sto_res = vqmovun_s16(sto_res_tmp)//
+
+    ld1         {v18.4h},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    smull       v14.4s, v4.4h, v23.4h
+    smlal       v14.4s, v3.4h, v22.4h
+    smlal       v14.4s, v5.4h, v24.4h
+    smlal       v14.4s, v6.4h, v25.4h
+    smlal       v14.4s, v7.4h, v26.4h
+    smlal       v14.4s, v16.4h, v27.4h
+    smlal       v14.4s, v17.4h, v28.4h
+    smlal       v14.4s, v18.4h, v29.4h
+    sqshrn      v12.4h, v12.4s,#6
+    sqrshrun    v10.8b, v10.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
+
+    add         x14,x1,x6
+    st1         {v8.s}[0],[x1],#4           //vst1_u8(pu1_dst,sto_res)//
+
+epilog_end:
+    st1         {v10.s}[0],[x14],x6         //vst1_u8(pu1_dst_tmp,sto_res)//
+    sqrshrun    v12.8b, v12.8h,#6
+
+    st1         {v12.s}[0],[x14],x6
+    sqshrn      v14.4h, v14.4s,#6
+    sqrshrun    v14.8b, v14.8h,#6
+
+    st1         {v14.s}[0],[x14],x6
+
+
+end_loops:
+
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+    ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s
new file mode 100644
index 0000000..da316ae
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s

@@ -0,0 +1,483 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//******************************************************************************
+//* //file
+//*  ihevc_inter_pred_filters_luma_vert.s
+//*
+//* //brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//*  parthiban v
+//*
+//* //par list of functions:
+//*
+//*  - ihevc_inter_pred_luma_vert()
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
+///* include reconstruction */
+
+
+//void ihevc_inter_pred_luma_vert_w16out(uword8 *pu1_src,
+//                                    word16 *pi2_dst,
+//                                    word32 src_strd,
+//                                    word32 dst_strd,
+//                                    word8 *pi1_coeff,
+//                                    word32 ht,
+//                                    word32 wd   )
+
+//**************variables vs registers*****************************************
+//    x0 => *pu1_src
+//    x1 => *pu1_dst
+//    x2 =>  src_strd
+//    x6 =>  dst_strd
+//    x12 => *pi1_coeff
+//    x5 =>  ht
+//    x3 =>  wd
+
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_luma_vert_w16out_av8
+
+.type ihevc_inter_pred_luma_vert_w16out_av8, %function
+
+ihevc_inter_pred_luma_vert_w16out_av8:
+
+    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    mov         x15,x4 // pi1_coeff
+    mov         x16,x5 // ht
+    mov         x17,x6 // wd
+
+    mov         x12,x15                     //load pi1_coeff
+    mov         x6,x3
+    mov         x5,x17                      //load wd
+    ld1         {v0.8b},[x12]               //coeff = vld1_s8(pi1_coeff)
+    sub         x12,x2,x2,lsl #2            //src_ctrd & pi1_coeff
+    abs         v0.8b, v0.8b                //vabs_s8(coeff)
+    add         x0,x0,x12                   //x0->pu1_src    x12->pi1_coeff
+    mov         x3,x16                      //load ht
+    subs        x7,x3,#0                    //x3->ht
+    //ble          end_loops_16out            //end loop jump
+    dup         v22.8b, v0.8b[0]            //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
+    cmp         x5,#8
+    dup         v23.8b, v0.8b[1]            //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
+    dup         v24.8b, v0.8b[2]            //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
+    dup         v25.8b, v0.8b[3]            //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
+    dup         v26.8b, v0.8b[4]            //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
+    dup         v27.8b, v0.8b[5]            //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
+    dup         v28.8b, v0.8b[6]            //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
+    dup         v29.8b, v0.8b[7]            //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
+    blt         core_loop_wd_4_16out        //core loop wd 4 jump
+    stp         x0,x1, [sp, #-16]!
+
+    bic         x4,x5,#7                    //x5 ->wd
+    sub         x20,x4,x6,lsl #2            //x6->dst_strd    x5    ->wd
+    neg         x9, x20
+    sub         x20,x4,x2,lsl #2            //x2->src_strd
+    neg         x8, x20
+    lsl         x6, x6, #1
+    lsr         x3, x5, #3                  //divide by 8
+    mul         x7, x7, x3                  //multiply height by width
+    sub         x7, x7,#4                   //subtract by one for epilog
+
+prolog_16out:
+
+    and         x10, x0, #31
+    add         x3,x0,x2                    //pu1_src_tmp += src_strd//
+
+    ld1         {v1.8b},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    ld1         {v0.8b},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    subs        x4,x4,#8
+    ld1         {v2.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umull       v8.8h, v1.8b, v23.8b        //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+    ld1         {v3.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
+    umlsl       v8.8h, v0.8b, v22.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+    ld1         {v4.8b},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    umlsl       v8.8h, v2.8b, v24.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+    ld1         {v5.8b},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    umlal       v8.8h, v3.8b, v25.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    ld1         {v6.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umlal       v8.8h, v4.8b, v26.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    ld1         {v7.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
+    umlsl       v8.8h, v5.8b, v27.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+    ld1         {v16.8b},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    umlal       v8.8h, v6.8b, v28.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    ld1         {v17.8b},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    umlsl       v8.8h, v7.8b, v29.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+
+
+    add         x20,x0,x8
+    csel        x0, x20, x0,le
+    umull       v10.8h, v2.8b, v23.8b       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+
+    bic         x20,x5,#7                   //x5 ->wd
+    csel        x4, x20, x4,le
+    umlsl       v10.8h, v1.8b, v22.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+
+    ld1         {v18.8b},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umlsl       v10.8h, v3.8b, v24.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+
+    add         x20,x20,x3
+    prfm        PLDL1KEEP,[x20]
+    umlal       v10.8h, v4.8b, v25.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    add         x20,x3, x2
+    prfm        PLDL1KEEP,[x20]
+    umlal       v10.8h, v5.8b, v26.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    add         x20,x3, x2, lsl #1
+    prfm        PLDL1KEEP,[x20]
+    umlsl       v10.8h, v6.8b, v27.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+    add         x3, x3, x2
+    umlal       v10.8h, v7.8b, v28.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    add         x20,x3, x2, lsl #1
+    prfm        PLDL1KEEP,[x20]
+    umlsl       v10.8h, v16.8b, v29.8b      //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+
+    add         x3,x0,x2                    //pu1_src_tmp += src_strd//
+    umull       v12.8h, v3.8b, v23.8b
+    ld1         {v1.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umlsl       v12.8h, v2.8b, v22.8b
+    ld1         {v0.8b},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    umlsl       v12.8h, v4.8b, v24.8b
+    ld1         {v2.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umlal       v12.8h, v5.8b, v25.8b
+    umlal       v12.8h, v6.8b, v26.8b
+    umlsl       v12.8h, v7.8b, v27.8b
+    umlal       v12.8h, v16.8b, v28.8b
+    umlsl       v12.8h, v17.8b, v29.8b
+    add         x14,x1,x6
+    st1         {v8.16b},[x1],#16           //vst1_u8(pu1_dst,sto_res)//
+    //vqrshrun.s16 d10,q5,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
+    add         x20,x1,x9,lsl #1
+    csel        x1, x20, x1,le
+
+    umull       v14.8h, v4.8b, v23.8b
+    subs        x7,x7,#4
+    umlsl       v14.8h, v3.8b, v22.8b
+    umlsl       v14.8h, v5.8b, v24.8b
+    umlal       v14.8h, v6.8b, v25.8b
+    ld1         {v3.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
+    umlal       v14.8h, v7.8b, v26.8b
+    ld1         {v4.8b},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    umlsl       v14.8h, v16.8b, v27.8b
+    ld1         {v5.8b},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    umlal       v14.8h, v17.8b, v28.8b
+    ld1         {v6.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umlsl       v14.8h, v18.8b, v29.8b
+    ld1         {v7.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
+
+    st1         {v10.16b},[x14],x6          //vst1_u8(pu1_dst_tmp,sto_res)//
+    //vqrshrun.s16 d12,q6,#6
+
+
+    blt         epilog_end_16out
+    beq         epilog_16out                //jumps to epilog
+
+kernel_8_16out:
+
+    subs        x4,x4,#8
+    umull       v8.8h, v1.8b, v23.8b        //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+
+    add         x20,x0,x8
+    csel        x0, x20, x0,le
+    umlsl       v8.8h, v0.8b, v22.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+
+    ld1         {v16.8b},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    umlsl       v8.8h, v2.8b, v24.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+
+    ld1         {v17.8b},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    umlal       v8.8h, v3.8b, v25.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+
+    bic         x20,x5,#7                   //x5 ->wd
+    csel        x4, x20, x4,le
+    umlal       v8.8h, v4.8b, v26.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+
+    ld1         {v18.8b},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umlsl       v8.8h, v5.8b, v27.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+
+    st1         {v12.16b},[x14],x6
+    umlal       v8.8h, v6.8b, v28.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+
+    add         x3,x0,x2                    //pu1_src_tmp += src_strd//
+    umlsl       v8.8h, v7.8b, v29.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+
+
+//    and            x11, x0, #31
+    umull       v10.8h, v2.8b, v23.8b       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+
+    st1         {v14.16b},[x14],x6
+    umlsl       v10.8h, v1.8b, v22.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+
+    add         x14,x1,x6
+    umlsl       v10.8h, v3.8b, v24.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+
+    ld1         {v0.8b},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    umlal       v10.8h, v4.8b, v25.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+
+    ld1         {v1.8b},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    umlal       v10.8h, v5.8b, v26.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+
+    st1         {v8.16b},[x1],#16           //vst1_u8(pu1_dst,sto_res)//
+    umlsl       v10.8h, v6.8b, v27.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+
+    add         x20,x1,x9,lsl #1
+    csel        x1, x20, x1,le
+    umlal       v10.8h, v7.8b, v28.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+
+//    cmp            x11, x10
+    umlsl       v10.8h, v16.8b, v29.8b      //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+
+    add         x10, x3, x2, lsl #3         // 10*strd - 8+2
+    umull       v12.8h, v3.8b, v23.8b
+
+    add         x10, x10, x2                // 11*strd
+    umlsl       v12.8h, v2.8b, v22.8b
+
+    add         x20,x20,x10
+    prfm        PLDL1KEEP,[x20]             //11+ 0
+    umlsl       v12.8h, v4.8b, v24.8b
+
+    add         x20,x10, x2
+    prfm        PLDL1KEEP,[x20]             //11+ 1*strd
+    umlal       v12.8h, v5.8b, v25.8b
+
+    add         x20,x10, x2, lsl #1
+    prfm        PLDL1KEEP,[x20]             //11+ 2*strd
+    umlal       v12.8h, v6.8b, v26.8b
+
+    add         x10, x10, x2                //12*strd
+    umlsl       v12.8h, v7.8b, v27.8b
+
+    add         x20,x10, x2, lsl #1
+    prfm        PLDL1KEEP,[x20]             //11+ 3*strd
+    umlal       v12.8h, v16.8b, v28.8b
+
+//    mov            x10, x11
+    umlsl       v12.8h, v17.8b, v29.8b
+
+    ld1         {v2.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umull       v14.8h, v4.8b, v23.8b
+
+    subs        x7,x7,#4
+    umlsl       v14.8h, v3.8b, v22.8b
+
+    st1         {v10.16b},[x14],x6          //vst1_u8(pu1_dst_tmp,sto_res)//
+    umlsl       v14.8h, v5.8b, v24.8b
+
+    ld1         {v3.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
+    umlal       v14.8h, v6.8b, v25.8b
+
+    ld1         {v4.8b},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    umlal       v14.8h, v7.8b, v26.8b
+
+    ld1         {v5.8b},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    umlsl       v14.8h, v16.8b, v27.8b
+
+    ld1         {v6.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umlal       v14.8h, v17.8b, v28.8b
+
+    ld1         {v7.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
+    umlsl       v14.8h, v18.8b, v29.8b
+
+
+    bgt         kernel_8_16out              //jumps to kernel_8
+
+epilog_16out:
+
+    umull       v8.8h, v1.8b, v23.8b        //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+    umlsl       v8.8h, v0.8b, v22.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+    umlsl       v8.8h, v2.8b, v24.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+    umlal       v8.8h, v3.8b, v25.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    umlal       v8.8h, v4.8b, v26.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    umlsl       v8.8h, v5.8b, v27.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+    umlal       v8.8h, v6.8b, v28.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    umlsl       v8.8h, v7.8b, v29.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+    st1         {v12.16b},[x14],x6
+
+    //vqrshrun.s16 d14,q7,#6
+
+    ld1         {v16.8b},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
+    umull       v10.8h, v2.8b, v23.8b       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+    umlsl       v10.8h, v1.8b, v22.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+    umlsl       v10.8h, v3.8b, v24.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+    umlal       v10.8h, v4.8b, v25.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    umlal       v10.8h, v5.8b, v26.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    umlsl       v10.8h, v6.8b, v27.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+    umlal       v10.8h, v7.8b, v28.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    umlsl       v10.8h, v16.8b, v29.8b      //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+    st1         {v14.16b},[x14],x6
+
+    //vqrshrun.s16 d8,q4,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
+
+    ld1         {v17.8b},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
+    umull       v12.8h, v3.8b, v23.8b
+    umlsl       v12.8h, v2.8b, v22.8b
+    umlsl       v12.8h, v4.8b, v24.8b
+    umlal       v12.8h, v5.8b, v25.8b
+    umlal       v12.8h, v6.8b, v26.8b
+    umlsl       v12.8h, v7.8b, v27.8b
+    umlal       v12.8h, v16.8b, v28.8b
+    umlsl       v12.8h, v17.8b, v29.8b
+    add         x14,x1,x6
+    st1         {v8.16b},[x1],#16           //vst1_u8(pu1_dst,sto_res)//
+    //vqrshrun.s16 d10,q5,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
+
+    ld1         {v18.8b},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
+    umull       v14.8h, v4.8b, v23.8b
+    umlsl       v14.8h, v3.8b, v22.8b
+    umlsl       v14.8h, v5.8b, v24.8b
+    umlal       v14.8h, v6.8b, v25.8b
+    umlal       v14.8h, v7.8b, v26.8b
+    umlsl       v14.8h, v16.8b, v27.8b
+    umlal       v14.8h, v17.8b, v28.8b
+    umlsl       v14.8h, v18.8b, v29.8b
+
+    st1         {v10.16b},[x14],x6          //vst1_u8(pu1_dst_tmp,sto_res)//
+    //vqrshrun.s16 d12,q6,#6
+
+epilog_end_16out:
+    st1         {v12.16b},[x14],x6
+    //vqrshrun.s16 d14,q7,#6
+
+    st1         {v14.16b},[x14],x6
+
+
+end_loops_16out:
+    tst         x5,#7
+    ldp         x0,x1, [sp], #16
+
+    // ldmeqfd sp!,{x4-x12,x15}    //reload the registers from sp
+    bne         lbl355
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+    ret
+lbl355:
+    mov         x5, #4
+    add         x0, x0, #8
+    add         x1, x1, #16
+    mov         x7, #16
+    lsr         x6, x6, #1
+
+    //
+
+core_loop_wd_4_16out:
+    sub         x20,x5,x6,lsl #2            //x6->dst_strd    x5    ->wd
+    neg         x9, x20
+    sub         x20,x5,x2,lsl #2            //x2->src_strd
+    neg         x8, x20
+    movi        v4.8b, #0
+    lsl         x6, x6, #1
+
+outer_loop_wd_4_16out:
+    subs        x12,x5,#0
+    ble         end_inner_loop_wd_4_16out   //outer loop jump
+
+inner_loop_wd_4_16out:
+    add         x3,x0,x2
+    ld1         {v4.s}[1],[x3],x2           //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)//
+    subs        x12,x12,#4
+    dup         v5.2s, v4.2s[1]             //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
+    ld1         {v5.s}[1],[x3],x2           //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)//
+    ld1         {v4.s}[0],[x0]              //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)//
+    umull       v0.8h, v5.8b, v23.8b        //mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)//
+
+    dup         v6.2s, v5.2s[1]             //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
+    add         x0,x0,#4
+    ld1         {v6.s}[1],[x3],x2           //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)//
+    umlsl       v0.8h, v4.8b, v22.8b        //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)//
+
+    dup         v7.2s, v6.2s[1]             //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
+    ld1         {v7.s}[1],[x3],x2           //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
+    umlsl       v0.8h, v6.8b, v24.8b        //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)//
+
+    umull       v8.8h, v7.8b, v23.8b
+    dup         v4.2s, v7.2s[1]             //src_tmp1 = vdup_lane_u32(src_tmp4, 1)//
+    umull       v2.8h, v7.8b, v25.8b        //mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)//
+    ld1         {v4.s}[1],[x3],x2           //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)//
+    umlsl       v8.8h, v6.8b, v22.8b
+    umlal       v0.8h, v4.8b, v26.8b        //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)//
+
+    dup         v5.2s, v4.2s[1]             //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
+    umlsl       v8.8h, v4.8b, v24.8b
+    ld1         {v5.s}[1],[x3],x2           //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)//
+    umlsl       v2.8h, v5.8b, v27.8b        //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)//
+
+    dup         v6.2s, v5.2s[1]             //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
+    umlal       v8.8h, v5.8b, v25.8b
+    ld1         {v6.s}[1],[x3],x2           //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)//
+    umlal       v0.8h, v6.8b, v28.8b        //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)//
+
+    dup         v7.2s, v6.2s[1]             //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
+    umlal       v8.8h, v6.8b, v26.8b
+    ld1         {v7.s}[1],[x3],x2           //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
+    umlsl       v2.8h, v7.8b, v29.8b        //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)//
+
+    dup         v4.2s, v7.2s[1]
+    add         v0.8h, v0.8h , v2.8h        //mul_res1 = vaddq_u16(mul_res1, mul_res2)//
+
+    umlsl       v8.8h, v7.8b, v27.8b
+    ld1         {v4.s}[1],[x3],x2
+    umlal       v8.8h, v4.8b, v28.8b
+    dup         v5.2s, v4.2s[1]
+    //vqrshrun.s16 d0,q0,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
+
+    ld1         {v5.s}[1],[x3]
+    add         x3,x1,x6
+    st1         {v0.d}[0],[x1],#8           //vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)//
+
+    umlsl       v8.8h, v5.8b, v29.8b
+    st1         {v0.d}[1],[x3],x6           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)//
+    //vqrshrun.s16 d8,q4,#6
+
+    st1         {v8.d}[0],[x3],x6
+    //add          x1,x1,#4
+    st1         {v8.d}[1],[x3]
+    bgt         inner_loop_wd_4_16out
+
+end_inner_loop_wd_4_16out:
+    subs        x7,x7,#4
+    add         x1,x1,x9,lsl #1
+    add         x0,x0,x8
+    bgt         outer_loop_wd_4_16out
+
+    // ldmfd sp!, {x4-x12, x15}    //reload the registers from sp
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+    ret
+
+
+
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_inter_pred_luma_copy.s b/common/arm64/ihevc_inter_pred_luma_copy.s
new file mode 100644
index 0000000..dccbb2b
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_luma_copy.s

@@ -0,0 +1,199 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///**
+//*******************************************************************************
+//*
+//* //brief
+//*     interprediction luma function for copy
+//*
+//* //par description:
+//*   copies the array of width 'wd' and height 'ht' from the  location pointed
+//*   by 'src' to the location pointed by 'dst'
+//*
+//* //param[in] pu1_src
+//*  uword8 pointer to the source
+//*
+//* //param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] dst_strd
+//*  integer destination stride
+//*
+//* //param[in] pi1_coeff
+//*  word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_inter_pred_luma_copy (
+//                            uword8 *pu1_src,
+//                            uword8 *pu1_dst,
+//                            word32 src_strd,
+//                            word32 dst_strd,
+//                            word8 *pi1_coeff,
+//                            word32 ht,
+//                            word32 wd   )
+
+//**************variables vs registers*****************************************
+//    x0 => *pu1_src
+//    x1 => *pu1_dst
+//    x2 =>  src_strd
+//    x3 =>  dst_strd
+//    x11 =>  ht
+//    x16 => wd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_luma_copy_av8
+
+.type ihevc_inter_pred_luma_copy_av8, %function
+
+ihevc_inter_pred_luma_copy_av8:
+    // stmfd sp!, {x8-x16, lr}                //stack stores the values of the arguments
+    stp         x19,x20,[sp, #-16]!
+    mov         x16,x6                      //loads wd
+    mov         x11,x5                      //loads ht
+    cmp         x11,#0                      //checks ht == 0
+    ble         end_loops
+    tst         x16,#15                     //checks wd for multiples for 4 & 8
+    beq         core_loop_wd_16
+    tst         x16,#7                      //checks wd for multiples for 4 & 8
+    beq         core_loop_wd_8
+    sub         x15,x16,#4
+
+outer_loop_wd_4:
+    subs        x8,x16,#0                   //checks wd == 0
+    ble         end_inner_loop_wd_4
+
+inner_loop_wd_4:
+    ld1         {v0.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    add         x9,x0,x2                    //pu1_src_tmp += src_strd
+    add         x10,x1,x3                   //pu1_dst_tmp += dst_strd
+    st1         {v0.s}[0],[x1]              //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    ld1         {v0.s}[0],[x9],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    add         x0,x0,#4                    //pu1_src += 4
+    st1         {v0.s}[0],[x10],x3          //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    ld1         {v0.s}[0],[x9],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    subs        x8,x8,#4                    //(wd -4)
+    st1         {v0.s}[0],[x10],x3          //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+    ld1         {v0.s}[0],[x9],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+    add         x1,x1,#4                    //pu1_dst += 4
+    st1         {v0.s}[0],[x10],x3          //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+
+    bgt         inner_loop_wd_4
+
+end_inner_loop_wd_4:
+    subs        x11,x11,#4                  //ht - 4
+    sub         x0,x9,x15                   //pu1_src = pu1_src_tmp
+    sub         x1,x10,x15                  //pu1_dst = pu1_dst_tmp
+    bgt         outer_loop_wd_4
+
+end_loops:
+    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
+//  MRS x20,PMCCFILTR_EL0
+    sub         x0,x20,x19
+    ldp         x19,x20,[sp],#16
+    ret
+
+
+core_loop_wd_8:
+    sub         x15,x16,#8
+
+outer_loop_wd_8:
+    subs        x8,x16,#0                   //checks wd
+    ble         end_inner_loop_wd_8
+
+inner_loop_wd_8:
+    add         x9,x0,x2                    //pu1_src_tmp += src_strd
+    ld1         {v0.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
+    add         x10,x1,x3                   //pu1_dst_tmp += dst_strd
+    st1         {v0.8b},[x1],#8             //vst1_u8(pu1_dst_tmp, tmp_src)
+    ld1         {v1.8b},[x9],x2             //vld1_u8(pu1_src_tmp)
+    st1         {v1.8b},[x10],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
+    subs        x8,x8,#8                    //wd - 8(loop condition)
+    ld1         {v2.8b},[x9],x2             //vld1_u8(pu1_src_tmp)
+    st1         {v2.8b},[x10],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
+    ld1         {v3.8b},[x9],x2             //vld1_u8(pu1_src_tmp)
+    st1         {v3.8b},[x10],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
+    bgt         inner_loop_wd_8
+
+end_inner_loop_wd_8:
+    subs        x11,x11,#4                  //ht -= 4
+    sub         x0,x9,x15                   //pu1_src = pu1_src_tmp
+    sub         x1,x10,x15                  //pu1_dst = pu1_dst_tmp
+    bgt         outer_loop_wd_8
+
+    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
+//  MRS x20,PMCCFILTR_EL0
+    sub         x0,x20,x19
+    ldp         x19,x20,[sp],#16
+    ret
+
+core_loop_wd_16:
+    sub         x15,x16,#16
+
+outer_loop_wd_16:
+    subs        x8,x16,#0                   //checks wd
+    ble         end_inner_loop_wd_16
+
+inner_loop_wd_16:
+    add         x9,x0,x2                    //pu1_src_tmp += src_strd
+    ld1         {v0.16b},[x0],#16           //vld1_u8(pu1_src_tmp)
+    add         x10,x1,x3                   //pu1_dst_tmp += dst_strd
+    st1         {v0.16b},[x1],#16           //vst1_u8(pu1_dst_tmp, tmp_src)
+    ld1         {v1.16b},[x9],x2            //vld1_u8(pu1_src_tmp)
+    st1         {v1.16b},[x10],x3           //vst1_u8(pu1_dst_tmp, tmp_src)
+    subs        x8,x8,#16                   //wd - 8(loop condition)
+    ld1         {v2.16b},[x9],x2            //vld1_u8(pu1_src_tmp)
+    st1         {v2.16b},[x10],x3           //vst1_u8(pu1_dst_tmp, tmp_src)
+    ld1         {v3.16b},[x9],x2            //vld1_u8(pu1_src_tmp)
+    st1         {v3.16b},[x10],x3           //vst1_u8(pu1_dst_tmp, tmp_src)
+    bgt         inner_loop_wd_16
+
+end_inner_loop_wd_16:
+    subs        x11,x11,#4                  //ht -= 4
+    sub         x0,x9,x15                   //pu1_src = pu1_src_tmp
+    sub         x1,x10,x15                  //pu1_dst = pu1_dst_tmp
+    bgt         outer_loop_wd_16
+
+    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
+//  MRS x20,PMCCFILTR_EL0
+    sub         x0,x20,x19
+    ldp         x19,x20,[sp],#16
+    ret
+
+
+
+

diff --git a/common/arm64/ihevc_inter_pred_luma_copy_w16out.s b/common/arm64/ihevc_inter_pred_luma_copy_w16out.s
new file mode 100644
index 0000000..86ffdba
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_luma_copy_w16out.s

@@ -0,0 +1,272 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///**
+//*******************************************************************************
+//*
+//* //brief
+//*     interprediction luma function for copy
+//*
+//* //par description:
+//*   copies the array of width 'wd' and height 'ht' from the  location pointed
+//*   by 'src' to the location pointed by 'dst'
+//*
+//* //param[in] pu1_src
+//*  uword8 pointer to the source
+//*
+//* //param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] dst_strd
+//*  integer destination stride
+//*
+//* //param[in] pi1_coeff
+//*  word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_inter_pred_luma_copy_w16out (
+//                                uword8 *pu1_src,
+//                                word16 *pi2_dst,
+//                                word32 src_strd,
+//                                word32 dst_strd,
+//                                word8 *pi1_coeff,
+//                                word32 ht,
+//                                word32 wd   )
+
+//**************variables vs registers*****************************************
+//    x0 => *pu1_src
+//    x1 => *pi2_dst
+//    x2 =>  src_strd
+//    x3 =>  dst_strd
+//    x7 =>  ht
+//    x12 => wd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_luma_copy_w16out_av8
+
+.type ihevc_inter_pred_luma_copy_w16out_av8, %function
+
+ihevc_inter_pred_luma_copy_w16out_av8:
+
+    // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    mov         x15,x4 // pi1_coeff
+    mov         x16,x5 // ht
+    mov         x17,x6 // wd
+
+    mov         x12,x17                     //loads wd
+    mov         x7,x16                      //loads ht
+    cmp         x7,#0                       //ht condition(ht == 0)
+    ble         end_loops                   //loop
+    tst         x12,#7                      //conditional check for wd (multiples)
+    beq         core_loop_wd_8
+    sub         x11,x12,#4
+    lsl         x6, x3,#1
+    adds        x6, x6,#0
+
+outer_loop_wd_4:
+    subs        x4,x12,#0                   //wd conditional subtract
+    ble         end_inner_loop_wd_4
+
+inner_loop_wd_4:
+    ld1         {v0.8b},[x0]                //vld1_u8(pu1_src_tmp)
+    add         x5,x0,x2                    //pu1_src +src_strd
+    uxtl        v0.8h, v0.8b                //vmovl_u8(vld1_u8(pu1_src_tmp)
+    add         x10,x1,x6
+    subs        x4,x4,#4                    //wd - 4
+    shl         v0.2d, v0.2d,#6             //vshlq_n_s64(temp, 6)
+    ld1         {v22.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
+    add         x0,x0,#4                    //pu1_src += 4
+    st1         {v0.d}[0],[x1]              //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    add         x1,x1,#8
+    uxtl        v22.8h, v22.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    ld1         {v24.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
+    shl         v22.2d, v22.2d,#6           //vshlq_n_s64(temp, 6)
+    uxtl        v24.8h, v24.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    st1         {v22.d}[0],[x10],x6         //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    shl         v24.2d, v24.2d,#6           //vshlq_n_s64(temp, 6)
+    ld1         {v26.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
+    st1         {v24.d}[0],[x10],x6         //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    uxtl        v26.8h, v26.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    shl         v26.2d, v26.2d,#6           //vshlq_n_s64(temp, 6)
+    st1         {v26.d}[0],[x10],x6         //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+    bgt         inner_loop_wd_4
+
+end_inner_loop_wd_4:
+    subs        x7,x7,#4                    //ht + 4
+    sub         x0,x5,x11
+    sub         x1,x10,x11,lsl #1
+    bgt         outer_loop_wd_4
+
+end_loops:
+    // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+
+    ret
+
+
+core_loop_wd_8:
+    //sub            x11,x12,#8
+    lsl         x5, x3,#1
+    adds        x5, x5,#0
+    sub         x20,x12,x3, lsl #2          // x11 = (dst_strd * 4) - width
+    neg         x11, x20
+    sub         x20,x12,x2,lsl #2           //x2->src_strd
+    neg         x8, x20
+    lsr         x4, x12, #3                 // divide by 8
+    mul         x7, x7, x4
+    sub         x4,x12,#0                   //wd conditional check
+    sub         x7,x7,#4                    //subtract one for epilog
+
+prolog:
+    add         x6,x0,x2                    //pu1_src_tmp += src_strd
+    add         x10,x1,x5
+    ld1         {v8.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
+    ld1         {v10.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    ld1         {v12.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    ld1         {v14.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    uxtl        v16.8h, v8.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
+    uxtl        v18.8h, v10.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v20.8h, v12.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v22.8h, v14.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    subs        x4,x4,#8                    //wd decrements by 8
+    shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
+    shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
+    shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
+    shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
+    add         x20,x0,x8
+    csel        x0, x20, x0,le
+    add         x6,x0,x2                    //pu1_src_tmp += src_strd
+    ld1         {v8.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
+    ld1         {v10.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    ld1         {v12.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    ld1         {v14.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+
+    st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
+    add         x20,x1,x11,lsl #1
+    csel        x1, x20, x1,le
+    sub         x20,x12,#0                  //wd conditional check
+    csel        x4, x20, x4,le
+
+    subs        x7,x7,#4                    //ht - 4
+
+    blt         epilog_end                  //jumps to epilog_end
+    beq         epilog                      //jumps to epilog
+
+
+
+outer_loop_wd_8:
+
+    st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+    uxtl        v16.8h, v8.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
+
+    st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+    uxtl        v18.8h, v10.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+    uxtl        v20.8h, v12.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    uxtl        v22.8h, v14.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    subs        x4,x4,#8                    //wd decrements by 8
+    add         x20,x0,x8
+    csel        x0, x20, x0,le
+
+    add         x6,x0,x2                    //pu1_src_tmp += src_strd
+
+    ld1         {v8.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
+    shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
+
+    ld1         {v10.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
+
+    ld1         {v12.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
+
+    ld1         {v14.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    add         x10,x1,x5
+
+    shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
+
+    st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
+
+    add         x20,x1,x11,lsl #1
+    csel        x1, x20, x1,le
+    sub         x20,x12,#0                  //wd conditional check
+    csel        x4, x20, x4,le
+
+    subs        x7,x7,#4                    //ht - 4
+    bgt         outer_loop_wd_8
+
+epilog:
+    st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+    uxtl        v16.8h, v8.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
+
+    st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+    uxtl        v18.8h, v10.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+    uxtl        v20.8h, v12.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+    uxtl        v22.8h, v14.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    //add          x6,x0,x2                //pu1_src_tmp += src_strd
+
+    shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
+    shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
+    shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
+    add         x10,x1,x5
+    shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
+
+    st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
+epilog_end:
+    st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+    st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+    st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
+
+
+    // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+    ret
+
+
+
+

diff --git a/common/arm64/ihevc_inter_pred_luma_horz_w16out.s b/common/arm64/ihevc_inter_pred_luma_horz_w16out.s
new file mode 100644
index 0000000..f7b6644
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_luma_horz_w16out.s

@@ -0,0 +1,678 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+
+///**
+//******************************************************************************
+//* //file
+//*  ihevc_inter_pred_luma_horz_w16out.s
+//*
+//* //brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//*  parthiban v
+//*
+//* //par list of functions:
+//*
+//*  - ihevc_inter_pred_luma_horz_w16out()
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* //brief
+//*   interprediction luma filter for horizontal 16bit output
+//*
+//* //par description:
+//*     applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
+//*     to the elements pointed by 'pu1_src' and  writes to the location pointed
+//*     by 'pu1_dst'  no downshifting or clipping is done and the output is  used
+//*     as an input for vertical filtering or weighted  prediction   assumptions :
+//*     the function is optimized considering the fact width is  multiple of 4 or
+//*     8. if width is multiple of 4 then height  should be multiple of 2, width 8
+//*     is optimized further.
+//*
+//* //param[in] pu1_src
+//*  uword8 pointer to the source
+//*
+//* //param[out] pi2_dst
+//*  word16 pointer to the destination
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] dst_strd
+//*  integer destination stride
+//*
+//* //param[in] pi1_coeff
+//*  word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_inter_pred_luma_horz_w16out(uword8 *pu1_src,
+//                                word16 *pi2_dst,
+//                                word32 src_strd,
+//                                word32 dst_strd,
+//                                word8 *pi1_coeff,
+//                                word32 ht,
+//                                word32 wd
+
+
+//x0 - free
+//x1 - dst_ptr
+//x2 - src_strd
+//x3 - dst_strd
+//x8 - src_ptx2
+//x9 - inner loop counter
+//x10 - dst_ptx2
+//x11 - free
+//x12 - dst_strd2
+//x13 - src_strd1
+//x14 - wd
+//x15 - #1
+//x16 - src_ptx1
+//x19 - loop_counter
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_luma_horz_w16out_av8
+
+.type ihevc_inter_pred_luma_horz_w16out_av8, %function
+
+ihevc_inter_pred_luma_horz_w16out_av8:
+
+    // stmfd sp!, {x8-x16, x19}                //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+    mov         x20,#1
+    bic         x19, x19, x20               // clearing bit[0], so that it goes back to mode
+    mov         x8,x4                       //loads pi1_coeff
+    mov         x11,x5                      //loads ht
+
+
+    ld1         {v0.8b},[x8]                //coeff = vld1_s8(pi1_coeff)
+    sub         x19,x11,#0                  //checks for ht == 0
+    abs         v2.8b, v0.8b                //vabs_s8(coeff)
+    mov         x15,#1
+    //ble          end_loops
+    mov         x14,x6                      //loads wd
+    dup         v24.8b, v2.8b[0]            //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+    sub         x16,x0,#3                   //pu1_src - 3
+    dup         v25.8b, v2.8b[1]            //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+    add         x8,x16,x2                   //pu1_src_tmp2_8 = pu1_src + src_strd
+    dup         v26.8b, v2.8b[2]            //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+    sub         x20,x14,x2,lsl #1           //2*src_strd - wd
+    neg         x13, x20
+    dup         v27.8b, v2.8b[3]            //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+    sub         x20,x14,x3                  //dst_strd - wd
+    neg         x12, x20
+    dup         v28.8b, v2.8b[4]            //coeffabs_4 = vdup_lane_u8(coeffabs, 4)
+
+    dup         v29.8b, v2.8b[5]            //coeffabs_5 = vdup_lane_u8(coeffabs, 5)
+    and         x11,x19,#1                  //calculating ht_residue ht_residue = (ht & 1)
+    dup         v30.8b, v2.8b[6]            //coeffabs_6 = vdup_lane_u8(coeffabs, 6)
+    sub         x19,x19,x11                 //decrement height by ht_residue(residue value is calculated outside)
+    dup         v31.8b, v2.8b[7]            //coeffabs_7 = vdup_lane_u8(coeffabs, 7)
+
+    cmp         x11,#1
+    beq         odd_height_decision
+
+even_height_decision:
+    mov         x11,x1
+    cmp         x14,#4
+    ble         outer_loop_4
+
+    cmp         x14,#24
+    mov         x20,#16
+    csel        x14, x20, x14,eq
+    add         x20, x12,#8
+    csel        x12, x20, x12,eq
+    add         x20, x13,#8
+    csel        x13, x20, x13,eq
+
+    cmp         x14,#16
+    bge         outer_loop_16_branch
+
+    cmp         x14,#12
+    add         x20, x12,#4
+    csel        x12, x20, x12,eq
+    add         x20, x13,#4
+    csel        x13, x20, x13,eq
+outer_loop_8_branch:
+    b           outer_loop_8
+
+outer_loop_16_branch:
+    b           outer_loop_16
+
+
+odd_height_decision:
+    cmp         x14,#24
+    beq         outer_loop_8_branch
+    cmp         x14,#12
+    beq         outer_loop_4
+    b           even_height_decision
+
+outer_loop4_residual:
+    sub         x16,x0,#3                   //pu1_src - 3
+    mov         x1,x11
+    add         x1, x1,#16
+    mov         x14,#4
+    add         x16, x16,#8
+    mov         x19,#16
+    add         x12, x12,#4
+    add         x13, x13,#4
+
+outer_loop_4:
+    add         x10,x1,x3,lsl #1            //pu1_dst + dst_strd
+    add         x8,x16,x2                   //pu1_src + src_strd
+
+    subs        x9,x14,#0                   //checks wd
+    ble         end_inner_loop_4
+
+inner_loop_4:
+    mov         x15,#1
+    ld1         {v20.2s},[x16],x15          //vector load pu1_src
+    ld1         {v21.2s},[x16],x15
+    ld1         {v22.2s},[x8],x15           //vector load pu1_src + src_strd
+    ld1         {v23.2s},[x8],x15
+
+    zip1        v0.2s, v20.2s, v22.2s
+    zip2        v12.2s, v20.2s, v22.2s      //vector zip the i iteration and ii interation in single register
+    zip1        v1.2s, v21.2s, v23.2s
+    zip2        v13.2s, v21.2s, v23.2s
+
+    ld1         {v20.2s},[x16],x15
+    ld1         {v21.2s},[x16],x15
+    ld1         {v22.2s},[x8],x15
+    ld1         {v23.2s},[x8],x15
+
+    zip1        v2.2s, v20.2s, v22.2s
+    zip2        v14.2s, v20.2s, v22.2s
+    zip1        v3.2s, v21.2s, v23.2s
+    zip2        v15.2s, v21.2s, v23.2s
+
+    ld1         {v20.2s},[x16],x15
+    ld1         {v21.2s},[x16],x15
+    ld1         {v22.2s},[x8],x15
+    ld1         {v23.2s},[x8],x15
+
+    zip1        v4.2s, v20.2s, v22.2s
+    zip2        v16.2s, v20.2s, v22.2s
+    zip1        v5.2s, v21.2s, v23.2s
+    zip2        v17.2s, v21.2s, v23.2s
+
+    ld1         {v20.2s},[x16],x15
+    ld1         {v21.2s},[x16],x15
+    ld1         {v22.2s},[x8],x15
+    ld1         {v23.2s},[x8],x15
+
+    //add        x16,x16,#4                        //increment the input pointer
+    sub         x16,x16,#4
+    //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
+    //vext.u8    d3,d0,d1,#3                        //vector extract of src[0_3]
+    //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
+
+    //vext.u8    d5,d0,d1,#5                        //vector extract of src[0_5]
+    //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
+    //vext.u8    d7,d0,d1,#7                        //vector extract of src[0_7]
+    //vext.u8    d1,d0,d1,#1                        //vector extract of src[0_1]
+    sub         x8,x8,#4
+    // add        x8,x8,#4                        //increment the input pointer
+    // vext.u8    d14,d12,d13,#2                    //vector extract of src[0_2]
+    // vext.u8    d15,d12,d13,#3                    //vector extract of src[0_3]
+    // vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
+    // vext.u8    d17,d12,d13,#5                    //vector extract of src[0_5]
+    // vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
+    // vext.u8    d19,d12,d13,#7                    //vector extract of src[0_7]
+    //vext.u8    d13,d12,d13,#1                    //vector extract of src[0_1]
+
+
+
+
+
+
+    zip1        v6.2s, v20.2s, v22.2s
+    zip2        v18.2s, v20.2s, v22.2s
+    zip1        v7.2s, v21.2s, v23.2s
+    zip2        v19.2s, v21.2s, v23.2s
+
+    umull       v8.8h, v1.8b, v25.8b        //arithmetic operations for ii iteration in the same time
+    umlsl       v8.8h, v0.8b, v24.8b
+    umlsl       v8.8h, v2.8b, v26.8b
+    umlal       v8.8h, v3.8b, v27.8b
+    umlal       v8.8h, v4.8b, v28.8b
+    umlsl       v8.8h, v5.8b, v29.8b
+    umlal       v8.8h, v6.8b, v30.8b
+    umlsl       v8.8h, v7.8b, v31.8b
+
+    // vqrshrun.s16 d8,q4,#6                        //narrow right shift and saturating the result
+    st1         {v8.d}[0],[x1],#8           //store the i iteration result which is in upper part of the register
+    st1         {v8.d}[1],[x10],#8          //store the ii iteration result which is in lower part of the register
+    subs        x9,x9,#4                    //decrement the wd by 4
+    bgt         inner_loop_4
+
+end_inner_loop_4:
+    subs        x19,x19,#2                  //decrement the ht by 4
+    add         x16,x16,x13                 //increment the input pointer 2*src_strd-wd
+    add         x1,x10,x12,lsl #1           //increment the output pointer 2*dst_strd-wd
+    bgt         outer_loop_4
+
+
+height_residue_4:
+
+    mov         x11,x5                      //loads ht
+    and         x11,x11,#1                  //calculating ht_residue ht_residue = (ht & 1)
+    cmp         x11,#0
+    //beq        end_loops
+    // ldmeqfd sp!,{x8-x16,pc}                  //reload the registers from sp
+    bne         lbl280
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+    ret
+lbl280:
+
+outer_loop_height_residue_4:
+
+
+    subs        x9,x14,#0                   //checks wd
+    ble         end_inner_loop_height_residue_4
+
+inner_loop_height_residue_4:
+    mov         x15, #1
+    ld1         {v0.2s},[x16],x15           //vector load pu1_src
+    ld1         {v1.2s},[x16],x15
+
+
+
+
+
+
+    // vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
+    // vext.u8    d3,d0,d1,#3                        //vector extract of src[0_3]
+    // vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
+
+
+
+    //add        x16,x16,#4                        //increment the input pointer
+    // vext.u8    d5,d0,d1,#5                        //vector extract of src[0_5]
+    // vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
+    // vext.u8    d7,d0,d1,#7                        //vector extract of src[0_7]
+    // vext.u8    d1,d0,d1,#1                        //vector extract of src[0_1]
+    ld1         {v2.2s},[x16],x15
+    umull       v8.8h, v1.8b, v25.8b        //arithmetic operations for ii iteration in the same time
+    ld1         {v3.2s},[x16],x15
+    umlsl       v8.8h, v0.8b, v24.8b
+    ld1         {v4.2s},[x16],x15
+    umlsl       v8.8h, v2.8b, v26.8b
+    ld1         {v5.2s},[x16],x15
+    umlal       v8.8h, v3.8b, v27.8b
+    ld1         {v6.2s},[x16],x15
+    umlal       v8.8h, v4.8b, v28.8b
+    ld1         {v7.2s},[x16],x15
+    umlsl       v8.8h, v5.8b, v29.8b
+    sub         x16,x16,#4
+    umlal       v8.8h, v6.8b, v30.8b
+    umlsl       v8.8h, v7.8b, v31.8b        //store the i iteration result which is in upper part of the register
+    subs        x9,x9,#4                    //decrement the wd by 4
+    st1         {v8.d}[0],[x1],#8
+    bgt         inner_loop_height_residue_4
+
+end_inner_loop_height_residue_4:
+    subs        x11,x11,#1                  //decrement the ht by 4
+    sub         x20,x14,x2
+    neg         x13, x20
+    add         x16,x16,x13                 //increment the input pointer src_strd-wd
+    add         x1,x1,x12                   //increment the output pointer dst_strd-wd
+    bgt         outer_loop_height_residue_4
+
+    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+    ret
+
+outer_loop8_residual:
+    sub         x16,x0,#3                   //pu1_src - 3
+    mov         x1,x11
+    mov         x19,#32
+    add         x1, x1,#32
+    add         x16, x16,#16
+    mov         x14,#8
+    add         x12, x12,#8
+    add         x13, x13,#8
+
+outer_loop_8:
+
+    add         x10,x1,x3,lsl #1            //pu1_dst + dst_strd
+    add         x8,x16,x2                   //pu1_src + src_strd
+    subs        x9,x14,#0                   //checks wd
+
+    ble         end_inner_loop_8
+
+inner_loop_8:
+    mov         x15, #1
+    ld1         {v0.2s},[x16],x15           //vector load pu1_src
+    ld1         {v1.2s},[x16],x15
+    ld1         {v2.2s},[x16],x15
+    ld1         {v3.2s},[x16],x15
+
+
+
+
+
+    // vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
+    // vext.u8    d3,d0,d1,#3                        //vector extract of src[0_3]
+    // vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
+    // vext.u8    d5,d0,d1,#5                        //vector extract of src[0_5]
+    // vext.u8    d6,d0,d1,#6                        //vector extract of src [0_6]
+    // vext.u8    d7,d0,d1,#7                        //vector extract of src[0_7]
+    // vext.u8    d1,d0,d1,#1                        //vector extract of src[0_1]
+    // vext.u8    d14,d12,d13,#2
+
+    //vext.u8    d15,d12,d13,#3                    //vector extract of src[0_3]
+    // vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
+    // vext.u8    d17,d12,d13,#5                    //vector extract of src[0_5]
+    //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
+    //vext.u8    d19,d12,d13,#7                    //vector extract of src[0_7]
+    //vext.u8    d13,d12,d13,#1                    //vector extract of src[0_1]
+    ld1         {v4.2s},[x16],x15
+    umull       v8.8h, v1.8b, v25.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+    ld1         {v5.2s},[x16],x15
+    umlal       v8.8h, v3.8b, v27.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    ld1         {v6.2s},[x16],x15
+    umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    ld1         {v7.2s},[x16],x15
+    umlsl       v8.8h, v2.8b, v26.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    ld1         {v12.2s},[x8],x15           //vector load pu1_src + src_strd
+    umlal       v8.8h, v4.8b, v28.8b        //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+    ld1         {v13.2s},[x8],x15
+    umlsl       v8.8h, v5.8b, v29.8b        //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+    ld1         {v14.2s},[x8],x15
+    umlal       v8.8h, v6.8b, v30.8b        //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+    ld1         {v15.2s},[x8],x15
+    umlsl       v8.8h, v7.8b, v31.8b        //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+    ld1         {v16.2s},[x8],x15           //vector load pu1_src + src_strd
+
+    umull       v10.8h, v15.8b, v27.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    ld1         {v17.2s},[x8],x15
+    umlsl       v10.8h, v14.8b, v26.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    ld1         {v18.2s},[x8],x15
+    umlal       v10.8h, v16.8b, v28.8b      //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+    ld1         {v19.2s},[x8],x15           //vector load pu1_src + src_strd
+    umlsl       v10.8h, v17.8b, v29.8b      //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+    // vqrshrun.s16     d20,q4,#6                        //right shift and saturating narrow result 1
+    umlal       v10.8h, v18.8b, v30.8b      //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+    umlsl       v10.8h, v19.8b, v31.8b      //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+    st1         {v8.8h},[x1],#16            //store the result pu1_dst
+    umlsl       v10.8h, v12.8b, v24.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    umlal       v10.8h, v13.8b, v25.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+
+    // vqrshrun.s16 d8,q5,#6                        //right shift and saturating narrow result 2
+    subs        x9,x9,#8                    //decrement the wd loop
+    st1         {v10.8h},[x10],#16          //store the result pu1_dst
+    cmp         x9,#4
+    bgt         inner_loop_8
+
+end_inner_loop_8:
+    subs        x19,x19,#2                  //decrement the ht loop
+    add         x16,x16,x13                 //increment the src pointer by 2*src_strd-wd
+    add         x1,x10,x12,lsl #1           //increment the dst pointer by 2*dst_strd-wd
+    bgt         outer_loop_8
+
+
+
+
+
+    mov         x14,x6                      //loads wd
+    cmp         x14,#12
+
+    beq         outer_loop4_residual
+
+    mov         x11,x5                      //loads ht
+    and         x11,x11,#1
+    cmp         x11,#1
+    beq         height_residue_4
+
+//end_loops
+
+    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+    ret
+
+
+
+
+
+outer_loop_16:
+    mov         x15, #-7
+    stp         x0,x11,[sp,#-16]!
+    add         x10,x1,x3,lsl #1            //pu1_dst + dst_strd
+    add         x8,x16,x2                   //pu1_src + src_strd
+    and         x0, x16, #31
+    sub         x9,x14,#0                   //checks wd
+    //ble          end_loops1
+    add         x20,x16, x2, lsl #1
+    prfm        PLDL1KEEP,[x20]
+    ld1         {v0.2s},[x16],#8            //vector load pu1_src
+    ld1         {v1.2s},[x16],x15           //vector load pu1_src
+    add         x20,x8, x2, lsl #1
+    prfm        PLDL1KEEP,[x20]
+    ld1         {v2.2s},[x16],#8
+    ld1         {v3.2s},[x16],x15
+    ld1         {v4.2s},[x16],#8
+    ld1         {v5.2s},[x16],x15
+    ld1         {v6.2s},[x16],#8
+    ld1         {v7.2s},[x16],x15
+    ld1         {v12.2s},[x16],#8
+    ld1         {v13.2s},[x16],x15
+    umull       v8.8h, v2.8b, v25.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+    ld1         {v14.2s},[x16],#8
+    ld1         {v15.2s},[x16],x15
+    umlal       v8.8h, v6.8b, v27.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    ld1         {v16.2s},[x16],#8
+    ld1         {v17.2s},[x16],x15
+    umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    ld1         {v18.2s},[x16],#8
+    ld1         {v19.2s},[x16],x15
+    umlsl       v8.8h, v4.8b, v26.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlal       v8.8h, v12.8b, v28.8b       //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+    umlsl       v8.8h, v14.8b, v29.8b       //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+    umlal       v8.8h, v16.8b, v30.8b       //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+    umlsl       v8.8h, v18.8b, v31.8b       //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+
+
+inner_loop_16:
+
+
+    subs        x9,x9,#16
+    umull       v20.8h, v3.8b, v25.8b
+
+    add         x16, x16,#8
+    umlsl       v20.8h, v1.8b, v24.8b
+
+    ld1         {v0.2s},[x8],#8             //vector load pu1_src
+    ld1         {v1.2s},[x8],x15            //vector load pu1_src
+    umlal       v20.8h, v7.8b, v27.8b
+
+    ld1         {v2.2s},[x8],#8
+    ld1         {v3.2s},[x8],x15
+    umlsl       v20.8h, v5.8b, v26.8b
+
+    ld1         {v4.2s},[x8],#8
+    ld1         {v5.2s},[x8],x15
+    umlal       v20.8h, v13.8b, v28.8b
+
+    ld1         {v6.2s},[x8],#8
+    ld1         {v7.2s},[x8],x15
+    umlal       v20.8h, v17.8b, v30.8b
+
+    ld1         {v12.2s},[x8],#8
+    ld1         {v13.2s},[x8],x15
+    umlsl       v20.8h, v15.8b, v29.8b
+
+    ld1         {v14.2s},[x8],#8
+    ld1         {v15.2s},[x8],x15
+    umlsl       v20.8h, v19.8b, v31.8b
+
+    ld1         {v16.2s},[x8],#8
+    ld1         {v17.2s},[x8],x15
+    umull       v10.8h, v2.8b, v25.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+    ld1         {v18.2s},[x8],#8
+    ld1         {v19.2s},[x8],x15
+    umlal       v10.8h, v6.8b, v27.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+    add         x8, x8,#8
+    umlsl       v10.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    add         x20,x16, x2, lsl #2
+    prfm        PLDL1KEEP,[x20]
+    add         x20,x8, x2, lsl #2
+    prfm        PLDL1KEEP,[x20]
+    st1         {v8.16b},[x1],#16           //store the result pu1_dst
+    umlsl       v10.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+    add         x20,x16,x13                 //increment the src pointer by 2*src_strd-wd
+    csel        x16, x20, x16,eq
+    umlal       v10.8h, v12.8b, v28.8b      //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+
+    add         x20,x16,x2                  //pu1_src + src_strd
+    csel        x8, x20, x8,eq
+    umlsl       v10.8h, v14.8b, v29.8b      //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+
+//    and            x11, x16, #31
+    umlal       v10.8h, v16.8b, v30.8b      //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+
+    sub         x20,x19,#2
+    csel        x19, x20, x19,eq
+    umlsl       v10.8h, v18.8b, v31.8b      //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+
+    //cmp            x11, x0
+    umull       v22.8h, v3.8b, v25.8b
+
+//    add x20,x16, x2, lsl #2
+    prfm        PLDL1KEEP,[x20]
+    umlsl       v22.8h, v1.8b, v24.8b
+
+    st1         {v20.8h},[x1],#16
+    umlal       v22.8h, v7.8b, v27.8b
+
+//    add x20,x8, x2, lsl #2
+    prfm        PLDL1KEEP,[x20]
+    umlsl       v22.8h, v5.8b, v26.8b
+
+//    mov            x0, x11
+    umlal       v22.8h, v13.8b, v28.8b
+
+    cmp         x19,#0
+    umlal       v22.8h, v17.8b, v30.8b
+
+    st1         {v10.8h},[x10],#16
+    umlsl       v22.8h, v15.8b, v29.8b
+
+    umlsl       v22.8h, v19.8b, v31.8b
+
+    beq         epilog_16
+
+    ld1         {v0.2s},[x16],#8            //vector load pu1_src
+    ld1         {v1.2s},[x16],x15           //vector load pu1_src
+    ld1         {v2.2s},[x16],#8
+    ld1         {v3.2s},[x16],x15
+    ld1         {v4.2s},[x16],#8
+    ld1         {v5.2s},[x16],x15
+    ld1         {v6.2s},[x16],#8
+    ld1         {v7.2s},[x16],x15
+    ld1         {v12.2s},[x16],#8
+    ld1         {v13.2s},[x16],x15
+    umull       v8.8h, v2.8b, v25.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+    ld1         {v14.2s},[x16],#8
+    ld1         {v15.2s},[x16],x15
+    umlal       v8.8h, v6.8b, v27.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    ld1         {v16.2s},[x16],#8
+    ld1         {v17.2s},[x16],x15
+    umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    ld1         {v18.2s},[x16],#8
+    ld1         {v19.2s},[x16],x15
+    umlsl       v8.8h, v4.8b, v26.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlal       v8.8h, v12.8b, v28.8b       //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+    cmp         x9,#0
+    umlsl       v8.8h, v14.8b, v29.8b       //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+    mov         x20,x14
+    csel        x9, x20, x9,eq
+    umlal       v8.8h, v16.8b, v30.8b       //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+    st1         {v22.16b},[x10],#16         //store the result pu1_dst
+    umlsl       v8.8h, v18.8b, v31.8b       //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+    add         x20,x10,x12,lsl #1
+    csel        x1, x20, x1,eq
+    add         x20,x1,x3,lsl #1            //pu1_dst + dst_strd
+    csel        x10, x20, x10,eq
+    b           inner_loop_16
+
+
+epilog_16:
+//    vqrshrun.s16 d11,q11,#6
+    st1         {v22.16b},[x10],#16         //store the result pu1_dst
+
+    ldp         x0,x11,[sp],#16
+    mov         x14,x6
+    cmp         x14,#24
+    beq         outer_loop8_residual
+    add         x1,x10,x12,lsl #1
+    mov         x11,x5                      //loads ht
+    and         x11,x11,#1
+    cmp         x11,#1
+    beq         height_residue_4
+
+end_loops1:
+
+    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+    ret
+
+
+
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s b/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s
new file mode 100644
index 0000000..b94ec3c
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s

@@ -0,0 +1,418 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//******************************************************************************
+//* //file
+//*  ihevc_inter_pred_filters_luma_vert_w16inp.s
+//*
+//* //brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//*  yogeswaran rs
+//*
+//* //par list of functions:
+//*
+//*  - ihevc_inter_pred_luma_vert()
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
+///* include reconstruction */
+//
+
+///**
+//*******************************************************************************
+//*
+//* //brief
+//*    luma vertical filter for 16bit input.
+//*
+//* //par description:
+//*     applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+//*     the elements pointed by 'pu1_src' and  writes to the location pointed by
+//*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
+//*     clipped to lie  between 0 and 255   assumptions : the function is
+//*     optimized considering the fact width is  multiple of 4. and height as
+//*     multiple of 2.
+//*
+//* //param[in] pi2_src
+//*  word16 pointer to the source
+//*
+//* //param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] dst_strd
+//*  integer destination stride
+//*
+//* //param[in] pi1_coeff
+//*  word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src,
+//                                    uword8 *pu1_dst,
+//                                    word32 src_strd,
+//                                    word32 dst_strd,
+//                                    word8 *pi1_coeff,
+//                                    word32 ht,
+//                                    word32 wd   )
+//**************variables vs registers*****************************************
+//  r0 => *pu2_src
+//  r1 => *pu1_dst
+//  r2 =>  src_strd
+//  r3 =>  dst_strd
+//  r4 => *pi1_coeff
+//  r5 =>  ht
+//  r6 =>  wd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_luma_vert_w16inp_w16out_av8
+
+.type ihevc_inter_pred_luma_vert_w16inp_w16out_av8, %function
+
+ihevc_inter_pred_luma_vert_w16inp_w16out_av8:
+
+    //stmfd     sp!, {r4-r12, r14}  //stack stores the values of the arguments
+    push_v_regs
+    stp         x19,x20,[sp, #-16]!
+
+    mov         x15,x4 // pi1_coeff
+    mov         x16,x5 // ht
+    mov         x17,x6 // wd
+
+
+    mov         x12,x15                     //load pi1_coeff
+    lsl         x6,x3,#1
+    mov         x5,x17                      //load wd
+    ld1         {v0.8b},[x12]               //coeff = ld1_s8(pi1_coeff)
+    lsl         x2, x2,#1
+    sub         x12,x2,x2,lsl #2            //src_ctrd & pi1_coeff
+    //vabs.s8   d0,d0               //vabs_s8(coeff)
+    add         x0,x0,x12                   //r0->pu1_src   r12->pi1_coeff
+    mov         x3,x16                      //load ht
+    subs        x7,x3,#0                    //r3->ht
+    //ble       end_loops           //end loop jump
+    sxtl        v0.8h,v0.8b
+    dup         v22.4h,v0.h[0]              //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
+    dup         v23.4h,v0.h[1]              //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
+    dup         v24.4h,v0.h[2]              //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
+    dup         v25.4h,v0.h[3]              //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
+    dup         v26.4h,v0.h[4]              //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
+    dup         v27.4h,v0.h[5]              //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
+    dup         v28.4h,v0.h[6]              //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
+    dup         v29.4h,v0.h[7]              //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
+    movi        v30.4s,#8, lsl #16
+
+    sub         x9,x5,x6,lsl #2             //r6->dst_strd  r5  ->wd
+    neg         x9,x9
+    sub         x8,x5,x2,lsl #2             //r2->src_strd
+    neg         x8,x8
+    sub         x8,x8,x5
+    sub         x9,x9,x5
+    lsr         x3, x5, #2                  //divide by 4
+    mul         x7, x7, x3                  //multiply height by width
+    sub         x7, x7, #4                  //subtract by one for epilog
+    mov         x4,x5                       //r5 ->wd
+    //mov           r2, r2, lsl #1
+
+prolog:
+
+    add         x3,x0,x2                    //pu1_src_tmp += src_strd//
+    ld1         {v1.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
+    ld1         {v0.4h},[x0], #8            //src_tmp1 = ld1_u8(pu1_src_tmp)//
+    subs        x4,x4,#4
+    ld1         {v2.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
+    smull       v8.4s,v1.4h,v23.4h          //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
+    ld1         {v3.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
+    smlal       v8.4s,v0.4h,v22.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+    ld1         {v4.4h},[x3],x2             //src_tmp1 = ld1_u8(pu1_src_tmp)//
+    smlal       v8.4s,v2.4h,v24.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+    ld1         {v5.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
+    smlal       v8.4s,v3.4h,v25.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    ld1         {v6.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
+    smlal       v8.4s,v4.4h,v26.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    ld1         {v7.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
+    smlal       v8.4s,v5.4h,v27.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+    smlal       v8.4s,v6.4h,v28.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    smlal       v8.4s,v7.4h,v29.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+
+    ld1         {v16.4h},[x3],x2            //src_tmp1 = ld1_u8(pu1_src_tmp)//
+
+    smull       v10.4s,v2.4h,v23.4h         //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
+    add         x20,x0,x8,lsl #0
+    csel        x0,x20,x0,le
+    smlal       v10.4s,v1.4h,v22.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+    csel        x4,x5,x4,le
+    smlal       v10.4s,v3.4h,v24.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+    ld1         {v17.4h},[x3],x2            //src_tmp2 = ld1_u8(pu1_src_tmp)//
+    smlal       v10.4s,v4.4h,v25.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    ld1         {v18.4h},[x3],x2            //src_tmp3 = ld1_u8(pu1_src_tmp)//
+    smlal       v10.4s,v5.4h,v26.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    add         x3,x0,x2                    //pu1_src_tmp += src_strd//
+    smlal       v10.4s,v6.4h,v27.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+    smlal       v10.4s,v7.4h,v28.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    smlal       v10.4s,v16.4h,v29.4h        //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+    sub         v8.4s, v8.4s, v30.4s
+
+    ld1         {v1.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
+    smull       v12.4s,v3.4h,v23.4h
+    ld1         {v0.4h},[x0],#8             //src_tmp1 = ld1_u8(pu1_src_tmp)//
+    smlal       v12.4s,v2.4h,v22.4h
+    ld1         {v2.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
+    smlal       v12.4s,v4.4h,v24.4h
+    smlal       v12.4s,v5.4h,v25.4h
+    smlal       v12.4s,v6.4h,v26.4h
+    smlal       v12.4s,v7.4h,v27.4h
+    smlal       v12.4s,v16.4h,v28.4h
+    smlal       v12.4s,v17.4h,v29.4h
+    add         x14,x1,x6
+    sub         v10.4s, v10.4s, v30.4s
+    shrn        v8.4h, v8.4s, #6
+    //vqrshrun d8,q4,#6         //sto_res = vqmovun_s16(sto_res_tmp)//
+
+    smull       v14.4s,v4.4h,v23.4h
+    smlal       v14.4s,v3.4h,v22.4h
+    smlal       v14.4s,v5.4h,v24.4h
+    smlal       v14.4s,v6.4h,v25.4h
+    ld1         {v3.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
+    smlal       v14.4s,v7.4h,v26.4h
+    ld1         {v4.4h},[x3],x2             //src_tmp1 = ld1_u8(pu1_src_tmp)//
+    smlal       v14.4s,v16.4h,v27.4h
+    ld1         {v5.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
+    smlal       v14.4s,v17.4h,v28.4h
+    ld1         {v6.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
+    smlal       v14.4s,v18.4h,v29.4h
+    ld1         {v7.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
+
+    st1         {v8.2s},[x1],#8             //st1_u8(pu1_dst,sto_res)//
+    sub         v12.4s, v12.4s, v30.4s
+    shrn        v10.4h, v10.4s, #6
+    //vqrshrun d10,q5,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
+    add         x20, x1, x9
+    csel        x1, x20, x1, le
+
+    subs        x7,x7,#4
+
+
+    blt         epilog_end                  //jumps to epilog_end
+    beq         epilog                      //jumps to epilog
+
+kernel_8:
+
+    smull       v8.4s,v1.4h,v23.4h          //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
+    subs        x4,x4,#4
+    smlal       v8.4s,v0.4h,v22.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+    add         x20,x0,x8,lsl #0
+    csel        x0,x20,x0,le
+    smlal       v8.4s,v2.4h,v24.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+    smlal       v8.4s,v3.4h,v25.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    smlal       v8.4s,v4.4h,v26.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    smlal       v8.4s,v5.4h,v27.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+    smlal       v8.4s,v6.4h,v28.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    smlal       v8.4s,v7.4h,v29.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+    st1         {v10.2s},[x14],x6           //st1_u8(pu1_dst_tmp,sto_res)//
+
+    sub         v14.4S, v14.4s, v30.4s
+    shrn        v12.4h, v12.4s, #6
+    //vqrshrun d12,q6,#6
+    ld1         {v16.4h},[x3],x2            //src_tmp1 = ld1_u8(pu1_src_tmp)//
+
+    smull       v10.4s,v2.4h,v23.4h         //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
+    smlal       v10.4s,v1.4h,v22.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+    smlal       v10.4s,v3.4h,v24.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+    smlal       v10.4s,v4.4h,v25.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    smlal       v10.4s,v5.4h,v26.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    smlal       v10.4s,v6.4h,v27.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+    st1         {v12.2s},[x14],x6
+
+    smlal       v10.4s,v7.4h,v28.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    ld1         {v17.4h},[x3],x2            //src_tmp2 = ld1_u8(pu1_src_tmp)//
+
+    smlal       v10.4s,v16.4h,v29.4h        //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+
+    sub         v8.4s, v8.4s, v30.4s
+    shrn        v14.4h, v14.4s, #6
+    //vqrshrun d14,q7,#6
+
+    smull       v12.4s,v3.4h,v23.4h
+    csel        x4,x5,x4,le
+
+    smlal       v12.4s,v2.4h,v22.4h
+    ld1         {v18.4h},[x3],x2            //src_tmp3 = ld1_u8(pu1_src_tmp)//
+
+    smlal       v12.4s,v4.4h,v24.4h
+    add         x3,x0,x2                    //pu1_src_tmp += src_strd//
+
+    smlal       v12.4s,v5.4h,v25.4h
+
+    smlal       v12.4s,v6.4h,v26.4h
+    st1         {v14.2s},[x14],x6
+
+    smlal       v12.4s,v7.4h,v27.4h
+    ld1         {v1.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
+
+    smlal       v12.4s,v16.4h,v28.4h
+    add         x14,x1,x6
+
+    smlal       v12.4s,v17.4h,v29.4h
+    ld1         {v0.4h},[x0],#8             //src_tmp1 = ld1_u8(pu1_src_tmp)//
+
+    sub         v10.4s, v10.4s, v30.4s
+    shrn        v8.4h, v8.4s, #6
+    //vqrshrun d8,q4,#6         //sto_res = vqmovun_s16(sto_res_tmp)//
+    ld1         {v2.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
+
+    smull       v14.4s,v4.4h,v23.4h
+    smlal       v14.4s,v3.4h,v22.4h
+    smlal       v14.4s,v5.4h,v24.4h
+    ld1         {v3.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
+
+    smlal       v14.4s,v6.4h,v25.4h
+    ld1         {v4.4h},[x3],x2             //src_tmp1 = ld1_u8(pu1_src_tmp)//
+    smlal       v14.4s,v7.4h,v26.4h
+    ld1         {v5.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
+    smlal       v14.4s,v16.4h,v27.4h
+    ld1         {v6.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
+    smlal       v14.4s,v17.4h,v28.4h
+    ld1         {v7.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
+    smlal       v14.4s,v18.4h,v29.4h
+    st1         {v8.2s},[x1],#8             //st1_u8(pu1_dst,sto_res)//
+
+    sub         v12.4s, v12.4s, v30.4s
+    shrn        v10.4h, v10.4s, #6
+    add         x20, x1, x9
+    csel        x1, x20, x1, le
+
+    //vqrshrun d10,q5,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
+    subs        x7,x7,#4
+
+    bgt         kernel_8                    //jumps to kernel_8
+
+epilog:
+
+    smull       v8.4s,v1.4h,v23.4h          //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
+    smlal       v8.4s,v0.4h,v22.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+    smlal       v8.4s,v2.4h,v24.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+    smlal       v8.4s,v3.4h,v25.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    smlal       v8.4s,v4.4h,v26.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    smlal       v8.4s,v5.4h,v27.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+    smlal       v8.4s,v6.4h,v28.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    smlal       v8.4s,v7.4h,v29.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+    st1         {v10.2s},[x14],x6
+
+    sub         v14.4s, v14.4s, v30.4s
+    shrn        v12.4h, v12.4s, #6
+    //vqrshrun d12,q6,#6
+
+    ld1         {v16.4h},[x3],x2            //src_tmp1 = ld1_u8(pu1_src_tmp)//
+    smull       v10.4s,v2.4h,v23.4h         //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
+    smlal       v10.4s,v1.4h,v22.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+    smlal       v10.4s,v3.4h,v24.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+    smlal       v10.4s,v4.4h,v25.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    smlal       v10.4s,v5.4h,v26.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    smlal       v10.4s,v6.4h,v27.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+    smlal       v10.4s,v7.4h,v28.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    smlal       v10.4s,v16.4h,v29.4h        //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+    st1         {v12.2s},[x14],x6
+
+    sub         v8.4s, v8.4s, v30.4s
+    shrn        v14.4h, v14.4s, #6
+    //vqrshrun d14,q7,#6
+
+    ld1         {v17.4h},[x3],x2            //src_tmp2 = ld1_u8(pu1_src_tmp)//
+    smull       v12.4s,v3.4h,v23.4h
+    smlal       v12.4s,v2.4h,v22.4h
+    smlal       v12.4s,v4.4h,v24.4h
+    smlal       v12.4s,v5.4h,v25.4h
+    smlal       v12.4s,v6.4h,v26.4h
+    smlal       v12.4s,v7.4h,v27.4h
+    smlal       v12.4s,v16.4h,v28.4h
+    smlal       v12.4s,v17.4h,v29.4h
+    st1         {v14.2s},[x14],x6
+    sub         v10.4s, v10.4s, v30.4s
+    shrn        v8.4h, v8.4s, #6
+    //vqrshrun d8,q4,#6         //sto_res = vqmovun_s16(sto_res_tmp)//
+
+    ld1         {v18.4h},[x3],x2            //src_tmp3 = ld1_u8(pu1_src_tmp)//
+    smull       v14.4s,v4.4h,v23.4h
+    smlal       v14.4s,v3.4h,v22.4h
+    smlal       v14.4s,v5.4h,v24.4h
+    smlal       v14.4s,v6.4h,v25.4h
+    smlal       v14.4s,v7.4h,v26.4h
+    smlal       v14.4s,v16.4h,v27.4h
+    smlal       v14.4s,v17.4h,v28.4h
+    smlal       v14.4s,v18.4h,v29.4h
+    sub         v12.4s, v12.4s, v30.4s
+    shrn        v10.4h, v10.4s, #6
+    //vqrshrun d10,q5,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
+
+    add         x14,x1,x6
+    st1         {v8.2s},[x1],#8             //st1_u8(pu1_dst,sto_res)//
+
+epilog_end:
+    st1         {v10.2s},[x14],x6           //st1_u8(pu1_dst_tmp,sto_res)//
+    shrn        v12.4h, v12.4s, #6
+    //vqrshrun d12,q6,#6
+
+    st1         {v12.2s},[x14],x6
+    sub         v14.4s, v14.4s, v30.4s
+    shrn        v14.4h, v14.4s, #6
+    //vqrshrun d14,q7,#6
+
+    st1         {v14.2s},[x14],x6
+
+
+end_loops:
+
+    //ldmfd     sp!,{r4-r12,r15}            //reload the registers from sp
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+    ret
+
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_chroma_dc.s b/common/arm64/ihevc_intra_pred_chroma_dc.s
new file mode 100644
index 0000000..2fdee98
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_chroma_dc.s

@@ -0,0 +1,300 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_chroma_dc_neon.s
+//*
+//* @brief
+//*  contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  yogeswaran rs
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*    luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] pi1_coeff
+//*  word8 pointer to the planar coefficients
+//*
+//* @param[in] nt
+//*  size of tranform block
+//*
+//* @param[in] mode
+//*  type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_chroma_dc(uword8 *pu1_ref,
+//                                word32 src_strd,
+//                                uword8 *pu1_dst,
+//                                word32 dst_strd,
+//                                word32 nt,
+//                                word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+//    nt
+//    mode
+//    pi1_coeff
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_chroma_dc_av8
+
+.type ihevc_intra_pred_chroma_dc_av8, %function
+
+ihevc_intra_pred_chroma_dc_av8:
+
+    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    mov         x9, #0
+    mov         v17.s[0], w9
+    mov         v17.s[1], w9
+
+    clz         w5,w4                       //counts leading zeros
+
+    add         x6, x0, x4,lsl #1           //&src[2nt]
+    mov         v18.s[0], w9
+    mov         v18.s[1], w9
+    sub         x20, x5, #32                //log2nt
+    neg         x5, x20
+    add         x7, x0, x4, lsl #2          //&src[4nt]
+    mov         x12,x5
+    add         x8, x7, #2                  //&src[4nt+2]
+
+    cmp         x4, #4
+    beq         dc_4                        //nt=4 loop
+
+
+add_loop:
+    ld2         {v30.8b, v31.8b}, [x6], #16 //load from src[nt]
+    lsl         x10,x4,#1                   //2nt
+
+    uaddlp      v2.4h,  v30.8b
+    subs        x10, x10,#0x10
+
+    ld2         {v26.8b, v27.8b}, [x8],#16  //load from src[2nt+1]
+
+    uaddlp      v3.4h,  v31.8b
+    uaddlp      v2.2s,  v2.4h
+    uaddlp      v3.2s,  v3.4h
+
+    uadalp      v17.1d,  v2.2s
+
+    uadalp      v18.1d,  v3.2s
+
+    uaddlp      v2.4h,  v26.8b
+    uaddlp      v3.4h,  v27.8b
+
+    uaddlp      v2.2s,  v2.4h
+    uaddlp      v3.2s,  v3.4h
+
+    uadalp      v17.1d,  v2.2s
+    uadalp      v18.1d,  v3.2s
+
+    beq         epil_add_loop
+
+core_loop_add:
+    ld2         {v30.8b, v31.8b}, [x6],#16  //load from src[nt]
+    uaddlp      v28.4h,  v30.8b
+    uaddlp      v3.4h,  v31.8b
+
+    ld2         {v26.8b, v27.8b}, [x8],#16  //load from src[2nt+1]
+
+    uaddlp      v3.2s,  v3.4h
+    uaddlp      v29.2s,  v28.4h
+
+    uadalp      v18.1d,  v3.2s
+    uadalp      v17.1d,  v29.2s
+
+    uaddlp      v3.4h,  v27.8b
+    uaddlp      v28.4h,  v26.8b
+
+    uaddlp      v3.2s,  v3.4h
+    uaddlp      v29.2s,  v28.4h
+
+    uadalp      v18.1d,  v3.2s
+    uadalp      v17.1d,  v29.2s
+
+
+epil_add_loop:
+
+    smov        x1, v18.2s[0]
+    smov        x11, v17.2s[0]
+
+    add         x1,x1,x4
+    add         x11,x11,x4
+
+    lsr         x1,x1,x12
+    lsr         x11,x11,x12
+
+    dup         v17.8b,w1
+    dup         v16.8b,w11
+
+prologue_cpy_32:
+
+    add         x5, x2, x3
+    subs        x9, x4, #8
+    lsl         x6, x3, #2
+    csel        x11, x6, x11,eq
+    add         x8, x5, x3
+    add         x10, x8, x3
+
+    beq         epilogue_copy
+
+    st2         {v16.8b, v17.8b}, [x2],#16
+    add         x6, x6, #-16
+
+    st2         {v16.8b, v17.8b}, [x5],#16
+    st2         {v16.8b, v17.8b}, [x8],#16
+    mov         x20,#16
+    csel        x11, x20, x11,ne
+    st2         {v16.8b, v17.8b}, [x10],#16
+
+
+    st2         {v16.8b, v17.8b}, [x2], x6
+    st2         {v16.8b, v17.8b}, [x5], x6
+    st2         {v16.8b, v17.8b}, [x8], x6
+    st2         {v16.8b, v17.8b}, [x10], x6
+
+kernel_copy:
+    st2         {v16.8b, v17.8b}, [x2],#16
+    st2         {v16.8b, v17.8b}, [x5],#16
+    st2         {v16.8b, v17.8b}, [x8],#16
+    st2         {v16.8b, v17.8b}, [x10],#16
+
+    st2         {v16.8b, v17.8b}, [x2], x6
+    st2         {v16.8b, v17.8b}, [x5], x6
+    st2         {v16.8b, v17.8b}, [x8], x6
+    st2         {v16.8b, v17.8b}, [x10], x6
+
+    st2         {v16.8b, v17.8b}, [x2],#16
+    st2         {v16.8b, v17.8b}, [x5],#16
+    st2         {v16.8b, v17.8b}, [x8],#16
+    st2         {v16.8b, v17.8b}, [x10],#16
+
+    st2         {v16.8b, v17.8b}, [x2], x6
+    st2         {v16.8b, v17.8b}, [x5], x6
+    st2         {v16.8b, v17.8b}, [x8], x6
+    st2         {v16.8b, v17.8b}, [x10], x6
+
+epilogue_copy:
+    st2         {v16.8b, v17.8b}, [x2],x11
+    st2         {v16.8b, v17.8b}, [x5],x11
+    st2         {v16.8b, v17.8b}, [x8],x11
+    st2         {v16.8b, v17.8b}, [x10],x11
+
+    st2         {v16.8b, v17.8b}, [x2]
+    st2         {v16.8b, v17.8b}, [x5]
+    st2         {v16.8b, v17.8b}, [x8]
+    st2         {v16.8b, v17.8b}, [x10]
+    b           end_func
+
+dc_4:
+    ld2         {v30.8b, v31.8b},[x6]       //load from src[nt]
+    shl         d3, d30,#32
+
+    ld2         {v26.8b, v27.8b},[x8]       //load from src[2nt+1]
+    shl         d2, d31,#32
+
+    uaddlp      v3.4h,  v3.8b
+    uaddlp      v2.4h,  v2.8b
+    uaddlp      v3.2s,  v3.4h
+    uaddlp      v2.2s,  v2.4h
+    uadalp      v17.1d,  v3.2s
+    uadalp      v18.1d,  v2.2s
+
+    shl         d3, d26,#32
+    shl         d2, d27,#32
+    uaddlp      v3.4h,  v3.8b
+    uaddlp      v2.4h,  v2.8b
+    uaddlp      v3.2s,  v3.4h
+    uaddlp      v2.2s,  v2.4h
+    uadalp      v17.1d,  v3.2s
+    uadalp      v18.1d,  v2.2s
+
+    smov        x10, v17.2s[0]
+    smov        x11, v18.2s[0]
+
+    add         x10,x10,x4
+    add         x11,x11,x4
+    lsr         x10,x10,x12
+    lsr         x11,x11,x12
+    orr         x10,x10,x11,lsl #8
+    dup         v0.4h,w10
+
+    st1         {v0.8b},[x2],x3
+    st1         {v0.8b},[x2],x3
+    st1         {v0.8b},[x2],x3
+    st1         {v0.8b},[x2]
+
+end_func:
+    // ldmfd sp!,{x4-x12,x15}     //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_chroma_horz.s b/common/arm64/ihevc_intra_pred_chroma_horz.s
new file mode 100644
index 0000000..da41e59
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_chroma_horz.s

@@ -0,0 +1,361 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_chroma_horz_neon.s
+//*
+//* @brief
+//*  contains function definition for intra prediction  interpolation filters
+//*
+//*
+//* @author
+//*  parthiban v
+//*
+//* @par list of functions:
+//*  - ihevc_intra_pred_luma_horz()
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+//
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*     intra prediction interpolation filter for horizontal luma variable.
+//*
+//* @par description:
+//*      horizontal intraprediction(mode 10) with.extern  samples location
+//*      pointed by 'pu1_ref' to the tu block  location pointed by 'pu1_dst'  refer
+//*      to section 8.4.4.2.6 in the standard (special case)
+//*
+//* @param[in] pu1_src
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] nt
+//*  integer transform block size
+//*
+//* @param[in] mode
+//*  integer intraprediction mode
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_intra_pred_chroma_horz(uword8 *pu1_ref,
+//                                  word32 src_strd,
+//                                  uword8 *pu1_dst,
+//                                  word32 dst_strd,
+//                                  word32 nt,
+//                                  word32 mode)
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 =>  src_strd
+//x2 => *pu1_dst
+//x3 =>  dst_strd
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+.globl ihevc_intra_pred_chroma_horz_av8
+
+.type ihevc_intra_pred_chroma_horz_av8, %function
+
+ihevc_intra_pred_chroma_horz_av8:
+
+    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    lsl         x6,x4,#2                    //four_nt
+
+    add         x12,x0,x6                   //*pu1_ref[four_nt]
+    cmp         x4,#4                       //if nt == 4
+    beq         core_loop_4
+
+    cmp         x4,#8                       //if nt == 8
+    beq         core_loop_8
+
+    //cmp            x4,#16                            @if nt == 16
+    //beq            core_loop_16
+
+    sub         x12,x12,#16                 //move to 16th value pointer
+    add         x9,x2,#16
+
+core_loop_16:
+    ld1         { v0.8h},[x12]              //load 16 values. d1[7] will have the 1st value.
+    sub         x12,x12,#16
+    ld1         { v10.8h},[x12]             //load 16 values. d1[7] will have the 1st value.
+
+    dup         v2.8h, v0.4h[7]             //duplicate the i value.
+
+    dup         v4.8h, v0.4h[6]             //duplicate the ii value.
+    dup         v6.8h, v0.4h[5]             //duplicate the iii value.
+    st1         { v2.8h},[x2],x3            //store in 1st row 0-16 columns
+    st1         { v2.8h},[x9],x3            //store in 1st row 16-32 columns
+
+    dup         v8.8h, v0.4h[4]
+    st1         { v4.8h},[x2],x3
+    st1         { v4.8h},[x9],x3
+
+    dup         v2.8h, v0.4h[3]
+    st1         { v6.8h},[x2],x3
+    st1         { v6.8h},[x9],x3
+
+    dup         v4.8h, v0.4h[2]
+    st1         { v8.8h},[x2],x3
+    st1         { v8.8h},[x9],x3
+
+    dup         v6.8h, v0.4h[1]
+    st1         { v2.8h},[x2],x3
+    st1         { v2.8h},[x9],x3
+
+    dup         v8.8h, v0.4h[0]
+    st1         { v4.8h},[x2],x3
+    st1         { v4.8h},[x9],x3
+
+    dup         v2.8h, v10.4h[7]
+    st1         { v6.8h},[x2],x3
+    st1         { v6.8h},[x9],x3
+
+    dup         v4.8h, v10.4h[6]
+    st1         { v8.8h},[x2],x3
+    st1         { v8.8h},[x9],x3
+
+    dup         v6.8h, v10.4h[5]
+    st1         { v2.8h},[x2],x3
+    st1         { v2.8h},[x9],x3
+
+    dup         v8.8h, v10.4h[4]
+    st1         { v4.8h},[x2],x3
+    st1         { v4.8h},[x9],x3
+
+    dup         v2.8h, v10.4h[3]
+    st1         { v6.8h},[x2],x3
+    st1         { v6.8h},[x9],x3
+
+    dup         v4.8h, v10.4h[2]
+    st1         { v8.8h},[x2],x3
+    st1         { v8.8h},[x9],x3
+
+    dup         v6.8h, v10.4h[1]
+    st1         { v2.8h},[x2],x3
+    st1         { v2.8h},[x9],x3
+    sub         x12,x12,#16                 //move to 16th value pointer
+
+    dup         v8.8h, v10.4h[0]
+    st1         { v4.8h},[x2],x3
+    st1         { v4.8h},[x9],x3
+
+    subs        x4,x4,#16                   //decrement the loop count by 16
+    st1         { v6.8h},[x2],x3
+    st1         { v6.8h},[x9],x3
+
+    st1         { v8.8h},[x2],x3
+    st1         { v8.8h},[x9],x3
+    bgt         core_loop_16
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+    b           endloop
+
+core_loop_8:
+    ldrb        w14,[x12],#1                //pu1_ref[two_nt]
+    sxtw        x14,w14
+    //vld1.8        {q15},[x12]                        @pu1_ref[two_nt + 1 + col]
+
+    dup         v28.8b,w14
+    sub         x12,x12,#17
+    ld1         { v0.16b},[x12]
+
+    sub         x12,x12,#16
+//    ld1 { v30.16b},[x12]
+    dup         v10.8h, v0.4h[7]
+    //vmovl.u8    q13,d26
+
+    dup         v2.8h, v0.4h[6]
+    //vsubl.u8    q12,d30,d28
+
+    dup         v4.8h, v0.4h[5]
+    //vshr.s16    q12,q12,#1
+
+    dup         v6.8h, v0.4h[4]
+    //vqadd.s16    q11,q13,q12
+
+    dup         v8.8h, v0.4h[3]
+    //vqmovun.s16 d22,q11
+
+    st1         { v10.8h},[x2],x3
+
+    dup         v10.8h, v0.4h[2]
+    //vsubl.u8    q12,d31,d28
+
+    dup         v12.8h, v0.4h[1]
+    //vshr.s16    q12,q12,#1
+
+    dup         v14.8h, v0.4h[0]
+    //vqadd.s16    q11,q13,q12
+
+    dup         v16.8h, v0.4h[3]
+    //vqmovun.s16 d22,q11
+
+    st1         { v2.8h},[x2],x3
+    //sub            x2,x2,#8
+
+    st1         { v4.8h},[x2],x3
+
+    st1         { v6.8h},[x2],x3
+    st1         { v8.8h},[x2],x3
+    st1         { v10.8h},[x2],x3
+
+    //vdup.8        q1,d0[2]
+    st1         { v12.8h},[x2],x3
+
+    //vdup.8        q2,d0[1]
+    st1         { v14.8h},[x2],x3
+
+    //vdup.8        q3,d0[0]
+    //vst1.8        {q7},[x2],x3
+
+    //vdup.8        q4,d0[3]
+    //vst1.8        {q8},[x2],x3
+
+    //vdup.8        q5,d0[2]
+    //vst1.8        {q1},[x2],x3
+
+    //vdup.8        q6,d0[1]
+    //vst1.8        {q2},[x2],x3
+
+    //vdup.8        q7,d0[0]
+    //vst1.8        {q3},[x2],x3
+
+    //vst1.8        {q4},[x2],x3
+    //vst1.8        {q5},[x2],x3
+    //vst1.8        {q6},[x2],x3
+    //vst1.8        {q7},[x2],x3
+
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+    b           endloop
+
+
+core_loop_4:
+    ldrb        w14,[x12]                   //pu1_ref[two_nt]
+    sxtw        x14,w14
+    add         x12,x12,#1                  //pu1_ref[two_nt + 1]
+    //vld1.8        {d30},[x12]                        @pu1_ref[two_nt + 1 + col]
+
+    sub         x12,x12,#9
+    ld1         {v0.8b},[x12]
+    sub         x12,x12,#8
+    ld1         {v30.8b},[x12]
+    dup         v26.4h, v0.4h[3]
+    dup         v28.8b,w14
+
+    dup         v3.4h, v0.4h[2]
+    uxtl        v26.8h, v26.8b
+
+    dup         v4.4h, v0.4h[1]
+    usubl       v24.8h, v30.8b, v28.8b
+
+    dup         v5.4h, v0.4h[0]
+    sshr        v24.8h, v24.8h,#1
+
+    dup         v6.4h, v0.4h[3]
+    sqadd       v22.8h,  v26.8h ,  v24.8h
+
+    dup         v7.4h, v0.4h[2]
+    sqxtun      v22.8b, v22.8h
+
+    st1         {v6.8b},[x2],x3
+    st1         {v3.8b},[x2],x3
+
+    dup         v8.4h, v0.4h[1]
+    st1         {v4.8b},[x2],x3
+    st1         {v5.8b},[x2],x3
+
+    dup         v9.4h, v0.4h[0]
+    //vst1.8        {d6},[x2],x3
+    //vst1.8        {d7},[x2],x3
+
+    //vst1.8        {d8},[x2],x3
+    //vst1.8        {d9},[x2],x3
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+    b           endloop
+
+
+//core_loop_4
+    ldrb        w14,[x12]                   //pu1_ref[two_nt]
+    sxtw        x14,w14
+    add         x12,x12,#1                  //pu1_ref[two_nt + 1]
+    ld1         {v30.8b},[x12]              //pu1_ref[two_nt + 1 + col]
+
+    sub         x12,x12,#5
+    ld1         {v0.8b},[x12]
+    dup         v28.8b,w14
+    dup         v26.8b, v0.8b[3]
+    uxtl        v26.8h, v26.8b
+
+    dup         v3.8b, v0.8b[2]
+    usubl       v24.8h, v30.8b, v28.8b
+
+    dup         v4.8b, v0.8b[1]
+    sshr        v24.8h, v24.8h,#1
+
+    dup         v5.8b, v0.8b[0]
+    sqadd       v22.8h,  v26.8h ,  v24.8h
+
+    sqxtun      v22.8b, v22.8h
+
+    st1         {v22.s}[0],[x2],x3
+    st1         {v3.s}[0],[x2],x3
+    st1         {v4.s}[0],[x2],x3
+    st1         {v5.s}[0],[x2],x3
+
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+endloop:
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_chroma_mode2.s b/common/arm64/ihevc_intra_pred_chroma_mode2.s
new file mode 100644
index 0000000..d2c0730
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_chroma_mode2.s

@@ -0,0 +1,312 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_luma_mode2_neon.s
+//*
+//* @brief
+//*  contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  yogeswaran rs
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*    luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] pi1_coeff
+//*  word8 pointer to the planar coefficients
+//*
+//* @param[in] nt
+//*  size of tranform block
+//*
+//* @param[in] mode
+//*  type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref,
+//                                 word32 src_strd,
+//                                 uword8 *pu1_dst,
+//                                 word32 dst_strd,
+//                                 word32 nt,
+//                                 word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+//    nt
+//    mode
+//    pi1_coeff
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_chroma_mode2_av8
+
+.type ihevc_intra_pred_chroma_mode2_av8, %function
+
+ihevc_intra_pred_chroma_mode2_av8:
+
+    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    mov         x8,#-4
+
+    cmp         x4,#4
+    beq         mode2_4
+
+    add         x0,x0,x4,lsl #2
+
+    sub         x0,x0,#0x12                 //src[1]
+    add         x10,x0,#-2
+
+prologue_cpy_32:
+
+    ld2         {v0.8b, v1.8b},[x0],x8
+
+    mov         x11,x4
+    rev64       v16.8b,  v0.8b
+    rev64       v17.8b,  v1.8b
+
+    ld2         {v2.8b, v3.8b},[x10],x8
+    mov         x6, x2
+
+    ld2         {v4.8b, v5.8b},[x0],x8
+    ld2         {v6.8b, v7.8b},[x10],x8
+    lsr         x1, x4, #3
+
+    ld2         {v8.8b, v9.8b},[x0],x8
+    ld2         {v10.8b, v11.8b},[x10],x8
+    ld2         {v12.8b, v13.8b},[x0],x8
+    mul         x1, x4, x1
+
+    ld2         {v14.8b, v15.8b},[x10],x8
+    add         x7,x6,x3
+
+    rev64       v18.8b,  v2.8b
+    rev64       v19.8b,  v3.8b
+    lsl         x5, x3, #2
+
+    rev64       v20.8b,  v4.8b
+    rev64       v21.8b,  v5.8b
+    add         x9,x7,x3
+
+    rev64       v22.8b,  v6.8b
+    rev64       v23.8b,  v7.8b
+
+    rev64       v24.8b,  v8.8b
+    rev64       v25.8b,  v9.8b
+
+    rev64       v26.8b,  v10.8b
+    subs        x1,x1,#8
+
+    rev64       v27.8b,  v11.8b
+
+    rev64       v28.8b,  v12.8b
+    rev64       v29.8b,  v13.8b
+
+    rev64       v30.8b,  v14.8b
+    add         x14,x9,x3
+    rev64       v31.8b,  v15.8b
+
+    beq         epilogue_mode2
+
+    sub         x12,x4,#8
+
+kernel_mode2:
+
+    st2         {v16.8b, v17.8b},[x6],x5
+    st2         {v18.8b, v19.8b},[x7],x5
+    subs        x11,x11,#8
+    st2         {v20.8b, v21.8b},[x9],x5
+    st2         {v22.8b, v23.8b},[x14],x5
+    st2         {v24.8b, v25.8b},[x6],x5
+    add         x20,x2,#16
+    csel        x2, x20, x2,gt
+    st2         {v26.8b, v27.8b},[x7],x5
+    st2         {v28.8b, v29.8b},[x9],x5
+    st2         {v30.8b, v31.8b},[x14],x5
+
+    ld2         {v0.8b, v1.8b},[x0],x8
+    csel        x11, x4, x11,le
+
+    ld2         {v2.8b, v3.8b},[x10],x8
+    ld2         {v4.8b, v5.8b},[x0],x8
+    add         x20, x2, x3, lsl #2
+    csel        x2, x20, x2,le
+    ld2         {v6.8b, v7.8b},[x10],x8
+    rev64       v16.8b,  v0.8b
+
+    ld2         {v8.8b, v9.8b},[x0],x8
+    ld2         {v10.8b, v11.8b},[x10],x8
+    sub         x20, x6,#16
+    csel        x2, x20, x2,le
+    ld2         {v12.8b, v13.8b},[x0],x8
+    rev64       v17.8b,  v1.8b
+    ld2         {v14.8b, v15.8b},[x10],x8
+
+    subs        x12,x12,#8
+    mov         x6, x2
+    add         x20, x0, x4,lsl #1
+    csel        x0, x20, x0,le
+    add         x7, x6, x3
+
+    rev64       v18.8b,  v2.8b
+    sub         x20, x0, #16
+    csel        x0, x20, x0,le
+    rev64       v19.8b,  v3.8b
+
+    rev64       v20.8b,  v4.8b
+    csel        x12, x4, x12,le
+    rev64       v21.8b,  v5.8b
+
+    rev64       v22.8b,  v6.8b
+    add         x9, x7, x3
+    rev64       v23.8b,  v7.8b
+
+    rev64       v24.8b,  v8.8b
+    add         x10,x0,#-2
+    rev64       v25.8b,  v9.8b
+
+    rev64       v26.8b,  v10.8b
+    subs        x1, x1, #8
+    rev64       v27.8b,  v11.8b
+
+    rev64       v28.8b,  v12.8b
+    rev64       v29.8b,  v13.8b
+
+    rev64       v30.8b,  v14.8b
+    add         x14, x9, x3
+    rev64       v31.8b,  v15.8b
+
+    bne         kernel_mode2
+
+epilogue_mode2:
+
+    st2         {v16.8b, v17.8b},[x6],x5
+    st2         {v18.8b, v19.8b},[x7],x5
+    st2         {v20.8b, v21.8b},[x9],x5
+    st2         {v22.8b, v23.8b},[x14],x5
+    st2         {v24.8b, v25.8b},[x6],x5
+    st2         {v26.8b, v27.8b},[x7],x5
+    st2         {v28.8b, v29.8b},[x9],x5
+    st2         {v30.8b, v31.8b},[x14],x5
+
+    b           end_func
+
+mode2_4:
+
+    lsl         x12,x4,#1
+    add         x0,x0,x12
+    sub         x0,x0,#2
+
+    ld2         {v12.8b, v13.8b},[x0],x8
+    shl         d0, d12,#32
+    add         x10,x0,#2
+    shl         d1, d13,#32
+
+    rev64       v0.8b,  v0.8b
+    ld2         {v14.8b, v15.8b},[x10],x8
+    shl         d2, d14,#32
+
+    rev64       v1.8b,  v1.8b
+    shl         d3, d15,#32
+    zip1        v0.8b, v0.8b, v1.8b
+    zip2        v1.8b, v0.8b, v1.8b
+    st1         {v0.8b},[x2],x3
+
+    rev64       v2.8b,  v2.8b
+    ld2         {v16.8b, v17.8b},[x0],x8
+    shl         d4, d16,#32
+    rev64       v3.8b,  v3.8b
+    shl         d5, d17,#32
+    zip1        v2.8b, v2.8b, v3.8b
+    zip2        v3.8b, v2.8b, v3.8b
+    rev64       v4.8b,  v4.8b
+    rev64       v5.8b,  v5.8b
+    st1         {v2.8b},[x2],x3
+
+
+    ld2         {v18.8b, v19.8b},[x10],x8
+    shl         d6, d18,#32
+
+    zip1        v4.8b, v4.8b, v5.8b
+    zip2        v5.8b, v4.8b, v5.8b
+    shl         d7, d19,#32
+    rev64       v6.8b,  v6.8b
+    st1         {v4.8b},[x2],x3
+
+    rev64       v7.8b,  v7.8b
+    zip1        v6.8b, v6.8b, v7.8b
+    zip2        v7.8b, v6.8b, v7.8b
+    st1         {v6.8b},[x2],x3
+
+end_func:
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s b/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s
new file mode 100644
index 0000000..52fc702
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s

@@ -0,0 +1,198 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_luma_mode_18_34_neon.s
+//*
+//* @brief
+//*  contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  yogeswaran rs
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*    luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] pi1_coeff
+//*  word8 pointer to the planar coefficients
+//*
+//* @param[in] nt
+//*  size of tranform block
+//*
+//* @param[in] mode
+//*  type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_chroma_mode_18_34(uword8 *pu1_ref,
+//                                      word32 src_strd,
+//                                      uword8 *pu1_dst,
+//                                      word32 dst_strd,
+//                                      word32 nt,
+//                                      word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+//    nt
+//    mode
+//    pi1_coeff
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_chroma_mode_18_34_av8
+
+.type ihevc_intra_pred_chroma_mode_18_34_av8, %function
+
+ihevc_intra_pred_chroma_mode_18_34_av8:
+
+    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+
+    cmp         x4,#4
+    beq         mode2_4
+
+    mov         x12,x4
+    mov         x11,x4
+    add         x0,x0,x4,lsl #2
+
+    cmp         x5,#0x22
+    mov         x10,x2
+
+    add         x0,x0,#4
+
+    sub         x20,x0,#4
+    csel        x0, x20, x0,ne
+    mov         x20,#2
+    csel        x6, x20, x6,eq
+    mov         x20,#-2
+    csel        x6, x20, x6,ne
+    mov         x8,x0
+
+
+kernel:
+
+
+    ld1         {v0.8b, v1.8b},[x8],x6
+    st1         {v0.8b, v1.8b},[x10],x3
+    ld1         {v2.8b, v3.8b},[x8],x6
+    st1         {v2.8b, v3.8b},[x10],x3
+    ld1         {v4.8b, v5.8b},[x8],x6
+    st1         {v4.8b, v5.8b},[x10],x3
+    ld1         {v6.8b, v7.8b},[x8],x6
+    st1         {v6.8b, v7.8b},[x10],x3
+    ld1         {v8.8b, v9.8b},[x8],x6
+    st1         {v8.8b, v9.8b},[x10],x3
+    ld1         {v10.8b, v11.8b},[x8],x6
+    st1         {v10.8b, v11.8b},[x10],x3
+    ld1         {v12.8b, v13.8b},[x8],x6
+    st1         {v12.8b, v13.8b},[x10],x3
+    ld1         {v14.8b, v15.8b},[x8],x6
+    st1         {v14.8b, v15.8b},[x10],x3
+
+    subs        x12,x12,#8
+    bne         kernel
+
+    cmp         x11,#16
+    add         x8,x0,#16
+    add         x10,x2,#16
+    sub         x11, x11,#16
+    mov         x12,#16
+    beq         kernel
+    b           end_func
+
+mode2_4:
+
+    add         x0,x0,#20
+    cmp         x5,#0x22
+    sub         x20,x0,#4
+    csel        x0, x20, x0,ne
+
+    mov         x20,#2
+    csel        x8, x20, x8,eq
+    mov         x20,#-2
+    csel        x8, x20, x8,ne
+
+    ld1         {v0.8b},[x0],x8
+    st1         {v0.2s},[x2],x3
+
+    ld1         {v0.8b},[x0],x8
+    st1         {v0.2s},[x2],x3
+
+    ld1         {v0.8b},[x0],x8
+    st1         {v0.2s},[x2],x3
+
+    ld1         {v0.8b},[x0],x8
+    st1         {v0.2s},[x2],x3
+
+end_func:
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s b/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
new file mode 100644
index 0000000..1df4ad0
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s

@@ -0,0 +1,551 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_chroma_mode_27_to_33.s
+//*
+//* @brief
+//*  contains function definition for intra prediction  interpolation filters
+//*
+//*
+//* @author
+//*  parthiban v
+//*
+//* @par list of functions:
+//*  - ihevc_intra_pred_chroma_mode_27_to_33()
+//*
+//* @remarksll
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*  intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
+//*.extern  neighboring samples location pointed by 'pu1_ref' to the  tu
+//* block location pointed by 'pu1_dst'
+//*
+//* @par description:
+//*
+//*
+//* @param[in] pu1_src
+//*  uword8 pointer to the source
+//*
+//* @param[in] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] nt
+//*  integer transform block size
+//*
+//* @param[in] mode
+//*  integer intraprediction mode
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//.if intra_pred_chroma_27_t0_33 == c
+//void ihevc_intra_pred_chroma_mode_27_to_33(uword8 *pu1_ref,
+//                                        word32 src_strd,
+//                                         uword8 *pu1_dst,
+//                                         word32 dst_strd,
+//                                         word32 nt,
+//                                         word32 mode)
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+.globl ihevc_intra_pred_chroma_mode_27_to_33_av8
+.extern gai4_ihevc_ang_table
+.extern gau1_ihevc_planar_factor
+
+.type ihevc_intra_pred_chroma_mode_27_to_33_av8, %function
+
+ihevc_intra_pred_chroma_mode_27_to_33_av8:
+
+    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    adrp        x6,  :got:gai4_ihevc_ang_table //loads word32 gai4_ihevc_ang_table[35]
+    ldr         x6,  [x6, #:got_lo12:gai4_ihevc_ang_table]
+
+    lsl         x7,x4,#2                    //four_nt
+
+    add         x8,x6,x5,lsl #2             //*gai4_ihevc_ang_table[mode]
+    ldr         w9, [x8]                    //intra_pred_ang = gai4_ihevc_ang_table[mode]
+    sxtw        x9,w9
+    adrp        x1, :got:gau1_ihevc_planar_factor //used for ((row + 1) * intra_pred_ang) row values
+    ldr         x1, [x1, #:got_lo12:gau1_ihevc_planar_factor]
+    add         x6,x1,#1
+
+    tst         x4,#7
+    add         x8,x0,x7                    //pu1_ref + four_nt
+    mov         x14,#0                      //row
+    mov         x12,x4
+    bne         core_loop_4
+    lsl         x4,x4,#1
+    b           core_loop_8
+
+core_loop_8:
+    add         x8,x8,#2                    //pu1_ref_main_idx += (four_nt + 1)
+    dup         v0.8b,w9                    //intra_pred_ang
+    lsr         x12, x4, #4                 //divide by 8
+
+    movi        v1.8b, #32
+    mul         x7, x4, x12
+
+    movi        v6.8h, #31
+
+    mov         x1,x8
+    mov         x5,x4
+    mov         x11,#2
+
+prologue:
+    ld1         {v3.8b},[x6]                //loads the row value
+    umull       v2.8h, v3.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
+    and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
+    xtn         v4.8b,  v4.8h
+    shrn        v5.8b, v2.8h,#5             //idx = pos >> 5
+
+    dup         v31.8b, v4.8b[0]
+    add         x0,x2,x3
+
+    smov        x14, v5.2s[0]               //(i row)extract idx to the r register
+    lsl         x14,x14,#1
+
+    dup         v29.8b, v4.8b[1]            //(ii)
+    and         x9,x14,#0xff                //(i row) get the last byte
+
+    add         x10,x8,x9                   //(i row)*pu1_ref[ref_main_idx]
+
+    asr         x14,x14,#8                  //(ii)shift by 8
+    ld1         {v8.8b},[x10],x11           //(i row)ref_main_idx
+    and         x9,x14,#0xff                //(ii)get the last byte
+
+    asr         x14,x14,#8                  //(iii)
+    ld1         {v9.8b},[x10]               //(i row)ref_main_idx_1
+    add         x12,x8,x9                   //(ii)*pu1_ref[ref_main_idx]
+
+    and         x9,x14,#0xff                //(iii)
+    sub         v30.8b,  v1.8b ,  v31.8b    //32-fract(dup_const_32_fract)
+    add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
+
+    ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
+    umull       v10.8h, v8.8b, v30.8b       //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
+    umlal       v10.8h, v9.8b, v31.8b       //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
+    asr         x14,x14,#8                  //(iv)
+
+    dup         v27.8b, v4.8b[2]            //(iii)
+    sub         v28.8b,  v1.8b ,  v29.8b    //(ii)32-fract(dup_const_32_fract)
+    and         x9,x14,#0xff                //(iv)
+
+    dup         v25.8b, v4.8b[3]            //(iv)
+    umull       v14.8h, v12.8b, v28.8b      //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    add         x12,x8,x9                   //(iv)*pu1_ref[ref_main_idx]
+
+    ld1         {v16.8b},[x10],x11          //(iii)ref_main_idx
+    umlal       v14.8h, v13.8b, v29.8b      //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v17.8b},[x10]              //(iii)ref_main_idx_1
+    rshrn       v10.8b, v10.8h,#5           //(i row)shift_res = vrshrn_n_u16(add_res, 5)
+
+    ld1         {v20.8b},[x12],x11          //(iv)ref_main_idx
+    sub         v26.8b,  v1.8b ,  v27.8b    //(iii)32-fract(dup_const_32_fract)
+
+    ld1         {v21.8b},[x12]              //(iv)ref_main_idx_1
+
+    dup         v31.8b, v4.8b[4]            //(v)
+    umull       v18.8h, v16.8b, v26.8b      //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    smov        x14, v5.2s[1]               //extract idx to the r register
+    umlal       v18.8h, v17.8b, v27.8b      //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    lsl         x14,x14,#1
+
+    st1         {v10.8b},[x2],#8            //(i row)
+    rshrn       v14.8b, v14.8h,#5           //(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    and         x9,x14,#0xff                //(v)
+    dup         v29.8b, v4.8b[5]            //(vi)
+    add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
+
+    ld1         {v8.8b},[x10],x11           //(v)ref_main_idx
+    sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
+
+    asr         x14,x14,#8                  //(vi)
+    umull       v22.8h, v20.8b, v24.8b      //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+    and         x9,x14,#0xff                //(vi)
+
+    ld1         {v9.8b},[x10]               //(v)ref_main_idx_1
+    umlal       v22.8h, v21.8b, v25.8b      //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    st1         {v14.8b},[x0],x3            //(ii)
+    rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    add         x12,x8,x9                   //(vi)*pu1_ref[ref_main_idx]
+    dup         v27.8b, v4.8b[6]            //(vii)
+    asr         x14,x14,#8                  //(vii)
+
+    and         x9,x14,#0xff                //(vii)
+    sub         v30.8b,  v1.8b ,  v31.8b    //(v)32-fract(dup_const_32_fract)
+    add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
+
+    ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
+    umull       v10.8h, v8.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
+    umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    st1         {v18.8b},[x0],x3            //(iii)
+    rshrn       v22.8b, v22.8h,#5           //(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+    asr         x14,x14,#8                  //(viii)
+    dup         v25.8b, v4.8b[7]            //(viii)
+    and         x9,x14,#0xff                //(viii)
+
+    ld1         {v16.8b},[x10],x11          //(vii)ref_main_idx
+    sub         v28.8b,  v1.8b ,  v29.8b    //(vi)32-fract(dup_const_32_fract)
+
+    ld1         {v17.8b},[x10]              //(vii)ref_main_idx_1
+    umull       v14.8h, v12.8b, v28.8b      //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    add         x12,x8,x9                   //(viii)*pu1_ref[ref_main_idx]
+    umlal       v14.8h, v13.8b, v29.8b      //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+    subs        x7,x7,#8
+
+    st1         {v22.8b},[x0],x3            //(iv)
+    rshrn       v10.8b, v10.8h,#5           //(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+    ld1         {v20.8b},[x12],x11          //(viii)ref_main_idx
+    sub         v26.8b,  v1.8b ,  v27.8b    //(vii)32-fract(dup_const_32_fract)
+
+    ld1         {v21.8b},[x12]              //(viii)ref_main_idx_1
+    umull       v18.8h, v16.8b, v26.8b      //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    add         x20,x8,#8
+    csel        x8, x20, x8,gt
+    umlal       v18.8h, v17.8b, v27.8b      //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    sub         x20,x4,#8
+    csel        x4, x20, x4,gt
+
+    st1         {v10.8b},[x0],x3            //(v)
+    rshrn       v14.8b, v14.8h,#5           //(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+    beq         epilogue
+
+    ld1         {v5.8b},[x6]                //loads the row value
+    umull       v2.8h, v5.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
+    and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
+    xtn         v4.8b,  v4.8h
+    shrn        v3.8b, v2.8h,#5             //idx = pos >> 5
+    smov        x14, v3.2s[0]               //(i)extract idx to the r register
+    lsl         x14,x14,#1
+    and         x9,x14,#0xff                //(i)
+    add         x10,x8,x9                   //(i)*pu1_ref[ref_main_idx]
+
+kernel_8_rows:
+    asr         x14,x14,#8                  //(ii)
+    dup         v31.8b, v4.8b[0]
+    subs        x4,x4,#8
+
+    ld1         {v8.8b},[x10],x11           //(i)ref_main_idx
+    sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
+    and         x9,x14,#0xff                //(ii)
+    add         x20,x6,#8                   //increment the row value
+    csel        x6, x20, x6,le
+
+    ld1         {v9.8b},[x10]               //(i)ref_main_idx_1
+    umull       v22.8h, v20.8b, v24.8b      //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    add         x12,x8,x9                   //(ii)*pu1_ref[ref_main_idx]
+
+    ld1         {v5.8b},[x6]                //loads the row value
+    umlal       v22.8h, v21.8b, v25.8b      //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    asr         x14,x14,#8                  //(iii)
+
+    dup         v29.8b, v4.8b[1]            //(ii)
+    rshrn       v18.8b, v18.8h,#5           //(vii)shift_res = vrshrn_n_u16(add_res, 5)
+    and         x9,x14,#0xff                //(iii)
+
+    st1         {v14.8b},[x0],x3            //(vi)
+    sub         v30.8b,  v1.8b ,  v31.8b    //(i)32-fract(dup_const_32_fract)
+    add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
+
+    ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
+    umull       v10.8h, v8.8b, v30.8b       //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+    asr         x14,x14,#8                  //(iv)
+
+    ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
+    umlal       v10.8h, v9.8b, v31.8b       //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
+    and         x9,x14,#0xff                //(iv)
+
+    smov        x14, v3.2s[1]               //extract idx to the r register
+    rshrn       v22.8b, v22.8h,#5           //(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    dup         v27.8b, v4.8b[2]            //(iii)
+    sub         v28.8b,  v1.8b ,  v29.8b    //(ii)32-fract(dup_const_32_fract)
+    csel        x4, x5, x4,le               //reload nt
+
+    ld1         {v16.8b},[x10],x11          //(iii)ref_main_idx
+    umull       v14.8h, v12.8b, v28.8b      //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    add         x12,x8,x9                   //(iv)*pu1_ref[ref_main_idx]
+
+    st1         {v18.8b},[x0],x3            //(vii)
+    umlal       v14.8h, v13.8b, v29.8b      //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v17.8b},[x10]              //(iii)ref_main_idx_1
+    rshrn       v10.8b, v10.8h,#5           //(i)shift_res = vrshrn_n_u16(add_res, 5)
+
+    dup         v25.8b, v4.8b[3]            //(iv)
+    umull       v2.8h, v5.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
+
+    st1         {v22.8b},[x0]               //(viii)
+    sub         v26.8b,  v1.8b ,  v27.8b    //(iii)32-fract(dup_const_32_fract)
+
+    ld1         {v20.8b},[x12],x11          //(iv)ref_main_idx
+    umull       v18.8h, v16.8b, v26.8b      //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    lsl         x14,x14,#1
+
+    ld1         {v21.8b},[x12]              //(iv)ref_main_idx_1
+    umlal       v18.8h, v17.8b, v27.8b      //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    add         x0,x2,x3
+
+    dup         v31.8b, v4.8b[4]            //(v)
+    rshrn       v14.8b, v14.8h,#5           //(ii)shift_res = vrshrn_n_u16(add_res, 5)
+    and         x9,x14,#0xff                //(v)
+
+    st1         {v10.8b},[x2],#8            //(i)
+    sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
+    add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
+
+    dup         v29.8b, v4.8b[5]            //(vi)
+    umull       v22.8h, v20.8b, v24.8b      //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+    asr         x14,x14,#8                  //(vi)
+
+    dup         v27.8b, v4.8b[6]            //(vii)
+    umlal       v22.8h, v21.8b, v25.8b      //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+    and         x9,x14,#0xff                //(vi)
+
+    dup         v25.8b, v4.8b[7]            //(viii)
+    rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
+    add         x12,x8,x9                   //(vi)*pu1_ref[ref_main_idx]
+
+    ld1         {v8.8b},[x10],x11           //(v)ref_main_idx
+    and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
+    asr         x14,x14,#8                  //(vii)
+
+    ld1         {v9.8b},[x10]               //(v)ref_main_idx_1
+    shrn        v3.8b, v2.8h,#5             //idx = pos >> 5
+    and         x9,x14,#0xff                //(vii)
+
+    st1         {v14.8b},[x0],x3            //(ii)
+    rshrn       v22.8b, v22.8h,#5           //(iv)shift_res = vrshrn_n_u16(add_res, 5)
+    asr         x14,x14,#8                  //(viii)
+
+    ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
+    sub         v30.8b,  v1.8b ,  v31.8b    //(v)32-fract(dup_const_32_fract)
+    add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
+
+    ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
+    umull       v10.8h, v8.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+    and         x9,x14,#0xff                //(viii)
+
+    smov        x14, v3.2s[0]               //(i)extract idx to the r register
+    umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+    add         x12,x8,x9                   //(viii)*pu1_ref[ref_main_idx]
+
+    ld1         {v16.8b},[x10],x11          //(vii)ref_main_idx
+    sub         v28.8b,  v1.8b ,  v29.8b    //(vi)32-fract(dup_const_32_fract)
+
+    st1         {v18.8b},[x0],x3            //(iii)
+    umull       v14.8h, v12.8b, v28.8b      //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+    csel        x8, x1, x8,le               //reload the source to pu1_src+2nt
+
+    ld1         {v17.8b},[x10]              //(vii)ref_main_idx_1
+    umlal       v14.8h, v13.8b, v29.8b      //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+    add         x20,x8,#8                   //increment the source next set 8 columns in same row
+    csel        x8, x20, x8,gt
+
+    ld1         {v20.8b},[x12],x11          //(viii)ref_main_idx
+    rshrn       v10.8b, v10.8h,#5           //(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+    ld1         {v21.8b},[x12]              //(viii)ref_main_idx_1
+    sub         v26.8b,  v1.8b ,  v27.8b    //(vii)32-fract(dup_const_32_fract)
+    lsl         x20, x3,#3
+    csel        x12,x20,x12,le
+
+    st1         {v22.8b},[x0],x3            //(iv)
+    umull       v18.8h, v16.8b, v26.8b      //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    sub         x20,x12,x5
+    csel        x12, x20, x12,le
+
+    st1         {v10.8b},[x0],x3            //(v)
+    umlal       v18.8h, v17.8b, v27.8b      //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    add         x20,x2,x12                  //increment the dst pointer to 8*dst_strd - nt
+    csel        x2, x20, x2,le
+
+    xtn         v4.8b,  v4.8h
+    rshrn       v14.8b, v14.8h,#5           //(vi)shift_res = vrshrn_n_u16(add_res, 5)
+    lsl         x14,x14,#1
+
+    and         x9,x14,#0xff                //(i)
+    subs        x7,x7,#8
+    add         x10,x8,x9                   //(i)*pu1_ref[ref_main_idx]
+
+    bne         kernel_8_rows
+
+epilogue:
+    st1         {v14.8b},[x0],x3            //(vi)
+    rshrn       v18.8b, v18.8h,#5           //(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
+    umull       v22.8h, v20.8b, v24.8b      //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umlal       v22.8h, v21.8b, v25.8b      //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    st1         {v18.8b},[x0],x3            //(vii)
+    rshrn       v22.8b, v22.8h,#5           //(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    st1         {v22.8b},[x0],x3            //(viii)
+    b           end_loops
+
+core_loop_4:
+    add         x10,x8,#2                   //pu1_ref_main_idx += (four_nt + 1)
+    add         x11,x8,#4                   //pu1_ref_main_idx_1 += (four_nt + 2)
+    mov         x8,#0
+
+    add         x5,x8,#1                    //row + 1
+    mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
+    and         x5,x5,#31                   //fract = pos & (31)
+    cmp         x14,x5                      //if(fract_prev > fract)
+    add         x20,x10,#2                  //pu1_ref_main_idx += 2
+    csel        x10, x20, x10,gt
+    add         x11,x10,#2                  //pu1_ref_main_idx_1 += 2
+    dup         v0.8b,w5                    //dup_const_fract
+    sub         x20,x5,#32
+    neg         x4, x20
+    dup         v1.8b,w4                    //dup_const_32_fract
+
+//inner_loop_4
+    ld1         {v2.8b},[x10]               //ref_main_idx
+    add         x8,x8,#1
+    mov         x14,x5                      //fract_prev = fract
+
+    ld1         {v3.8b},[x11]               //ref_main_idx_1
+    add         x5,x8,#1                    //row + 1
+    mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
+    and         x5,x5,#31                   //fract = pos & (31)
+    cmp         x14,x5                      //if(fract_prev > fract)
+    add         x20,x10,#2                  //pu1_ref_main_idx += 1
+    csel        x10, x20, x10,gt
+    add         x11,x10,#2                  //pu1_ref_main_idx_1 += 1
+
+    dup         v6.8b,w5                    //dup_const_fract
+    umull       v4.8h, v2.8b, v1.8b         //vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    sub         x20,x5,#32
+    neg         x4, x20
+    dup         v7.8b,w4                    //dup_const_32_fract
+    umlal       v4.8h, v3.8b, v0.8b         //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v8.8b},[x10]               //ref_main_idx
+    add         x8,x8,#1
+
+    ld1         {v9.8b},[x11]               //ref_main_idx_1
+    rshrn       v4.8b, v4.8h,#5             //shift_res = vrshrn_n_u16(add_res, 5)
+
+    mov         x14,x5                      //fract_prev = fract
+    add         x5,x8,#1                    //row + 1
+    mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
+    and         x5,x5,#31                   //fract = pos & (31)
+    cmp         x14,x5                      //if(fract_prev > fract)
+    add         x20,x10,#2                  //pu1_ref_main_idx += 1
+    csel        x10, x20, x10,gt
+    add         x11,x10,#2                  //pu1_ref_main_idx_1 += 1
+
+    dup         v12.8b,w5                   //dup_const_fract
+    umull       v10.8h, v8.8b, v7.8b        //vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    sub         x20,x5,#32
+    neg         x4, x20
+    dup         v13.8b,w4                   //dup_const_32_fract
+    umlal       v10.8h, v9.8b, v6.8b        //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v14.8b},[x10]              //ref_main_idx
+    add         x8,x8,#1
+
+    st1         {v4.8b},[x2],x3
+    rshrn       v10.8b, v10.8h,#5           //shift_res = vrshrn_n_u16(add_res, 5)
+
+    ld1         {v15.8b},[x11]              //ref_main_idx_1
+    mov         x14,x5                      //fract_prev = fract
+    add         x5,x8,#1                    //row + 1
+    mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
+    and         x5,x5,#31                   //fract = pos & (31)
+    cmp         x14,x5                      //if(fract_prev > fract)
+    add         x20,x10,#2                  //pu1_ref_main_idx += 1
+    csel        x10, x20, x10,gt
+    add         x11,x10,#2                  //pu1_ref_main_idx_1 += 1
+
+    dup         v18.8b,w5                   //dup_const_fract
+    umull       v16.8h, v14.8b, v13.8b      //vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    sub         x20,x5,#32
+    neg         x4, x20
+    dup         v19.8b,w4                   //dup_const_32_fract
+    umlal       v16.8h, v15.8b, v12.8b      //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v20.8b},[x10]              //ref_main_idx
+
+    st1         {v10.8b},[x2],x3
+    rshrn       v16.8b, v16.8h,#5           //shift_res = vrshrn_n_u16(add_res, 5)
+    ld1         {v21.8b},[x11]              //ref_main_idx_1
+
+    umull       v22.8h, v20.8b, v19.8b      //vmull_u8(ref_main_idx, dup_const_32_fract)
+    umlal       v22.8h, v21.8b, v18.8b      //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    st1         {v16.8b},[x2],x3
+    rshrn       v22.8b, v22.8h,#5           //shift_res = vrshrn_n_u16(add_res, 5)
+
+    st1         {v22.8b},[x2],x3
+
+end_loops:
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s b/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s
new file mode 100644
index 0000000..3c8746c
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s

@@ -0,0 +1,495 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_chroma_mode_3_to_9.s
+//*
+//* @brief
+//*  contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  parthiban v
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*    luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] nt
+//*  size of tranform block
+//*
+//* @param[in] mode
+//*  type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_intra_pred_chroma_mode_3_to_9(uword8 *pu1_ref,
+//                                       word32 src_strd,
+//                                       uword8 *pu1_dst,
+//                                       word32 dst_strd,
+//                                       word32 nt,
+//                                       word32 mode)
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+//    nt
+//    mode
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_chroma_mode_3_to_9_av8
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern col_for_intra_chroma
+.extern idx_neg_idx_chroma_3_9
+
+.type ihevc_intra_pred_chroma_mode_3_to_9_av8, %function
+
+ihevc_intra_pred_chroma_mode_3_to_9_av8:
+
+    // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    adrp        x7,  :got:gai4_ihevc_ang_table
+    ldr         x7,  [x7, #:got_lo12:gai4_ihevc_ang_table]
+
+    adrp        x8,  :got:gai4_ihevc_inv_ang_table
+    ldr         x8,  [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
+
+    add         x7, x7, x5, lsl #2          //gai4_ihevc_ang_table[mode]
+    ldr         w7,  [x7]                   //intra_pred_ang
+    sxtw        x7,w7
+    dup         v30.8b,w7                   //intra_pred_ang
+
+    adrp        x14,  :got:col_for_intra_chroma
+    ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
+
+prologue_8_16_32:
+    lsr         x10, x4, #3
+    ld1         {v31.8b},[x14],#8
+    mul         x10, x4, x10                //block counter (dec by #8)
+
+    lsl         x11, x4, #1                 //col counter to be inc/dec by #8
+    smull       v22.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
+
+    sub         x7, x5, #3
+    adrp        x12,  :got:idx_neg_idx_chroma_3_9 //load most idx table
+    ldr         x12, [x12,  #:got_lo12:idx_neg_idx_chroma_3_9]
+
+    add         x12, x12, x7, lsl #4
+    mov         x8, x12
+
+    mov         x7, #8
+    sub         x7, x7, x3, lsl #3          //x7 = 8-8x3
+
+    ldr         w9,  [x8]
+    sxtw        x9,w9
+    lsl         x9, x9, #1
+    add         x1, x0, x4, lsl #2          //pu1_ref + 4*nt
+
+    xtn         v6.8b,  v22.8h
+    dup         v26.8b,w9                   //most idx added to final idx values
+    sub         x1, x1, #26                 //ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
+
+    sub         x6, x1, x9
+
+    ld1         {v0.16b, v1.16b}, [x6]      //stores the 32 values reqd based on indices values (from most idx)
+    sshr        v22.8h, v22.8h,#5
+
+    movi        v29.8b, #31                 //contains #31 for vand operation
+
+    movi        v28.8b, #32
+
+    sqxtn       v8.8b,  v22.8h
+    shl         v8.8b, v8.8b,#1             // 2 * idx
+
+    and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
+    movi        v29.8b, #2                  //contains #2 for adding to get ref_main_idx + 1
+
+    mov         x0,#0x302                   // idx value for v is +1 of u
+    dup         v27.4h,w0
+    mov         x0,#0
+
+    movi        v9.8b, #22                  //row 0 to 7
+
+    sub         v8.8b,  v8.8b ,  v27.8b     //ref_main_idx (sub row)
+    sub         v8.8b,  v26.8b ,  v8.8b     //ref_main_idx (row 0)
+    add         v8.8b,  v8.8b ,  v9.8b      //to compensate the pu1_src idx incremented by 8
+    sub         v9.8b,  v8.8b ,  v29.8b     //ref_main_idx + 1 (row 0)
+    tbl         v12.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0)
+    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
+
+    tbl         v13.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0)
+    sub         v4.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 1)
+    sub         v5.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 1)
+
+    movi        v29.8b, #4
+
+    tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
+    umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
+    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
+
+    tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
+    sub         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 2)
+    sub         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 2)
+
+    rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
+
+    tbl         v14.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2)
+    umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
+    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
+
+    tbl         v15.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2)
+    sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 3)
+    sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 3)
+
+    st1         {v24.8b},[x2], x3           //st (row 0)
+    rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
+
+    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
+
+    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
+    sub         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 4)
+    sub         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 4)
+
+    st1         {v22.8b},[x2], x3           //st (row 1)
+    rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
+
+    tbl         v12.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4)
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 3)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 3)
+
+    tbl         v13.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4)
+    sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 5)
+    sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 5)
+
+    st1         {v20.8b},[x2], x3           //st (row 2)
+    rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
+
+    tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
+    umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
+    umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
+
+    tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
+    sub         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 6)
+    sub         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 6)
+
+    st1         {v18.8b},[x2], x3           //st (row 3)
+    cmp         x4,#4
+    beq         end_func
+    rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
+
+    tbl         v14.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6)
+    umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
+    umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
+
+    tbl         v15.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6)
+    sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 7)
+    sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 7)
+
+    st1         {v24.8b},[x2], x3           //st (row 4)
+    rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
+
+    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
+
+    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+
+    st1         {v22.8b},[x2], x3           //st (row 5)
+    rshrn       v20.8b, v20.8h,#5           //round shft (row 6)
+    rshrn       v18.8b, v18.8h,#5           //round shft (row 7)
+
+    st1         {v20.8b},[x2], x3           //st (row 6)
+
+    subs        x10, x10, #4                //subtract 8 and go to end if 8x8
+
+    st1         {v18.8b},[x2], x3           //st (row 7)
+
+    beq         end_func
+
+    subs        x11, x11, #8                //decrement the processed col
+    add         x20, x8, #4
+    csel        x8, x20, x8,gt
+    add         x20, x2, x7
+    csel        x2, x20, x2,gt
+    csel        x8, x12, x8,le
+    sub         x20, x2, x4
+    csel        x2, x20, x2,le
+    add         x20, x2, #8
+    csel        x2, x20, x2,le
+    lsl         x20, x4,  #1
+    csel        x11,x20,x11,le
+    bgt         lbl284
+    adrp        x14,  :got:col_for_intra_chroma
+    ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
+lbl284:
+    add         x20, x0, #8
+    csel        x0, x20, x0,le
+
+    ld1         {v31.8b},[x14],#8
+    smull       v12.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
+    xtn         v10.8b,  v12.8h
+    sshr        v12.8h, v12.8h,#5
+    sqxtn       v11.8b,  v12.8h
+    shl         v11.8b, v11.8b,#1
+    mov         x5, #0x302                  //idx value for v is +1 of u
+    dup         v27.4h,w5                   //row value inc or reset accordingly
+    ldr         w9,  [x8]                   //loads index value
+    sxtw        x9,w9
+    lsl         x9, x9, #1
+    mov         x5, #22
+    sub         x5, x5, x0, lsl #1
+    dup         v16.8b,w5
+    dup         v26.8b,w9
+
+    mov         x5,x2
+    sub         v11.8b,  v11.8b ,  v27.8b   //ref_main_idx (sub row)
+
+kernel_8_16_32:
+    movi        v29.8b, #2                  //contains #2 for adding to get ref_main_idx + 1
+    sub         v8.8b,  v26.8b ,  v11.8b    //ref_main_idx
+    mov         v26.8b, v10.8b
+
+    subs        x11, x11, #8
+    sub         x6, x1, x9
+    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+    add         v8.8b,  v8.8b ,  v16.8b     //to compensate the pu1_src idx incremented by 8
+
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
+    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx - 1 (row 7)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
+
+    add         x20, x0, #8
+    csel        x0, x20, x0,le
+    sub         v9.8b,  v8.8b ,  v29.8b     //ref_main_idx - 2
+    add         x20, x8, #4
+    csel        x8, x20, x8,gt
+
+    ld1         {v0.16b, v1.16b}, [x6]      //stores the 32 values reqd based on indices values (from most idx)
+    rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
+
+    bgt         lbl326
+    adrp        x14,  :got:col_for_intra_chroma
+    ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
+lbl326:
+    st1         {v24.8b},[x5], x3           //st (row 4)
+    csel        x8, x12, x8,le
+
+    mov         x9,#0x302
+    dup         v27.4h,w9                   //row value inc or reset accordingly
+    sub         v4.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 1)
+
+    sub         v5.8b,  v9.8b ,  v29.8b     //ref_main_idx - 1 (row 1)
+    tbl         v12.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0)
+    movi        v29.8b, #31                 //contains #2 for adding to get ref_main_idx + 1
+
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
+    tbl         v13.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+
+    ld1         {v31.8b},[x14],#8
+    and         v6.8b,  v29.8b ,  v26.8b    //fract values in d1/ idx values in d0
+
+    lsl         x20, x4,  #1
+    csel        x11,x20,x11,le
+    movi        v29.8b, #4                  //contains #2 for adding to get ref_main_idx + 1
+    ldr         w9,  [x8]
+    sxtw        x9,w9
+
+    st1         {v22.8b},[x5], x3           //(from previous loop)st (row 5)
+    rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
+
+    sub         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 2)
+    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
+    sub         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx - 1 (row 2)
+
+    lsl         x9, x9, #1
+    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
+
+    umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
+    tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
+    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
+
+    st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
+    rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
+
+    sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 3)
+    tbl         v14.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2)
+    sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx - 1 (row 3)
+
+    umull       v22.8h, v10.8b, v7.8b       //mul (row 1)
+    tbl         v15.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2)
+    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
+
+    rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
+    st1         {v18.8b},[x5], x3           //(from previous loop)st (row 7)
+
+    sub         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 4)
+    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
+    sub         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx - 1 (row 4)
+
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
+    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
+
+    add         x5,x2,x3,lsl#2
+    smull       v14.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
+    add         x9, x9, x0, lsl #1
+
+    st1         {v24.8b},[x2], x3           //st (row 0)
+    rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
+
+    sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 5)
+    tbl         v12.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4)
+    sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx - 1 (row 5)
+
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 3)
+    tbl         v13.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 3)
+
+    st1         {v22.8b},[x2], x3           //st (row 1)
+    rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
+
+    xtn         v10.8b,  v14.8h
+    sshr        v14.8h, v14.8h,#5
+
+    sub         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 6)
+    tbl         v21.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
+    sub         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx - 1 (row 6)
+
+    umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
+    tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
+    sqxtn       v11.8b,  v14.8h
+
+    st1         {v20.8b},[x2], x3           //st (row 2)
+    umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
+
+    rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
+    dup         v26.8b,w9
+
+    sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 7)
+    tbl         v14.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6)
+    sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx - 1 (row 7)
+
+    mov         x6, #22                     //to compensate the 2*row value
+    shl         v11.8b, v11.8b,#1
+    sub         x6, x6, x0, lsl #1
+
+    umull       v22.8h, v21.8b, v7.8b       //mul (row 5)
+    tbl         v15.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6)
+    umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
+
+    st1         {v18.8b},[x2], x3           //st (row 3)
+    rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
+
+    add         x2,x2,x3, lsl #2
+    dup         v16.8b,w6
+    add         x20, x7, x2
+    csel        x2, x20, x2,gt
+
+    sub         x20, x2, x4
+    csel        x2, x20, x2,le
+    sub         v11.8b,  v11.8b ,  v27.8b   //ref_main_idx (add row)
+    sub         x20,x2,#8
+    csel        x2, x20, x2,le
+
+    subs        x10, x10, #4                //subtract 8 and go to end if 8x8
+
+    bne         kernel_8_16_32
+
+epil_8_16_32:
+    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
+    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
+
+    st1         {v24.8b},[x5], x3           //st (row 4)
+    rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
+
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+
+    st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
+    rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
+
+    st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
+    rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
+
+    st1         {v18.8b},[x5], x3           //st (row 7)
+
+end_func:
+    // ldmfd sp!,{x4-x12,x15}          //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_chroma_planar.s b/common/arm64/ihevc_intra_pred_chroma_planar.s
new file mode 100644
index 0000000..ac6b362
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_chroma_planar.s

@@ -0,0 +1,377 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_filters_planar.s
+//*
+//* @brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  akshaya mukund
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*    luma intraprediction filter for planar input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] pi1_coeff
+//*  word8 pointer to the planar coefficients
+//*
+//* @param[in] nt
+//*  size of tranform block
+//*
+//* @param[in] mode
+//*  type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
+//                                  word32 src_strd,
+//                                  uword8* pu1_dst,
+//                                  word32 dst_strd,
+//                                  word32 nt,
+//                                  word32 mode,
+//                   word32 pi1_coeff)
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+//    nt
+//    mode
+//    pi1_coeff
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+.globl ihevc_intra_pred_chroma_planar_av8
+.extern gau1_ihevc_planar_factor
+
+
+.type ihevc_intra_pred_chroma_planar_av8, %function
+
+ihevc_intra_pred_chroma_planar_av8:
+
+    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    adrp        x11, :got:gau1_ihevc_planar_factor //loads table of coeffs
+    ldr         x11, [x11, #:got_lo12:gau1_ihevc_planar_factor]
+
+    clz         w5,w4
+    sub         x20, x5, #32
+    neg         x5, x20
+    dup         v14.8h,w5
+    neg         v14.8h, v14.8h              //shr value (so vneg)
+    dup         v2.8b,w4                    //nt
+    dup         v16.8h,w4                   //nt
+
+    sub         x6, x4, #1                  //nt-1
+    add         x6, x0,x6,lsl #1            //2*(nt-1)
+    ldr         w7,  [x6]
+    sxtw        x7,w7
+    dup         v0.4h,w7                    //src[nt-1]
+
+    add         x6, x4, x4,lsl #1           //3nt
+    add         x6, x6, #1                  //3nt + 1
+    lsl         x6,x6,#1                    //2*(3nt + 1)
+
+    add         x6, x6, x0
+    ldr         w7,  [x6]
+    sxtw        x7,w7
+    dup         v1.4h,w7                    //src[3nt+1]
+
+
+    add         x6, x4, x4                  //2nt
+    add         x14, x6, #1                 //2nt+1
+    lsl         x14,x14,#1                  //2*(2nt+1)
+    sub         x6, x6, #1                  //2nt-1
+    lsl         x6,x6,#1                    //2*(2nt-1)
+    add         x6, x6, x0                  //&src[2nt-1]
+    add         x14, x14, x0                //&src[2nt+1]
+
+    mov         x8, #1                      //row+1 (row is first 0)
+    sub         x9, x4, x8                  //nt-1-row (row is first 0)
+
+    dup         v5.8b,w8                    //row + 1
+    dup         v6.8b,w9                    //nt - 1 - row
+    mov         v7.8b, v5.8b                //mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
+
+    add         x12, x11, #1                //coeffs (to be reloaded after every row)
+    mov         x1, x4                      //nt (row counter) (dec after every row)
+    mov         x5, x2                      //dst (to be reloaded after every row and inc by dst_strd)
+    mov         x10, #8                     //increment for the coeffs
+    mov         x0, x14                     //&src[2nt+1] (to be reloaded after every row)
+
+    cmp         x4, #4
+    beq         tf_sz_4
+
+
+
+    mov         x10,x6
+tf_sz_8_16:
+    ld1         {v10.8b, v11.8b}, [x14],#16 //load src[2nt+1+col]
+    ld1         {v8.8b},[x12],#8
+    mov         v9.8b, v8.8b
+    zip1        v29.8b, v8.8b, v9.8b
+    zip2        v9.8b, v8.8b, v9.8b
+    mov         v8.d[0], v29.d[0]
+    sub         v30.8b,  v2.8b ,  v8.8b     //[nt-1-col]
+    sub         v31.8b,  v2.8b ,  v9.8b
+
+
+
+
+loop_sz_8_16:
+
+    ldr         w7,  [x6], #-2              //src[2nt-1-row] (dec to take into account row)
+    sxtw        x7,w7
+    umull       v12.8h, v5.8b, v0.8b        //(row+1)    *    src[nt-1]
+    ldr         w11,  [x6], #-2             //src[2nt-1-row] (dec to take into account row)
+    sxtw        x11,w11
+    umlal       v12.8h, v6.8b, v10.8b       //(nt-1-row)    *    src[2nt+1+col]
+    dup         v4.4h,w7                    //src[2nt-1-row]
+    umlal       v12.8h, v8.8b, v1.8b        //(col+1)    *    src[3nt+1]
+    dup         v3.4h,w11                   //src[2nt-1-row]
+    umlal       v12.8h, v30.8b, v4.8b       //(nt-1-col)    *    src[2nt-1-row]
+
+
+
+    umull       v28.8h, v5.8b, v0.8b
+    ldr         w7,  [x6], #-2              //src[2nt-1-row] (dec to take into account row)
+    sxtw        x7,w7
+    umlal       v28.8h, v6.8b, v11.8b
+    add         v18.8b,  v5.8b ,  v7.8b     //row++ [(row+1)++]c
+
+
+    umlal       v28.8h, v31.8b, v4.8b
+    sub         v19.8b,  v6.8b ,  v7.8b     //[nt-1-row]--
+    umlal       v28.8h, v9.8b, v1.8b
+    dup         v4.4h,w7                    //src[2nt-1-row]
+
+    umull       v26.8h, v18.8b, v0.8b       //(row+1)    *    src[nt-1]
+    add         v12.8h,  v12.8h ,  v16.8h   //add (nt)
+    umlal       v26.8h, v19.8b, v10.8b      //(nt-1-row)    *    src[2nt+1+col]
+    sshl        v12.8h, v12.8h, v14.8h      //shr
+    umlal       v26.8h, v8.8b, v1.8b        //(col+1)    *    src[3nt+1]
+    add         v28.8h,  v28.8h ,  v16.8h
+    umlal       v26.8h, v30.8b, v3.8b       //(nt-1-col)    *    src[2nt-1-row]
+    sshl        v28.8h, v28.8h, v14.8h
+
+
+
+
+
+    umull       v24.8h, v18.8b, v0.8b
+    add         v5.8b,  v18.8b ,  v7.8b     //row++ [(row+1)++]
+    umlal       v24.8h, v19.8b, v11.8b
+    sub         v6.8b,  v19.8b ,  v7.8b     //[nt-1-row]--
+    umlal       v24.8h, v9.8b, v1.8b
+    xtn         v12.8b,  v12.8h
+    umlal       v24.8h, v31.8b, v3.8b
+    xtn         v13.8b,  v28.8h
+
+
+
+
+    add         v26.8h,  v26.8h ,  v16.8h   //add (nt)
+    umull       v22.8h, v5.8b, v0.8b        //(row+1)    *    src[nt-1]
+    sshl        v26.8h, v26.8h, v14.8h      //shr
+    umlal       v22.8h, v6.8b, v10.8b       //(nt-1-row)    *    src[2nt+1+col]
+    st1         {v12.2s, v13.2s}, [x2], x3
+    umlal       v22.8h, v8.8b, v1.8b        //(col+1)    *    src[3nt+1]
+    add         v24.8h,  v24.8h ,  v16.8h
+    umlal       v22.8h, v30.8b, v4.8b       //(nt-1-col)    *    src[2nt-1-row]
+    sshl        v24.8h, v24.8h, v14.8h
+
+    umull       v20.8h, v5.8b, v0.8b
+    add         v18.8b,  v5.8b ,  v7.8b     //row++ [(row+1)++]c
+    umlal       v20.8h, v6.8b, v11.8b
+    sub         v19.8b,  v6.8b ,  v7.8b     //[nt-1-row]--
+    umlal       v20.8h, v31.8b, v4.8b
+
+    ldr         w11,  [x6], #-2             //src[2nt-1-row] (dec to take into account row)
+    sxtw        x11,w11
+    umlal       v20.8h, v9.8b, v1.8b
+    dup         v3.4h,w11                   //src[2nt-1-row]
+    add         v22.8h,  v22.8h ,  v16.8h   //add (nt)
+
+    umull       v12.8h, v18.8b, v0.8b       //(row+1)    *    src[nt-1]
+    xtn         v26.8b,  v26.8h
+    umlal       v12.8h, v19.8b, v10.8b      //(nt-1-row)    *    src[2nt+1+col]
+    xtn         v27.8b,  v24.8h
+
+    umlal       v12.8h, v8.8b, v1.8b        //(col+1)    *    src[3nt+1]
+    sshl        v22.8h, v22.8h, v14.8h      //shr
+
+    umlal       v12.8h, v30.8b, v3.8b       //(nt-1-col)    *    src[2nt-1-row]
+    add         v20.8h,  v20.8h ,  v16.8h
+
+    umull       v28.8h, v18.8b, v0.8b
+    st1         {v26.2s, v27.2s}, [x2], x3
+
+    umlal       v28.8h, v19.8b, v11.8b
+    add         v5.8b,  v18.8b ,  v7.8b     //row++ [(row+1)++]
+
+    sub         v6.8b,  v19.8b ,  v7.8b     //[nt-1-row]--
+    umlal       v28.8h, v9.8b, v1.8b
+
+    umlal       v28.8h, v31.8b, v3.8b
+    sshl        v20.8h, v20.8h, v14.8h
+
+
+    add         v12.8h,  v12.8h ,  v16.8h   //add (nt)
+    xtn         v22.8b,  v22.8h
+
+
+    add         v28.8h,  v28.8h ,  v16.8h
+    xtn         v23.8b,  v20.8h
+
+
+    sshl        v12.8h, v12.8h, v14.8h      //shr
+    st1         {v22.2s, v23.2s}, [x2], x3
+    sshl        v28.8h, v28.8h, v14.8h
+
+
+
+
+
+    xtn         v20.8b,  v12.8h
+    xtn         v21.8b,  v28.8h
+
+    st1         {v20.2s, v21.2s}, [x2], x3
+
+
+    subs        x1, x1, #4
+
+    bne         loop_sz_8_16
+
+
+
+
+    cmp         x4,#16
+
+    bne         end_loop
+
+
+    sub         x4, x4,#16
+    dup         v5.8b,w8                    //row + 1
+    dup         v6.8b,w9                    //nt - 1 - row
+    mov         v7.8b, v5.8b                //mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
+
+    mov         x6,x10
+    mov         x1,#16
+    sub         x2,x2,x3,lsl #4
+    add         x2,x2,#16
+
+    ld1         {v10.8b, v11.8b}, [x14],#16 //load src[2nt+1+col]
+    ld1         {v8.8b},[x12],#8
+    mov         v9.8b, v8.8b
+    zip1        v29.8b, v8.8b, v9.8b
+    zip2        v9.8b, v8.8b, v9.8b
+    mov         v8.d[0], v29.d[0]
+    sub         v30.8b,  v2.8b ,  v8.8b     //[nt-1-col]
+    sub         v31.8b,  v2.8b ,  v9.8b
+
+    beq         loop_sz_8_16
+
+
+
+tf_sz_4:
+    ld1         {v10.8b},[x14]              //load src[2nt+1+col]
+    ld1         {v8.8b},[x12], x10          //load 8 coeffs [col+1]
+    mov         v9.8b, v8.8b
+    zip1        v29.8b, v8.8b, v9.8b
+    zip2        v9.8b, v8.8b, v9.8b
+    mov         v8.d[0], v29.d[0]
+loop_sz_4:
+    //mov        x10, #4                @reduce inc to #4 for 4x4
+    ldr         w7,  [x6], #-2              //src[2nt-1-row] (dec to take into account row)
+    sxtw        x7,w7
+    dup         v4.4h,w7                    //src[2nt-1-row]
+
+    sub         v9.8b,  v2.8b ,  v8.8b      //[nt-1-col]
+
+    umull       v12.8h, v5.8b, v0.8b        //(row+1)    *    src[nt-1]
+    umlal       v12.8h, v6.8b, v10.8b       //(nt-1-row)    *    src[2nt+1+col]
+    umlal       v12.8h, v8.8b, v1.8b        //(col+1)    *    src[3nt+1]
+    umlal       v12.8h, v9.8b, v4.8b        //(nt-1-col)    *    src[2nt-1-row]
+//    vadd.i16    q6, q6, q8            @add (nt)
+//    vshl.s16     q6, q6, q7            @shr
+//    vmovn.i16     d12, q6
+    rshrn       v12.8b, v12.8h,#3
+
+    st1         {v12.2s},[x2], x3
+
+    add         v5.8b,  v5.8b ,  v7.8b      //row++ [(row+1)++]
+    sub         v6.8b,  v6.8b ,  v7.8b      //[nt-1-row]--
+    subs        x1, x1, #1
+
+    bne         loop_sz_4
+
+end_loop:
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_chroma_ver.s b/common/arm64/ihevc_intra_pred_chroma_ver.s
new file mode 100644
index 0000000..8d1daf7
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_chroma_ver.s

@@ -0,0 +1,232 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_chroma_ver_neon.s
+//*
+//* @brief
+//*  contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  yogeswaran rs
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*    luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] nt
+//*  size of tranform block
+//*
+//* @param[in] mode
+//*  type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_chroma_ver(uword8 *pu1_ref,
+//        word32 src_strd,
+//        uword8 *pu1_dst,
+//        word32 dst_strd,
+//        word32 nt,
+//        word32 mode)
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+//    nt
+//    mode
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+.globl ihevc_intra_pred_chroma_ver_av8
+
+.type ihevc_intra_pred_chroma_ver_av8, %function
+
+ihevc_intra_pred_chroma_ver_av8:
+
+    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    lsl         x5, x4, #2                  //4nt
+
+
+    cmp         x4, #8
+    beq         blk_8
+    blt         blk_4
+
+copy_16:
+    add         x5, x5, #2                  //2nt+2
+    add         x6, x0, x5                  //&src[2nt+1]
+
+    add         x5, x2, x3                  //pu1_dst + dst_strd
+    ld2         {v20.8b, v21.8b}, [x6],#16  //16 loads (col 0:15)
+    add         x8, x5, x3
+
+    add         x10, x8, x3
+    ld2         {v22.8b, v23.8b}, [x6]      //16 loads (col 16:31)
+    lsl         x11, x3, #2
+
+    add         x11, x11, #-16
+
+
+    st2         {v20.8b, v21.8b}, [x2],#16
+    st2         {v20.8b, v21.8b}, [x5],#16
+    st2         {v20.8b, v21.8b}, [x8],#16
+    st2         {v20.8b, v21.8b}, [x10],#16
+
+    st2         {v22.8b, v23.8b}, [x2], x11
+    st2         {v22.8b, v23.8b}, [x5], x11
+    st2         {v22.8b, v23.8b}, [x8], x11
+    st2         {v22.8b, v23.8b}, [x10], x11
+
+    subs        x4, x4, #4
+
+kernel_copy_16:
+    st2         {v20.8b, v21.8b}, [x2],#16
+    st2         {v20.8b, v21.8b}, [x5],#16
+    st2         {v20.8b, v21.8b}, [x8],#16
+    st2         {v20.8b, v21.8b}, [x10],#16
+
+    st2         {v22.8b, v23.8b}, [x2], x11
+    st2         {v22.8b, v23.8b}, [x5], x11
+    st2         {v22.8b, v23.8b}, [x8], x11
+    st2         {v22.8b, v23.8b}, [x10], x11
+
+    subs        x4, x4, #4
+
+
+    st2         {v20.8b, v21.8b}, [x2],#16
+    st2         {v20.8b, v21.8b}, [x5],#16
+    st2         {v20.8b, v21.8b}, [x8],#16
+    st2         {v20.8b, v21.8b}, [x10],#16
+
+    st2         {v22.8b, v23.8b}, [x2], x11
+    st2         {v22.8b, v23.8b}, [x5], x11
+    st2         {v22.8b, v23.8b}, [x8], x11
+    st2         {v22.8b, v23.8b}, [x10], x11
+
+    subs        x4, x4, #4
+
+    st2         {v20.8b, v21.8b}, [x2],#16
+    st2         {v20.8b, v21.8b}, [x5],#16
+    st2         {v20.8b, v21.8b}, [x8],#16
+    st2         {v20.8b, v21.8b}, [x10],#16
+
+    st2         {v22.8b, v23.8b}, [x2], x11
+    st2         {v22.8b, v23.8b}, [x5], x11
+    st2         {v22.8b, v23.8b}, [x8], x11
+    st2         {v22.8b, v23.8b}, [x10], x11
+
+    subs        x4, x4, #4
+    bne         kernel_copy_16
+
+    b           end_func
+
+blk_8:
+
+    add         x5, x5, #2                  //2nt+2
+    add         x6, x0, x5                  //&src[2nt+1]
+
+    add         x5, x2, x3                  //pu1_dst + dst_strd
+    ld2         {v20.8b, v21.8b}, [x6],#16  //16 loads (col 0:15)
+    add         x8, x5, x3
+
+    add         x10, x8, x3
+    ld2         {v22.8b, v23.8b}, [x6]      //16 loads (col 16:31)
+
+    lsl         x11,x3,#2
+
+    st2         {v20.8b, v21.8b}, [x2],x11
+    st2         {v20.8b, v21.8b}, [x5],x11
+    st2         {v20.8b, v21.8b}, [x8],x11
+    st2         {v20.8b, v21.8b}, [x10],x11
+
+    st2         {v20.8b, v21.8b}, [x2]
+    st2         {v20.8b, v21.8b}, [x5]
+    st2         {v20.8b, v21.8b}, [x8]
+    st2         {v20.8b, v21.8b}, [x10]
+
+    subs        x4, x4, #8
+    beq         end_func
+
+blk_4:
+
+    //lsl        x5, x4, #2            @4nt
+    add         x5, x5, #2                  //2nt+2
+    add         x6, x0, x5                  //&src[2nt+1]
+
+    ld1         {v0.8b},[x6]
+    add         x5, x2, x3                  //pu1_dst + dst_strd
+
+    st1         {v0.8b},[x2]
+    add         x8, x5, x3
+    st1         {v0.8b},[x5]
+    add         x10, x8, x3
+    st1         {v0.8b},[x8]
+    st1         {v0.8b},[x10]
+
+
+
+end_func:
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s b/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
new file mode 100644
index 0000000..e9f83ff
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s

@@ -0,0 +1,623 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_chroma_mode_11_to_17.s
+//*
+//* @brief
+//*  contains function definitions for intra prediction chroma mode 11 to 17
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  akshaya mukund
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*    luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] nt
+//*  size of tranform block
+//*
+//* @param[in] mode
+//*  type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_chroma_mode_11_to_17(uword8* pu1_ref,
+//                               word32 src_strd,
+//                               uword8* pu1_dst,
+//                               word32 dst_strd,
+//                               word32 nt,
+//                               word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+//    nt
+//    mode
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_chroma_mode_11_to_17_av8
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern col_for_intra_chroma
+.extern idx_neg_idx_chroma_11_17
+
+.type ihevc_intra_pred_chroma_mode_11_to_17_av8, %function
+
+ihevc_intra_pred_chroma_mode_11_to_17_av8:
+
+    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    adrp        x7,  :got:gai4_ihevc_ang_table
+    ldr         x7,  [x7, #:got_lo12:gai4_ihevc_ang_table]
+
+    adrp        x8,  :got:gai4_ihevc_inv_ang_table
+    ldr         x8,  [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
+
+    add         x7, x7, x5, lsl #2          //gai4_ihevc_ang_table[mode]
+    add         x8, x8, x5, lsl #2          //gai4_ihevc_inv_ang_table[mode - 11]
+    sub         x8, x8, #44
+
+    ldr         w7,  [x7]                   //intra_pred_ang
+    sxtw        x7,w7
+    sub         sp, sp, #132                //ref_temp[2 * max_cu_size + 2]
+
+    ldr         w8,  [x8]                   //inv_ang
+    sxtw        x8,w8
+    add         x6, sp, x4, lsl #1          //ref_temp + 2 * nt
+
+    mul         x9, x4, x7                  //nt*intra_pred_ang
+
+    sub         x6, x6, #2                  //ref_temp + 2*nt - 2
+
+    add         x1, x0, x4, lsl #2          //x1 = &src[4nt]
+    dup         v30.8b,w7                   //intra_pred_ang
+
+    mov         x7, x4
+
+    sub         x1,x1,#6                    //address calculation for copying 4 halfwords
+
+    asr         x9, x9, #5
+
+    ld1         {v0.8b},[x1]
+    rev64       v0.4h,  v0.4h
+    st1         {v0.8b},[x6],#8
+
+    sub         x1, x1,#8
+
+    subs        x7, x7, #4
+    add         x20, x1,#8
+    csel        x1, x20, x1,eq
+    beq         end_loop_copy
+    subs        x7,x7,#4
+    beq         loop_copy_8
+    subs        x7,x7,#8
+    beq         loop_copy_16
+
+loop_copy_32:
+    sub         x1, x1,#24
+    ld1         {v0.16b, v1.16b},[x1]
+
+    sub         x1, x1,#24
+    ld1         {v0.16b, v1.16b},[x1],#32
+
+    rev64       v6.4h,  v6.4h
+    rev64       v5.4h,  v5.4h
+    rev64       v4.4h,  v4.4h
+    rev64       v3.4h,  v3.4h
+    rev64       v2.4h,  v2.4h
+    rev64       v1.4h,  v1.4h
+    rev64       v0.4h,  v0.4h
+
+    st1         {v6.8b},[x6],#8
+    st1         {v5.8b},[x6],#8
+    st1         {v4.8b},[x6],#8
+    st1         {v3.8b},[x6],#8
+    st1         {v2.8b},[x6],#8
+    st1         {v1.8b},[x6],#8
+    st1         {v0.8b},[x6],#8
+
+    ld1         {v4.8b, v5.8b, v6.8b},[x1],#24
+    b           end_loop_copy
+
+loop_copy_16:
+    sub         x1, x1,#16
+    ld1         {v0.8b, v1.8b, v2.8b},[x1]
+
+    rev64       v2.4h,  v2.4h
+    rev64       v1.4h,  v1.4h
+    rev64       v0.4h,  v0.4h
+
+    st1         {v2.8b},[x6],#8
+    st1         {v1.8b},[x6],#8
+    st1         {v0.8b},[x6],#8
+
+    b           end_loop_copy
+loop_copy_8:
+    ld1         {v0.8b},[x1]
+    rev64       v0.4h,  v0.4h
+    st1         {v0.8b},[x6],#8
+end_loop_copy:
+    sub         x1, x1,#2
+
+    ldrh        w11, [x1], #-2
+    sxtw        x11,w11
+    strh        w11, [x6], #2
+    sxtw        x11,w11
+
+    cmp         x9, #-1
+    bge         prologue_8_16_32
+
+    add         x6, sp, x4, lsl #1          //ref_temp + 2 * nt
+    sub         x6, x6, #4                  //ref_temp + 2 * nt - 2 - 2
+
+    mov         x12, #-1
+
+    sub         x20, x9, x12                //count to take care off ref_idx
+    neg         x9, x20
+
+    add         x1, x0, x4, lsl #2          //x1 = &src[4nt]
+
+    mov         x7, #128                    //inv_ang_sum
+
+loop_copy_ref_idx:
+
+    add         x7, x7, x8                  //inv_ang_sum += inv_ang
+
+    lsr         x0, x7, #8
+    lsl         x0, x0, #1
+
+    ldrh        w11, [x1, x0]
+    sxtw        x11,w11
+    strh        w11, [x6], #-2
+    sxtw        x11,w11
+
+    subs        x9, x9, #1
+
+    bne         loop_copy_ref_idx
+
+prologue_8_16_32:
+
+    adrp        x14,  :got:col_for_intra_chroma
+    ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
+
+    lsr         x10, x4, #3
+    ld1         {v31.8b},[x14],#8
+    mul         x10, x4, x10                //block counter (dec by #8)
+
+    lsl         x11, x4, #1                 //col counter to be inc/dec by #8
+    smull       v22.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
+
+    sub         x7, x5, #11
+
+    adrp        x12, :got:idx_neg_idx_chroma_11_17 //load least idx table
+    ldr         x12, [x12, #:got_lo12:idx_neg_idx_chroma_11_17]
+
+    add         x12, x12, x7, lsl #4
+    mov         x8, x12
+
+    mov         x7, #8
+    sub         x7, x7, x3, lsl #3          //x7 = 8-8x3
+
+    ldr         w9,  [x8]
+    sxtw        x9,w9
+    lsl         x9, x9, #1
+    add         x1, sp, x4, lsl #1          //ref_temp + 2nt
+
+    xtn         v6.8b,  v22.8h
+    dup         v26.8b,w9                   //least idx added to final idx values
+    sub         x1, x1, #2                  //ref_temp + 2nt - 2
+
+    add         x6, x1, x9
+
+    ld1         {v0.16b, v1.16b}, [x6]      //stores the 32 values reqd based on indices values (from least idx)
+    sshr        v22.8h, v22.8h,#5
+
+//    mov        x0, #31
+    movi        v29.8b, #31                 //contains #31 for vand operation
+
+//    mov        x0, #32
+    movi        v28.8b, #32
+
+    sqxtn       v8.8b,  v22.8h
+    shl         v8.8b, v8.8b,#1             // 2 * idx
+
+    and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
+
+//    mov        x0, #2
+    movi        v29.8b, #2                  //contains #2 for adding to get ref_main_idx + 1
+
+    mov         x0,#0x100                   // idx value for v is +1 of u
+    dup         v27.4h,w0
+    add         v27.8b,  v27.8b ,  v29.8b
+    mov         x0,#0
+
+    add         v8.8b,  v8.8b ,  v27.8b     //ref_main_idx (add row)
+    sub         v8.8b,  v8.8b ,  v26.8b     //ref_main_idx (row 0)
+    add         v9.8b,  v8.8b ,  v29.8b     //ref_main_idx + 1 (row 0)
+    tbl         v12.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0)
+    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
+
+    tbl         v13.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0)
+    add         v4.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 1)
+    add         v5.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 1)
+
+//    mov        x0, #4                @ 2 *(row * 2 )
+    movi        v29.8b, #4
+
+    tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
+    umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
+    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
+
+    tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
+    add         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 2)
+    add         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 2)
+
+    rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
+
+    tbl         v14.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2)
+    umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
+    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
+
+    tbl         v15.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2)
+    add         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 3)
+    add         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 3)
+
+    st1         {v24.8b},[x2], x3           //st (row 0)
+    rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
+
+    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
+
+    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
+    add         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 4)
+    add         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 4)
+
+    st1         {v22.8b},[x2], x3           //st (row 1)
+    rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
+
+    tbl         v12.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4)
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 3)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 3)
+
+    tbl         v13.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4)
+    add         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 5)
+    add         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 5)
+
+    st1         {v20.8b},[x2], x3           //st (row 2)
+    rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
+
+    tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
+    umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
+    umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
+
+    tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
+    add         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 6)
+    add         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 6)
+
+    st1         {v18.8b},[x2], x3           //st (row 3)
+    cmp         x4,#4
+    beq         end_func
+    rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
+
+    tbl         v14.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6)
+    umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
+    umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
+
+    tbl         v15.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6)
+    add         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 7)
+    add         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 7)
+
+    st1         {v24.8b},[x2], x3           //st (row 4)
+    rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
+
+    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
+
+    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+
+    st1         {v22.8b},[x2], x3           //st (row 5)
+    rshrn       v20.8b, v20.8h,#5           //round shft (row 6)
+    rshrn       v18.8b, v18.8h,#5           //round shft (row 7)
+
+    st1         {v20.8b},[x2], x3           //st (row 6)
+
+    subs        x10, x10, #4                //subtract 8 and go to end if 8x8
+
+    st1         {v18.8b},[x2], x3           //st (row 7)
+
+    beq         end_func
+
+    subs        x11, x11, #8
+    add         x20, x8, #4
+    csel        x8, x20, x8,gt
+    add         x20, x2, x7
+    csel        x2, x20, x2,gt
+    csel        x8, x12, x8,le
+    sub         x20, x2, x4
+    csel        x2, x20, x2,le
+    add         x20, x2, #8
+    csel        x2, x20, x2,le
+    lsl         x20, x4,  #1
+    csel        x11,x20,x11,le
+    bgt         lbl400
+    adrp        x14,  :got:col_for_intra_chroma
+    ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
+lbl400:
+    add         x20, x0, #8
+    csel        x0, x20, x0,le
+
+    ld1         {v31.8b},[x14],#8
+    smull       v12.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
+    xtn         v10.8b,  v12.8h
+    sshr        v12.8h, v12.8h,#5
+    sqxtn       v11.8b,  v12.8h
+    shl         v11.8b, v11.8b,#1
+    orr         x5,x0,x0, lsl#8
+    add         x5, x5,#0x002
+    add         x5, x5,#0x300
+    dup         v27.4h,w5                   //row value inc or reset accordingly
+    ldr         w9,  [x8]
+    sxtw        x9,w9
+    lsl         x9, x9, #1
+    add         x9, x9, x0, lsl #1
+//    sub        x9, x9, #1
+    dup         v26.8b,w9
+    add         v8.8b,  v27.8b ,  v11.8b    //ref_main_idx (add row)
+    mov         x5,x2
+
+//    sub        x4,x4,#8
+
+kernel_8_16_32:
+    movi        v29.8b, #2                  //contains #2 for adding to get ref_main_idx + 1
+
+    sub         v8.8b,  v8.8b ,  v26.8b     //ref_main_idx
+    mov         v26.8b, v10.8b
+
+    subs        x11, x11, #8
+    add         x6, x1, x9
+    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+    add         v9.8b,  v29.8b ,  v8.8b     //ref_main_idx + 1
+
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
+    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
+
+    add         x20, x0, #8
+    csel        x0, x20, x0,le
+    add         x20, x8, #4
+    csel        x8, x20, x8,gt
+    ld1         {v0.16b, v1.16b}, [x6]      //stores the 32 values reqd based on indices values (from least idx)
+
+    st1         {v24.8b},[x5], x3           //st (row 4)
+    rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
+
+    csel        x8, x12, x8,le
+    orr         x9,x0,x0, lsl#8
+    lsl         x9, x9, #1
+    add         x9, x9,#0x002
+    add         x9, x9,#0x300
+    dup         v27.4h,w9                   //row value inc or reset accordingly
+
+    bgt         lbl452
+    adrp        x14,  :got:col_for_intra_chroma
+    ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
+lbl452:
+
+    add         v4.8b,  v29.8b ,  v8.8b     //ref_main_idx (row 1)
+    tbl         v12.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0)
+    add         v5.8b,  v29.8b ,  v9.8b     //ref_main_idx + 1 (row 1)
+
+    movi        v29.8b, #31                 //contains #2 for adding to get ref_main_idx + 1
+
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
+    tbl         v13.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+
+    ld1         {v31.8b},[x14],#8
+    and         v6.8b,  v29.8b ,  v26.8b    //fract values in d1/ idx values in d0
+
+    movi        v29.8b, #4                  //contains #2 for adding to get ref_main_idx + 1
+
+    st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
+    rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
+
+    add         v8.8b,  v29.8b ,  v8.8b     //ref_main_idx (row 2)
+    tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
+    add         v9.8b,  v29.8b ,  v9.8b     //ref_main_idx + 1 (row 2)
+
+    lsl         x20, x4,  #1
+    csel        x11,x20,x11,le
+    ldr         w9,  [x8]
+    sxtw        x9,w9
+    lsl         x9, x9, #1
+    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
+
+    umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
+    tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
+    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
+
+    st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
+    rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
+
+    add         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 3)
+    tbl         v14.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2)
+    add         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 3)
+
+    umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
+    tbl         v15.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2)
+    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
+
+    rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
+    st1         {v18.8b},[x5], x3           //(from previous loop)st (row 7)
+
+    add         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 4)
+    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
+    add         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 4)
+
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
+    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
+
+    smull       v14.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
+    add         x5,x2,x3,lsl#2
+    add         x9, x9, x0, lsl #1
+
+
+    st1         {v24.8b},[x2], x3           //st (row 0)
+    rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
+
+    add         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 5)
+    tbl         v12.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4)
+    add         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 5)
+
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 3)
+    tbl         v13.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 3)
+
+    st1         {v22.8b},[x2], x3           //st (row 1)
+    rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
+
+    xtn         v10.8b,  v14.8h
+    sshr        v14.8h, v14.8h,#5
+
+    add         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 6)
+    tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
+    add         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 6)
+
+    umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
+    tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
+    umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
+
+    st1         {v20.8b},[x2], x3           //st (row 2)
+    rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
+
+//    sub        x9, x9, #1
+    sqxtn       v11.8b,  v14.8h
+
+    add         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 7)
+    tbl         v14.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6)
+    add         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 7)
+
+    shl         v11.8b, v11.8b,#1
+
+    umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
+    tbl         v15.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6)
+    umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
+
+    add         v8.8b,  v27.8b ,  v11.8b    //ref_main_idx (add row)
+    dup         v26.8b,w9
+
+    st1         {v18.8b},[x2], x3           //st (row 3)
+    rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
+
+
+    add         x2, x2, x3, lsl #2
+    add         x20, x7, x2
+    csel        x2, x20, x2,gt
+    sub         x20, x2, x4, lsl #1
+    csel        x2, x20, x2,le
+    add         x20,x2,#8
+    csel        x2, x20, x2,le
+
+    subs        x10, x10, #4                //subtract 8 and go to end if 8x8
+
+    bne         kernel_8_16_32
+epil_8_16_32:
+
+    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
+    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
+
+    st1         {v24.8b},[x5], x3           //st (row 4)
+    rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
+
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+
+    st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
+    rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
+
+    st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
+    rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
+
+    st1         {v18.8b},[x5], x3           //st (row 7)
+
+end_func:
+    add         sp, sp, #132
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s b/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
new file mode 100644
index 0000000..3af2da7
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s

@@ -0,0 +1,575 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_chroma_mode_19_to_25.s
+//*
+//* @brief
+//*  contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  naveen sr
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*    chroma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] nt
+//*  size of tranform block
+//*
+//* @param[in] mode
+//*  type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_chroma_mode_19_to_25(uword8* pu1_ref,
+//                               word32 src_strd,
+//                               uword8* pu1_dst,
+//                               word32 dst_strd,
+//                               word32 nt,
+//                               word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+//    nt
+//    mode
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+.globl ihevc_intra_pred_chroma_mode_19_to_25_av8
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern gau1_ihevc_planar_factor
+
+.type ihevc_intra_pred_chroma_mode_19_to_25_av8, %function
+
+ihevc_intra_pred_chroma_mode_19_to_25_av8:
+
+    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    adrp        x7,  :got:gai4_ihevc_ang_table
+    ldr         x7,  [x7, #:got_lo12:gai4_ihevc_ang_table]
+
+    adrp        x8,  :got:gai4_ihevc_inv_ang_table
+    ldr         x8,  [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
+
+    add         x7, x7, x5, lsl #2          //gai4_ihevc_ang_table[mode]
+    add         x8, x8, x5, lsl #2          //gai4_ihevc_inv_ang_table
+    sub         x8, x8, #48                 //gai4_ihevc_inv_ang_table[mode - 12]
+
+    ldr         w7,  [x7]                   //intra_pred_ang
+    sxtw        x7,w7
+    sub         sp, sp, #132                //ref_temp[2 * max_cu_size + 2]
+
+    ldr         w8,  [x8]                   //inv_ang
+    sxtw        x8,w8
+    add         x6, sp, x4 , lsl #1         //ref_temp + 2 * nt
+
+    mul         x9, x4, x7                  //nt*intra_pred_ang
+
+    sub         x6, x6, #2                  //ref_temp + 2*nt - 2
+
+    add         x1, x0, x4, lsl #2          //x1 = &src[4nt]
+    dup         v30.8b,w7                   //intra_pred_ang
+
+    mov         x7, x4
+
+    asr         x9, x9, #5
+
+    ld1         {v0.2s},[x1],#8             // pu1_ref[two_nt + k]
+
+    st1         {v0.2s},[x6],#8             //ref_temp[k + nt - 1] = pu1_ref[two_nt + k]//
+
+    subs        x7, x7, #4
+    beq         end_loop_copy
+    subs        x7,x7,#4
+    beq         loop_copy_8
+    subs        x7,x7,#8
+    beq         loop_copy_16
+
+loop_copy_32:
+    ld1         {v0.8b, v1.8b, v2.8b, v3.8b},[x1],#32
+    ld1         {v4.8b, v5.8b, v6.8b},[x1],#24
+
+    st1         {v0.8b, v1.8b, v2.8b, v3.8b},[x6],#32
+
+
+    st1         {v4.8b, v5.8b, v6.8b},[x6],#24
+    b           end_loop_copy
+
+loop_copy_16:
+    ld1         {v0.8b, v1.8b, v2.8b},[x1],#24
+    st1         {v0.8b, v1.8b, v2.8b},[x6],#24
+
+    b           end_loop_copy
+
+loop_copy_8:
+    ld1         {v0.8b},[x1],#8
+    st1         {v0.8b},[x6],#8
+
+end_loop_copy:
+
+    ldrh        w11, [x1]
+    sxtw        x11,w11
+    strh        w11, [x6]
+    sxtw        x11,w11
+
+    cmp         x9, #-1
+    bge         linear_filtering
+
+    add         x6, sp, x4 ,lsl #1          //ref_temp + 2 * nt
+    sub         x6, x6, #4                  //ref_temp + 2 * nt - 2 - 2
+
+    mov         x12, #-1
+
+    sub         x20, x9, x12                //count to take care off ref_idx
+    neg         x9, x20
+
+    add         x1, x0, x4, lsl #2          //x1 = &src[2nt]
+
+    mov         x7, #128                    //inv_ang_sum
+
+loop_copy_ref_idx:
+
+    add         x7, x7, x8                  //inv_ang_sum += inv_ang
+    lsr         x0, x7, #8
+    lsl         x0, x0, #1
+    neg         x20,x0
+    ldrh        w11, [x1, x20]
+    sxtw        x11,w11
+    strh        w11, [x6], #-2
+    sxtw        x11,w11
+
+    subs        x9, x9, #1
+
+    bne         loop_copy_ref_idx
+
+
+linear_filtering:
+//    after copy
+//    below code is taken from mode 27 to 33 and modified
+
+
+    adrp        x6,  :got:gai4_ihevc_ang_table //loads word32 gai4_ihevc_ang_table[35]
+    ldr         x6,  [x6, #:got_lo12:gai4_ihevc_ang_table]
+
+    lsl         x7,x4,#2                    //four_nt
+
+    add         x8,x6,x5,lsl #2             //*gai4_ihevc_ang_table[mode]
+    ldr         w9, [x8]                    //intra_pred_ang = gai4_ihevc_ang_table[mode]
+    sxtw        x9,w9
+    adrp        x1, :got:gau1_ihevc_planar_factor //used for ((row + 1) * intra_pred_ang) row values
+    ldr         x1, [x1, #:got_lo12:gau1_ihevc_planar_factor]
+
+    add         x6,x1,#1
+
+    add         x8, sp, x4, lsl #1          //ref_temp + 2 * nt
+    sub         x8, x8,#2                   //ref_temp + 2*nt -2
+
+    mov         x14,#0                      //row
+    mov         x12,x4
+    lsl         x4,x4,#1
+
+core_loop_8:
+    add         x8,x8,#2                    //pu1_ref_main_idx += (four_nt + 1)
+    dup         v0.8b,w9                    //intra_pred_ang
+    lsr         x12, x4, #4                 //divide by 8
+
+    movi        v1.8b, #32
+    mul         x7, x4, x12
+
+    movi        v6.8h, #31
+
+
+    mov         x1,x8
+
+    mov         x5,x4
+    mov         x11,#2
+
+prologue:
+    ld1         {v3.8b},[x6]                //loads the row value
+    smull       v2.8h, v3.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
+    and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
+    xtn         v4.8b,  v4.8h
+    shrn        v5.8b, v2.8h,#5             //idx = pos >> 5
+    shl         v5.8b, v5.8b,#1
+
+    dup         v31.8b, v4.8b[0]
+    add         x0,x2,x3
+
+    smov        x14, v5.2s[0]               //(i row)extract idx to the r register
+//    lsl            x14,x14,#1
+
+    dup         v29.8b, v4.8b[1]            //(ii)
+    sbfx        x9,x14,#0,#8
+
+    add         x10,x8,x9                   //(i row)*pu1_ref[ref_main_idx]
+
+    ld1         {v8.8b},[x10],x11           //(i row)ref_main_idx
+    sbfx        x9,x14,#8,#8
+
+    ld1         {v9.8b},[x10]               //(i row)ref_main_idx_1
+    add         x12,x8,x9                   //(ii)*pu1_ref[ref_main_idx]
+
+    sbfx        x9,x14,#16,#8
+    sub         v30.8b,  v1.8b ,  v31.8b    //32-fract(dup_const_32_fract)
+    add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
+
+    ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
+    umull       v10.8h, v8.8b, v30.8b       //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
+    umlal       v10.8h, v9.8b, v31.8b       //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    dup         v27.8b, v4.8b[2]            //(iii)
+    sub         v28.8b,  v1.8b ,  v29.8b    //(ii)32-fract(dup_const_32_fract)
+    sbfx        x9,x14,#24,#8
+
+    dup         v25.8b, v4.8b[3]            //(iv)
+    umull       v14.8h, v12.8b, v28.8b      //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    add         x12,x8,x9                   //(iv)*pu1_ref[ref_main_idx]
+
+    ld1         {v16.8b},[x10],x11          //(iii)ref_main_idx
+    umlal       v14.8h, v13.8b, v29.8b      //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v17.8b},[x10]              //(iii)ref_main_idx_1
+    rshrn       v10.8b, v10.8h,#5           //(i row)shift_res = vrshrn_n_u16(add_res, 5)
+
+    ld1         {v20.8b},[x12],x11          //(iv)ref_main_idx
+    sub         v26.8b,  v1.8b ,  v27.8b    //(iii)32-fract(dup_const_32_fract)
+
+    ld1         {v21.8b},[x12]              //(iv)ref_main_idx_1
+
+    dup         v31.8b, v4.8b[4]            //(v)
+    umull       v18.8h, v16.8b, v26.8b      //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    smov        x14, v5.2s[1]               //extract idx to the r register
+    umlal       v18.8h, v17.8b, v27.8b      //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+//    lsl            x14,x14,#1
+
+    st1         {v10.8b},[x2],#8            //(i row)
+    rshrn       v14.8b, v14.8h,#5           //(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    sbfx        x9,x14,#0,#8
+    dup         v29.8b, v4.8b[5]            //(vi)
+    add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
+
+    ld1         {v8.8b},[x10],x11           //(v)ref_main_idx
+    sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
+
+    umull       v22.8h, v20.8b, v24.8b      //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+    sbfx        x9,x14,#8,#8
+
+    ld1         {v9.8b},[x10]               //(v)ref_main_idx_1
+    umlal       v22.8h, v21.8b, v25.8b      //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    st1         {v14.8b},[x0],x3            //(ii)
+    rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    add         x12,x8,x9                   //(vi)*pu1_ref[ref_main_idx]
+    dup         v27.8b, v4.8b[6]            //(vii)
+
+    sbfx        x9,x14,#16,#8
+    sub         v30.8b,  v1.8b ,  v31.8b    //(v)32-fract(dup_const_32_fract)
+    add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
+
+    ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
+    umull       v10.8h, v8.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
+    umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    st1         {v18.8b},[x0],x3            //(iii)
+    rshrn       v22.8b, v22.8h,#5           //(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+    dup         v25.8b, v4.8b[7]            //(viii)
+    sbfx        x9,x14,#24,#8
+
+    ld1         {v16.8b},[x10],x11          //(vii)ref_main_idx
+    sub         v28.8b,  v1.8b ,  v29.8b    //(vi)32-fract(dup_const_32_fract)
+
+    ld1         {v17.8b},[x10]              //(vii)ref_main_idx_1
+    umull       v14.8h, v12.8b, v28.8b      //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    add         x12,x8,x9                   //(viii)*pu1_ref[ref_main_idx]
+    umlal       v14.8h, v13.8b, v29.8b      //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+    subs        x7,x7,#8
+
+    st1         {v22.8b},[x0],x3            //(iv)
+    cmp         x4,#8                       // go to end if 4x4
+    beq         end_loops
+
+    rshrn       v10.8b, v10.8h,#5           //(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+    ld1         {v20.8b},[x12],x11          //(viii)ref_main_idx
+    sub         v26.8b,  v1.8b ,  v27.8b    //(vii)32-fract(dup_const_32_fract)
+
+    ld1         {v21.8b},[x12]              //(viii)ref_main_idx_1
+    umull       v18.8h, v16.8b, v26.8b      //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    add         x20,x8,#8
+    csel        x8, x20, x8,gt
+    umlal       v18.8h, v17.8b, v27.8b      //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    sub         x20,x4,#8
+    csel        x4, x20, x4,gt
+
+    st1         {v10.8b},[x0],x3            //(v)
+    rshrn       v14.8b, v14.8h,#5           //(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+    beq         epilogue
+
+    ld1         {v5.8b},[x6]                //loads the row value
+    smull       v2.8h, v5.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
+    and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
+    xtn         v4.8b,  v4.8h
+    shrn        v3.8b, v2.8h,#5             //idx = pos >> 5
+    shl         v3.8b, v3.8b,#1
+    smov        x14, v3.2s[0]               //(i)extract idx to the r register
+//    lsl            x14,x14,#1
+    sbfx        x9,x14,#0,#8
+    add         x10,x8,x9                   //(i)*pu1_ref[ref_main_idx]
+
+kernel_8_rows:
+    dup         v31.8b, v4.8b[0]
+    subs        x4,x4,#8
+    sbfx        x9,x14,#8,#8
+
+    ld1         {v8.8b},[x10],x11           //(i)ref_main_idx
+    sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
+
+    add         x20,x6,#8                   //increment the row value
+    csel        x6, x20, x6,le
+    add         x12,x8,x9                   //(ii)*pu1_ref[ref_main_idx]
+
+    ld1         {v9.8b},[x10]               //(i)ref_main_idx_1
+    umull       v22.8h, v20.8b, v24.8b      //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    ld1         {v5.8b},[x6]                //loads the row value
+    umlal       v22.8h, v21.8b, v25.8b      //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    dup         v29.8b, v4.8b[1]            //(ii)
+    rshrn       v18.8b, v18.8h,#5           //(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    sbfx        x9,x14,#16,#8
+
+    st1         {v14.8b},[x0],x3            //(vi)
+    sub         v30.8b,  v1.8b ,  v31.8b    //(i)32-fract(dup_const_32_fract)
+
+    add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
+
+    ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
+    umull       v10.8h, v8.8b, v30.8b       //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
+    umlal       v10.8h, v9.8b, v31.8b       //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    sbfx        x9,x14,#24,#8
+    csel        x4, x5, x4,le               //reload nt
+
+    smov        x14, v3.2s[1]               //extract idx to the r register
+    rshrn       v22.8b, v22.8h,#5           //(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    dup         v27.8b, v4.8b[2]            //(iii)
+    sub         v28.8b,  v1.8b ,  v29.8b    //(ii)32-fract(dup_const_32_fract)
+    add         x12,x8,x9                   //(iv)*pu1_ref[ref_main_idx]
+
+    ld1         {v16.8b},[x10],x11          //(iii)ref_main_idx
+    umull       v14.8h, v12.8b, v28.8b      //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    st1         {v18.8b},[x0],x3            //(vii)
+    umlal       v14.8h, v13.8b, v29.8b      //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v17.8b},[x10]              //(iii)ref_main_idx_1
+    rshrn       v10.8b, v10.8h,#5           //(i)shift_res = vrshrn_n_u16(add_res, 5)
+
+    dup         v25.8b, v4.8b[3]            //(iv)
+    smull       v2.8h, v5.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
+
+    st1         {v22.8b},[x0]               //(viii)
+    sub         v26.8b,  v1.8b ,  v27.8b    //(iii)32-fract(dup_const_32_fract)
+
+    ld1         {v20.8b},[x12],x11          //(iv)ref_main_idx
+    umull       v18.8h, v16.8b, v26.8b      //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+//    lsl            x14,x14,#1
+
+    ld1         {v21.8b},[x12]              //(iv)ref_main_idx_1
+    umlal       v18.8h, v17.8b, v27.8b      //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    sbfx        x9,x14,#0,#8
+    add         x0,x2,x3
+
+    dup         v31.8b, v4.8b[4]            //(v)
+    rshrn       v14.8b, v14.8h,#5           //(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
+    sbfx        x9,x14,#8,#8
+
+    st1         {v10.8b},[x2],#8            //(i)
+    sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
+
+    dup         v29.8b, v4.8b[5]            //(vi)
+    umull       v22.8h, v20.8b, v24.8b      //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    dup         v27.8b, v4.8b[6]            //(vii)
+    umlal       v22.8h, v21.8b, v25.8b      //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    add         x12,x8,x9                   //(vi)*pu1_ref[ref_main_idx]
+    sbfx        x9,x14,#16,#8
+
+    dup         v25.8b, v4.8b[7]            //(viii)
+    rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    ld1         {v8.8b},[x10],x11           //(v)ref_main_idx
+    and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
+
+    ld1         {v9.8b},[x10]               //(v)ref_main_idx_1
+    shrn        v3.8b, v2.8h,#5             //idx = pos >> 5
+
+    st1         {v14.8b},[x0],x3            //(ii)
+    rshrn       v22.8b, v22.8h,#5           //(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+    add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
+    sbfx        x9,x14,#24,#8
+
+    ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
+    sub         v30.8b,  v1.8b ,  v31.8b    //(v)32-fract(dup_const_32_fract)
+
+    shl         v3.8b, v3.8b,#1
+
+    ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
+    umull       v10.8h, v8.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    smov        x14, v3.2s[0]               //(i)extract idx to the r register
+    umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    add         x12,x8,x9                   //(viii)*pu1_ref[ref_main_idx]
+    csel        x8, x1, x8,le               //reload the source to pu1_src+2nt
+
+    ld1         {v16.8b},[x10],x11          //(vii)ref_main_idx
+    sub         v28.8b,  v1.8b ,  v29.8b    //(vi)32-fract(dup_const_32_fract)
+
+    st1         {v18.8b},[x0],x3            //(iii)
+    umull       v14.8h, v12.8b, v28.8b      //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    ld1         {v17.8b},[x10]              //(vii)ref_main_idx_1
+    umlal       v14.8h, v13.8b, v29.8b      //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v20.8b},[x12],x11          //(viii)ref_main_idx
+    rshrn       v10.8b, v10.8h,#5           //(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+    ld1         {v21.8b},[x12]              //(viii)ref_main_idx_1
+    sub         v26.8b,  v1.8b ,  v27.8b    //(vii)32-fract(dup_const_32_fract)
+
+    add         x20,x8,#8                   //increment the source next set 8 columns in same row
+    csel        x8, x20, x8,gt
+    lsl         x20, x3,#3
+    csel        x12,x20,x12,le
+    sub         x20,x12,x5
+    csel        x12, x20, x12,le
+
+    st1         {v22.8b},[x0],x3            //(iv)
+    umull       v18.8h, v16.8b, v26.8b      //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    st1         {v10.8b},[x0],x3            //(v)
+    umlal       v18.8h, v17.8b, v27.8b      //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    add         x20,x2,x12                  //increment the dst pointer to 8*dst_strd - nt
+    csel        x2, x20, x2,le
+    sbfx        x9,x14,#0,#8
+
+    xtn         v4.8b,  v4.8h
+    rshrn       v14.8b, v14.8h,#5           //(vi)shift_res = vrshrn_n_u16(add_res, 5)
+//    lsl            x14,x14,#1
+
+    subs        x7,x7,#8
+    add         x10,x8,x9                   //(i)*pu1_ref[ref_main_idx]
+
+    bne         kernel_8_rows
+
+epilogue:
+    st1         {v14.8b},[x0],x3            //(vi)
+    rshrn       v18.8b, v18.8h,#5           //(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
+    umull       v22.8h, v20.8b, v24.8b      //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umlal       v22.8h, v21.8b, v25.8b      //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    st1         {v18.8b},[x0],x3            //(vii)
+    rshrn       v22.8b, v22.8h,#5           //(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    st1         {v22.8b},[x0],x3            //(viii)
+    b           end_loops
+
+core_loop_4:
+
+end_loops:
+    add         sp, sp, #132
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s b/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
new file mode 100644
index 0000000..1502ad6
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s

@@ -0,0 +1,697 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_luma_mode_11_to_17.s
+//*
+//* @brief
+//*  contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  akshaya mukund
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*    luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] nt
+//*  size of tranform block
+//*
+//* @param[in] mode
+//*  type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_mode_11_to_17(uword8* pu1_ref,
+//                               word32 src_strd,
+//                               uword8* pu1_dst,
+//                               word32 dst_strd,
+//                               word32 nt,
+//                               word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+//    nt
+//    mode
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_luma_mode_11_to_17_av8
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern col_for_intra_luma
+.extern idx_11_17
+
+.type ihevc_intra_pred_luma_mode_11_to_17_av8, %function
+
+ihevc_intra_pred_luma_mode_11_to_17_av8:
+
+    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    adrp        x7,  :got:gai4_ihevc_ang_table
+    ldr         x7,  [x7, #:got_lo12:gai4_ihevc_ang_table]
+
+    adrp        x8,  :got:gai4_ihevc_inv_ang_table
+    ldr         x8,  [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
+
+    add         x7, x7, x5, lsl #2          //gai4_ihevc_ang_table[mode]
+    add         x8, x8, x5, lsl #2          //gai4_ihevc_inv_ang_table[mode - 11]
+    sub         x8, x8, #44
+
+    ldr         w7,  [x7]                   //intra_pred_ang
+    sxtw        x7,w7
+    sub         sp, sp, #132                //ref_temp[2 * max_cu_size + 1]
+
+    ldr         w8,  [x8]                   //inv_ang
+    sxtw        x8,w8
+    add         x6, sp, x4                  //ref_temp + nt
+
+    mul         x9, x4, x7                  //nt*intra_pred_ang
+
+    sub         x6, x6, #1                  //ref_temp + nt - 1
+
+    add         x1, x0, x4, lsl #1          //x1 = &src[2nt]
+    dup         v30.8b,w7                   //intra_pred_ang
+
+    mov         x7, x4
+
+    ldrb        w11, [x1], #-1
+    sxtw        x11,w11
+
+    asr         x9, x9, #5
+
+    ldrb        w12, [x1], #-1
+    sxtw        x12,w12
+    ldrb        w10, [x1], #-1
+    sxtw        x10,w10
+    ldrb        w14, [x1], #-1
+    sxtw        x14,w14
+
+    strb        w11, [x6], #1
+    sxtw        x11,w11
+    strb        w12, [x6], #1
+    sxtw        x12,w12
+    strb        w10, [x6], #1
+    sxtw        x10,w10
+    strb        w14, [x6], #1
+    sxtw        x14,w14
+
+    subs        x7, x7, #4
+    beq         end_loop_copy
+
+    sub         x6, x6,#4
+    sub         x1, x1,#3
+
+    subs        x7,x7,#4
+    beq         loop_copy_8
+    subs        x7,x7,#8
+    beq         loop_copy_16
+
+loop_copy_32:
+    ld1         {v0.8b},[x1]
+    sub         x1, x1,#8
+    ld1         {v1.8b},[x1]
+    sub         x1, x1,#8
+    ld1         {v2.8b},[x1]
+    sub         x1, x1,#8
+    ld1         {v3.8b},[x1]
+
+    rev64       v0.8b,  v0.8b
+    rev64       v1.8b,  v1.8b
+    st1         {v0.8b},[x6],#8
+    rev64       v2.8b,  v2.8b
+    st1         {v1.8b},[x6],#8
+    rev64       v3.8b,  v3.8b
+    st1         {v2.8b},[x6],#8
+    st1         {v3.8b},[x6],#8
+    sub         x1, x1,#1
+    b           end_loop_copy
+
+loop_copy_16:
+    ld1         {v0.8b},[x1]
+    sub         x1, x1,#8
+    ld1         {v1.8b},[x1]
+
+    rev64       v0.8b,  v0.8b
+    rev64       v1.8b,  v1.8b
+
+    st1         {v0.8b},[x6],#8
+    st1         {v1.8b},[x6],#8
+    sub         x1, x1,#1
+    b           end_loop_copy
+
+loop_copy_8:
+    ld1         {v0.8b},[x1]
+    rev64       v0.8b,  v0.8b
+    st1         {v0.8b},[x6],#8
+    sub         x1, x1,#1
+end_loop_copy:
+
+    ldrb        w11, [x1], #-1
+    sxtw        x11,w11
+    strb        w11, [x6], #1
+    sxtw        x11,w11
+
+    cmp         x9, #-1
+    bge         prologue_8_16_32
+
+    add         x6, sp, x4                  //ref_temp + nt
+    sub         x6, x6, #2                  //ref_temp + nt - 2
+
+    mov         x12, #-1
+
+    sub         x20, x9, x12                //count to take care off ref_idx
+    neg         x9, x20
+
+    add         x1, x0, x4, lsl #1          //x1 = &src[2nt]
+
+    mov         x7, #128                    //inv_ang_sum
+
+loop_copy_ref_idx:
+
+    add         x7, x7, x8                  //inv_ang_sum += inv_ang
+
+    lsr         x20, x7, #8
+    ldrb        w11, [x1, x20]
+    strb        w11, [x6], #-1
+
+    subs        x9, x9, #1
+
+    bne         loop_copy_ref_idx
+
+prologue_8_16_32:
+    cmp         x4, #4
+    beq         sz_4_proc
+    adrp        x14,  :got:col_for_intra_luma
+    ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
+
+    lsr         x10, x4, #3
+    ld1         {v31.8b},[x14],#8
+    mul         x10, x4, x10                //block counter (dec by #8)
+
+    mov         x11, x4                     //col counter to be inc/dec by #8
+    smull       v22.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
+    mov         x0, #1
+
+    sub         x7, x5, #11
+    dup         v2.8b,w0                    //contains #1 for adding to get ref_main_idx + 1
+
+    adrp        x12, :got:idx_neg_idx_11_17 //load least idx table
+    ldr         x12, [x12, #:got_lo12:idx_neg_idx_11_17]
+
+    mov         x0, #2
+    dup         v3.8b,w0
+
+    add         x12, x12, x7, lsl #4
+    mov         x8, x12
+
+    mov         x7, #8
+    sub         x7, x7, x3, lsl #3          //x7 = 8-8x3
+
+    ldr         w9,  [x8]
+    sxtw        x9,w9
+    add         x1, sp, x4                  //ref_temp + nt
+
+    xtn         v6.8b,  v22.8h
+    dup         v26.8b,w9                   //least idx added to final idx values
+    sub         x1, x1, #1                  //ref_temp + nt - 1
+
+    add         x6, x1, x9
+
+    ld1         {v0.16b}, [x6]              //stores the 32 values reqd based on indices values (from least idx)
+    sshr        v22.8h, v22.8h,#5
+
+    mov         x0, #31
+    dup         v29.8b,w0                   //contains #31 for vand operation
+
+    mov         x0, #32
+    dup         v28.8b,w0
+
+    sqxtn       v8.8b,  v22.8h
+
+    and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
+
+    mov         x0, #1
+    dup         v27.8b,w0                   //row value inc or reset accordingly
+
+    add         v8.8b,  v8.8b ,  v27.8b     //ref_main_idx (add row)
+    sub         v8.8b,  v8.8b ,  v26.8b     //ref_main_idx (row 0)
+    add         v9.8b,  v8.8b ,  v2.8b      //ref_main_idx + 1 (row 0)
+    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 0)
+    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
+
+    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 0)
+    add         v4.8b,  v8.8b ,  v2.8b      //ref_main_idx (row 1)
+    add         v5.8b,  v9.8b ,  v2.8b      //ref_main_idx + 1 (row 1)
+
+    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 1)
+    umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
+    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
+
+    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
+    add         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 2)
+    add         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx + 1 (row 2)
+
+    rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
+
+    tbl         v14.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 2)
+    umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
+    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
+
+    tbl         v15.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 2)
+    add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 3)
+    add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 3)
+
+    st1         {v24.8b},[x2], x3           //st (row 0)
+    rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
+
+    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
+
+    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
+    add         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 4)
+    add         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx + 1 (row 4)
+
+    st1         {v22.8b},[x2], x3           //st (row 1)
+    rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
+
+    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 4)
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 3)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 3)
+
+    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 4)
+    add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 5)
+    add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 5)
+
+    st1         {v20.8b},[x2], x3           //st (row 2)
+    rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
+
+    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 5)
+    umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
+    umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
+
+    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 5)
+    add         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 6)
+    add         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx + 1 (row 6)
+
+    st1         {v18.8b},[x2], x3           //st (row 3)
+    rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
+
+    tbl         v14.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 6)
+    umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
+    umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
+
+    tbl         v15.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 6)
+    add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 7)
+    add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 7)
+
+    st1         {v24.8b},[x2], x3           //st (row 4)
+    rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
+
+    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
+
+    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+
+    st1         {v22.8b},[x2], x3           //st (row 5)
+    rshrn       v20.8b, v20.8h,#5           //round shft (row 6)
+    rshrn       v18.8b, v18.8h,#5           //round shft (row 7)
+
+    st1         {v20.8b},[x2], x3           //st (row 6)
+
+    subs        x10, x10, #8                //subtract 8 and go to end if 8x8
+
+    st1         {v18.8b},[x2], x3           //st (row 7)
+
+    beq         end_func
+
+    subs        x11, x11, #8
+    add         x20, x8, #4
+    csel        x8, x20, x8,gt
+    add         x20, x2, x7
+    csel        x2, x20, x2,gt
+    csel        x8, x12, x8,le
+    sub         x20, x2, x4
+    csel        x2, x20, x2,le
+    add         x20, x2, #8
+    csel        x2, x20, x2,le
+    csel        x11, x4, x11,le
+    bgt         lbl390
+    adrp        x14,  :got:col_for_intra_luma
+    ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
+lbl390:
+    add         x20, x0, #8
+    csel        x0, x20, x0,le
+
+    mov         x5,x2
+    ld1         {v31.8b},[x14],#8
+    smull       v12.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
+    xtn         v10.8b,  v12.8h
+    sshr        v12.8h, v12.8h,#5
+    sqxtn       v11.8b,  v12.8h
+    dup         v27.8b,w0                   //row value inc or reset accordingly
+    ldr         w9,  [x8]
+    sxtw        x9,w9
+    add         x9, x0, x9
+    sub         x9, x9, #1
+    dup         v26.8b,w9
+    add         v8.8b,  v27.8b ,  v11.8b    //ref_main_idx (add row)
+
+    sub         x4,x4,#8
+
+kernel_8_16_32:
+
+    sub         v8.8b,  v8.8b ,  v26.8b     //ref_main_idx
+    mov         v26.8b, v10.8b
+
+    subs        x11, x11, #8
+    add         x6, x1, x9
+    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
+    add         v9.8b,  v2.8b ,  v8.8b      //ref_main_idx + 1
+
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
+    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
+
+    add         x20, x0, #8
+    csel        x0, x20, x0,le
+    add         x20, x8, #4
+    csel        x8, x20, x8,gt
+    ld1         {v0.16b}, [x6]              //stores the 32 values reqd based on indices values (from least idx)
+
+    st1         {v24.8b},[x5], x3           //st (row 4)
+    rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
+
+    bgt         lbl429
+    adrp        x14,  :got:col_for_intra_luma
+    ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
+lbl429:
+    csel        x8, x12, x8,le
+    dup         v27.8b,w0                   //row value inc or reset accordingly
+
+    add         v4.8b,  v2.8b ,  v8.8b      //ref_main_idx (row 1)
+    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 0)
+    add         v5.8b,  v2.8b ,  v9.8b      //ref_main_idx + 1 (row 1)
+
+
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
+    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 0)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+
+    ld1         {v31.8b},[x14],#8
+    and         v6.8b,  v29.8b ,  v26.8b    //fract values in d1/ idx values in d0
+
+    st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
+    rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
+
+    add         v8.8b,  v3.8b ,  v8.8b      //ref_main_idx (row 2)
+    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 1)
+    add         v9.8b,  v3.8b ,  v9.8b      //ref_main_idx + 1 (row 2)
+
+    add         x20, x4, #8
+    csel        x11, x20, x11,le
+    ldr         w9,  [x8]
+    sxtw        x9,w9
+    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
+
+    umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
+    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
+    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
+
+    st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
+    rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
+
+    add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 3)
+    tbl         v14.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 2)
+    add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 3)
+
+    umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
+    tbl         v15.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 2)
+    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
+
+    rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
+    st1         {v18.8b},[x5], x3           //(from previous loop)st (row 7)
+
+    add         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 4)
+    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
+    add         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx + 1 (row 4)
+
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
+    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
+
+    smull       v14.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
+    add         x5,x2,x3,lsl#2
+    add         x9, x0, x9
+
+
+    st1         {v24.8b},[x2], x3           //st (row 0)
+    rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
+
+    add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 5)
+    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 4)
+    add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 5)
+
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 3)
+    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 4)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 3)
+
+    st1         {v22.8b},[x2], x3           //st (row 1)
+    rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
+
+    xtn         v10.8b,  v14.8h
+    sshr        v14.8h, v14.8h,#5
+
+    add         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 6)
+    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 5)
+    add         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx + 1 (row 6)
+
+    umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
+    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 5)
+    umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
+
+    st1         {v20.8b},[x2], x3           //st (row 2)
+    rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
+
+    sub         x9, x9, #1
+    sqxtn       v11.8b,  v14.8h
+
+    add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 7)
+    tbl         v14.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 6)
+    add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 7)
+
+    umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
+    tbl         v15.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 6)
+    umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
+
+    add         v8.8b,  v27.8b ,  v11.8b    //ref_main_idx (add row)
+    dup         v26.8b,w9
+
+    st1         {v18.8b},[x2], x3           //st (row 3)
+    rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
+
+
+    add         x2, x2, x3, lsl #2
+    add         x20, x7, x2
+    csel        x2, x20, x2,gt
+    sub         x20, x2, x4
+    csel        x2, x20, x2,le
+
+    subs        x10, x10, #8                //subtract 8 and go to end if 8x8
+
+    bne         kernel_8_16_32
+epil_8_16_32:
+
+    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
+
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
+    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
+
+    st1         {v24.8b},[x5], x3           //st (row 4)
+    rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
+
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+
+    st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
+    rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
+
+    st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
+    rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
+
+    st1         {v18.8b},[x5], x3           //st (row 7)
+
+
+    b           end_func
+
+sz_4_proc:
+    adrp        x14,  :got:col_for_intra_luma
+    ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
+
+    ld1         {v31.8b},[x14]
+    mov         x12, #1
+
+    dup         v2.8b,w12                   //contains #1 for adding to get ref_main_idx + 1
+    mov         x0, #2
+
+    dup         v3.8b,w0
+    adrp        x12, :got:idx_neg_idx_11_17 //load least idx table
+    ldr         x12, [x12, #:got_lo12:idx_neg_idx_11_17]
+
+    smull       v22.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
+    sub         x7, x5, #11
+
+    add         x12, x12, x7, lsl #4
+    mov         x8, x12
+
+    ldr         w9,  [x8]
+    sxtw        x9,w9
+
+    dup         v26.8b,w9                   //least idx added to final idx values
+    add         x6, sp, x4                  //ref_temp + nt
+
+    sub         x6, x6, #1                  //ref_temp + nt - 1
+    xtn         v6.8b,  v22.8h
+    add         x6, x6, x9
+
+    ld1         {v0.16b}, [x6]              //stores the 32 values reqd based on indices values (from least idx)
+    mov         x0, #31
+
+    dup         v29.8b,w0                   //contains #31 for vand operation
+    mov         x1, #32
+
+    dup         v28.8b,w1
+
+    sshr        v22.8h, v22.8h,#5
+    sqxtn       v8.8b,  v22.8h
+
+    and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
+    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
+
+    add         v8.8b,  v8.8b ,  v2.8b      //ref_main_idx (add 1)
+    sub         v8.8b,  v8.8b ,  v26.8b     //ref_main_idx
+    add         v9.8b,  v8.8b ,  v2.8b      //ref_main_idx + 1
+
+    add         v4.8b,  v8.8b ,  v2.8b      //row 1 ref_main_idx
+    add         v5.8b,  v9.8b ,  v2.8b
+
+    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 0)
+    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 0)
+
+
+    umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
+    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx    (row 1)
+    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
+
+    add         v8.8b,  v8.8b ,  v3.8b      //idx (row 2)
+    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
+    add         v9.8b,  v9.8b ,  v3.8b      //idx+1 (row 2)
+
+    umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
+    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx    (row 2)
+    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
+
+    rshrn       v24.8b, v24.8h,#5           //round shift (row 0)
+
+    add         v4.8b,  v4.8b ,  v3.8b      //idx (row 3)
+    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 2)
+    add         v5.8b,  v5.8b ,  v3.8b      //idx+1 (row 3)
+
+    umull       v20.8h, v12.8b, v7.8b       //mul (row 2)
+    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
+    umlal       v20.8h, v13.8b, v6.8b       //mul (row 2)
+
+    st1         {v24.s}[0],[x2], x3         //st row 0
+    rshrn       v22.8b, v22.8h,#5           //round shift (row 1)
+
+    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
+
+    umull       v18.8h, v16.8b, v7.8b       //mul (row 3)
+    umlal       v18.8h, v17.8b, v6.8b       //mul (row 3)
+
+    st1         {v22.s}[0],[x2], x3         //st row 1
+    rshrn       v20.8b, v20.8h,#5           //round shift (row 2)
+
+    st1         {v20.s}[0],[x2], x3         //st row 2
+
+    rshrn       v18.8b, v18.8h,#5           //round shift (row 3)
+
+    st1         {v18.s}[0],[x2], x3         //st (row 3)
+
+end_func:
+    add         sp, sp, #132
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
new file mode 100644
index 0000000..fe7ac11
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s

@@ -0,0 +1,665 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_luma_mode_19_to_25.s
+//*
+//* @brief
+//*  contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  naveen sr
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*    luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] nt
+//*  size of tranform block
+//*
+//* @param[in] mode
+//*  type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_mode_19_to_25(uword8* pu1_ref,
+//                               word32 src_strd,
+//                               uword8* pu1_dst,
+//                               word32 dst_strd,
+//                               word32 nt,
+//                               word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+//    nt
+//    mode
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_luma_mode_19_to_25_av8
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern gau1_ihevc_planar_factor
+
+.type ihevc_intra_pred_luma_mode_19_to_25_av8, %function
+
+ihevc_intra_pred_luma_mode_19_to_25_av8:
+
+    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    adrp        x7,  :got:gai4_ihevc_ang_table
+    ldr         x7,  [x7, #:got_lo12:gai4_ihevc_ang_table]
+
+    adrp        x8,  :got:gai4_ihevc_inv_ang_table
+    ldr         x8,  [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
+
+    add         x7, x7, x5, lsl #2          //gai4_ihevc_ang_table[mode]
+    add         x8, x8, x5, lsl #2          //gai4_ihevc_inv_ang_table
+    sub         x8, x8, #48                 //gai4_ihevc_inv_ang_table[mode - 12]
+
+    ldr         w7,  [x7]                   //intra_pred_ang
+    sxtw        x7,w7
+    sub         sp, sp, #132                //ref_temp[2 * max_cu_size + 1]
+
+    ldr         w8,  [x8]                   //inv_ang
+    sxtw        x8,w8
+    add         x6, sp, x4                  //ref_temp + nt
+
+    mul         x9, x4, x7                  //nt*intra_pred_ang
+
+    sub         x6, x6, #1                  //ref_temp + nt - 1
+
+    add         x1, x0, x4, lsl #1          //x1 = &src[2nt]
+    dup         v30.8b,w7                   //intra_pred_ang
+
+    mov         x7, x4
+
+    asr         x9, x9, #5
+
+    ld1         {v0.s}[0],[x1],#4           // pu1_ref[two_nt + k]
+
+    st1         {v0.s}[0],[x6],#4           //ref_temp[k + nt - 1] = pu1_ref[two_nt + k]//
+
+    subs        x7, x7, #4
+    beq         end_loop_copy
+    sub         x1, x1,#4
+    sub         x6, x6,#4
+    subs        x7,x7,#4
+    beq         loop_copy_8
+    subs        x7,x7,#8
+    beq         loop_copy_16
+
+loop_copy_32:
+    ld1         {v0.8b},[x1],#8
+    ld1         {v1.8b},[x1],#8
+    ld1         {v2.8b},[x1],#8
+    ld1         {v3.8b},[x1],#8
+
+    st1         {v0.8b},[x6],#8
+    st1         {v1.8b},[x6],#8
+    st1         {v2.8b},[x6],#8
+    st1         {v3.8b},[x6],#8
+    b           end_loop_copy
+
+loop_copy_16:
+    ld1         {v0.8b},[x1],#8
+    ld1         {v1.8b},[x1],#8
+
+    st1         {v0.8b},[x6],#8
+    st1         {v1.8b},[x6],#8
+    b           end_loop_copy
+
+loop_copy_8:
+    ld1         {v0.8b},[x1],#8
+    st1         {v0.8b},[x6],#8
+
+end_loop_copy:
+
+    ldrb        w11, [x1]
+    sxtw        x11,w11
+    strb        w11, [x6]
+    sxtw        x11,w11
+
+    cmp         x9, #-1
+    bge         linear_filtering
+
+    add         x6, sp, x4                  //ref_temp + nt
+    sub         x6, x6, #2                  //ref_temp + nt - 2
+
+    mov         x12, #-1
+
+    sub         x20, x9, x12                //count to take care off ref_idx
+    neg         x9, x20
+
+    add         x1, x0, x4, lsl #1          //x1 = &src[2nt]
+
+    mov         x7, #128                    //inv_ang_sum
+
+loop_copy_ref_idx:
+
+    add         x7, x7, x8                  //inv_ang_sum += inv_ang
+    lsr         x14, x7, #8
+    neg         x20,x14
+    ldrb        w11, [x1, x20]
+    sxtw        x11,w11
+//    ldrb        x11, [x1, -x7, lsr #8]
+    strb        w11, [x6], #-1
+    sxtw        x11,w11
+
+    subs        x9, x9, #1
+
+    bne         loop_copy_ref_idx
+
+
+linear_filtering:
+//    after copy
+//    below code is taken from mode 27 to 33 and modified
+
+    adrp        x6,  :got:gai4_ihevc_ang_table //loads word32 gai4_ihevc_ang_table[35]
+    ldr         x6,  [x6, #:got_lo12:gai4_ihevc_ang_table]
+
+    add         x8,x6,x5,lsl #2             //*gai4_ihevc_ang_table[mode]
+    ldr         w9, [x8]                    //intra_pred_ang = gai4_ihevc_ang_table[mode]
+    sxtw        x9,w9
+    adrp        x1, :got:gau1_ihevc_planar_factor //used for ((row + 1) * intra_pred_ang) row values
+    ldr         x1, [x1, #:got_lo12:gau1_ihevc_planar_factor]
+    add         x6,x1,#1
+
+    add         x8, sp, x4                  //ref_temp + nt
+    sub         x8, x8,#1                   //ref_temp + nt -1
+
+    tst         x4,#7
+    mov         x14,#0                      //row
+    mov         x12,x4
+    bne         core_loop_4
+
+core_loop_8:
+    add         x8,x8,#1                    //pu1_ref_main_idx += (two_nt + 1)
+    dup         v0.8b,w9                    //intra_pred_ang
+    lsr         x12, x4, #3                 //divide by 8
+
+    movi        v1.8b, #32
+    mul         x7, x4, x12
+
+    movi        v6.8h, #31
+    //lsl            x12,x3,#3
+
+    mov         x1,x8
+    //sub            x12,x12,x4
+    mov         x5,x4
+    mov         x11,#1
+
+prologue:
+    ld1         {v3.8b},[x6]                //loads the row value
+    smull       v2.8h, v3.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
+    and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
+    xtn         v4.8b,  v4.8h
+    shrn        v5.8b, v2.8h,#5             //idx = pos >> 5
+
+    dup         v31.8b, v4.8b[0]
+    add         x0,x2,x3
+
+    umov        w14, v5.2s[0]               //(i row)extract idx to the r register
+    sxtw        x14,w14
+
+    dup         v29.8b, v4.8b[1]            //(ii)
+    sbfx        x9,x14,#0,#8
+
+    add         x10,x8,x9                   //(i row)*pu1_ref[ref_main_idx]
+
+    ld1         {v8.8b},[x10],x11           //(i row)ref_main_idx
+    sbfx        x9,x14,#8,#8
+
+    ld1         {v9.8b},[x10]               //(i row)ref_main_idx_1
+    add         x12,x8,x9                   //(ii)*pu1_ref[ref_main_idx]
+
+    sbfx        x9,x14,#16,#8
+    sub         v30.8b,  v1.8b ,  v31.8b    //32-fract(dup_const_32_fract)
+    add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
+
+    ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
+    umull       v10.8h, v8.8b, v30.8b       //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
+    umlal       v10.8h, v9.8b, v31.8b       //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    dup         v27.8b, v4.8b[2]            //(iii)
+    sub         v28.8b,  v1.8b ,  v29.8b    //(ii)32-fract(dup_const_32_fract)
+    sbfx        x9,x14,#24,#8
+
+    dup         v25.8b, v4.8b[3]            //(iv)
+    umull       v14.8h, v12.8b, v28.8b      //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    add         x12,x8,x9                   //(iv)*pu1_ref[ref_main_idx]
+
+    ld1         {v16.8b},[x10],x11          //(iii)ref_main_idx
+    umlal       v14.8h, v13.8b, v29.8b      //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v17.8b},[x10]              //(iii)ref_main_idx_1
+    rshrn       v10.8b, v10.8h,#5           //(i row)shift_res = vrshrn_n_u16(add_res, 5)
+
+    ld1         {v20.8b},[x12],x11          //(iv)ref_main_idx
+    sub         v26.8b,  v1.8b ,  v27.8b    //(iii)32-fract(dup_const_32_fract)
+
+    ld1         {v21.8b},[x12]              //(iv)ref_main_idx_1
+
+    dup         v31.8b, v4.8b[4]            //(v)
+    umull       v18.8h, v16.8b, v26.8b      //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    umov        w14, v5.2s[1]               //extract idx to the r register
+    sxtw        x14,w14
+    umlal       v18.8h, v17.8b, v27.8b      //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    st1         {v10.8b},[x2],#8            //(i row)
+    rshrn       v14.8b, v14.8h,#5           //(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    sbfx        x9,x14,#0,#8
+    dup         v29.8b, v4.8b[5]            //(vi)
+    add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
+
+    ld1         {v8.8b},[x10],x11           //(v)ref_main_idx
+    sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
+
+    umull       v22.8h, v20.8b, v24.8b      //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+    sbfx        x9,x14,#8,#8
+
+    ld1         {v9.8b},[x10]               //(v)ref_main_idx_1
+    umlal       v22.8h, v21.8b, v25.8b      //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    st1         {v14.8b},[x0],x3            //(ii)
+    rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    add         x12,x8,x9                   //(vi)*pu1_ref[ref_main_idx]
+    dup         v27.8b, v4.8b[6]            //(vii)
+
+    sbfx        x9,x14,#16,#8
+    sub         v30.8b,  v1.8b ,  v31.8b    //(v)32-fract(dup_const_32_fract)
+    add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
+
+    ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
+    umull       v10.8h, v8.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
+    umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    st1         {v18.8b},[x0],x3            //(iii)
+    rshrn       v22.8b, v22.8h,#5           //(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+    dup         v25.8b, v4.8b[7]            //(viii)
+    sbfx        x9,x14,#24,#8
+
+    ld1         {v16.8b},[x10],x11          //(vii)ref_main_idx
+    sub         v28.8b,  v1.8b ,  v29.8b    //(vi)32-fract(dup_const_32_fract)
+
+    ld1         {v17.8b},[x10]              //(vii)ref_main_idx_1
+    umull       v14.8h, v12.8b, v28.8b      //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    add         x12,x8,x9                   //(viii)*pu1_ref[ref_main_idx]
+    umlal       v14.8h, v13.8b, v29.8b      //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+    subs        x4,x4,#8
+
+    st1         {v22.8b},[x0],x3            //(iv)
+    rshrn       v10.8b, v10.8h,#5           //(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+    ld1         {v20.8b},[x12],x11          //(viii)ref_main_idx
+    sub         v26.8b,  v1.8b ,  v27.8b    //(vii)32-fract(dup_const_32_fract)
+
+    ld1         {v21.8b},[x12]              //(viii)ref_main_idx_1
+    umull       v18.8h, v16.8b, v26.8b      //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    add         x20,x8,#8
+    csel        x8, x20, x8,gt
+    umlal       v18.8h, v17.8b, v27.8b      //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    sub         x20,x7,#8
+    csel        x7, x20, x7,gt
+
+    st1         {v10.8b},[x0],x3            //(v)
+    rshrn       v14.8b, v14.8h,#5           //(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+    beq         epilogue
+
+    ld1         {v5.8b},[x6]                //loads the row value
+    smull       v2.8h, v5.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
+    and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
+    xtn         v4.8b,  v4.8h
+    shrn        v3.8b, v2.8h,#5             //idx = pos >> 5
+    umov        w14, v3.2s[0]               //(i)extract idx to the r register
+    sxtw        x14,w14
+    sbfx        x9,x14,#0,#8
+    add         x10,x8,x9                   //(i)*pu1_ref[ref_main_idx]
+
+kernel_8_rows:
+    dup         v31.8b, v4.8b[0]
+    subs        x4,x4,#8
+    sbfx        x9,x14,#8,#8
+
+    ld1         {v8.8b},[x10],x11           //(i)ref_main_idx
+    sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
+
+    add         x20,x6,#8                   //increment the row value
+    csel        x6, x20, x6,le
+    add         x12,x8,x9                   //(ii)*pu1_ref[ref_main_idx]
+
+    ld1         {v9.8b},[x10]               //(i)ref_main_idx_1
+    umull       v22.8h, v20.8b, v24.8b      //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    ld1         {v5.8b},[x6]                //loads the row value
+    umlal       v22.8h, v21.8b, v25.8b      //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    dup         v29.8b, v4.8b[1]            //(ii)
+    rshrn       v18.8b, v18.8h,#5           //(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    sbfx        x9,x14,#16,#8
+
+    st1         {v14.8b},[x0],x3            //(vi)
+    sub         v30.8b,  v1.8b ,  v31.8b    //(i)32-fract(dup_const_32_fract)
+
+    add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
+
+    ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
+    umull       v10.8h, v8.8b, v30.8b       //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
+    umlal       v10.8h, v9.8b, v31.8b       //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    sbfx        x9,x14,#24,#8
+    csel        x4, x5, x4,le               //reload nt
+
+    umov        w14, v3.2s[1]               //extract idx to the r register
+    sxtw        x14,w14
+    rshrn       v22.8b, v22.8h,#5           //(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    dup         v27.8b, v4.8b[2]            //(iii)
+    sub         v28.8b,  v1.8b ,  v29.8b    //(ii)32-fract(dup_const_32_fract)
+    add         x12,x8,x9                   //(iv)*pu1_ref[ref_main_idx]
+
+    ld1         {v16.8b},[x10],x11          //(iii)ref_main_idx
+    umull       v14.8h, v12.8b, v28.8b      //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    st1         {v18.8b},[x0],x3            //(vii)
+    umlal       v14.8h, v13.8b, v29.8b      //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v17.8b},[x10]              //(iii)ref_main_idx_1
+    rshrn       v10.8b, v10.8h,#5           //(i)shift_res = vrshrn_n_u16(add_res, 5)
+
+    dup         v25.8b, v4.8b[3]            //(iv)
+    smull       v2.8h, v5.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
+
+    st1         {v22.8b},[x0]               //(viii)
+    sub         v26.8b,  v1.8b ,  v27.8b    //(iii)32-fract(dup_const_32_fract)
+
+    ld1         {v20.8b},[x12],x11          //(iv)ref_main_idx
+    umull       v18.8h, v16.8b, v26.8b      //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    ld1         {v21.8b},[x12]              //(iv)ref_main_idx_1
+    umlal       v18.8h, v17.8b, v27.8b      //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    sbfx        x9,x14,#0,#8
+    add         x0,x2,x3
+
+    dup         v31.8b, v4.8b[4]            //(v)
+    rshrn       v14.8b, v14.8h,#5           //(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
+    sbfx        x9,x14,#8,#8
+
+    st1         {v10.8b},[x2],#8            //(i)
+    sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
+
+    dup         v29.8b, v4.8b[5]            //(vi)
+    umull       v22.8h, v20.8b, v24.8b      //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    dup         v27.8b, v4.8b[6]            //(vii)
+    umlal       v22.8h, v21.8b, v25.8b      //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    add         x12,x8,x9                   //(vi)*pu1_ref[ref_main_idx]
+    sbfx        x9,x14,#16,#8
+
+    dup         v25.8b, v4.8b[7]            //(viii)
+    rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    ld1         {v8.8b},[x10],x11           //(v)ref_main_idx
+    and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
+
+    ld1         {v9.8b},[x10]               //(v)ref_main_idx_1
+    shrn        v3.8b, v2.8h,#5             //idx = pos >> 5
+
+    st1         {v14.8b},[x0],x3            //(ii)
+    rshrn       v22.8b, v22.8h,#5           //(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+    add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
+    sbfx        x9,x14,#24,#8
+
+    ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
+    sub         v30.8b,  v1.8b ,  v31.8b    //(v)32-fract(dup_const_32_fract)
+
+    ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
+    umull       v10.8h, v8.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    umov        w14, v3.2s[0]               //(i)extract idx to the r register
+    sxtw        x14,w14
+    umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    add         x12,x8,x9                   //(viii)*pu1_ref[ref_main_idx]
+    csel        x8, x1, x8,le               //reload the source to pu1_src+2nt
+
+    ld1         {v16.8b},[x10],x11          //(vii)ref_main_idx
+    sub         v28.8b,  v1.8b ,  v29.8b    //(vi)32-fract(dup_const_32_fract)
+
+    st1         {v18.8b},[x0],x3            //(iii)
+    umull       v14.8h, v12.8b, v28.8b      //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    ld1         {v17.8b},[x10]              //(vii)ref_main_idx_1
+    umlal       v14.8h, v13.8b, v29.8b      //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v20.8b},[x12],x11          //(viii)ref_main_idx
+    rshrn       v10.8b, v10.8h,#5           //(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+    ld1         {v21.8b},[x12]              //(viii)ref_main_idx_1
+    sub         v26.8b,  v1.8b ,  v27.8b    //(vii)32-fract(dup_const_32_fract)
+
+    add         x20,x8,#8                   //increment the source next set 8 columns in same row
+    csel        x8, x20, x8,gt
+    lsl         x20, x3,#3
+    csel        x12,x20,x12,le
+    sub         x20,x12,x5
+    csel        x12, x20, x12,le
+
+    st1         {v22.8b},[x0],x3            //(iv)
+    umull       v18.8h, v16.8b, v26.8b      //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    st1         {v10.8b},[x0],x3            //(v)
+    umlal       v18.8h, v17.8b, v27.8b      //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    add         x20,x2,x12                  //increment the dst pointer to 8*dst_strd - nt
+    csel        x2, x20, x2,le
+    sbfx        x9,x14,#0,#8
+
+    xtn         v4.8b,  v4.8h
+    rshrn       v14.8b, v14.8h,#5           //(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+    subs        x7,x7,#8
+    add         x10,x8,x9                   //(i)*pu1_ref[ref_main_idx]
+
+    bne         kernel_8_rows
+
+epilogue:
+    st1         {v14.8b},[x0],x3            //(vi)
+    rshrn       v18.8b, v18.8h,#5           //(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
+    umull       v22.8h, v20.8b, v24.8b      //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umlal       v22.8h, v21.8b, v25.8b      //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    st1         {v18.8b},[x0],x3            //(vii)
+    rshrn       v22.8b, v22.8h,#5           //(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    st1         {v22.8b},[x0],x3            //(viii)
+    b           end_loops
+
+core_loop_4:
+    add         x6,x8,#1                    //pu1_ref_main_idx +=  1
+    mov         x8,#0
+
+    add         x5,x8,#1                    //row + 1
+    mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
+    asr         x14, x5, #5                 //if(fract_prev > fract)
+    and         x5,x5,#31                   //fract = pos & (31)
+    add         x10,x6,x14                  //pu1_ref_main_idx += 1
+    add         x11,x10,#1                  //pu1_ref_main_idx_1 += 1
+    dup         v0.8b,w5                    //dup_const_fract
+    sub         x20,x5,#32
+    neg         x4, x20
+    dup         v1.8b,w4                    //dup_const_32_fract
+
+//inner_loop_4
+    ld1         {v2.s}[0],[x10]             //ref_main_idx
+    add         x8,x8,#1
+//    mov            x14,x5                            @fract_prev = fract
+
+    ld1         {v3.s}[0],[x11]             //ref_main_idx_1
+    add         x5,x8,#1                    //row + 1
+    mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
+    asr         x14, x5, #5                 // pos >> 5
+    and         x5,x5,#31                   //fract = pos & (31)
+    add         x10,x6,x14                  //pu1_ref_main_idx += 1
+    add         x11,x10,#1                  //pu1_ref_main_idx_1 += 1
+
+    dup         v6.8b,w5                    //dup_const_fract
+    umull       v4.8h, v2.8b, v1.8b         //vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    sub         x20,x5,#32
+    neg         x4, x20
+    dup         v7.8b,w4                    //dup_const_32_fract
+    umlal       v4.8h, v3.8b, v0.8b         //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v8.s}[0],[x10]             //ref_main_idx
+    add         x8,x8,#1
+
+    ld1         {v9.s}[0],[x11]             //ref_main_idx_1
+    rshrn       v4.8b, v4.8h,#5             //shift_res = vrshrn_n_u16(add_res, 5)
+
+//    mov            x14,x5                            @fract_prev = fract
+    add         x5,x8,#1                    //row + 1
+    mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
+    asr         x14, x5, #5                 //if(fract_prev > fract)
+    and         x5,x5,#31                   //fract = pos & (31)
+    add         x10,x6,x14                  //ref_main + idx
+    add         x11,x10,#1                  //pu1_ref_main_idx_1 += 1
+
+    dup         v12.8b,w5                   //dup_const_fract
+    umull       v10.8h, v8.8b, v7.8b        //vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    sub         x20,x5,#32
+    neg         x4, x20
+    dup         v13.8b,w4                   //dup_const_32_fract
+    umlal       v10.8h, v9.8b, v6.8b        //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v14.s}[0],[x10]            //ref_main_idx
+    add         x8,x8,#1
+
+    st1         {v4.s}[0],[x2],x3
+    rshrn       v10.8b, v10.8h,#5           //shift_res = vrshrn_n_u16(add_res, 5)
+
+    ld1         {v15.s}[0],[x11]            //ref_main_idx_1
+//    mov            x14,x5                            @fract_prev = fract
+    add         x5,x8,#1                    //row + 1
+    mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
+    asr         x14, x5, #5                 //if(fract_prev > fract)
+    and         x5,x5,#31                   //fract = pos & (31)
+    add         x10,x6,x14                  //pu1_ref_main_idx += 1
+    add         x11,x10,#1                  //pu1_ref_main_idx_1 += 1
+
+    dup         v18.8b,w5                   //dup_const_fract
+    umull       v16.8h, v14.8b, v13.8b      //vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    sub         x20,x5,#32
+    neg         x4, x20
+    dup         v19.8b,w4                   //dup_const_32_fract
+    umlal       v16.8h, v15.8b, v12.8b      //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v20.s}[0],[x10]            //ref_main_idx
+
+    st1         {v10.s}[0],[x2],x3
+    rshrn       v16.8b, v16.8h,#5           //shift_res = vrshrn_n_u16(add_res, 5)
+    ld1         {v21.s}[0],[x11]            //ref_main_idx_1
+
+    umull       v22.8h, v20.8b, v19.8b      //vmull_u8(ref_main_idx, dup_const_32_fract)
+    umlal       v22.8h, v21.8b, v18.8b      //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    st1         {v16.s}[0],[x2],x3
+    rshrn       v22.8b, v22.8h,#5           //shift_res = vrshrn_n_u16(add_res, 5)
+
+    st1         {v22.s}[0],[x2],x3
+
+end_loops:
+    add         sp, sp, #132
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_luma_dc.s b/common/arm64/ihevc_intra_pred_luma_dc.s
new file mode 100644
index 0000000..7683266
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_luma_dc.s

@@ -0,0 +1,519 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_filters_dc.s
+//*
+//* @brief
+//*  contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  akshaya mukund
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*    luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] pi1_coeff
+//*  word8 pointer to the planar coefficients
+//*
+//* @param[in] nt
+//*  size of tranform block
+//*
+//* @param[in] mode
+//*  type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_dc(uword8 *pu1_ref,
+//                              word32 src_strd,
+//                              uword8 *pu1_dst,
+//                              word32 dst_strd,
+//                              word32 nt,
+//                              word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+//    nt
+//    mode
+//    pi1_coeff
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+.globl ihevc_intra_pred_luma_dc_av8
+
+.type ihevc_intra_pred_luma_dc_av8, %function
+
+ihevc_intra_pred_luma_dc_av8:
+
+    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+
+//********** testing
+    //mov        x6, #128
+    //b        prologue_cpy_32
+//********** testing
+
+    mov         x11, #2                     //mov #2 to x11 (to be used to add to 2dc_val & 3dc_val)
+    mov         x9, #0
+    mov         v17.s[0], w11
+    mov         v17.s[1], w9
+
+    clz         w5,w4
+
+    add         x6, x0, x4                  //&src[nt]
+    sub         x20, x5, #32                //log2nt
+    neg         x5, x20
+    add         x7, x0, x4, lsl #1          //&src[2nt]
+
+    add         x8, x7, #1                  //&src[2nt+1]
+    mvn         x5, x5
+    add         x5, x5, #1
+    dup         v8.2s,w5
+
+    ldrb        w14, [x8]
+    sxtw        x14,w14
+    shl         d8, d8,#32
+
+    sub         x9, x7, #1                  //&src[2nt-1]
+    sshr        d8, d8,#32
+
+    mov         x7, x8                      //x7 also stores 2nt+1
+
+    ldrb        w12, [x9]
+    sxtw        x12,w12
+    add         x14, x14, x12               //src[2nt+1] + src[2nt-1]
+    add         x14, x14, x11               //src[2nt+1] + src[2nt-1] + 2
+
+    cmp         x4, #4
+    beq         dc_4
+
+    mov         x10, x4                     //nt
+
+add_loop:
+    ld1         {v0.8b},[x6],#8             //load from src[nt]
+    mov         x5, #0                      //
+    ld1         {v1.8b},[x8],#8             //load from src[2nt+1]
+
+    uaddlp      v2.4h,  v0.8b
+
+    mov         v6.s[0], w4
+    mov         v6.s[1], w5                 //store nt to accumulate
+    uaddlp      v3.4h,  v1.8b
+
+    ld1         {v0.8b},[x6],#8             //load from src[nt] (extra load for 8)
+
+    ld1         {v1.8b},[x8],#8             //load from src[2nt+1] (extra load for 8)
+    add         v4.4h,  v2.4h ,  v3.4h
+
+
+    uaddlp      v5.2s,  v4.4h
+
+
+    uadalp      v6.1d,  v5.2s               //accumulate all inp into d6 (end for nt==8)
+
+    subs        x10, x10,#8
+    beq         epil_add_loop
+
+core_loop_add:
+    uaddlp      v2.4h,  v0.8b
+    subs        x10, x10,#8
+    uaddlp      v3.4h,  v1.8b
+
+
+
+    add         v4.4h,  v2.4h ,  v3.4h
+    ld1         {v0.8b},[x6],#8             //load from src[nt] (extra load for 16)
+
+    uaddlp      v5.2s,  v4.4h
+    ld1         {v1.8b},[x8],#8             //load from src[2nt+1] (extra load for 16)
+
+    uadalp      v6.1d,  v5.2s               //accumulate all inp into d6
+    bne         core_loop_add
+
+epil_add_loop:
+
+    sshl        d9, d6, d8                  //(dc_val) shr by log2nt+1
+    cmp         x4, #32
+
+    mov         v28.s[0], w14
+    mov         v28.s[1], w5                //src[2nt+1]+2+src[2nt-1] moved to d28
+    mov         x20,#128
+    csel        x6, x20, x6,eq
+
+    dup         v16.8b, v9.8b[0]            //dc_val
+    shl         d13, d9,#1                  //2*dc
+
+    beq         prologue_cpy_32
+
+    add         d14,  d13 ,  d28            //src[2nt+1]+2+src[2nt-1]+2dc_val
+    mov         x20,#0
+    csel        x6, x20, x6,ne              //nt
+
+    ushr        v15.4h, v14.4h,#2           //final dst[0]'s value in d15[0]
+    csel        x10, x4, x10,ne
+
+    add         d11,  d13 ,  d9             //3*dc
+    sub         x12, x3, x3, lsl #3         //-7*strd
+
+    add         d11,  d11 ,  d17            //3*dc + 2
+    add         x12, x12, #8                //offset after one 8x8 block (-7*strd + 8)
+
+    dup         v24.8h, v11.4h[0]           //3*dc + 2 (moved to all lanes)
+    sub         x0, x3, x4                  //strd - nt
+
+prologue_col:
+    //0th column and 0-7 rows done here
+    //x8 and x9 (2nt+1+col 2nt-1-row)
+
+    mov         x8, x7                      //&src[2nt+1]
+
+    add         x0, x0, #8                  //strd - nt + 8
+    ld1         {v0.8b},[x8],#8             //col 1::7 load (prol)
+    sub         x9, x9, #7                  //&src[2nt-1-row]
+
+    ld1         {v1.8b},[x9]                //row 7::1 (0 also) load (prol)
+    sub         x9, x9, #8
+
+    uxtl        v20.8h, v0.8b
+
+    ld1         {v6.8b},[x8]                //col 8::15 load (prol extra)
+    add         v20.8h,  v20.8h ,  v24.8h   //col 1::7 add 3dc+2 (prol)
+
+    uxtl        v22.8h, v1.8b
+    sqshrun     v2.8b, v20.8h,#2            //columns shx2 movn (prol)
+
+    uxtl        v26.8h, v6.8b
+    add         v22.8h,  v22.8h ,  v24.8h   //row 1::7 add 3dc+2 (prol)
+
+    movi        d19, #0x00000000000000ff    //
+    sqshrun     v3.8b, v22.8h,#2            //rows shx2 movn (prol)
+
+    bsl         v19.8b,  v15.8b ,  v2.8b    //first row with dst[0]
+    add         v26.8h,  v26.8h ,  v24.8h   //col 8::15 add 3dc+2 (prol extra)
+
+    rev64       v3.8b,  v3.8b
+
+    st1         {v19.8b},[x2], x3           //store row 0 (prol)
+    sshr        d3, d3,#8                   //row 0 shift (prol) (first value to be ignored)
+
+    movi        d20, #0x00000000000000ff    //byte mask row 1 (prol)
+
+loop_again_col_row:
+
+    bsl         v20.8b,  v3.8b ,  v16.8b    //row 1    (prol)
+
+    movi        d21, #0x00000000000000ff    //byte mask row 2 (prol)
+    sshr        d3, d3,#8                   //row 1 shift (prol)
+
+    st1         {v20.8b},[x2], x3           //store row 1 (prol)
+    sqshrun     v4.8b, v26.8h,#2            //columns shx2 movn (prol extra)
+
+
+    bsl         v21.8b,  v3.8b ,  v16.8b    //row 2 (prol)
+
+    movi        d20, #0x00000000000000ff    //byte mask row 3 (prol)
+    sshr        d3, d3,#8                   //row 2 shift (prol)
+
+    st1         {v21.8b},[x2], x3           //store row 2 (prol)
+
+
+    bsl         v20.8b,  v3.8b ,  v16.8b    //row 3    (prol)
+
+    movi        d21, #0x00000000000000ff    //byte mask row 4 (prol)
+    sshr        d3, d3,#8                   //row 3 shift (prol)
+
+    st1         {v20.8b},[x2], x3           //store row 3 (prol)
+
+
+    bsl         v21.8b,  v3.8b ,  v16.8b    //row 4 (prol)
+
+    movi        d20, #0x00000000000000ff    //byte mask row 5 (prol)
+    sshr        d3, d3,#8                   //row 4 shift (prol)
+
+    st1         {v21.8b},[x2], x3           //store row 4 (prol)
+
+
+    bsl         v20.8b,  v3.8b ,  v16.8b    //row 5 (prol)
+
+    movi        d21, #0x00000000000000ff    //byte mask row 6 (prol)
+    sshr        d3, d3,#8                   //row 5 shift (prol)
+
+    st1         {v20.8b},[x2], x3           //store row 5 (prol)
+
+    ld1         {v1.8b},[x9]                //row 8::15 load (prol extra)
+
+    bsl         v21.8b,  v3.8b ,  v16.8b    //row 6 (prol)
+
+    uxtl        v22.8h, v1.8b
+
+    movi        d20, #0x00000000000000ff    //byte mask row 7 (prol)
+    sshr        d3, d3,#8                   //row 6 shift (prol)
+
+    st1         {v21.8b},[x2], x3           //store row 6 (prol)
+
+    bsl         v20.8b,  v3.8b ,  v16.8b    //row 7 (prol)
+    add         v22.8h,  v22.8h ,  v24.8h   //row 8::15 add 3dc+2 (prol extra)
+
+    sshr        d3, d3,#8                   //row 7 shift (prol)
+    st1         {v20.8b},[x2], x12          //store row 7 (prol)
+
+    subs        x10, x10, #8                //counter for cols
+
+    beq         end_func
+    blt         copy_16
+
+
+    movi        d20, #0x00000000000000ff    //byte mask row 9 (prol)
+    sqshrun     v3.8b, v22.8h,#2            //rows shx2 movn (prol)
+
+    rev64       v3.8b,  v3.8b
+
+    st1         {v4.8b},[x2], x3            //store 2nd col (for 16x16)
+
+    st1         {v16.8b},[x2], x3
+    st1         {v16.8b},[x2], x3
+    st1         {v16.8b},[x2], x3
+    st1         {v16.8b},[x2], x3
+    st1         {v16.8b},[x2], x3
+    st1         {v16.8b},[x2], x3
+    st1         {v16.8b},[x2], x0           //go to next row for 16
+
+
+    bsl         v20.8b,  v3.8b ,  v16.8b    //row 9    (prol)
+    subs        x10, x10, #8
+
+    st1         {v20.8b},[x2], x3           //store row 9 (prol)
+    sshr        d3, d3,#8                   //row 9 shift (prol)
+
+    movi        d20, #0x00000000000000ff    //byte mask row 9 (prol)
+
+    b           loop_again_col_row
+
+
+copy_16:
+    st1         {v16.8b},[x2], x3
+    st1         {v16.8b},[x2], x3
+    st1         {v16.8b},[x2], x3
+    st1         {v16.8b},[x2], x3
+    st1         {v16.8b},[x2], x3
+    st1         {v16.8b},[x2], x3
+    st1         {v16.8b},[x2], x3
+    st1         {v16.8b},[x2]
+
+    b           end_func
+
+prologue_cpy_32:
+    mov         x9, #128
+    //sub        x7, x3, #-24
+    add         x5, x2, x3
+    add         x8, x5, x3
+    add         x10, x8, x3
+    dup         v20.16b, v16.8b[0]
+    lsl         x6, x3, #2
+    add         x6, x6, #-16
+
+    st1         {v20.16b}, [x2],#16
+    st1         {v20.16b}, [x5],#16
+    st1         {v20.16b}, [x8],#16
+    st1         {v20.16b}, [x10],#16
+
+    st1         {v20.16b}, [x2], x6
+    st1         {v20.16b}, [x5], x6
+    st1         {v20.16b}, [x8], x6
+    st1         {v20.16b}, [x10], x6
+
+    sub         x9, x9, #32                 //32x32 prol/epil counter dec
+
+kernel_copy:
+    st1         {v20.16b}, [x2],#16
+    st1         {v20.16b}, [x5],#16
+    st1         {v20.16b}, [x8],#16
+    st1         {v20.16b}, [x10],#16
+
+    st1         {v20.16b}, [x2], x6
+    st1         {v20.16b}, [x5], x6
+    st1         {v20.16b}, [x8], x6
+    st1         {v20.16b}, [x10], x6
+
+    subs        x9, x9, #32
+
+    st1         {v20.16b}, [x2],#16
+    st1         {v20.16b}, [x5],#16
+    st1         {v20.16b}, [x8],#16
+    st1         {v20.16b}, [x10],#16
+
+    st1         {v20.16b}, [x2], x6
+    st1         {v20.16b}, [x5], x6
+    st1         {v20.16b}, [x8], x6
+    st1         {v20.16b}, [x10], x6
+
+    bne         kernel_copy
+
+epilogue_copy:
+    st1         {v20.16b}, [x2],#16
+    st1         {v20.16b}, [x5],#16
+    st1         {v20.16b}, [x8],#16
+    st1         {v20.16b}, [x10],#16
+
+    st1         {v20.16b}, [x2]
+    st1         {v20.16b}, [x5]
+    st1         {v20.16b}, [x8]
+    st1         {v20.16b}, [x10]
+
+    b           end_func
+
+
+dc_4:
+    ld1         {v0.8b},[x6],#8             //load from src[nt]
+    ld1         {v1.8b},[x8],#8             //load from src[2nt+1]
+
+    uaddlp      v2.4h,  v0.8b
+    mov         x5, #0                      //
+    mov         v6.s[0], w4
+    mov         v6.s[1], w5                 //store nt to accumulate
+    uaddlp      v3.4h,  v1.8b
+
+    add         v4.4h,  v2.4h ,  v3.4h
+
+
+    uaddlp      v5.2s,  v4.4h
+    movi        d30, #0x00000000ffffffff
+
+    and         v5.8b,  v5.8b ,  v30.8b
+
+    mov         v28.s[0], w14
+    mov         v28.s[1], w5                //src[2nt+1]+2+src[2nt-1] moved to d28
+    add         d6,  d6 ,  d5               //accumulate all inp into d6 (end for nt==8)
+
+    sshl        d9, d6, d8                  //(dc_val) shr by log2nt+1
+    mov         x8, x7                      //&src[2nt+1]
+
+    shl         d13, d9,#1                  //2*dc
+    sub         x9, x9, #3                  //&src[2nt-1-row]
+
+    dup         v16.8b, v9.8b[0]            //dc_val
+    add         d14,  d13 ,  d28            //src[2nt+1]+2+src[2nt-1]+2dc_val
+
+    ushr        v15.4h, v14.4h,#2           //final dst[0]'s value in d15[0]
+    sub         x12, x3, x3, lsl #2         //-3*strd
+    add         d11,  d13 ,  d9             //3*dc
+
+    add         d11,  d11 ,  d17            //3*dc + 2
+    add         x12, x12, #4                //offset after one 4x4 block (-3*strd + 4)
+
+    dup         v24.8h, v11.4h[0]           //3*dc + 2 (moved to all lanes)
+    sub         x0, x3, x4                  //strd - nt
+
+
+    ld1         {v0.8b},[x8]                //col 1::3 load (prol)
+    ld1         {v1.8b},[x9]                //row 3::1 (0 also) load (prol)
+
+    uxtl        v20.8h, v0.8b
+
+    uxtl        v22.8h, v1.8b
+    add         v20.8h,  v20.8h ,  v24.8h   //col 1::7 add 3dc+2 (prol)
+
+    add         v22.8h,  v22.8h ,  v24.8h   //row 1::7 add 3dc+2 (prol)
+
+    movi        d19, #0x00000000000000ff    //
+    sqshrun     v2.8b, v20.8h,#2            //columns shx2 movn (prol)
+
+    movi        d20, #0x00000000000000ff    //byte mask row 1 (prol)
+    sqshrun     v3.8b, v22.8h,#2            //rows shx2 movn (prol)
+
+
+    bsl         v19.8b,  v15.8b ,  v2.8b    //first row with dst[0]
+
+    rev64       v3.8b,  v3.8b
+
+    st1         {v19.s}[0],[x2], x3         //store row 0 (prol)
+    sshr        d3, d3,#40                  //row 0 shift (prol) (first value to be ignored)
+
+    movi        d21, #0x00000000000000ff    //byte mask row 2 (prol)
+
+    bsl         v20.8b,  v3.8b ,  v16.8b    //row 1    (prol)
+    sshr        d3, d3,#8                   //row 1 shift (prol)
+
+    st1         {v20.s}[0],[x2], x3         //store row 1 (prol)
+
+    bsl         v21.8b,  v3.8b ,  v16.8b    //row 2 (prol)
+
+    movi        d20, #0x00000000000000ff    //byte mask row 3 (prol)
+
+    sshr        d3, d3,#8                   //row 2 shift (prol)
+    st1         {v21.s}[0],[x2], x3         //store row 2 (prol)
+
+    bsl         v20.8b,  v3.8b ,  v16.8b    //row 3    (prol)
+    st1         {v20.s}[0],[x2]             //store row 3 (prol)
+
+epilogue_end:
+end_func:
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_luma_horz.s b/common/arm64/ihevc_intra_pred_luma_horz.s
new file mode 100644
index 0000000..551fd77
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_luma_horz.s

@@ -0,0 +1,357 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_luma_horz_neon.s
+//*
+//* @brief
+//*  contains function definition for intra prediction  interpolation filters
+//*
+//*
+//* @author
+//*  parthiban v
+//*
+//* @par list of functions:
+//*  - ihevc_intra_pred_luma_horz()
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+//
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*     intra prediction interpolation filter for horizontal luma variable.
+//*
+//* @par description:
+//*      horizontal intraprediction(mode 10) with.extern  samples location
+//*      pointed by 'pu1_ref' to the tu block  location pointed by 'pu1_dst'  refer
+//*      to section 8.4.4.2.6 in the standard (special case)
+//*
+//* @param[in] pu1_src
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] nt
+//*  integer transform block size
+//*
+//* @param[in] mode
+//*  integer intraprediction mode
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_intra_pred_luma_horz(uword8 *pu1_ref,
+//                                word32 src_strd,
+//                                uword8 *pu1_dst,
+//                                word32 dst_strd,
+//                                word32 nt,
+//                                word32 mode)
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 =>  src_strd
+//x2 => *pu1_dst
+//x3 =>  dst_strd
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_luma_horz_av8
+
+.type ihevc_intra_pred_luma_horz_av8, %function
+
+ihevc_intra_pred_luma_horz_av8:
+
+    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    //ldr          x5,[sp,#44]                        @loads mode
+
+    lsl         x6,x4,#1                    //two_nt
+
+    add         x12,x0,x6                   //*pu1_ref[two_nt]
+    cmp         x4,#4                       //if nt == 4
+    beq         core_loop_4
+
+    cmp         x4,#8                       //if nt == 8
+    beq         core_loop_8
+
+    cmp         x4,#16                      //if nt == 16
+    beq         core_loop_16
+    sub         x12,x12,#16                 //move to 16th value pointer
+    add         x9,x2,#16
+
+core_loop_32:
+    ld1         { v0.16b},[x12]             //load 16 values. d1[7] will have the 1st value.
+
+    dup         v2.16b, v0.16b[15]          //duplicate the i value.
+
+    dup         v4.16b, v0.16b[14]          //duplicate the ii value.
+    dup         v6.16b, v0.16b[13]          //duplicate the iii value.
+    st1         { v2.16b},[x2],x3           //store in 1st row 0-16 columns
+    st1         { v2.16b},[x9],x3           //store in 1st row 16-32 columns
+
+    dup         v8.16b, v0.16b[12]
+    st1         { v4.16b},[x2],x3
+    st1         { v4.16b},[x9],x3
+
+    dup         v2.16b, v0.16b[11]
+    st1         { v6.16b},[x2],x3
+    st1         { v6.16b},[x9],x3
+
+    dup         v4.16b, v0.16b[10]
+    st1         { v8.16b},[x2],x3
+    st1         { v8.16b},[x9],x3
+
+    dup         v6.16b, v0.16b[9]
+    st1         { v2.16b},[x2],x3
+    st1         { v2.16b},[x9],x3
+
+    dup         v8.16b, v0.16b[8]
+    st1         { v4.16b},[x2],x3
+    st1         { v4.16b},[x9],x3
+
+    dup         v2.16b, v0.8b[7]
+    st1         { v6.16b},[x2],x3
+    st1         { v6.16b},[x9],x3
+
+    dup         v4.16b, v0.8b[6]
+    st1         { v8.16b},[x2],x3
+    st1         { v8.16b},[x9],x3
+
+    dup         v6.16b, v0.8b[5]
+    st1         { v2.16b},[x2],x3
+    st1         { v2.16b},[x9],x3
+
+    dup         v8.16b, v0.8b[4]
+    st1         { v4.16b},[x2],x3
+    st1         { v4.16b},[x9],x3
+
+    dup         v2.16b, v0.8b[3]
+    st1         { v6.16b},[x2],x3
+    st1         { v6.16b},[x9],x3
+
+    dup         v4.16b, v0.8b[2]
+    st1         { v8.16b},[x2],x3
+    st1         { v8.16b},[x9],x3
+
+    dup         v6.16b, v0.8b[1]
+    st1         { v2.16b},[x2],x3
+    st1         { v2.16b},[x9],x3
+    sub         x12,x12,#16                 //move to 16th value pointer
+
+    dup         v8.16b, v0.8b[0]
+    st1         { v4.16b},[x2],x3
+    st1         { v4.16b},[x9],x3
+
+    subs        x4,x4,#16                   //decrement the loop count by 16
+    st1         { v6.16b},[x2],x3
+    st1         { v6.16b},[x9],x3
+
+    st1         { v8.16b},[x2],x3
+    st1         { v8.16b},[x9],x3
+    bgt         core_loop_32
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+    b           end_func
+
+core_loop_16:
+    ldrb        w14,[x12],#1                //pu1_ref[two_nt]
+    sxtw        x14,w14
+    ld1         { v30.8b},[x12],#8          //pu1_ref[two_nt + 1 + col]
+    ld1         { v31.8b},[x12]             //pu1_ref[two_nt + 1 + col]
+    sub         x12,x12,#8
+
+    dup         v28.8b,w14
+    sub         x12,x12,#17
+    ld1         { v0.16b},[x12]
+    dup         v26.8b, v0.16b[15]
+    uxtl        v26.8h, v26.8b
+
+    dup         v2.16b, v0.16b[14]
+    usubl       v24.8h, v30.8b, v28.8b
+
+    dup         v4.16b, v0.16b[13]
+    sshr        v24.8h, v24.8h,#1
+
+    dup         v6.16b, v0.16b[12]
+    sqadd       v22.8h,  v26.8h ,  v24.8h
+
+    dup         v8.16b, v0.16b[11]
+    sqxtun      v22.8b, v22.8h
+
+    st1         {v22.8b},[x2],#8
+
+    dup         v10.16b, v0.16b[10]
+    usubl       v24.8h, v31.8b, v28.8b
+
+    dup         v12.16b, v0.16b[9]
+    sshr        v24.8h, v24.8h,#1
+
+    dup         v14.16b, v0.16b[8]
+    sqadd       v22.8h,  v26.8h ,  v24.8h
+
+    dup         v16.16b, v0.8b[7]
+    sqxtun      v22.8b, v22.8h
+
+    st1         {v22.8b},[x2],x3
+    sub         x2,x2,#8
+
+    st1         { v2.16b},[x2],x3
+
+    st1         { v4.16b},[x2],x3
+    st1         { v6.16b},[x2],x3
+    st1         { v8.16b},[x2],x3
+
+    dup         v2.16b, v0.8b[6]
+    st1         { v10.16b},[x2],x3
+
+    dup         v4.16b, v0.8b[5]
+    st1         { v12.16b},[x2],x3
+
+    dup         v6.16b, v0.8b[4]
+    st1         { v14.16b},[x2],x3
+
+    dup         v8.16b, v0.8b[3]
+    st1         { v16.16b},[x2],x3
+
+    dup         v10.16b, v0.8b[2]
+    st1         { v2.16b},[x2],x3
+
+    dup         v12.16b, v0.8b[1]
+    st1         { v4.16b},[x2],x3
+
+    dup         v14.16b, v0.8b[0]
+    st1         { v6.16b},[x2],x3
+
+    st1         { v8.16b},[x2],x3
+    st1         { v10.16b},[x2],x3
+    st1         { v12.16b},[x2],x3
+    st1         { v14.16b},[x2],x3
+
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+    b           end_func
+
+
+core_loop_8:
+    ldrb        w14,[x12]                   //pu1_ref[two_nt]
+    sxtw        x14,w14
+    add         x12,x12,#1                  //pu1_ref[two_nt + 1]
+    ld1         {v30.8b},[x12]              //pu1_ref[two_nt + 1 + col]
+
+    sub         x12,x12,#9
+    ld1         {v0.8b},[x12]
+    dup         v26.8b, v0.8b[7]
+    dup         v28.8b,w14
+
+    dup         v3.8b, v0.8b[6]
+    uxtl        v26.8h, v26.8b
+
+    dup         v4.8b, v0.8b[5]
+    usubl       v24.8h, v30.8b, v28.8b
+
+    dup         v5.8b, v0.8b[4]
+    sshr        v24.8h, v24.8h,#1
+
+    dup         v6.8b, v0.8b[3]
+    sqadd       v22.8h,  v26.8h ,  v24.8h
+
+    dup         v7.8b, v0.8b[2]
+    sqxtun      v22.8b, v22.8h
+
+    st1         {v22.8b},[x2],x3
+    st1         {v3.8b},[x2],x3
+
+    dup         v8.8b, v0.8b[1]
+    st1         {v4.8b},[x2],x3
+    st1         {v5.8b},[x2],x3
+
+    dup         v9.8b, v0.8b[0]
+    st1         {v6.8b},[x2],x3
+    st1         {v7.8b},[x2],x3
+
+    st1         {v8.8b},[x2],x3
+    st1         {v9.8b},[x2],x3
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+    b           end_func
+
+
+core_loop_4:
+    ldrb        w14,[x12]                   //pu1_ref[two_nt]
+    sxtw        x14,w14
+    add         x12,x12,#1                  //pu1_ref[two_nt + 1]
+    ld1         {v30.8b},[x12]              //pu1_ref[two_nt + 1 + col]
+
+    sub         x12,x12,#5
+    ld1         {v0.8b},[x12]
+    dup         v28.8b,w14
+    dup         v26.8b, v0.8b[3]
+    uxtl        v26.8h, v26.8b
+
+    dup         v3.8b, v0.8b[2]
+    usubl       v24.8h, v30.8b, v28.8b
+
+    dup         v4.8b, v0.8b[1]
+    sshr        v24.8h, v24.8h,#1
+
+    dup         v5.8b, v0.8b[0]
+    sqadd       v22.8h,  v26.8h ,  v24.8h
+
+    sqxtun      v22.8b, v22.8h
+
+    st1         {v22.s}[0],[x2],x3
+    st1         {v3.s}[0],[x2],x3
+    st1         {v4.s}[0],[x2],x3
+    st1         {v5.s}[0],[x2],x3
+
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+end_func:
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_luma_mode2.s b/common/arm64/ihevc_intra_pred_luma_mode2.s
new file mode 100644
index 0000000..5d7a3c5
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_luma_mode2.s

@@ -0,0 +1,280 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_luma_mode2_neon.s
+//*
+//* @brief
+//*  contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  yogeswaran rs
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*    luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] pi1_coeff
+//*  word8 pointer to the planar coefficients
+//*
+//* @param[in] nt
+//*  size of tranform block
+//*
+//* @param[in] mode
+//*  type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref,
+//                                 word32 src_strd,
+//                                 uword8 *pu1_dst,
+//                                 word32 dst_strd,
+//                                 word32 nt,
+//                                 word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+//    nt
+//    mode
+//    pi1_coeff
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_luma_mode2_av8
+
+.type ihevc_intra_pred_luma_mode2_av8, %function
+
+ihevc_intra_pred_luma_mode2_av8:
+
+    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    mov         x8,#-2
+
+    cmp         x4,#4
+    beq         mode2_4
+
+    add         x0,x0,x4,lsl #1
+
+    sub         x0,x0,#9                    //src[1]
+    add         x10,x0,#-1
+
+prologue_cpy_32:
+
+    ld1         {v0.8b},[x0],x8
+    mov         x11,x4
+
+    ld1         {v1.8b},[x10],x8
+    mov         x6, x2
+
+    ld1         {v2.8b},[x0],x8
+    ld1         {v3.8b},[x10],x8
+    lsr         x1, x4, #3
+
+    ld1         {v4.8b},[x0],x8
+    ld1         {v5.8b},[x10],x8
+    ld1         {v6.8b},[x0],x8
+    mul         x1, x4, x1
+
+    ld1         {v7.8b},[x10],x8
+    add         x7,x6,x3
+
+    rev64       v8.8b,  v0.8b
+    rev64       v9.8b,  v1.8b
+    lsl         x5, x3, #2
+
+    rev64       v10.8b,  v2.8b
+    rev64       v11.8b,  v3.8b
+    add         x9,x7,x3
+
+    rev64       v12.8b,  v4.8b
+    subs        x1,x1,#8
+
+    rev64       v13.8b,  v5.8b
+    rev64       v14.8b,  v6.8b
+    rev64       v15.8b,  v7.8b
+    add         x14,x9,x3
+
+    beq         epilogue_mode2
+
+    sub         x12,x4,#8
+
+kernel_mode2:
+
+    st1         {v8.8b},[x6],x5
+    st1         {v9.8b},[x7],x5
+    subs        x11,x11,#8
+
+    st1         {v10.8b},[x9],x5
+    add         x20,x2,#8
+    csel        x2, x20, x2,gt
+
+    st1         {v11.8b},[x14],x5
+    st1         {v12.8b},[x6],x5
+    csel        x11, x4, x11,le
+
+    st1         {v13.8b},[x7],x5
+    st1         {v14.8b},[x9],x5
+    add         x20, x2, x3, lsl #2
+    csel        x2, x20, x2,le
+
+    st1         {v15.8b},[x14],x5
+    ld1         {v0.8b},[x0],x8
+    sub         x14,x4,#8
+
+    ld1         {v1.8b},[x10],x8
+    ld1         {v2.8b},[x0],x8
+    add         x20, x2, #8
+    csel        x2, x20, x2,le
+
+    ld1         {v3.8b},[x10],x8
+    ld1         {v4.8b},[x0],x8
+    sub         x20, x6, x14
+    csel        x2, x20, x2,le
+
+    ld1         {v5.8b},[x10],x8
+    subs        x12,x12,#8
+
+    ld1         {v6.8b},[x0],x8
+    mov         x6, x2
+
+    ld1         {v7.8b},[x10],x8
+    add         x20, x0, x4
+    csel        x0, x20, x0,le
+
+    rev64       v8.8b,  v0.8b
+    add         x7, x6, x3
+
+    rev64       v9.8b,  v1.8b
+    sub         x20, x0, #8
+    csel        x0, x20, x0,le
+
+    rev64       v10.8b,  v2.8b
+    csel        x12, x4, x12,le
+
+    rev64       v11.8b,  v3.8b
+    add         x9, x7, x3
+
+    rev64       v12.8b,  v4.8b
+    add         x10,x0,#-1
+
+    rev64       v13.8b,  v5.8b
+    subs        x1, x1, #8
+
+    rev64       v14.8b,  v6.8b
+    add         x14, x9, x3
+
+    rev64       v15.8b,  v7.8b
+
+    bne         kernel_mode2
+
+epilogue_mode2:
+
+    st1         {v8.8b},[x6],x5
+    st1         {v9.8b},[x7],x5
+    st1         {v10.8b},[x9],x5
+    st1         {v11.8b},[x14],x5
+    st1         {v12.8b},[x6],x5
+    st1         {v13.8b},[x7],x5
+    st1         {v14.8b},[x9],x5
+    st1         {v15.8b},[x14],x5
+
+    b           end_func
+
+mode2_4:
+
+    mov         x8,#-2
+    sub         x0,x0,#1
+    add         x10,x0,#-1
+
+    ld1         {v0.8b},[x0],x8
+    add         x5,x2,x3
+    ld1         {v2.8b},[x10],x8
+    add         x6,x5,x3
+    ld1         {v4.8b},[x0]
+    add         x7,x6,x3
+    ld1         {v6.8b},[x10]
+
+    rev64       v1.8b,  v0.8b
+    rev64       v3.8b,  v2.8b
+
+
+
+    st1         {v1.s}[0],[x2]
+    rev64       v5.8b,  v4.8b
+    st1         {v3.s}[0],[x5]
+    rev64       v7.8b,  v6.8b
+    st1         {v5.s}[0],[x6]
+    st1         {v7.s}[0],[x7]
+
+end_func:
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_luma_mode_18_34.s b/common/arm64/ihevc_intra_pred_luma_mode_18_34.s
new file mode 100644
index 0000000..11e1792
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_luma_mode_18_34.s

@@ -0,0 +1,288 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_luma_mode_18_34_neon.s
+//*
+//* @brief
+//*  contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  yogeswaran rs
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*    luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] pi1_coeff
+//*  word8 pointer to the planar coefficients
+//*
+//* @param[in] nt
+//*  size of tranform block
+//*
+//* @param[in] mode
+//*  type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_mode_18_34(uword8 *pu1_ref,
+//                                      word32 src_strd,
+//                                      uword8 *pu1_dst,
+//                                      word32 dst_strd,
+//                                      word32 nt,
+//                                      word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+//    nt
+//    mode
+//    pi1_coeff
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_luma_mode_18_34_av8
+
+.type ihevc_intra_pred_luma_mode_18_34_av8, %function
+
+ihevc_intra_pred_luma_mode_18_34_av8:
+
+    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    cmp         x4,#4
+    beq         mode2_4
+
+    mov         x11,x4
+    mov         x12,x4
+    sub         x14,x4,#8
+
+    add         x0,x0,x4,lsl #1
+
+    cmp         x5,#0x22
+    mov         x10,x2
+
+    add         x0,x0,#2
+    sub         x20,x0,#2
+    csel        x0, x20, x0,ne
+    mov         x20,#1
+    csel        x6, x20, x6,eq
+    mov         x20,#-1
+    csel        x6, x20, x6,ne
+    mov         x8,x0
+
+prologue_cpy_32:
+
+    ld1         {v0.8b},[x8],x6
+    lsr         x1, x4, #3
+    ld1         {v1.8b},[x8],x6
+    mul         x1, x4, x1
+    ld1         {v2.8b},[x8],x6
+    ld1         {v3.8b},[x8],x6
+    subs        x1,x1,#8
+    ld1         {v4.8b},[x8],x6
+    ld1         {v5.8b},[x8],x6
+    ld1         {v6.8b},[x8],x6
+
+    ld1         {v7.8b},[x8],x6
+
+
+    beq         epilogue_mode2
+    sub         x11,x11,#8
+
+    cmp         x5,#0x22
+    add         x20,x0,#8
+    csel        x0, x20, x0,ne
+    csel        x8, x0, x8,ne
+    bne         kernel_mode18
+    //add        x8,x0,#8
+
+kernel_mode2:
+    st1         {v0.8b},[x10],x3
+    st1         {v1.8b},[x10],x3
+    subs        x12,x12,#8
+    st1         {v2.8b},[x10],x3
+    add         x20,x2,#8
+    csel        x2, x20, x2,ne
+    st1         {v3.8b},[x10],x3
+
+    ld1         {v0.8b},[x8],x6
+    st1         {v4.8b},[x10],x3
+
+    st1         {v5.8b},[x10],x3
+    ld1         {v1.8b},[x8],x6
+    st1         {v6.8b},[x10],x3
+    ld1         {v2.8b},[x8],x6
+    st1         {v7.8b},[x10],x3
+
+    ld1         {v3.8b},[x8],x6
+    sub         x20,x10,x14
+    csel        x2, x20, x2,eq
+    ld1         {v4.8b},[x8],x6
+    mov         x10,x2
+    ld1         {v5.8b},[x8],x6
+    csel        x12, x4, x12,eq
+    ld1         {v6.8b},[x8],x6
+    subs        x11,x11,#8
+
+    ld1         {v7.8b},[x8],x6
+
+    add         x20,x0,#8
+    csel        x0, x20, x0,eq
+    csel        x11, x4, x11,eq
+    csel        x8, x0, x8,eq
+
+    subs        x1, x1, #8
+
+    bne         kernel_mode2
+
+    b           epilogue_mode2
+
+kernel_mode18:
+    st1         {v0.8b},[x10],x3
+    st1         {v1.8b},[x10],x3
+    subs        x12,x12,#8
+    st1         {v2.8b},[x10],x3
+    add         x20,x2,#8
+    csel        x2, x20, x2,ne
+    st1         {v3.8b},[x10],x3
+
+    ld1         {v0.8b},[x8],x6
+    st1         {v4.8b},[x10],x3
+
+    st1         {v5.8b},[x10],x3
+    ld1         {v1.8b},[x8],x6
+
+    st1         {v6.8b},[x10],x3
+    ld1         {v2.8b},[x8],x6
+    st1         {v7.8b},[x10],x3
+
+    ld1         {v3.8b},[x8],x6
+    sub         x20,x10,x14
+    csel        x2, x20, x2,eq
+    ld1         {v4.8b},[x8],x6
+    mov         x10,x2
+    ld1         {v5.8b},[x8],x6
+    csel        x12, x4, x12,eq
+    ld1         {v6.8b},[x8],x6
+    subs        x11,x11,#8
+    ld1         {v7.8b},[x8],x6
+
+    add         x20,x0,#8
+    csel        x0, x20, x0,ne
+    csel        x11, x4, x11,eq
+    sub         x20,x8,x14
+    csel        x0, x20, x0,eq
+    subs        x1, x1, #8
+    mov         x8,x0
+
+    bne         kernel_mode18
+
+
+epilogue_mode2:
+
+    st1         {v0.8b},[x10],x3
+    st1         {v1.8b},[x10],x3
+    st1         {v2.8b},[x10],x3
+    st1         {v3.8b},[x10],x3
+    st1         {v4.8b},[x10],x3
+    st1         {v5.8b},[x10],x3
+    st1         {v6.8b},[x10],x3
+    st1         {v7.8b},[x10],x3
+
+    b           end_func
+
+mode2_4:
+
+    add         x0,x0,#10
+    cmp         x5,#0x22
+    sub         x20,x0,#2
+    csel        x0, x20, x0,ne
+
+    mov         x20,#1
+    csel        x8, x20, x8,eq
+    mov         x20,#-1
+    csel        x8, x20, x8,ne
+
+    ld1         {v0.8b},[x0],x8
+    st1         {v0.s}[0],[x2],x3
+
+    ld1         {v0.8b},[x0],x8
+    st1         {v0.s}[0],[x2],x3
+
+    ld1         {v0.8b},[x0],x8
+    st1         {v0.s}[0],[x2],x3
+
+    ld1         {v0.8b},[x0],x8
+    st1         {v0.s}[0],[x2],x3
+
+end_func:
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s b/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s
new file mode 100644
index 0000000..79964f7
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s

@@ -0,0 +1,555 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_luma_mode_27_to_33.s
+//*
+//* @brief
+//*  contains function definition for intra prediction  interpolation filters
+//*
+//*
+//* @author
+//*  parthiban v
+//*
+//* @par list of functions:
+//*  - ihevc_intra_pred_luma_mode_27_to_33()
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+//
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*    intra prediction interpolation filter for luma mode 27 to mode 33
+//*
+//* @par description:
+//*    intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
+//*   .extern  neighboring samples location pointed by 'pu1_ref' to the  tu
+//*    block location pointed by 'pu1_dst'
+//*
+//* @param[in] pu1_src
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] nt
+//*  integer transform block size
+//*
+//* @param[in] mode
+//*  integer intraprediction mode
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_mode_27_to_33(uword8 *pu1_ref,
+//                                        word32 src_strd,
+//                                        uword8 *pu1_dst,
+//                                        word32 dst_strd,
+//                                        word32 nt,
+//                                        word32 mode)
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 =>  src_strd
+//x2 => *pu1_dst
+//x3 =>  dst_strd
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_luma_mode_27_to_33_av8
+.extern gai4_ihevc_ang_table
+.extern gau1_ihevc_planar_factor
+
+.type ihevc_intra_pred_luma_mode_27_to_33_av8, %function
+
+ihevc_intra_pred_luma_mode_27_to_33_av8:
+
+    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    adrp        x6,  :got:gai4_ihevc_ang_table //loads word32 gai4_ihevc_ang_table[35]
+    ldr         x6,  [x6, #:got_lo12:gai4_ihevc_ang_table]
+
+    lsl         x7,x4,#1                    //two_nt
+
+    add         x8,x6,x5,lsl #2             //*gai4_ihevc_ang_table[mode]
+    ldr         w9, [x8]                    //intra_pred_ang = gai4_ihevc_ang_table[mode]
+    sxtw        x9,w9
+    adrp        x1, :got:gau1_ihevc_planar_factor //used for ((row + 1) * intra_pred_ang) row values
+    ldr         x1, [x1, #:got_lo12:gau1_ihevc_planar_factor]
+    add         x6,x1,#1
+
+    tst         x4,#7
+    add         x8,x0,x7                    //pu1_ref + two_nt
+    mov         x14,#0                      //row
+    mov         x12,x4
+    bne         core_loop_4
+
+core_loop_8:
+    add         x8,x8,#1                    //pu1_ref_main_idx += (two_nt + 1)
+    dup         v0.8b,w9                    //intra_pred_ang
+    lsr         x12, x4, #3                 //divide by 8
+
+    movi        v1.8b, #32
+    mul         x7, x4, x12
+
+    movi        v6.8h, #31
+    //lsl            x12,x3,#3
+
+    mov         x1,x8
+    //sub            x12,x12,x4
+    mov         x5,x4
+    mov         x11,#1
+
+prologue:
+    ld1         {v3.8b},[x6]                //loads the row value
+    umull       v2.8h, v3.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
+    and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
+    xtn         v4.8b,  v4.8h
+    shrn        v5.8b, v2.8h,#5             //idx = pos >> 5
+
+    dup         v31.8b, v4.8b[0]
+    add         x0,x2,x3
+
+    umov        w14, v5.2s[0]               //(i row)extract idx to the r register
+    sxtw        x14,w14
+
+    dup         v29.8b, v4.8b[1]            //(ii)
+    and         x9,x14,#0xff                //(i row) get the last byte
+
+    add         x10,x8,x9                   //(i row)*pu1_ref[ref_main_idx]
+
+    asr         x14,x14,#8                  //(ii)shift by 8
+    ld1         {v8.8b},[x10],x11           //(i row)ref_main_idx
+    and         x9,x14,#0xff                //(ii)get the last byte
+
+    asr         x14,x14,#8                  //(iii)
+    ld1         {v9.8b},[x10]               //(i row)ref_main_idx_1
+    add         x12,x8,x9                   //(ii)*pu1_ref[ref_main_idx]
+
+    and         x9,x14,#0xff                //(iii)
+    sub         v30.8b,  v1.8b ,  v31.8b    //32-fract(dup_const_32_fract)
+    add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
+
+    ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
+    umull       v10.8h, v8.8b, v30.8b       //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
+    umlal       v10.8h, v9.8b, v31.8b       //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
+    asr         x14,x14,#8                  //(iv)
+
+    dup         v27.8b, v4.8b[2]            //(iii)
+    sub         v28.8b,  v1.8b ,  v29.8b    //(ii)32-fract(dup_const_32_fract)
+    and         x9,x14,#0xff                //(iv)
+
+    dup         v25.8b, v4.8b[3]            //(iv)
+    umull       v14.8h, v12.8b, v28.8b      //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    add         x12,x8,x9                   //(iv)*pu1_ref[ref_main_idx]
+
+    ld1         {v16.8b},[x10],x11          //(iii)ref_main_idx
+    umlal       v14.8h, v13.8b, v29.8b      //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v17.8b},[x10]              //(iii)ref_main_idx_1
+    rshrn       v10.8b, v10.8h,#5           //(i row)shift_res = vrshrn_n_u16(add_res, 5)
+
+    ld1         {v20.8b},[x12],x11          //(iv)ref_main_idx
+    sub         v26.8b,  v1.8b ,  v27.8b    //(iii)32-fract(dup_const_32_fract)
+
+    ld1         {v21.8b},[x12]              //(iv)ref_main_idx_1
+
+    dup         v31.8b, v4.8b[4]            //(v)
+    umull       v18.8h, v16.8b, v26.8b      //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    umov        w14, v5.2s[1]               //extract idx to the r register
+    sxtw        x14,w14
+    umlal       v18.8h, v17.8b, v27.8b      //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    st1         {v10.8b},[x2],#8            //(i row)
+    rshrn       v14.8b, v14.8h,#5           //(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    and         x9,x14,#0xff                //(v)
+    dup         v29.8b, v4.8b[5]            //(vi)
+    add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
+
+    ld1         {v8.8b},[x10],x11           //(v)ref_main_idx
+    sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
+
+    asr         x14,x14,#8                  //(vi)
+    umull       v22.8h, v20.8b, v24.8b      //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+    and         x9,x14,#0xff                //(vi)
+
+    ld1         {v9.8b},[x10]               //(v)ref_main_idx_1
+    umlal       v22.8h, v21.8b, v25.8b      //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    st1         {v14.8b},[x0],x3            //(ii)
+    rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    add         x12,x8,x9                   //(vi)*pu1_ref[ref_main_idx]
+    dup         v27.8b, v4.8b[6]            //(vii)
+    asr         x14,x14,#8                  //(vii)
+
+    and         x9,x14,#0xff                //(vii)
+    sub         v30.8b,  v1.8b ,  v31.8b    //(v)32-fract(dup_const_32_fract)
+    add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
+
+    ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
+    umull       v10.8h, v8.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
+    umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    st1         {v18.8b},[x0],x3            //(iii)
+    rshrn       v22.8b, v22.8h,#5           //(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+    asr         x14,x14,#8                  //(viii)
+    dup         v25.8b, v4.8b[7]            //(viii)
+    and         x9,x14,#0xff                //(viii)
+
+    ld1         {v16.8b},[x10],x11          //(vii)ref_main_idx
+    sub         v28.8b,  v1.8b ,  v29.8b    //(vi)32-fract(dup_const_32_fract)
+
+    ld1         {v17.8b},[x10]              //(vii)ref_main_idx_1
+    umull       v14.8h, v12.8b, v28.8b      //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    add         x12,x8,x9                   //(viii)*pu1_ref[ref_main_idx]
+    umlal       v14.8h, v13.8b, v29.8b      //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+    subs        x4,x4,#8
+
+    st1         {v22.8b},[x0],x3            //(iv)
+    rshrn       v10.8b, v10.8h,#5           //(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+    ld1         {v20.8b},[x12],x11          //(viii)ref_main_idx
+    sub         v26.8b,  v1.8b ,  v27.8b    //(vii)32-fract(dup_const_32_fract)
+
+    ld1         {v21.8b},[x12]              //(viii)ref_main_idx_1
+    umull       v18.8h, v16.8b, v26.8b      //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    add         x20,x8,#8
+    csel        x8, x20, x8,gt
+    umlal       v18.8h, v17.8b, v27.8b      //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    sub         x20,x7,#8
+    csel        x7, x20, x7,gt
+
+    st1         {v10.8b},[x0],x3            //(v)
+    rshrn       v14.8b, v14.8h,#5           //(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+    beq         epilogue
+
+    ld1         {v5.8b},[x6]                //loads the row value
+    umull       v2.8h, v5.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
+    and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
+    xtn         v4.8b,  v4.8h
+    shrn        v3.8b, v2.8h,#5             //idx = pos >> 5
+    umov        w14, v3.2s[0]               //(i)extract idx to the r register
+    sxtw        x14,w14
+    and         x9,x14,#0xff                //(i)
+    add         x10,x8,x9                   //(i)*pu1_ref[ref_main_idx]
+
+kernel_8_rows:
+    asr         x14,x14,#8                  //(ii)
+    dup         v31.8b, v4.8b[0]
+    subs        x4,x4,#8
+
+    ld1         {v8.8b},[x10],x11           //(i)ref_main_idx
+    sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
+    and         x9,x14,#0xff                //(ii)
+    add         x20,x6,#8                   //increment the row value
+    csel        x6, x20, x6,le
+
+    ld1         {v9.8b},[x10]               //(i)ref_main_idx_1
+    umull       v22.8h, v20.8b, v24.8b      //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    add         x12,x8,x9                   //(ii)*pu1_ref[ref_main_idx]
+
+    ld1         {v5.8b},[x6]                //loads the row value
+    umlal       v22.8h, v21.8b, v25.8b      //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    asr         x14,x14,#8                  //(iii)
+
+    dup         v29.8b, v4.8b[1]            //(ii)
+    rshrn       v18.8b, v18.8h,#5           //(vii)shift_res = vrshrn_n_u16(add_res, 5)
+    and         x9,x14,#0xff                //(iii)
+
+    st1         {v14.8b},[x0],x3            //(vi)
+    sub         v30.8b,  v1.8b ,  v31.8b    //(i)32-fract(dup_const_32_fract)
+    add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
+
+    ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
+    umull       v10.8h, v8.8b, v30.8b       //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+    asr         x14,x14,#8                  //(iv)
+
+    ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
+    umlal       v10.8h, v9.8b, v31.8b       //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
+    and         x9,x14,#0xff                //(iv)
+
+    umov        w14, v3.2s[1]               //extract idx to the r register
+    sxtw        x14,w14
+    rshrn       v22.8b, v22.8h,#5           //(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    dup         v27.8b, v4.8b[2]            //(iii)
+    sub         v28.8b,  v1.8b ,  v29.8b    //(ii)32-fract(dup_const_32_fract)
+    csel        x4, x5, x4,le               //reload nt
+
+    ld1         {v16.8b},[x10],x11          //(iii)ref_main_idx
+    umull       v14.8h, v12.8b, v28.8b      //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    add         x12,x8,x9                   //(iv)*pu1_ref[ref_main_idx]
+
+    st1         {v18.8b},[x0],x3            //(vii)
+    umlal       v14.8h, v13.8b, v29.8b      //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v17.8b},[x10]              //(iii)ref_main_idx_1
+    rshrn       v10.8b, v10.8h,#5           //(i)shift_res = vrshrn_n_u16(add_res, 5)
+
+    dup         v25.8b, v4.8b[3]            //(iv)
+    umull       v2.8h, v5.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
+
+    st1         {v22.8b},[x0]               //(viii)
+    sub         v26.8b,  v1.8b ,  v27.8b    //(iii)32-fract(dup_const_32_fract)
+
+    ld1         {v20.8b},[x12],x11          //(iv)ref_main_idx
+    umull       v18.8h, v16.8b, v26.8b      //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    add         x0,x2,x3
+
+    ld1         {v21.8b},[x12]              //(iv)ref_main_idx_1
+    umlal       v18.8h, v17.8b, v27.8b      //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    and         x9,x14,#0xff                //(v)
+
+    dup         v31.8b, v4.8b[4]            //(v)
+    rshrn       v14.8b, v14.8h,#5           //(ii)shift_res = vrshrn_n_u16(add_res, 5)
+    add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
+
+    st1         {v10.8b},[x2],#8            //(i)
+    sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
+    asr         x14,x14,#8                  //(vi)
+
+    dup         v29.8b, v4.8b[5]            //(vi)
+    umull       v22.8h, v20.8b, v24.8b      //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+    and         x9,x14,#0xff                //(vi)
+
+    dup         v27.8b, v4.8b[6]            //(vii)
+    umlal       v22.8h, v21.8b, v25.8b      //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+    add         x12,x8,x9                   //(vi)*pu1_ref[ref_main_idx]
+
+    dup         v25.8b, v4.8b[7]            //(viii)
+    rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
+    asr         x14,x14,#8                  //(vii)
+
+    ld1         {v8.8b},[x10],x11           //(v)ref_main_idx
+    and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
+    and         x9,x14,#0xff                //(vii)
+
+    ld1         {v9.8b},[x10]               //(v)ref_main_idx_1
+    shrn        v3.8b, v2.8h,#5             //idx = pos >> 5
+    asr         x14,x14,#8                  //(viii)
+
+    st1         {v14.8b},[x0],x3            //(ii)
+    rshrn       v22.8b, v22.8h,#5           //(iv)shift_res = vrshrn_n_u16(add_res, 5)
+    add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
+
+    ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
+    sub         v30.8b,  v1.8b ,  v31.8b    //(v)32-fract(dup_const_32_fract)
+    and         x9,x14,#0xff                //(viii)
+
+    ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
+    umull       v10.8h, v8.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    umov        w14, v3.2s[0]               //(i)extract idx to the r register
+    sxtw        x14,w14
+    umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+    add         x12,x8,x9                   //(viii)*pu1_ref[ref_main_idx]
+
+    ld1         {v16.8b},[x10],x11          //(vii)ref_main_idx
+    sub         v28.8b,  v1.8b ,  v29.8b    //(vi)32-fract(dup_const_32_fract)
+
+    st1         {v18.8b},[x0],x3            //(iii)
+    umull       v14.8h, v12.8b, v28.8b      //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+    csel        x8, x1, x8,le               //reload the source to pu1_src+2nt
+
+    ld1         {v17.8b},[x10]              //(vii)ref_main_idx_1
+    umlal       v14.8h, v13.8b, v29.8b      //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+    add         x20,x8,#8                   //increment the source next set 8 columns in same row
+    csel        x8, x20, x8,gt
+
+    ld1         {v20.8b},[x12],x11          //(viii)ref_main_idx
+    rshrn       v10.8b, v10.8h,#5           //(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+    ld1         {v21.8b},[x12]              //(viii)ref_main_idx_1
+    sub         v26.8b,  v1.8b ,  v27.8b    //(vii)32-fract(dup_const_32_fract)
+    lsl         x20, x3,#3
+    csel        x12,x20,x12,le
+
+    st1         {v22.8b},[x0],x3            //(iv)
+    umull       v18.8h, v16.8b, v26.8b      //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    sub         x20,x12,x5
+    csel        x12, x20, x12,le
+
+    st1         {v10.8b},[x0],x3            //(v)
+    umlal       v18.8h, v17.8b, v27.8b      //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+    add         x20,x2,x12                  //increment the dst pointer to 8*dst_strd - nt
+    csel        x2, x20, x2,le
+
+    xtn         v4.8b,  v4.8h
+    rshrn       v14.8b, v14.8h,#5           //(vi)shift_res = vrshrn_n_u16(add_res, 5)
+    and         x9,x14,#0xff                //(i)
+
+    subs        x7,x7,#8
+    add         x10,x8,x9                   //(i)*pu1_ref[ref_main_idx]
+
+    bne         kernel_8_rows
+
+epilogue:
+    st1         {v14.8b},[x0],x3            //(vi)
+    rshrn       v18.8b, v18.8h,#5           //(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
+    umull       v22.8h, v20.8b, v24.8b      //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umlal       v22.8h, v21.8b, v25.8b      //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    st1         {v18.8b},[x0],x3            //(vii)
+    rshrn       v22.8b, v22.8h,#5           //(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+    st1         {v22.8b},[x0],x3            //(viii)
+    b           end_loops
+
+core_loop_4:
+    add         x10,x8,#1                   //pu1_ref_main_idx += (two_nt + 1)
+    add         x11,x8,#2                   //pu1_ref_main_idx_1 += (two_nt + 2)
+    mov         x8,#0
+
+    add         x5,x8,#1                    //row + 1
+    mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
+    and         x5,x5,#31                   //fract = pos & (31)
+    cmp         x14,x5                      //if(fract_prev > fract)
+    add         x20,x10,#1                  //pu1_ref_main_idx += 1
+    csel        x10, x20, x10,gt
+    add         x11,x10,#1                  //pu1_ref_main_idx_1 += 1
+    dup         v0.8b,w5                    //dup_const_fract
+    sub         x20,x5,#32
+    neg         x4, x20
+    dup         v1.8b,w4                    //dup_const_32_fract
+
+//inner_loop_4
+    ld1         {v2.s}[0],[x10]             //ref_main_idx
+    add         x8,x8,#1
+    mov         x14,x5                      //fract_prev = fract
+
+    ld1         {v3.s}[0],[x11]             //ref_main_idx_1
+    add         x5,x8,#1                    //row + 1
+    mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
+    and         x5,x5,#31                   //fract = pos & (31)
+    cmp         x14,x5                      //if(fract_prev > fract)
+    add         x20,x10,#1                  //pu1_ref_main_idx += 1
+    csel        x10, x20, x10,gt
+    add         x11,x10,#1                  //pu1_ref_main_idx_1 += 1
+
+    dup         v6.8b,w5                    //dup_const_fract
+    umull       v4.8h, v2.8b, v1.8b         //vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    sub         x20,x5,#32
+    neg         x4, x20
+    dup         v7.8b,w4                    //dup_const_32_fract
+    umlal       v4.8h, v3.8b, v0.8b         //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v8.s}[0],[x10]             //ref_main_idx
+    add         x8,x8,#1
+
+    ld1         {v9.s}[0],[x11]             //ref_main_idx_1
+    rshrn       v4.8b, v4.8h,#5             //shift_res = vrshrn_n_u16(add_res, 5)
+
+    mov         x14,x5                      //fract_prev = fract
+    add         x5,x8,#1                    //row + 1
+    mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
+    and         x5,x5,#31                   //fract = pos & (31)
+    cmp         x14,x5                      //if(fract_prev > fract)
+    add         x20,x10,#1                  //pu1_ref_main_idx += 1
+    csel        x10, x20, x10,gt
+    add         x11,x10,#1                  //pu1_ref_main_idx_1 += 1
+
+    dup         v12.8b,w5                   //dup_const_fract
+    umull       v10.8h, v8.8b, v7.8b        //vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    sub         x20,x5,#32
+    neg         x4, x20
+    dup         v13.8b,w4                   //dup_const_32_fract
+    umlal       v10.8h, v9.8b, v6.8b        //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v14.s}[0],[x10]            //ref_main_idx
+    add         x8,x8,#1
+
+    st1         {v4.s}[0],[x2],x3
+    rshrn       v10.8b, v10.8h,#5           //shift_res = vrshrn_n_u16(add_res, 5)
+
+    ld1         {v15.s}[0],[x11]            //ref_main_idx_1
+    mov         x14,x5                      //fract_prev = fract
+    add         x5,x8,#1                    //row + 1
+    mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
+    and         x5,x5,#31                   //fract = pos & (31)
+    cmp         x14,x5                      //if(fract_prev > fract)
+    add         x20,x10,#1                  //pu1_ref_main_idx += 1
+    csel        x10, x20, x10,gt
+    add         x11,x10,#1                  //pu1_ref_main_idx_1 += 1
+
+    dup         v18.8b,w5                   //dup_const_fract
+    umull       v16.8h, v14.8b, v13.8b      //vmull_u8(ref_main_idx, dup_const_32_fract)
+
+    sub         x20,x5,#32
+    neg         x4, x20
+    dup         v19.8b,w4                   //dup_const_32_fract
+    umlal       v16.8h, v15.8b, v12.8b      //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    ld1         {v20.s}[0],[x10]            //ref_main_idx
+
+    st1         {v10.s}[0],[x2],x3
+    rshrn       v16.8b, v16.8h,#5           //shift_res = vrshrn_n_u16(add_res, 5)
+    ld1         {v21.s}[0],[x11]            //ref_main_idx_1
+
+    umull       v22.8h, v20.8b, v19.8b      //vmull_u8(ref_main_idx, dup_const_32_fract)
+    umlal       v22.8h, v21.8b, v18.8b      //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+    st1         {v16.s}[0],[x2],x3
+    rshrn       v22.8b, v22.8h,#5           //shift_res = vrshrn_n_u16(add_res, 5)
+
+    st1         {v22.s}[0],[x2],x3
+
+end_loops:
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s b/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s
new file mode 100644
index 0000000..b6e8601
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s

@@ -0,0 +1,567 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_luma_mode_3_to_9.s
+//*
+//* @brief
+//*  contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  parthiban v
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*    luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] nt
+//*  size of tranform block
+//*
+//* @param[in] mode
+//*  type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_mode_3_to_9(uword8* pu1_ref,
+//                               word32 src_strd,
+//                               uword8* pu1_dst,
+//                               word32 dst_strd,
+//                               word32 nt,
+//                               word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+//    nt
+//    mode
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_luma_mode_3_to_9_av8
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern col_for_intra_luma
+.extern idx_neg_idx_3_9
+
+
+.type ihevc_intra_pred_luma_mode_3_to_9_av8, %function
+
+ihevc_intra_pred_luma_mode_3_to_9_av8:
+
+    // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    adrp        x7,  :got:gai4_ihevc_ang_table
+    ldr         x7,  [x7, #:got_lo12:gai4_ihevc_ang_table]
+
+    adrp        x8,  :got:gai4_ihevc_inv_ang_table
+    ldr         x8,  [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
+
+    add         x7, x7, x5, lsl #2          //gai4_ihevc_ang_table[mode]
+    ldr         w7,  [x7]                   //intra_pred_ang
+    sxtw        x7,w7
+    dup         v30.8b,w7                   //intra_pred_ang
+
+    adrp        x14,  :got:col_for_intra_luma
+    ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
+
+    cmp         x4, #4
+
+    beq         sz_4_proc
+    b           prologue_8_16_32
+
+prologue_8_16_32:
+    lsr         x10, x4, #3
+    ld1         {v31.8b},[x14],#8
+    mul         x10, x4, x10                //block counter (dec by #8)
+
+    mov         x11, x4                     //col counter to be inc/dec by #8
+    smull       v22.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
+
+    sub         x7, x5, #3
+    movi        v2.8b, #1                   //contains #1 for adding to get ref_main_idx + 1
+    adrp        x12, :got:idx_neg_idx_3_9   //load least idx table
+    ldr         x12, [x12, #:got_lo12:idx_neg_idx_3_9]
+    movi        v3.8b, #2
+
+    add         x12, x12, x7, lsl #4
+    mov         x8, x12
+
+    mov         x7, #8
+    sub         x7, x7, x3, lsl #3          //x7 = 8-8x3
+
+    ldr         w9,  [x8]
+    sxtw        x9,w9
+    add         x1, x0, x4, lsl #1          //pu1_ref + nt
+
+    xtn         v6.8b,  v22.8h
+    dup         v26.8b,w9                   //least idx added to final idx values
+    sub         x1, x1, #9                  //ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
+
+    sub         x6, x1, x9
+
+    ld1         {v0.16b}, [x6]              //stores the 32 values reqd based on indices values (from least idx)
+    sshr        v22.8h, v22.8h,#5
+
+    movi        v29.8b, #31                 //contains #31 for vand operation
+
+    movi        v28.8b, #32
+
+    sqxtn       v8.8b,  v22.8h
+
+    and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
+
+    mov         x0, #1
+
+    movi        v27.8b, #7                  //row 0 to 7
+
+    sub         v8.8b,  v8.8b ,  v2.8b      //ref_main_idx (sub row)
+    sub         v8.8b,  v26.8b ,  v8.8b     //ref_main_idx (row 0)
+    add         v8.8b,  v8.8b ,  v27.8b     //t0 compensate the pu1_src idx incremented by 8
+    sub         v9.8b,  v8.8b ,  v2.8b      //ref_main_idx + 1 (row 0)
+    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 0)
+    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
+
+    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 0)
+    sub         v4.8b,  v8.8b ,  v2.8b      //ref_main_idx (row 1)
+    sub         v5.8b,  v9.8b ,  v2.8b      //ref_main_idx + 1 (row 1)
+
+    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 1)
+    umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
+    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
+
+    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
+    sub         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 2)
+    sub         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx + 1 (row 2)
+
+    rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
+
+    tbl         v14.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 2)
+    umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
+    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
+
+    tbl         v15.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 2)
+    sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 3)
+    sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 3)
+
+    st1         {v24.8b},[x2], x3           //st (row 0)
+    rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
+
+    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
+
+    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
+    sub         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 4)
+    sub         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx + 1 (row 4)
+
+    st1         {v22.8b},[x2], x3           //st (row 1)
+    rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
+
+    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 4)
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 3)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 3)
+
+    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 4)
+    sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 5)
+    sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 5)
+
+    st1         {v20.8b},[x2], x3           //st (row 2)
+    rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
+
+    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 5)
+    umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
+    umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
+
+    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 5)
+    sub         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 6)
+    sub         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx + 1 (row 6)
+
+    st1         {v18.8b},[x2], x3           //st (row 3)
+    rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
+
+    tbl         v14.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 6)
+    umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
+    umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
+
+    tbl         v15.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 6)
+    sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 7)
+    sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 7)
+
+    st1         {v24.8b},[x2], x3           //st (row 4)
+    rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
+
+    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
+
+    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+
+    st1         {v22.8b},[x2], x3           //st (row 5)
+    rshrn       v20.8b, v20.8h,#5           //round shft (row 6)
+    rshrn       v18.8b, v18.8h,#5           //round shft (row 7)
+
+    st1         {v20.8b},[x2], x3           //st (row 6)
+
+    subs        x10, x10, #8                //subtract 8 and go to end if 8x8
+
+    st1         {v18.8b},[x2], x3           //st (row 7)
+
+    beq         end_func
+
+    subs        x11, x11, #8
+    add         x20, x8, #4
+    csel        x8, x20, x8,gt
+    add         x20, x2, x7
+    csel        x2, x20, x2,gt
+    csel        x8, x12, x8,le
+    sub         x20, x2, x4
+    csel        x2, x20, x2,le
+    add         x20, x2, #8
+    csel        x2, x20, x2,le
+    csel        x11, x4, x11,le
+    bgt         lbl284
+    adrp        x14,  :got:col_for_intra_luma
+    ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
+lbl284:
+    add         x20, x0, #8
+    csel        x0, x20, x0,le
+
+    mov         x5,x2
+    ld1         {v31.8b},[x14],#8
+    smull       v12.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
+    xtn         v10.8b,  v12.8h
+    sshr        v12.8h, v12.8h,#5
+    sqxtn       v11.8b,  v12.8h
+    ldr         w9,  [x8]
+    sxtw        x9,w9
+    add         x9, x0, x9
+    sub         x9, x9, #1
+    dup         v26.8b,w9
+    movi        v16.8b, #8
+
+    sub         x4,x4,#8
+
+kernel_8_16_32:
+
+    sub         v8.8b,  v26.8b ,  v11.8b    //ref_main_idx
+    mov         v26.8b, v10.8b
+
+    subs        x11, x11, #8
+    sub         x6, x1, x9
+    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
+    add         v8.8b,  v8.8b ,  v16.8b     //to compensate the pu1_src idx incremented by 8
+
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
+    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx - 1 (row 7)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
+
+    sub         v9.8b,  v8.8b ,  v2.8b      //ref_main_idx - 1
+    add         x20, x0, #8
+    csel        x0, x20, x0,le
+    add         x20, x8, #4
+    csel        x8, x20, x8,gt
+    ld1         {v0.16b}, [x6]              //stores the 32 values reqd based on indices values (from least idx)
+
+    st1         {v24.8b},[x5], x3           //st (row 4)
+    rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
+
+    bgt         lbl323
+    adrp        x14,  :got:col_for_intra_luma
+    ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
+lbl323:
+    csel        x8, x12, x8,le
+    dup         v27.8b,w0                   //row value inc or reset accordingly
+
+    sub         v4.8b,  v8.8b ,  v2.8b      //ref_main_idx (row 1)
+    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 0)
+    sub         v5.8b,  v9.8b ,  v2.8b      //ref_main_idx - 1 (row 1)
+
+
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
+    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 0)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+
+    ld1         {v31.8b},[x14],#8
+    and         v6.8b,  v29.8b ,  v26.8b    //fract values in d1/ idx values in d0
+
+    st1         {v22.8b},[x5], x3           //(from previous loop)st (row 5)
+    rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
+
+    sub         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 2)
+    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 1)
+    sub         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx - 1 (row 2)
+
+    add         x20, x4, #8
+    csel        x11, x20, x11,le
+    ldr         w9,  [x8]
+    sxtw        x9,w9
+    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
+
+    umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
+    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
+    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
+
+    st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
+    rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
+
+    sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 3)
+    tbl         v14.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 2)
+    sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx - 1 (row 3)
+
+    umull       v22.8h, v10.8b, v7.8b       //mul (row 1)
+    tbl         v15.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 2)
+    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
+
+    rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
+    st1         {v18.8b},[x5], x3           //(from previous loop)st (row 7)
+
+    sub         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 4)
+    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
+    sub         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx - 1 (row 4)
+
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
+    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
+
+    smull       v14.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
+    add         x5,x2,x3,lsl#2
+    add         x9, x0, x9
+
+    st1         {v24.8b},[x2], x3           //st (row 0)
+    rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
+
+    sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 5)
+    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 4)
+    sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx - 1 (row 5)
+
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 3)
+    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 4)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 3)
+
+    st1         {v22.8b},[x2], x3           //st (row 1)
+    rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
+
+    xtn         v10.8b,  v14.8h
+    sshr        v14.8h, v14.8h,#5
+
+    sub         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 6)
+    tbl         v21.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 5)
+    sub         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx - 1 (row 6)
+
+    umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
+    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 5)
+    umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
+
+    st1         {v20.8b},[x2], x3           //st (row 2)
+    rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
+
+    sub         x9, x9, #1
+    sqxtn       v11.8b,  v14.8h
+
+    sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 7)
+    tbl         v14.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 6)
+    sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx - 1 (row 7)
+
+    umull       v22.8h, v21.8b, v7.8b       //mul (row 5)
+    tbl         v15.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 6)
+    umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
+
+    add         v11.8b,  v27.8b ,  v11.8b   //ref_main_idx (add row)
+    dup         v26.8b,w9
+
+    st1         {v18.8b},[x2], x3           //st (row 3)
+    rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
+
+    add         x2, x2, x3, lsl #2
+    sub         v11.8b,  v11.8b ,  v2.8b    //ref_main_idx -1 (sub 1)
+    add         x20, x7, x2
+    csel        x2, x20, x2,gt
+
+    sub         x20, x2, x4
+    csel        x2, x20, x2,le
+
+    subs        x10, x10, #8                //subtract 8 and go to end if 8x8
+
+    bne         kernel_8_16_32
+
+epil_8_16_32:
+    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
+
+    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
+    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
+    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
+
+    st1         {v24.8b},[x5], x3           //st (row 4)
+    rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
+
+    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
+    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+
+    st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
+    rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
+
+    st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
+    rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
+
+    st1         {v18.8b},[x5], x3           //st (row 7)
+
+    b           end_func
+
+sz_4_proc:
+    ld1         {v31.8b},[x14]
+    movi        v2.8b, #1                   //contains #1 for adding to get ref_main_idx - 1
+
+    movi        v3.8b, #2
+    adrp        x12, :got:idx_neg_idx_3_9   //load least idx table
+    ldr         x12, [x12, #:got_lo12:idx_neg_idx_3_9]
+
+    smull       v22.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
+    sub         x7, x5, #3
+
+    add         x12, x12, x7, lsl #4
+    mov         x8, x12
+
+    ldr         w9,  [x8]
+    sxtw        x9,w9
+
+    dup         v26.8b,w9                   //least idx added to final idx values
+    add         x6, x0, x4, lsl #1          //pu1_ref + 2nt
+
+    xtn         v6.8b,  v22.8h
+    sub         x6, x6, #9                  //ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
+    sub         x6, x6, x9
+
+    ld1         {v0.16b}, [x6]              //stores the 32 values reqd based on indices values (from least idx)
+
+    movi        v29.8b, #31                 //contains #31 for vand operation
+
+    movi        v28.8b, #32
+
+    sshr        v22.8h, v22.8h,#5
+    sqxtn       v8.8b,  v22.8h
+
+    and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
+    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
+
+    movi        v27.8b, #7                  //row 0 to 7(row-1)
+    sub         v8.8b,  v8.8b ,  v2.8b      //ref_main_idx (add 1)
+    sub         v8.8b,  v26.8b ,  v8.8b     //ref_main_idx
+    add         v8.8b,  v8.8b ,  v27.8b     //t0 compensate the pu1_src idx incremented by 8
+    sub         v9.8b,  v8.8b ,  v2.8b      //ref_main_idx - 1
+
+    sub         v4.8b,  v8.8b ,  v2.8b      //row 1 ref_main_idx
+    sub         v5.8b,  v9.8b ,  v2.8b
+
+    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 0)
+    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 0)
+
+
+    umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
+    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx    (row 1)
+    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
+
+    sub         v8.8b,  v8.8b ,  v3.8b      //idx (row 2)
+    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
+    sub         v9.8b,  v9.8b ,  v3.8b      //idx+1 (row 2)
+
+    umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
+    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx    (row 2)
+    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
+
+    rshrn       v24.8b, v24.8h,#5           //round shift (row 0)
+
+    sub         v4.8b,  v4.8b ,  v3.8b      //idx (row 3)
+    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 2)
+    sub         v5.8b,  v5.8b ,  v3.8b      //idx+1 (row 3)
+
+    umull       v20.8h, v12.8b, v7.8b       //mul (row 2)
+    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
+    umlal       v20.8h, v13.8b, v6.8b       //mul (row 2)
+
+    st1         {v24.s}[0],[x2], x3         //st row 0
+    rshrn       v22.8b, v22.8h,#5           //round shift (row 1)
+
+    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
+
+    umull       v18.8h, v16.8b, v7.8b       //mul (row 3)
+    umlal       v18.8h, v17.8b, v6.8b       //mul (row 3)
+
+    st1         {v22.s}[0],[x2], x3         //st row 1
+    rshrn       v20.8b, v20.8h,#5           //round shift (row 2)
+
+    st1         {v20.s}[0],[x2], x3         //st row 2
+
+    rshrn       v18.8b, v18.8h,#5           //round shift (row 3)
+
+    st1         {v18.s}[0],[x2], x3         //st (row 3)
+
+end_func:
+    // ldmfd sp!,{x4-x12,x15}          //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_luma_planar.s b/common/arm64/ihevc_intra_pred_luma_planar.s
new file mode 100644
index 0000000..d2f27a2
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_luma_planar.s

@@ -0,0 +1,569 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_filters_planar.s
+//*
+//* @brief
+//*  contains function definitions for inter prediction  interpolation.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  akshaya mukund
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*    luma intraprediction filter for planar input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] pi1_coeff
+//*  word8 pointer to the planar coefficients
+//*
+//* @param[in] nt
+//*  size of tranform block
+//*
+//* @param[in] mode
+//*  type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
+//                                  word32 src_strd,
+//                                  uword8* pu1_dst,
+//                                  word32 dst_strd,
+//                                  word32 nt,
+//                                  word32 mode,
+//                   word32 pi1_coeff)
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+//    nt
+//    mode
+//    pi1_coeff
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_luma_planar_av8
+.extern gau1_ihevc_planar_factor
+.extern gau1_ihevc_planar_factor_1
+
+.type ihevc_intra_pred_luma_planar_av8, %function
+
+ihevc_intra_pred_luma_planar_av8:
+
+    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    adrp        x11, :got:gau1_ihevc_planar_factor //loads table of coeffs
+    ldr         x11, [x11, #:got_lo12:gau1_ihevc_planar_factor]
+
+    clz         w5,w4
+    sub         x20, x5, #32
+    neg         x5, x20
+    dup         v14.8h,w5
+    neg         v14.8h, v14.8h              //shr value (so vneg)
+    dup         v2.8b,w4                    //nt
+    dup         v16.8h,w4                   //nt
+
+    sub         x6, x4, #1                  //nt-1
+    add         x6, x6, x0
+    ldr         w7,  [x6]
+    sxtw        x7,w7
+    dup         v0.8b,w7                    //src[nt-1]
+
+    add         x6, x4, x4,lsl #1           //3nt
+    add         x6, x6, #1                  //3nt + 1
+    add         x6, x6, x0
+    ldr         w7,  [x6]
+    sxtw        x7,w7
+    dup         v1.8b,w7                    //src[3nt+1]
+
+    add         x6, x4, x4                  //2nt
+    add         x14, x6, #1                 //2nt+1
+    sub         x6, x6, #1                  //2nt-1
+    add         x6, x6, x0                  //&src[2nt-1]
+    add         x14, x14, x0                //&src[2nt+1]
+
+    mov         x8, #1                      //row+1 (row is first 0)
+    sub         x9, x4, x8                  //nt-1-row (row is first 0)
+
+    dup         v5.8b,w8                    //row + 1
+    dup         v6.8b,w9                    //nt - 1 - row
+    mov         v7.8b, v5.8b                //mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
+
+    add         x12, x11, #1                //coeffs (to be reloaded after every row)
+    mov         x1, x4                      //nt (row counter) (dec after every row)
+    mov         x5, x2                      //dst (to be reloaded after every row and inc by dst_strd)
+    mov         x10, #8                     //increment for the coeffs
+    mov         x0, x14                     //&src[2nt+1] (to be reloaded after every row)
+
+    cmp         x4, #4
+    beq         tf_sz_4
+
+//@ ========== ***************** =====================
+prolog:
+tf_sz_8_16_32:
+
+    mov         x7, x4                      //column counter (set to no of cols)
+    lsr         x9, x4, #3                  //divide nt by 8
+    mul         x7, x7, x9                  //multiply width * height
+    adrp        x5, :got:gau1_ihevc_planar_factor_1 //loads table of coeffs
+    ldr         x5, [x5, #:got_lo12:gau1_ihevc_planar_factor_1]
+    sub         x6, x6, #7
+    mov         x8, x2
+    lsl         x9, x3, #3                  //4*stride
+    sub         x20, x9, #8                 //8-4*stride
+    neg         x9, x20
+    mov         x10, x4                     //nt
+    sub         x10, x10, #8                //nt - 8
+
+col_loop_8_16_32:
+
+    ld1         {v8.8b},[x12]               //(1-8)load 8 coeffs [col+1]
+    dup         v12.8h,w4                   //(1)
+    ld1         {v4.8b},[x6]                //(1-8)src[2nt-1-row]
+    sub         v9.8b,  v2.8b ,  v8.8b      //(1-8)[nt-1-col]
+
+
+    umlal       v12.8h, v5.8b, v0.8b        //(1)(row+1)    *    src[nt-1]
+
+    ld1         {v3.8b},[x14]               //(1-8)load 8 src[2nt+1+col]
+    umlal       v12.8h, v8.8b, v1.8b        //(1)(col+1)    *    src[3nt+1]
+
+    dup         v20.8b, v4.8b[7]            //(1)
+    umlal       v12.8h, v6.8b, v3.8b        //(1)(nt-1-row)    *    src[2nt+1+col]
+
+    dup         v21.8b, v4.8b[6]            //(2)
+    umlal       v12.8h, v9.8b, v20.8b       //(1)(nt-1-col)    *    src[2nt-1-row]
+
+    dup         v30.8h,w4                   //(2)
+    add         v5.8b,  v5.8b ,  v7.8b      //(1)
+
+    sub         v6.8b,  v6.8b ,  v7.8b      //(1)
+
+    dup         v22.8b, v4.8b[5]            //(3)
+    umlal       v30.8h, v5.8b, v0.8b        //(2)
+
+    dup         v28.8h,w4                   //(3)
+    umlal       v30.8h, v8.8b, v1.8b        //(2)
+
+    umlal       v30.8h, v6.8b, v3.8b        //(2)
+    umlal       v30.8h, v9.8b, v21.8b       //(2)
+
+    sshl        v12.8h, v12.8h, v14.8h      //(1)shr
+
+    add         v5.8b,  v5.8b ,  v7.8b      //(2)
+    sub         v6.8b,  v6.8b ,  v7.8b      //(2)
+
+    xtn         v12.8b,  v12.8h             //(1)
+    umlal       v28.8h, v5.8b, v0.8b        //(3)
+
+    dup         v23.8b, v4.8b[4]            //(4)
+    umlal       v28.8h, v8.8b, v1.8b        //(3)
+
+    dup         v10.8h,w4                   //(4)
+    umlal       v28.8h, v6.8b, v3.8b        //(3)
+
+    st1         {v12.8b},[x2], x3           //(1)str 8 values
+    umlal       v28.8h, v9.8b, v22.8b       //(3)
+
+    sshl        v30.8h, v30.8h, v14.8h      //(2)shr
+
+    add         v5.8b,  v5.8b ,  v7.8b      //(3)
+    sub         v6.8b,  v6.8b ,  v7.8b      //(3)
+
+    xtn         v30.8b,  v30.8h             //(2)
+    umlal       v10.8h, v5.8b, v0.8b        //(4)
+
+    dup         v20.8b, v4.8b[3]            //(5)
+    umlal       v10.8h, v8.8b, v1.8b        //(4)
+
+    dup         v16.8h,w4                   //(5)
+    umlal       v10.8h, v6.8b, v3.8b        //(4)
+
+    st1         {v30.8b},[x2], x3           //(2)str 8 values
+    umlal       v10.8h, v9.8b, v23.8b       //(4)
+
+    sshl        v28.8h, v28.8h, v14.8h      //(3)shr
+
+    add         v5.8b,  v5.8b ,  v7.8b      //(4)
+    sub         v6.8b,  v6.8b ,  v7.8b      //(4)
+
+    xtn         v28.8b,  v28.8h             //(3)
+    umlal       v16.8h, v5.8b, v0.8b        //(5)
+
+    dup         v21.8b, v4.8b[2]            //(6)
+    umlal       v16.8h, v8.8b, v1.8b        //(5)
+
+    dup         v18.8h,w4                   //(6)
+    umlal       v16.8h, v6.8b, v3.8b        //(5)
+
+    st1         {v28.8b},[x2], x3           //(3)str 8 values
+    umlal       v16.8h, v9.8b, v20.8b       //(5)
+
+    sshl        v10.8h, v10.8h, v14.8h      //(4)shr
+    add         v5.8b,  v5.8b ,  v7.8b      //(5)
+    sub         v6.8b,  v6.8b ,  v7.8b      //(5)
+
+    xtn         v10.8b,  v10.8h             //(4)
+    umlal       v18.8h, v5.8b, v0.8b        //(6)
+
+    dup         v22.8b, v4.8b[1]            //(7)
+    umlal       v18.8h, v8.8b, v1.8b        //(6)
+
+    dup         v26.8h,w4                   //(7)
+    umlal       v18.8h, v6.8b, v3.8b        //(6)
+
+    st1         {v10.8b},[x2], x3           //(4)str 8 values
+    umlal       v18.8h, v9.8b, v21.8b       //(6)
+
+    sshl        v16.8h, v16.8h, v14.8h      //(5)shr
+
+    add         v5.8b,  v5.8b ,  v7.8b      //(6)
+    sub         v6.8b,  v6.8b ,  v7.8b      //(6)
+
+    xtn         v16.8b,  v16.8h             //(5)
+    umlal       v26.8h, v5.8b, v0.8b        //(7)
+
+    dup         v23.8b, v4.8b[0]            //(8)
+    umlal       v26.8h, v8.8b, v1.8b        //(7)
+
+    dup         v24.8h,w4                   //(8)
+    umlal       v26.8h, v6.8b, v3.8b        //(7)
+
+    st1         {v16.8b},[x2], x3           //(5)str 8 values
+    umlal       v26.8h, v9.8b, v22.8b       //(7)
+
+    sshl        v18.8h, v18.8h, v14.8h      //(6)shr
+
+    add         v5.8b,  v5.8b ,  v7.8b      //(7)
+    sub         v6.8b,  v6.8b ,  v7.8b      //(7)
+
+    xtn         v18.8b,  v18.8h             //(6)
+    umlal       v24.8h, v5.8b, v0.8b        //(8)
+
+
+    umlal       v24.8h, v8.8b, v1.8b        //(8)
+
+    umlal       v24.8h, v6.8b, v3.8b        //(8)
+
+    st1         {v18.8b},[x2], x3           //(6)str 8 values
+    umlal       v24.8h, v9.8b, v23.8b       //(8)
+
+    sshl        v26.8h, v26.8h, v14.8h      //(7)shr
+
+    subs        x7, x7, #8
+
+    beq         epilog
+
+    subs        x1, x1, #8                  //row counter
+    add         x20, x12, #8                //col inc
+    csel        x12, x20, x12,gt
+    add         x20, x14, #8                //also for col inc
+    csel        x14, x20, x14,gt
+    csel        x1, x4, x1,le               //nt reloaded (refresh the value)
+    add         x20, x11, #1                //x12 reset
+    csel        x12, x20, x12,le
+
+    csel        x14, x0, x14,le             //x14 reset
+    ld1         {v8.8b},[x12]               //(1n)(1-8)load 8 coeffs [col+1]
+
+    sub         x20, x6, #8                 //for next set of rows
+    csel        x6, x20, x6,le
+    ld1         {v3.8b},[x14]               //(1n)(1-8)load 8 src[2nt+1+col]
+
+    add         x20, x5, #8
+    csel        x5, x20, x5,le
+    dup         v12.8h,w4                   //(1n)(1)
+
+    ld1         {v5.8b},[x5]
+
+    ld1         {v4.8b},[x6]                //(1n)(1-8)src[2nt-1-row]
+    sub         v9.8b,  v2.8b ,  v8.8b      //(1n)(1-8)[nt-1-col]
+
+    dup         v20.8b, v4.8b[7]            //(1n)(1)
+    sub         v6.8b,  v2.8b ,  v5.8b
+
+    beq         epilog
+
+kernel_plnr:
+
+    cmp         x1, #0                      // (cond loop)
+    sshl        v24.8h, v24.8h, v14.8h      //(8)shr
+
+    xtn         v26.8b,  v26.8h             //(7)
+    umlal       v12.8h, v5.8b, v0.8b        //(1)(row+1)    *    src[nt-1]
+
+    xtn         v24.8b,  v24.8h             //(8)
+    umlal       v12.8h, v8.8b, v1.8b        //(1)(col+1)    *    src[3nt+1]
+
+    dup         v21.8b, v4.8b[6]            //(2)
+    umlal       v12.8h, v6.8b, v3.8b        //(1)(nt-1-row)    *    src[2nt+1+col]
+
+    dup         v30.8h,w4                   //(2)
+    umlal       v12.8h, v9.8b, v20.8b       //(1)(nt-1-col)    *    src[2nt-1-row]
+
+    st1         {v26.8b},[x2], x3           //(7)str 8 values
+    add         v5.8b,  v5.8b ,  v7.8b      //(1)
+
+    st1         {v24.8b},[x2], x3           //(8)str 8 values
+    sub         v6.8b,  v6.8b ,  v7.8b      //(1)
+
+    add         x20, x2, x9                 //since more cols to fill, dst + 8 - 6*strd (cond loop)
+    csel        x2, x20, x2,gt
+    umlal       v30.8h, v5.8b, v0.8b        //(2)
+
+    sub         x20, x2, x10                //else go to next set of rows, dst - (nt-8) (cond loop)
+    csel        x2, x20, x2,le
+    umlal       v30.8h, v8.8b, v1.8b        //(2)
+
+    dup         v22.8b, v4.8b[5]            //(3)
+    umlal       v30.8h, v6.8b, v3.8b        //(2)
+
+    dup         v28.8h,w4                   //(3)
+    umlal       v30.8h, v9.8b, v21.8b       //(2)
+
+    sshl        v12.8h, v12.8h, v14.8h      //(1)shr
+
+    add         v5.8b,  v5.8b ,  v7.8b      //(2)
+    csel        x1, x4, x1,le               //nt reloaded (refresh the value)    (cond loop)
+
+    sub         v6.8b,  v6.8b ,  v7.8b      //(2)
+    subs        x1, x1, #8                  //row counter (loop)
+
+    xtn         v12.8b,  v12.8h             //(1)
+    umlal       v28.8h, v5.8b, v0.8b        //(3)
+
+    dup         v23.8b, v4.8b[4]            //(4)
+    umlal       v28.8h, v8.8b, v1.8b        //(3)
+
+    dup         v10.8h,w4                   //(4)
+    umlal       v28.8h, v6.8b, v3.8b        //(3)
+
+    st1         {v12.8b},[x2], x3           //(1)str 8 values
+    umlal       v28.8h, v9.8b, v22.8b       //(3)
+
+    sshl        v30.8h, v30.8h, v14.8h      //(2)shr
+
+    add         v5.8b,  v5.8b ,  v7.8b      //(3)
+
+    sub         v6.8b,  v6.8b ,  v7.8b      //(3)
+
+    xtn         v30.8b,  v30.8h             //(2)
+    umlal       v10.8h, v5.8b, v0.8b        //(4)
+
+    dup         v20.8b, v4.8b[3]            //(5)
+    umlal       v10.8h, v8.8b, v1.8b        //(4)
+
+    dup         v16.8h,w4                   //(5)
+    umlal       v10.8h, v6.8b, v3.8b        //(4)
+
+    st1         {v30.8b},[x2], x3           //(2)str 8 values
+    umlal       v10.8h, v9.8b, v23.8b       //(4)
+
+    sshl        v28.8h, v28.8h, v14.8h      //(3)shr
+
+    add         v5.8b,  v5.8b ,  v7.8b      //(4)
+
+    sub         v6.8b,  v6.8b ,  v7.8b      //(4)
+
+    xtn         v28.8b,  v28.8h             //(3)
+    umlal       v16.8h, v5.8b, v0.8b        //(5)
+
+    dup         v21.8b, v4.8b[2]            //(6)
+    umlal       v16.8h, v8.8b, v1.8b        //(5)
+
+    dup         v18.8h,w4                   //(6)
+    umlal       v16.8h, v6.8b, v3.8b        //(5)
+
+    st1         {v28.8b},[x2], x3           //(3)str 8 values
+    umlal       v16.8h, v9.8b, v20.8b       //(5)
+
+    add         x20, x11, #1                //x12 reset (cond loop)
+    csel        x12, x20, x12,le
+    sshl        v10.8h, v10.8h, v14.8h      //(4)shr
+
+    add         x20, x12, #8                //col inc (cond loop)
+    csel        x12, x20, x12,gt
+    add         v5.8b,  v5.8b ,  v7.8b      //(5)
+
+    add         x20, x14, #8                //also for col inc (cond loop)
+    csel        x14, x20, x14,gt
+    sub         v6.8b,  v6.8b ,  v7.8b      //(5)
+
+    xtn         v10.8b,  v10.8h             //(4)
+    umlal       v18.8h, v5.8b, v0.8b        //(6)
+
+    dup         v22.8b, v4.8b[1]            //(7)
+    umlal       v18.8h, v8.8b, v1.8b        //(6)
+
+    dup         v26.8h,w4                   //(7)
+    umlal       v18.8h, v6.8b, v3.8b        //(6)
+
+    st1         {v10.8b},[x2], x3           //(4)str 8 values
+    umlal       v18.8h, v9.8b, v21.8b       //(6)
+
+    csel        x14, x0, x14,le             //x14 reset (cond loop)
+    sshl        v16.8h, v16.8h, v14.8h      //(5)shr
+
+    sub         x20, x6, #8                 //for next set of rows (cond loop)
+    csel        x6, x20, x6,le
+    add         v5.8b,  v5.8b ,  v7.8b      //(6)
+
+    add         x20, x5, #8                 // (cond loop)
+    csel        x5, x20, x5,le
+    sub         v6.8b,  v6.8b ,  v7.8b      //(6)
+
+    xtn         v16.8b,  v16.8h             //(5)
+    umlal       v26.8h, v5.8b, v0.8b        //(7)
+
+    dup         v23.8b, v4.8b[0]            //(8)
+    umlal       v26.8h, v8.8b, v1.8b        //(7)
+
+    dup         v24.8h,w4                   //(8)
+    umlal       v26.8h, v6.8b, v3.8b        //(7)
+
+    st1         {v16.8b},[x2], x3           //(5)str 8 values
+    umlal       v26.8h, v9.8b, v22.8b       //(7)
+
+    ld1         {v4.8b},[x6]                //(1n)(1-8)src[2nt-1-row]
+    sshl        v18.8h, v18.8h, v14.8h      //(6)shr
+
+    add         v5.8b,  v5.8b ,  v7.8b      //(7)
+
+    sub         v6.8b,  v6.8b ,  v7.8b      //(7)
+
+    xtn         v18.8b,  v18.8h             //(6)
+    umlal       v24.8h, v5.8b, v0.8b        //(8)
+
+    ld1         {v5.8b},[x5]                //(row+1 value)
+    umlal       v24.8h, v8.8b, v1.8b        //(8)
+
+    dup         v20.8b, v4.8b[7]            //(1n)(1)
+    umlal       v24.8h, v6.8b, v3.8b        //(8)
+
+    st1         {v18.8b},[x2], x3           //(6)str 8 values
+    umlal       v24.8h, v9.8b, v23.8b       //(8)
+
+    ld1         {v8.8b},[x12]               //(1n)(1-8)load 8 coeffs [col+1]
+    sub         v6.8b,  v2.8b ,  v5.8b      //(nt-1-row) value
+
+    subs        x7, x7, #8                  //col counter
+
+    ld1         {v3.8b},[x14]               //(1n)(1-8)load 8 src[2nt+1+col]
+    sshl        v26.8h, v26.8h, v14.8h      //(7)shr
+
+    dup         v12.8h,w4                   //(1n)(1)
+    sub         v9.8b,  v2.8b ,  v8.8b      //(1n)(1-8)[nt-1-col]
+
+    bne         kernel_plnr
+
+epilog:
+
+    xtn         v26.8b,  v26.8h             //(7)
+    st1         {v26.8b},[x2], x3           //(7)str 8 values
+
+    sshl        v24.8h, v24.8h, v14.8h      //(8)shr
+    xtn         v24.8b,  v24.8h             //(8)
+    st1         {v24.8b},[x2], x3           //(8)str 8 values
+
+//@ ========== ***************** =====================
+
+    beq         end_loop
+
+tf_sz_4:
+    ld1         {v10.8b},[x14]              //load src[2nt+1+col]
+    ld1         {v8.8b},[x12], x10          //load 8 coeffs [col+1]
+loop_sz_4:
+    mov         x10, #4                     //reduce inc to #4 for 4x4
+    ldr         w7,  [x6], #-1              //src[2nt-1-row] (dec to take into account row)
+    sxtw        x7,w7
+    dup         v4.8b,w7                    //src[2nt-1-row]
+
+    sub         v9.8b,  v2.8b ,  v8.8b      //[nt-1-col]
+
+    umull       v12.8h, v5.8b, v0.8b        //(row+1)    *    src[nt-1]
+    umlal       v12.8h, v6.8b, v10.8b       //(nt-1-row)    *    src[2nt+1+col]
+    umlal       v12.8h, v8.8b, v1.8b        //(col+1)    *    src[3nt+1]
+    umlal       v12.8h, v9.8b, v4.8b        //(nt-1-col)    *    src[2nt-1-row]
+//    vadd.i16    q6, q6, q8            @add (nt)
+//    vshl.s16     q6, q6, q7            @shr
+//    vmovn.i16     d12, q6
+    rshrn       v12.8b, v12.8h,#3
+    st1         {v12.s}[0],[x2], x3
+
+    add         v5.8b,  v5.8b ,  v7.8b      //row++ [(row+1)++]
+    sub         v6.8b,  v6.8b ,  v7.8b      //[nt-1-row]--
+    subs        x1, x1, #1
+
+    bne         loop_sz_4
+
+end_loop:
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_intra_pred_luma_vert.s b/common/arm64/ihevc_intra_pred_luma_vert.s
new file mode 100644
index 0000000..56a20a0
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_luma_vert.s

@@ -0,0 +1,432 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_intra_pred_filters_vert.s
+//*
+//* @brief
+//*  contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon  intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//*  akshaya mukund
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*    luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//*  uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//*  integer source stride
+//*
+//* @param[in] dst_strd
+//*  integer destination stride
+//*
+//* @param[in] nt
+//*  size of tranform block
+//*
+//* @param[in] mode
+//*  type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_ver(uword8* pu1_ref,
+//                               word32 src_strd,
+//                               uword8* pu1_dst,
+//                               word32 dst_strd,
+//                               word32 nt,
+//                               word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+//    nt
+//    mode
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_luma_ver_av8
+
+.type ihevc_intra_pred_luma_ver_av8, %function
+
+ihevc_intra_pred_luma_ver_av8:
+
+    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    lsl         x5, x4, #1                  //2nt
+
+    cmp         x4, #16
+    beq         blk_16
+    blt         blk_4_8
+
+    add         x5, x5, #1                  //2nt+1
+    add         x6, x0, x5                  //&src[2nt+1]
+
+copy_32:
+    add         x5, x2, x3
+    ld1         {v20.8b, v21.8b}, [x6],#16  //16 loads (col 0:15)
+    add         x8, x5, x3
+
+    add         x10, x8, x3
+    ld1         {v22.8b, v23.8b}, [x6]      //16 loads (col 16:31)
+    lsl         x11, x3, #2
+
+    add         x11, x11, #-16
+    st1         {v20.8b, v21.8b}, [x2],#16
+    st1         {v20.8b, v21.8b}, [x5],#16
+    st1         {v20.8b, v21.8b}, [x8],#16
+    st1         {v20.8b, v21.8b}, [x10],#16
+
+    st1         {v22.8b, v23.8b}, [x2], x11
+    st1         {v22.8b, v23.8b}, [x5], x11
+    st1         {v22.8b, v23.8b}, [x8], x11
+    st1         {v22.8b, v23.8b}, [x10], x11
+
+    subs        x4, x4, #8
+
+kernel_copy_32:
+    st1         {v20.8b, v21.8b}, [x2],#16
+    st1         {v20.8b, v21.8b}, [x5],#16
+    st1         {v20.8b, v21.8b}, [x8],#16
+    st1         {v20.8b, v21.8b}, [x10],#16
+
+    st1         {v22.8b, v23.8b}, [x2], x11
+    st1         {v22.8b, v23.8b}, [x5], x11
+    st1         {v22.8b, v23.8b}, [x8], x11
+    st1         {v22.8b, v23.8b}, [x10], x11
+
+    subs        x4, x4, #8
+
+    st1         {v20.8b, v21.8b}, [x2],#16
+    st1         {v20.8b, v21.8b}, [x5],#16
+    st1         {v20.8b, v21.8b}, [x8],#16
+    st1         {v20.8b, v21.8b}, [x10],#16
+
+    st1         {v22.8b, v23.8b}, [x2], x11
+    st1         {v22.8b, v23.8b}, [x5], x11
+    st1         {v22.8b, v23.8b}, [x8], x11
+    st1         {v22.8b, v23.8b}, [x10], x11
+
+    bne         kernel_copy_32
+
+    st1         {v20.8b, v21.8b}, [x2],#16
+    st1         {v20.8b, v21.8b}, [x5],#16
+    st1         {v20.8b, v21.8b}, [x8],#16
+    st1         {v20.8b, v21.8b}, [x10],#16
+
+    st1         {v22.8b, v23.8b}, [x2], x11
+    st1         {v22.8b, v23.8b}, [x5], x11
+    st1         {v22.8b, v23.8b}, [x8], x11
+    st1         {v22.8b, v23.8b}, [x10], x11
+
+    b           end_func
+
+blk_16:
+    add         x6, x0, x5                  //&src[2nt]
+
+    ldrb        w11, [x6], #1               //src[2nt]
+    sxtw        x11,w11
+
+    dup         v22.16b,w11                 //src[2nt]
+    ldrb        w12, [x6]                   //src[2nt+1]
+    sxtw        x12,w12
+
+    ld1         {v16.8b, v17.8b}, [x6]      //ld for repl to cols src[2nt+1+col(0:15)] (0 ignored for stores)
+    add         x6, x6, #-17                //subtract -9 to take it to src[2nt-1-row(15)]
+
+    dup         v24.16b,w12                 //src[2nt+1]
+    dup         v30.8h,w12
+    lsl         x5, x3, #3                  //8*stride
+
+    ld1         {v26.16b}, [x6],#16         //load src[2nt-1-row](rows 0:15)
+    add         x5, x2, x5                  //x5 ->
+
+    movi        d18, #0x00000000000000ff
+    uhsub       v26.16b,  v26.16b ,  v22.16b //(src[2nt-1-row] - src[2nt])>>1
+    //vsubl.u8    q0, d26, d22
+    //vsubl.u8    q14, d27, d22
+
+    //vshr.s16    q0, q0, #1
+    //vshr.s16    q14, q14, #1
+
+    mov         v19.d[0],v17.d[0]
+    //vaddl.s8    q0, d24, d26
+    sxtl        v0.8h, v26.8b
+    sxtl2       v28.8h, v26.16b
+    sqadd       v0.8h,  v0.8h ,  v30.8h
+    sqadd       v28.8h,  v28.8h ,  v30.8h
+
+    movi        d10, #0x00000000000000ff
+    //vaddl.s8    q1, d25, d27
+
+    sqxtun      v24.8b, v28.8h
+    sqxtun2     v24.16b, v0.8h
+    //vmovn.u16    d25, q0
+    //vmovn.u16    d24, q1
+
+    rev64       v24.16b,  v24.16b
+    mov         v25.d[0], v24.d[1]
+
+    mov         v11.d[0],v17.d[0]
+
+    bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
+    bsl         v10.8b,  v25.8b ,  v16.8b
+
+    movi        d8, #0x00000000000000ff
+    mov         v9.d[0],v17.d[0]
+
+    movi        d6, #0x00000000000000ff
+    mov         v7.d[0],v17.d[0]
+
+    st1         {v18.8b, v19.8b}, [x2], x3
+    sshr        d24, d24,#8
+
+    st1         {v10.8b, v11.8b}, [x5], x3
+    sshr        d25, d25,#8
+
+
+    bsl         v8.8b,  v24.8b ,  v16.8b
+    bsl         v6.8b,  v25.8b ,  v16.8b
+
+    st1         {v8.8b, v9.8b}, [x2], x3
+    sshr        d24, d24,#8
+
+    st1         {v6.8b, v7.8b}, [x5], x3
+    sshr        d25, d25,#8
+
+    subs        x4, x4,#8
+
+    movi        d18, #0x00000000000000ff
+    //vmov.i64    d19, d17
+
+    movi        d10, #0x00000000000000ff
+    //vmov.i64    d11, d17
+
+
+loop_16:
+
+
+    movi        d8, #0x00000000000000ff
+
+    movi        d6, #0x00000000000000ff
+
+    bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
+    bsl         v10.8b,  v25.8b ,  v16.8b
+
+    st1         {v18.8b, v19.8b}, [x2], x3
+    sshr        d24, d24,#8
+
+    st1         {v10.8b, v11.8b}, [x5], x3
+    sshr        d25, d25,#8
+
+    movi        d18, #0x00000000000000ff
+
+    movi        d10, #0x00000000000000ff
+
+    bsl         v8.8b,  v24.8b ,  v16.8b
+    bsl         v6.8b,  v25.8b ,  v16.8b
+
+    st1         {v8.8b, v9.8b}, [x2], x3
+    sshr        d24, d24,#8
+
+    st1         {v6.8b, v7.8b}, [x5], x3
+    sshr        d25, d25,#8
+
+    subs        x4, x4, #4
+
+    bne         loop_16
+
+    movi        d8, #0x00000000000000ff
+
+    movi        d6, #0x00000000000000ff
+
+    bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
+    bsl         v10.8b,  v25.8b ,  v16.8b
+
+    st1         {v18.8b, v19.8b}, [x2], x3
+    sshr        d24, d24,#8
+
+    st1         {v10.8b, v11.8b}, [x5], x3
+    sshr        d25, d25,#8
+
+    bsl         v8.8b,  v24.8b ,  v16.8b
+    bsl         v6.8b,  v25.8b ,  v16.8b
+
+    st1         {v8.8b, v9.8b}, [x2], x3
+
+    st1         {v6.8b, v7.8b}, [x5], x3
+
+    b           end_func
+
+
+blk_4_8:
+    movi        d11, #0x00000000000000ff
+    add         x6, x0, x5                  //&src[2nt]
+
+    movi        d10, #0x00000000000000ff
+    ldrb        w11, [x6], #1               //src[2nt]
+    sxtw        x11,w11
+
+    dup         v22.8b,w11                  //src[2nt]
+    ldrb        w12, [x6]                   //src[2nt+1]
+    sxtw        x12,w12
+
+    ld1         {v16.8b},[x6]               //ld for repl to cols src[2nt+1+col(0:3 or 0:7)](0 ignored for st)
+    add         x6, x6, #-9                 //subtract -9 to take it to src[2nt-1-row(15)]
+
+    dup         v24.8b,w12                  //src[2nt+1]
+    dup         v30.8h,w12
+
+    ld1         {v26.8b},[x6],#8            //load src[2nt-1-row](rows 0:15)
+
+    movi        d18, #0x00000000000000ff
+    uhsub       v26.8b,  v26.8b ,  v22.8b   //(src[2nt-1-row] - src[2nt])>>1
+    //vsubl.u8    q13, d26, d22
+
+    //vshr.s16    q13, q13, #1
+
+    movi        d19, #0x00000000000000ff
+    sxtl        v26.8h, v26.8b
+    //vaddl.s8    q0, d24, d26
+    sqadd       v0.8h,  v26.8h ,  v30.8h
+
+    sqxtun      v24.8b, v0.8h
+    //vmovn.s16    d24, q0
+
+    rev64       v24.8b,  v24.8b
+
+    cmp         x4, #4
+    beq         blk_4
+
+    bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
+
+    st1         {v18.8b},[x2], x3
+    sshr        d24, d24,#8
+
+    movi        d18, #0x00000000000000ff
+
+    bsl         v19.8b,  v24.8b ,  v16.8b
+
+    st1         {v19.8b},[x2], x3
+    sshr        d24, d24,#8
+
+    movi        d19, #0x00000000000000ff
+
+    bsl         v10.8b,  v24.8b ,  v16.8b
+
+    st1         {v10.8b},[x2], x3
+    sshr        d24, d24,#8
+
+    movi        d10, #0x00000000000000ff
+
+    bsl         v11.8b,  v24.8b ,  v16.8b
+
+    st1         {v11.8b},[x2], x3
+    sshr        d24, d24,#8
+
+    movi        d11, #0x00000000000000ff
+
+    bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
+
+    st1         {v18.8b},[x2], x3
+    sshr        d24, d24,#8
+
+    bsl         v19.8b,  v24.8b ,  v16.8b
+
+    st1         {v19.8b},[x2], x3
+    sshr        d24, d24,#8
+
+    bsl         v10.8b,  v24.8b ,  v16.8b
+
+    st1         {v10.8b},[x2], x3
+    sshr        d24, d24,#8
+
+    bsl         v11.8b,  v24.8b ,  v16.8b
+
+    st1         {v11.8b},[x2], x3
+    sshr        d24, d24,#8
+
+    b           end_func
+
+
+blk_4:
+    bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
+
+    st1         {v18.s}[0],[x2], x3
+    sshr        d24, d24,#8
+
+    bsl         v19.8b,  v24.8b ,  v16.8b
+
+    st1         {v19.s}[0],[x2], x3
+    sshr        d24, d24,#8
+
+    bsl         v10.8b,  v24.8b ,  v16.8b
+
+    st1         {v10.s}[0],[x2], x3
+    sshr        d24, d24,#8
+
+    bsl         v11.8b,  v24.8b ,  v16.8b
+    st1         {v11.s}[0],[x2], x3
+
+
+end_func:
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+

diff --git a/common/arm64/ihevc_itrans_recon_16x16.s b/common/arm64/ihevc_itrans_recon_16x16.s
new file mode 100644
index 0000000..90df840
--- /dev/null
+++ b/common/arm64/ihevc_itrans_recon_16x16.s

@@ -0,0 +1,1240 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+// *******************************************************************************
+// * @file
+// *  ihevc_itrans_recon_8x8_neon.s
+// *
+// * @brief
+// *  contains function definitions for single stage  inverse transform
+// *
+// * @author
+// * anand s
+// *
+// * @par list of functions:
+// *  - ihevc_itrans_recon_16x16()
+// *
+// * @remarks
+// *  none
+// *
+// *******************************************************************************
+//*/
+
+///**
+// *******************************************************************************
+// *
+// * @brief
+// *  this function performs inverse transform  and reconstruction for 8x8
+// * input block
+// *
+// * @par description:
+// *  performs inverse transform and adds the prediction  data and clips output
+// * to 8 bit
+// *
+// * @param[in] pi2_src
+// *  input 16x16 coefficients
+// *
+// * @param[in] pi2_tmp
+// *  temporary 16x16 buffer for storing inverse
+// *
+// *  transform
+// *  1st stage output
+// *
+// * @param[in] pu1_pred
+// *  prediction 16x16 block
+// *
+// * @param[out] pu1_dst
+// *  output 8x8 block
+// *
+// * @param[in] src_strd
+// *  input stride
+// *
+// * @param[in] pred_strd
+// *  prediction stride
+// *
+// * @param[in] dst_strd
+// *  output stride
+// *
+// * @param[in] shift
+// *  output shift
+// *
+// * @param[in] x12
+// *  zero columns in pi2_src
+// *
+// * @returns  void
+// *
+// * @remarks
+// *  none
+// *
+// *******************************************************************************
+// */
+
+//void ihevc_itrans_recon_16x16(word16 *pi2_src,
+//                            word16 *pi2_tmp,
+//                            uword8 *pu1_pred,
+//                            uword8 *pu1_dst,
+//                            word32 src_strd,
+//                            word32 pred_strd,
+//                            word32 dst_strd,
+//                            word32 x12
+//                             word32    x11                )
+
+//**************variables vs registers*************************
+//    x0 => *pi2_src
+//    x1 => *pi2_tmp
+//    x2 => *pu1_pred
+//    x3 => *pu1_dst
+//    src_strd
+//    pred_strd
+//    dst_strd
+//    x12
+//    x11
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+
+
+
+.set shift_stage1_idct ,   7
+.set shift_stage2_idct ,   12
+//#define zero_cols         x12
+//#define zero_rows         x11
+.globl ihevc_itrans_recon_16x16_av8
+
+.extern g_ai2_ihevc_trans_16_transpose
+
+.type ihevc_itrans_recon_16x16_av8, %function
+
+ihevc_itrans_recon_16x16_av8:
+
+    ldr         w11, [sp]
+    // stmfd sp!,{x4-x12,x14}
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+    stp         x5, x6,[sp,#-16]!
+//    add             sp,sp,#40
+
+
+
+//    ldr            x8,[sp,#4]     @ prediction stride
+//    ldr            x7,[sp,#8]     @ destination stride
+    mov         x6, x4 // src stride
+    mov         x12, x7
+
+
+
+    adrp        x14, :got:g_ai2_ihevc_trans_16_transpose
+    ldr         x14, [x14, #:got_lo12:g_ai2_ihevc_trans_16_transpose]
+    ld1         {v0.4h, v1.4h, v2.4h, v3.4h},[x14] ////d0,d1 are used for storing the constant data
+    mov         x7,#0xffff
+    and         x12,x12,x7
+    and         x11,x11,x7
+    lsl         x6, x6, #1                  // x sizeof(word16)
+    add         x9,x0,x6, lsl #1            // 2 rows
+
+    add         x10,x6,x6, lsl #1           // 3 rows
+    add         x5,x6,x6,lsl #2
+    mov         x7,#0xfff0
+
+    cmp         x12,x7
+    bge         zero_12cols_decision
+
+    mov         x19,#0xff00
+    cmp         x12,x19
+    bge         zero_8cols_decision
+
+
+
+
+    mov         x14,#4
+    cmp         x11,x7
+    sub         x20,x6,#0
+    neg         x20, x20
+    csel        x10,x20,x10,ge
+
+    mov         x19,#0xff00
+    cmp         x11,x19
+    csel        x8, x5, x8,ge
+    sub         x20,x8,#0
+    neg         x20, x20
+    csel        x8,x20,x8,ge
+    csel        x8, x10, x8,lt
+    add         x5,x5,x6,lsl #3
+    sub         x20,x5,#0
+    neg         x5, x20
+
+    b           first_stage_top_four_bottom_four
+
+zero_12cols_decision:
+    mov         x14,#1
+    mov         x19,#0xff00
+    cmp         x11,x19
+    csel        x8, x5, x8,ge
+    csel        x8, x10, x8,lt
+    add         x5,x5,x6,lsl #3
+    sub         x20,x5,#0
+    neg         x5, x20
+
+    b           first_stage_top_four_bottom_four
+
+zero_8cols_decision:
+    mov         x14,#2
+    mov         x8,x5
+    sub         x20,x8,#0
+    neg         x8, x20
+    mov         x19,#0xff00
+    cmp         x11,x19
+    csel        x8, x10, x8,lt
+    add         x5,x5,x6,lsl #3
+    sub         x20,x5,#0
+    neg         x5, x20
+    cmp         x11,x7
+    sub         x20,x6,#0
+    neg         x20, x20
+    csel        x10,x20,x10,ge
+
+
+    b           first_stage_top_four_bottom_four
+
+
+//d0[0]=    64        d2[0]=64
+//d0[1]= 90        d2[1]=57
+//d0[2]= 89        d2[2]=50
+//d0[3]= 87        d2[3]=43
+//d1[0]= 83         d3[0]=36
+//d1[1]= 80        d3[1]=25
+//d1[2]= 75        d3[2]=18
+//d1[3]= 70        d3[3]=9
+
+
+
+first_stage:
+    add         x0,x0,#8
+    add         x9,x9,#8
+
+first_stage_top_four_bottom_four:
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v11.4h},[x9],x6
+    ld1         {v6.4h},[x0],x10
+    ld1         {v7.4h},[x9],x10
+    cmp         x11,x7
+    bge         skip_load4rows
+
+    ld1         {v4.4h},[x0],x6
+    ld1         {v5.4h},[x9],x6
+    ld1         {v8.4h},[x0],x8
+    ld1         {v9.4h},[x9],x8
+
+// registers used: q0,q1,q3,q5,q2,q4
+
+// d10 =x0
+//d6= x1
+//d11=x2
+//d7=x3
+
+skip_load4rows:
+    smull       v24.4s, v6.4h, v0.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v6.4h, v0.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v6.4h, v1.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v6.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v7.4h, v0.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlal       v26.4s, v7.4h, v2.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v7.4h, v3.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v7.4h, v2.4h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+
+    smull       v12.4s, v10.4h, v0.4h[0]
+    smlal       v12.4s, v11.4h, v0.4h[2]
+    smull       v14.4s, v10.4h, v0.4h[0]
+    smlal       v14.4s, v11.4h, v1.4h[2]
+    smull       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v2.4h[2]
+    smull       v18.4s, v10.4h, v0.4h[0]
+    smlal       v18.4s, v11.4h, v3.4h[2]
+
+    bge         skip_last12rows_kernel1
+
+
+    smlal       v24.4s, v8.4h, v1.4h[1]
+    smlal       v26.4s, v8.4h, v3.4h[3]
+    smlsl       v28.4s, v8.4h, v1.4h[3]
+    smlsl       v30.4s, v8.4h, v0.4h[3]
+
+
+    smlal       v24.4s, v9.4h, v1.4h[3]
+    smlsl       v26.4s, v9.4h, v2.4h[3]
+    smlsl       v28.4s, v9.4h, v0.4h[3]
+    smlal       v30.4s, v9.4h, v3.4h[3]
+
+
+
+
+
+    smlal       v12.4s, v4.4h, v1.4h[0]
+    smlal       v12.4s, v5.4h, v1.4h[2]
+    smlal       v14.4s, v4.4h, v3.4h[0]
+    smlsl       v14.4s, v5.4h, v3.4h[2]
+    smlsl       v16.4s, v4.4h, v3.4h[0]
+    smlsl       v16.4s, v5.4h, v0.4h[2]
+    smlsl       v18.4s, v4.4h, v1.4h[0]
+    smlsl       v18.4s, v5.4h, v2.4h[2]
+
+//d0[0]=    64        d2[0]=64
+//d0[1]= 90        d2[1]=57
+//d0[2]= 89        d2[2]=50
+//d0[3]= 87        d2[3]=43
+//d1[0]= 83         d3[0]=36
+//d1[1]= 80        d3[1]=25
+//d1[2]= 75        d3[2]=18
+//d1[3]= 70        d3[3]=9
+    mov         x19,#0xff00
+    cmp         x11,x19
+    bge         skip_last12rows_kernel1
+
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v11.4h},[x9],x6
+    ld1         {v6.4h},[x0],x10
+    ld1         {v7.4h},[x9],x10
+    ld1         {v4.4h},[x0],x6
+    ld1         {v5.4h},[x9],x6
+    ld1         {v8.4h},[x0],x5
+    ld1         {v9.4h},[x9],x5
+
+
+
+
+    smlal       v24.4s, v6.4h, v2.4h[1]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v6.4h, v1.4h[1]     //// y1 * cos3(part of b1)
+    smlsl       v28.4s, v6.4h, v3.4h[1]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v6.4h, v0.4h[1]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v7.4h, v2.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v7.4h, v0.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v7.4h, v2.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlal       v30.4s, v7.4h, v3.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+    smlal       v24.4s, v8.4h, v3.4h[1]
+    smlsl       v26.4s, v8.4h, v1.4h[3]
+    smlal       v28.4s, v8.4h, v0.4h[1]
+    smlsl       v30.4s, v8.4h, v1.4h[1]
+
+
+    smlal       v24.4s, v9.4h, v3.4h[3]
+    smlsl       v26.4s, v9.4h, v3.4h[1]
+    smlal       v28.4s, v9.4h, v2.4h[3]
+    smlsl       v30.4s, v9.4h, v2.4h[1]
+
+
+
+
+
+    smlal       v12.4s, v10.4h, v0.4h[0]
+    smlal       v12.4s, v11.4h, v2.4h[2]
+    smlal       v12.4s, v4.4h, v3.4h[0]
+    smlal       v12.4s, v5.4h, v3.4h[2]
+
+
+
+
+    smlsl       v14.4s, v10.4h, v0.4h[0]
+    smlsl       v14.4s, v11.4h, v0.4h[2]
+    smlsl       v14.4s, v4.4h, v1.4h[0]
+    smlsl       v14.4s, v5.4h, v2.4h[2]
+
+
+    smlsl       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v3.4h[2]
+    smlal       v16.4s, v4.4h, v1.4h[0]
+    smlal       v16.4s, v5.4h, v1.4h[2]
+
+
+    smlal       v18.4s, v10.4h, v0.4h[0]
+    smlal       v18.4s, v11.4h, v1.4h[2]
+    smlsl       v18.4s, v4.4h, v3.4h[0]
+    smlsl       v18.4s, v5.4h, v0.4h[2]
+
+skip_last12rows_kernel1:
+    add         v20.4s,  v12.4s ,  v24.4s
+    sub         v22.4s,  v12.4s ,  v24.4s
+
+    add         v12.4s,  v14.4s ,  v26.4s
+    sub         v24.4s,  v14.4s ,  v26.4s
+
+    add         v14.4s,  v16.4s ,  v28.4s
+    sub         v26.4s,  v16.4s ,  v28.4s
+
+
+    add         v16.4s,  v18.4s ,  v30.4s
+    sub         v28.4s,  v18.4s ,  v30.4s
+
+
+
+
+
+
+
+    sqrshrn     v30.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v19.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+    st1         {v30.4h, v31.4h},[x1],#16
+    st1         {v18.4h, v19.4h},[x1],#16
+    sub         x1,x1,#32
+
+    bge         skip_stage1_kernel_load
+
+first_stage_middle_eight:
+
+
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v11.4h},[x9],x6
+    ld1         {v6.4h},[x0],x10
+    ld1         {v7.4h},[x9],x10
+    ld1         {v4.4h},[x0],x6
+    ld1         {v5.4h},[x9],x6
+    ld1         {v8.4h},[x0],x8
+    ld1         {v9.4h},[x9],x8
+
+
+skip_stage1_kernel_load:
+    smull       v24.4s, v6.4h, v2.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v6.4h, v2.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v6.4h, v3.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v6.4h, v3.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v7.4h, v1.4h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v7.4h, v0.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v7.4h, v1.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v7.4h, v3.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+
+    smull       v22.4s, v10.4h, v0.4h[0]
+    smlsl       v22.4s, v11.4h, v3.4h[2]
+    smull       v20.4s, v10.4h, v0.4h[0]
+    smlsl       v20.4s, v11.4h, v2.4h[2]
+    smull       v16.4s, v10.4h, v0.4h[0]
+    smlsl       v16.4s, v11.4h, v1.4h[2]
+    smull       v18.4s, v10.4h, v0.4h[0]
+    smlsl       v18.4s, v11.4h, v0.4h[2]
+
+
+    cmp         x11,x7
+    bge         skip_last12rows_kernel2
+
+    smlsl       v24.4s, v8.4h, v3.4h[1]
+    smlal       v26.4s, v8.4h, v2.4h[1]
+    smlal       v28.4s, v8.4h, v0.4h[1]
+    smlal       v30.4s, v8.4h, v2.4h[3]
+
+
+    smlal       v24.4s, v9.4h, v0.4h[1]
+    smlal       v26.4s, v9.4h, v3.4h[1]
+    smlsl       v28.4s, v9.4h, v1.4h[1]
+    smlsl       v30.4s, v9.4h, v2.4h[1]
+
+
+
+    smlsl       v22.4s, v4.4h, v1.4h[0]
+    smlal       v22.4s, v5.4h, v2.4h[2]
+    smlsl       v20.4s, v4.4h, v3.4h[0]
+    smlal       v20.4s, v5.4h, v0.4h[2]
+    smlal       v16.4s, v4.4h, v3.4h[0]
+    smlal       v16.4s, v5.4h, v3.4h[2]
+    smlal       v18.4s, v4.4h, v1.4h[0]
+    smlsl       v18.4s, v5.4h, v1.4h[2]
+
+//d0[0]=    64        d2[0]=64
+//d0[1]= 90        d2[1]=57
+//d0[2]= 89        d2[2]=50
+//d0[3]= 87        d2[3]=43
+//d1[0]= 83         d3[0]=36
+//d1[1]= 80        d3[1]=25
+//d1[2]= 75        d3[2]=18
+//d1[3]= 70        d3[3]=9
+    mov         x19,#0xff00
+    cmp         x11,x19
+    bge         skip_last12rows_kernel2
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v11.4h},[x9],x6
+    ld1         {v6.4h},[x0],x10
+    ld1         {v7.4h},[x9],x10
+    ld1         {v4.4h},[x0],x6
+    ld1         {v5.4h},[x9],x6
+    ld1         {v8.4h},[x0],x5
+    ld1         {v9.4h},[x9],x5
+
+
+    smlsl       v24.4s, v6.4h, v3.4h[3]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v6.4h, v0.4h[3]     //// y1 * cos3(part of b1)
+    smlal       v28.4s, v6.4h, v2.4h[3]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v6.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v7.4h, v0.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlal       v26.4s, v7.4h, v1.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v7.4h, v3.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v7.4h, v1.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+    smlal       v24.4s, v8.4h, v2.4h[3]
+    smlal       v26.4s, v8.4h, v3.4h[3]
+    smlsl       v28.4s, v8.4h, v2.4h[1]
+    smlal       v30.4s, v8.4h, v0.4h[3]
+
+
+    smlal       v24.4s, v9.4h, v1.4h[3]
+    smlsl       v26.4s, v9.4h, v1.4h[1]
+    smlal       v28.4s, v9.4h, v0.4h[3]
+    smlsl       v30.4s, v9.4h, v0.4h[1]
+
+
+
+
+    smlal       v22.4s, v10.4h, v0.4h[0]
+    smlsl       v22.4s, v11.4h, v1.4h[2]
+    smlsl       v22.4s, v4.4h, v3.4h[0]
+    smlal       v22.4s, v5.4h, v0.4h[2]
+
+
+
+    smlsl       v20.4s, v10.4h, v0.4h[0]
+    smlsl       v20.4s, v11.4h, v3.4h[2]
+    smlal       v20.4s, v4.4h, v1.4h[0]
+    smlsl       v20.4s, v5.4h, v1.4h[2]
+
+
+    smlsl       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v0.4h[2]
+    smlsl       v16.4s, v4.4h, v1.4h[0]
+    smlal       v16.4s, v5.4h, v2.4h[2]
+
+
+
+    smlal       v18.4s, v10.4h, v0.4h[0]
+    smlsl       v18.4s, v11.4h, v2.4h[2]
+    smlal       v18.4s, v4.4h, v3.4h[0]
+    smlsl       v18.4s, v5.4h, v3.4h[2]
+
+skip_last12rows_kernel2:
+
+    add         v4.4s,  v22.4s ,  v24.4s
+    sub         v22.4s,  v22.4s ,  v24.4s
+
+    add         v6.4s,  v20.4s ,  v26.4s
+    sub         v24.4s,  v20.4s ,  v26.4s
+
+    add         v10.4s,  v16.4s ,  v28.4s
+    sub         v26.4s,  v16.4s ,  v28.4s
+
+
+    add         v16.4s,  v18.4s ,  v30.4s
+    sub         v28.4s,  v18.4s ,  v30.4s
+
+
+    sqrshrn     v18.4h, v4.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v31.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v30.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v20.4h, v6.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v23.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v21.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v22.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+
+    // registers used:    {q2,q4,q6,q7}, {q9,q15,q10,q11}
+
+
+
+
+
+
+    ld1         {v4.4h, v5.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],#16
+    sub         x1,x1,#32
+
+//d4=x0
+//d12=x1
+//d5=x2
+//d13=x3
+
+//d18=x4
+//d20=x5
+//d19=x6
+//d21=x7
+
+//d22=x8
+//d30=x9
+//d23=x10
+//d31=x11
+
+//d14=x12
+//d8=x13
+//d15=x14
+//d9=x15
+
+    umov        x15,v26.d[0]
+    umov        x16,v27.d[0]
+    umov        x19,v28.d[0]
+    umov        x20,v29.d[0]
+
+    trn1        v26.4h, v4.4h, v12.4h
+    trn2        v27.4h, v4.4h, v12.4h
+    trn1        v28.4h, v5.4h, v13.4h
+    trn2        v29.4h, v5.4h, v13.4h
+
+    trn1        v4.2s, v26.2s, v28.2s
+    trn2        v5.2s, v26.2s, v28.2s
+    trn1        v12.2s, v27.2s, v29.2s
+    trn2        v13.2s, v27.2s, v29.2s
+
+    trn1        v26.4h, v18.4h, v20.4h
+    trn2        v27.4h, v18.4h, v20.4h
+    trn1        v28.4h, v19.4h, v21.4h
+    trn2        v29.4h, v19.4h, v21.4h
+
+    trn1        v18.2s, v26.2s, v28.2s
+    trn2        v19.2s, v26.2s, v28.2s
+    trn1        v20.2s, v27.2s, v29.2s
+    trn2        v21.2s, v27.2s, v29.2s
+
+    trn1        v26.4h, v22.4h, v30.4h
+    trn2        v27.4h, v22.4h, v30.4h
+    trn1        v28.4h, v23.4h, v31.4h
+    trn2        v29.4h, v23.4h, v31.4h
+
+    trn1        v22.2s, v26.2s, v28.2s
+    trn2        v23.2s, v26.2s, v28.2s
+    trn1        v30.2s, v27.2s, v29.2s
+    trn2        v31.2s, v27.2s, v29.2s
+
+    trn1        v26.4h, v14.4h, v8.4h
+    trn2        v27.4h, v14.4h, v8.4h
+    trn1        v28.4h, v15.4h, v9.4h
+    trn2        v29.4h, v15.4h, v9.4h
+
+    trn1        v14.2s, v26.2s, v28.2s
+    trn2        v15.2s, v26.2s, v28.2s
+    trn1        v8.2s, v27.2s, v29.2s
+    trn2        v9.2s, v27.2s, v29.2s
+
+    mov         v26.d[0],x15
+    mov         v27.d[0],x16
+    mov         v28.d[0],x19
+    mov         v29.d[0],x20
+
+// d4 =x0 1- 4 values
+// d5 =x2 1- 4 values
+// d12=x1 1- 4 values
+// d13=x3 1- 4 values
+
+// d18 =x0 5- 8 values
+// d19 =x2 5- 8 values
+// d20=x1 5- 8 values
+// d21=x3 5- 8 values
+
+// d22 =x0 9- 12 values
+// d23 =x2 9- 12 values
+// d30=x1 9- 12 values
+// d31=x3 9- 12 values
+
+// d14 =x0 13-16 values
+// d15 =x2 13- 16 values
+// d8=x1 13- 16 values
+// d9=x3 13- 16 values
+
+
+    st1         { v4.4h, v5.4h},[x1],#16
+    st1         { v12.4h, v13.4h},[x1],#16
+
+    st1         { v18.4h, v19.4h},[x1],#16
+    st1         { v20.4h, v21.4h},[x1],#16
+    st1         { v22.4h, v23.4h},[x1],#16
+    st1         { v30.4h, v31.4h},[x1],#16
+    st1         { v14.4h, v15.4h},[x1],#16
+    st1         { v8.4h, v9.4h},[x1],#16
+
+
+    subs        x14,x14,#1
+    bne         first_stage
+
+
+
+
+
+
+
+
+
+
+    mov         x6,x7
+
+    ldp         x8, x7,[sp],#16
+
+    mov         x10,#16
+
+    cmp         x12,x6
+    sub         x20,x1,#128
+    csel        x1, x20, x1,ge
+    bge         label1
+
+    mov         x19,#0xff00
+    cmp         x12,x19
+    sub         x20,x1,#256
+    csel        x1, x20, x1,ge
+    bge         label_2
+
+    sub         x1,x1,#512
+    sub         x20,x10,#0
+    neg         x10, x20
+
+label_2:
+    add         x9,x1,#128
+    add         x11,x9,#128
+    add         x0,x11,#128
+
+
+
+label1:
+//    mov   x6,x1
+
+
+    mov         x14,#4
+    add         x4,x2,x8, lsl #1            // x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
+    add         x5,x8,x8, lsl #1            //
+//    add x0,x3,x7, lsl #1    @ x0 points to 3rd row of dest data
+//    add x10,x7,x7, lsl #1    @
+
+
+
+
+second_stage:
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v6.4h, v7.4h},[x1],x10
+    cmp         x12,x6
+    bge         second_stage_process
+    ld1         {v4.4h, v5.4h},[x9],#16
+    ld1         {v8.4h, v9.4h},[x9],x10
+
+second_stage_process:
+
+
+    smull       v24.4s, v6.4h, v0.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v6.4h, v0.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v6.4h, v1.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v6.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v7.4h, v0.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlal       v26.4s, v7.4h, v2.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v7.4h, v3.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v7.4h, v2.4h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+    smull       v12.4s, v10.4h, v0.4h[0]
+    smlal       v12.4s, v11.4h, v0.4h[2]
+    smull       v14.4s, v10.4h, v0.4h[0]
+    smlal       v14.4s, v11.4h, v1.4h[2]
+    smull       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v2.4h[2]
+    smull       v18.4s, v10.4h, v0.4h[0]
+    smlal       v18.4s, v11.4h, v3.4h[2]
+
+    bge         skip_last8rows_stage2_kernel1
+
+    smlal       v24.4s, v8.4h, v1.4h[1]
+    smlal       v26.4s, v8.4h, v3.4h[3]
+    smlsl       v28.4s, v8.4h, v1.4h[3]
+    smlsl       v30.4s, v8.4h, v0.4h[3]
+
+
+    smlal       v24.4s, v9.4h, v1.4h[3]
+    smlsl       v26.4s, v9.4h, v2.4h[3]
+    smlsl       v28.4s, v9.4h, v0.4h[3]
+    smlal       v30.4s, v9.4h, v3.4h[3]
+
+
+    smlal       v12.4s, v4.4h, v1.4h[0]
+    smlal       v12.4s, v5.4h, v1.4h[2]
+    smlal       v14.4s, v4.4h, v3.4h[0]
+    smlsl       v14.4s, v5.4h, v3.4h[2]
+    smlsl       v16.4s, v4.4h, v3.4h[0]
+    smlsl       v16.4s, v5.4h, v0.4h[2]
+    smlsl       v18.4s, v4.4h, v1.4h[0]
+    smlsl       v18.4s, v5.4h, v2.4h[2]
+
+    mov         x19,#0xff00
+    cmp         x12,x19
+    bge         skip_last8rows_stage2_kernel1
+
+
+    ld1         {v10.4h, v11.4h},[x11],#16
+    ld1         {v6.4h, v7.4h},[x11],x10
+    ld1         {v4.4h, v5.4h},[x0],#16
+    ld1         {v8.4h, v9.4h},[x0],x10
+
+
+
+
+
+    smlal       v24.4s, v6.4h, v2.4h[1]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v6.4h, v1.4h[1]     //// y1 * cos3(part of b1)
+    smlsl       v28.4s, v6.4h, v3.4h[1]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v6.4h, v0.4h[1]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v7.4h, v2.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v7.4h, v0.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v7.4h, v2.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlal       v30.4s, v7.4h, v3.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+    smlal       v24.4s, v8.4h, v3.4h[1]
+    smlsl       v26.4s, v8.4h, v1.4h[3]
+    smlal       v28.4s, v8.4h, v0.4h[1]
+    smlsl       v30.4s, v8.4h, v1.4h[1]
+
+
+    smlal       v24.4s, v9.4h, v3.4h[3]
+    smlsl       v26.4s, v9.4h, v3.4h[1]
+    smlal       v28.4s, v9.4h, v2.4h[3]
+    smlsl       v30.4s, v9.4h, v2.4h[1]
+
+
+
+
+
+    smlal       v12.4s, v10.4h, v0.4h[0]
+    smlal       v12.4s, v11.4h, v2.4h[2]
+    smlal       v12.4s, v4.4h, v3.4h[0]
+    smlal       v12.4s, v5.4h, v3.4h[2]
+
+
+
+
+    smlsl       v14.4s, v10.4h, v0.4h[0]
+    smlsl       v14.4s, v11.4h, v0.4h[2]
+    smlsl       v14.4s, v4.4h, v1.4h[0]
+    smlsl       v14.4s, v5.4h, v2.4h[2]
+
+
+    smlsl       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v3.4h[2]
+    smlal       v16.4s, v4.4h, v1.4h[0]
+    smlal       v16.4s, v5.4h, v1.4h[2]
+
+
+    smlal       v18.4s, v10.4h, v0.4h[0]
+    smlal       v18.4s, v11.4h, v1.4h[2]
+    smlsl       v18.4s, v4.4h, v3.4h[0]
+    smlsl       v18.4s, v5.4h, v0.4h[2]
+
+
+
+
+
+
+skip_last8rows_stage2_kernel1:
+
+
+
+    add         v20.4s,  v12.4s ,  v24.4s
+    sub         v22.4s,  v12.4s ,  v24.4s
+
+    add         v12.4s,  v14.4s ,  v26.4s
+    sub         v24.4s,  v14.4s ,  v26.4s
+
+    add         v14.4s,  v16.4s ,  v28.4s
+    sub         v26.4s,  v16.4s ,  v28.4s
+
+
+    add         v16.4s,  v18.4s ,  v30.4s
+    sub         v28.4s,  v18.4s ,  v30.4s
+
+
+
+
+
+
+
+    sqrshrn     v30.4h, v20.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v19.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+    bge         skip_stage2_kernel_load
+
+    //q2,q4,q6,q7 is used
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v6.4h, v7.4h},[x1],#16
+    ld1         {v4.4h, v5.4h},[x9],#16
+    ld1         {v8.4h, v9.4h},[x9],#16
+skip_stage2_kernel_load:
+    sub         x1,x1,#32
+    st1         {v30.4h, v31.4h},[x1],#16
+    st1         {v18.4h, v19.4h},[x1],#16
+    sub         x1,x1,#32
+
+    smull       v24.4s, v6.4h, v2.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v6.4h, v2.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v6.4h, v3.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v6.4h, v3.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v7.4h, v1.4h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v7.4h, v0.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v7.4h, v1.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v7.4h, v3.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+    smull       v22.4s, v10.4h, v0.4h[0]
+    smlsl       v22.4s, v11.4h, v3.4h[2]
+    smull       v20.4s, v10.4h, v0.4h[0]
+    smlsl       v20.4s, v11.4h, v2.4h[2]
+    smull       v16.4s, v10.4h, v0.4h[0]
+    smlsl       v16.4s, v11.4h, v1.4h[2]
+    smull       v18.4s, v10.4h, v0.4h[0]
+    smlsl       v18.4s, v11.4h, v0.4h[2]
+
+
+
+    cmp         x12,x6
+    bge         skip_last8rows_stage2_kernel2
+
+
+    smlsl       v24.4s, v8.4h, v3.4h[1]
+    smlal       v26.4s, v8.4h, v2.4h[1]
+    smlal       v28.4s, v8.4h, v0.4h[1]
+    smlal       v30.4s, v8.4h, v2.4h[3]
+
+
+    smlal       v24.4s, v9.4h, v0.4h[1]
+    smlal       v26.4s, v9.4h, v3.4h[1]
+    smlsl       v28.4s, v9.4h, v1.4h[1]
+    smlsl       v30.4s, v9.4h, v2.4h[1]
+
+
+
+    smlsl       v22.4s, v4.4h, v1.4h[0]
+    smlal       v22.4s, v5.4h, v2.4h[2]
+    smlsl       v20.4s, v4.4h, v3.4h[0]
+    smlal       v20.4s, v5.4h, v0.4h[2]
+    smlal       v16.4s, v4.4h, v3.4h[0]
+    smlal       v16.4s, v5.4h, v3.4h[2]
+    smlal       v18.4s, v4.4h, v1.4h[0]
+    smlsl       v18.4s, v5.4h, v1.4h[2]
+    mov         x19,#0xff00
+    cmp         x12,x19
+    bge         skip_last8rows_stage2_kernel2
+
+    ld1         {v10.4h, v11.4h},[x11],#16
+    ld1         {v6.4h, v7.4h},[x11],#16
+    ld1         {v4.4h, v5.4h},[x0],#16
+    ld1         {v8.4h, v9.4h},[x0],#16
+
+    smlsl       v24.4s, v6.4h, v3.4h[3]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v6.4h, v0.4h[3]     //// y1 * cos3(part of b1)
+    smlal       v28.4s, v6.4h, v2.4h[3]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v6.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v7.4h, v0.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlal       v26.4s, v7.4h, v1.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v7.4h, v3.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v7.4h, v1.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+    smlal       v24.4s, v8.4h, v2.4h[3]
+    smlal       v26.4s, v8.4h, v3.4h[3]
+    smlsl       v28.4s, v8.4h, v2.4h[1]
+    smlal       v30.4s, v8.4h, v0.4h[3]
+
+
+    smlal       v24.4s, v9.4h, v1.4h[3]
+    smlsl       v26.4s, v9.4h, v1.4h[1]
+    smlal       v28.4s, v9.4h, v0.4h[3]
+    smlsl       v30.4s, v9.4h, v0.4h[1]
+
+
+
+
+    smlal       v22.4s, v10.4h, v0.4h[0]
+    smlsl       v22.4s, v11.4h, v1.4h[2]
+    smlsl       v22.4s, v4.4h, v3.4h[0]
+    smlal       v22.4s, v5.4h, v0.4h[2]
+
+
+
+    smlsl       v20.4s, v10.4h, v0.4h[0]
+    smlsl       v20.4s, v11.4h, v3.4h[2]
+    smlal       v20.4s, v4.4h, v1.4h[0]
+    smlsl       v20.4s, v5.4h, v1.4h[2]
+
+
+    smlsl       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v0.4h[2]
+    smlsl       v16.4s, v4.4h, v1.4h[0]
+    smlal       v16.4s, v5.4h, v2.4h[2]
+
+
+
+    smlal       v18.4s, v10.4h, v0.4h[0]
+    smlsl       v18.4s, v11.4h, v2.4h[2]
+    smlal       v18.4s, v4.4h, v3.4h[0]
+    smlsl       v18.4s, v5.4h, v3.4h[2]
+
+
+skip_last8rows_stage2_kernel2:
+
+
+
+    add         v4.4s,  v22.4s ,  v24.4s
+    sub         v22.4s,  v22.4s ,  v24.4s
+
+    add         v6.4s,  v20.4s ,  v26.4s
+    sub         v24.4s,  v20.4s ,  v26.4s
+
+    add         v10.4s,  v16.4s ,  v28.4s
+    sub         v26.4s,  v16.4s ,  v28.4s
+
+
+    add         v16.4s,  v18.4s ,  v30.4s
+    sub         v28.4s,  v18.4s ,  v30.4s
+
+
+    sqrshrn     v18.4h, v4.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v31.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v30.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v20.4h, v6.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v23.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v21.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v22.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+    ld1         {v4.4h, v5.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],#16
+
+
+
+    // registers used:    {q2,q4,q6,q7}, {q9,q15,q10,q11}
+
+//d4=x0
+//d12=x1
+//d5=x2
+//d13=x3
+
+//d18=x4
+//d20=x5
+//d19=x6
+//d21=x7
+
+//d22=x8
+//d30=x9
+//d23=x10
+//d31=x11
+
+//d14=x12
+//d8=x13
+//d15=x14
+//d9=x15
+
+    umov        x15,v26.d[0]
+    umov        x16,v27.d[0]
+    umov        x19,v28.d[0]
+    umov        x20,v29.d[0]
+
+    trn1        v26.4h, v4.4h, v12.4h
+    trn2        v27.4h, v4.4h, v12.4h
+    trn1        v28.4h, v5.4h, v13.4h
+    trn2        v29.4h, v5.4h, v13.4h
+
+    trn1        v4.2s, v26.2s, v28.2s
+    trn2        v5.2s, v26.2s, v28.2s
+    trn1        v12.2s, v27.2s, v29.2s
+    trn2        v13.2s, v27.2s, v29.2s
+
+    trn1        v26.4h, v18.4h, v20.4h
+    trn2        v27.4h, v18.4h, v20.4h
+    trn1        v28.4h, v19.4h, v21.4h
+    trn2        v29.4h, v19.4h, v21.4h
+
+    trn1        v18.2s, v26.2s, v28.2s
+    trn2        v19.2s, v26.2s, v28.2s
+    trn1        v20.2s, v27.2s, v29.2s
+    trn2        v21.2s, v27.2s, v29.2s
+
+    trn1        v26.4h, v22.4h, v30.4h
+    trn2        v27.4h, v22.4h, v30.4h
+    trn1        v28.4h, v23.4h, v31.4h
+    trn2        v29.4h, v23.4h, v31.4h
+
+    trn1        v22.2s, v26.2s, v28.2s
+    trn2        v23.2s, v26.2s, v28.2s
+    trn1        v30.2s, v27.2s, v29.2s
+    trn2        v31.2s, v27.2s, v29.2s
+
+    trn1        v26.4h, v14.4h, v8.4h
+    trn2        v27.4h, v14.4h, v8.4h
+    trn1        v28.4h, v15.4h, v9.4h
+    trn2        v29.4h, v15.4h, v9.4h
+
+    trn1        v14.2s, v26.2s, v28.2s
+    trn2        v15.2s, v26.2s, v28.2s
+    trn1        v8.2s, v27.2s, v29.2s
+    trn2        v9.2s, v27.2s, v29.2s
+
+    mov         v26.d[0],x15
+    mov         v27.d[0],x16
+    mov         v28.d[0],x19
+    mov         v29.d[0],x20
+
+// d4 =x0 1- 4 values
+// d5 =x2 1- 4 values
+// d12=x1 1- 4 values
+// d13=x3 1- 4 values
+
+// d18 =x0 5- 8 values
+// d19 =x2 5- 8 values
+// d20=x1 5- 8 values
+// d21=x3 5- 8 values
+
+// d22 =x0 9- 12 values
+// d23 =x2 9- 12 values
+// d30=x1 9- 12 values
+// d31=x3 9- 12 values
+
+// d14 =x0 13-16 values
+// d15 =x2 13- 16 values
+// d8=x1 13- 16 values
+// d9=x3 13- 16 values
+
+    // swapping v5 and v15
+    mov         v5.d[1],v5.d[0]
+    mov         v5.d[0],v18.d[0]
+    mov         v18.d[0],v5.d[1]
+    // swapping v23 and v14
+    mov         v23.d[1],v23.d[0]
+    mov         v23.d[0],v14.d[0]
+    mov         v14.d[0],v23.d[1]
+    // swapping v13 and v20
+    mov         v13.d[1],v13.d[0]
+    mov         v13.d[0],v20.d[0]
+    mov         v20.d[0],v13.d[1]
+    // swapping v31 and v8
+    mov         v31.d[1],v31.d[0]
+    mov         v31.d[0],v8.d[0]
+    mov         v8.d[0],v31.d[1]
+
+// q2: x0 1-8 values
+// q11: x0 9-16 values
+// q9 : x2 1-8 values
+// q7 : x2 9-16 values
+// q6 : x1 1- 8 values
+// q10: x3 1-8 values
+// q15: x1 9-16 values
+// q4:  x3 9-16 values
+
+
+//    registers free: q8,q14,q12,q13
+
+
+    ld1         {v16.8b, v17.8b},[x2],x8
+    ld1         {v28.8b, v29.8b},[x2],x5
+    ld1         {v24.8b, v25.8b},[x4],x8
+    ld1         {v26.8b, v27.8b},[x4],x5
+
+    mov         v4.d[1] ,v5.d[0]
+    mov         v22.d[1] ,v23.d[0]
+    mov         v12.d[1] ,v13.d[0]
+    mov         v30.d[1] ,v31.d[0]
+    mov         v18.d[1] ,v19.d[0]
+    mov         v14.d[1] ,v15.d[0]
+    mov         v20.d[1] ,v21.d[0]
+    mov         v8.d[1] ,v9.d[0]
+
+    uaddw       v4.8h,  v4.8h ,  v16.8b
+    uaddw       v22.8h,  v22.8h ,  v17.8b
+    uaddw       v12.8h,  v12.8h ,  v28.8b
+    uaddw       v30.8h,  v30.8h ,  v29.8b
+    uaddw       v18.8h,  v18.8h ,  v24.8b
+    uaddw       v14.8h,  v14.8h ,  v25.8b
+    uaddw       v20.8h,  v20.8h ,  v26.8b
+    uaddw       v8.8h,  v8.8h ,  v27.8b
+
+
+    sqxtun      v16.8b, v4.8h
+    sqxtun      v17.8b, v22.8h
+    sqxtun      v28.8b, v12.8h
+    sqxtun      v29.8b, v30.8h
+    sqxtun      v24.8b, v18.8h
+    sqxtun      v25.8b, v14.8h
+    sqxtun      v26.8b, v20.8h
+    sqxtun      v27.8b, v8.8h
+
+
+
+    st1         {v16.8b, v17.8b},[x3],x7
+    st1         {v28.8b, v29.8b},[x3],x7
+    st1         {v24.8b, v25.8b},[x3],x7
+    st1         {v26.8b, v27.8b},[x3],x7
+
+    subs        x14,x14,#1
+
+
+
+    bne         second_stage
+
+
+//    sub         sp,sp,#40
+    // ldmfd sp!,{x4-x12,pc}
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_itrans_recon_32x32.s b/common/arm64/ihevc_itrans_recon_32x32.s
new file mode 100644
index 0000000..6f40747
--- /dev/null
+++ b/common/arm64/ihevc_itrans_recon_32x32.s

@@ -0,0 +1,3053 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+// *******************************************************************************
+// * @file
+// *  ihevc_itrans_recon_8x8_neon.s
+// *
+// * @brief
+// *  contains function definitions for single stage  inverse transform
+// *
+// * @author
+// * anand s
+// *
+// * @par list of functions:
+// *  - ihevc_itrans_recon_32x32()
+// *
+// * @remarks
+// *  the input buffer is being corrupted
+// *
+// *******************************************************************************
+//*/
+
+///**
+// *******************************************************************************
+// *
+// * @brief
+// *  this function performs inverse transform  and reconstruction for 8x8
+// * input block
+// *
+// * @par description:
+// *  performs inverse transform and adds the prediction  data and clips output
+// * to 8 bit
+// *
+// * @param[in] pi2_src
+// *  input 16x16 coefficients
+// *
+// * @param[in] pi2_tmp
+// *  temporary 16x16 buffer for storing inverse
+// *
+// *  transform
+// *  1st stage output
+// *
+// * @param[in] pu1_pred
+// *  prediction 16x16 block
+// *
+// * @param[out] pu1_dst
+// *  output 8x8 block
+// *
+// * @param[in] src_strd
+// *  input stride
+// *
+// * @param[in] pred_strd
+// *  prediction stride
+// *
+// * @param[in] dst_strd
+// *  output stride
+// *
+// * @param[in] shift
+// *  output shift
+// *
+// * @param[in] x12
+// *  zero columns in pi2_src
+// *
+// * @returns  void
+// *
+// * @remarks
+// *  none
+// *
+// *******************************************************************************
+// */
+
+//void ihevc_itrans_recon_32x32(word16 *pi2_src,
+//                            word16 *pi2_tmp,
+//                            uword8 *pu1_pred,
+//                            uword8 *pu1_dst,
+//                            word32 src_strd,
+//                            word32 pred_strd,
+//                            word32 dst_strd,
+//                            word32 x12
+//                             word32    x11                )
+
+//**************variables vs registers*************************
+//    x0 => *pi2_src
+//    x1 => *pi2_tmp
+//    x2 => *pu1_pred
+//    x3 => *pu1_dst
+//    src_strd
+//    pred_strd
+//    dst_strd
+//    x12
+//    x11
+
+
+//d0[0]=    64        d2[0]=83
+//d0[1]= 90        d2[1]=82
+//d0[2]= 90        d2[2]=80
+//d0[3]= 90        d2[3]=78
+//d1[0]= 89         d3[0]=75
+//d1[1]= 88        d3[1]=73
+//d1[2]= 87        d3[2]=70
+//d1[3]= 85        d3[3]=67
+
+//d4[0]=    64        d6[0]=36
+//d4[1]= 61        d6[1]=31
+//d4[2]= 57        d6[2]=25
+//d4[3]= 54        d6[3]=22
+//d5[0]= 50         d7[0]=18
+//d5[1]= 46        d7[1]=13
+//d5[2]= 43        d7[2]=9
+//d5[3]= 38        d7[3]=4
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+
+.set shift_stage1_idct ,   7
+.set shift_stage2_idct ,   12
+
+//#define zero_cols      x12
+//#define zero_rows     x11
+
+.globl ihevc_itrans_recon_32x32_av8
+
+.extern g_ai2_ihevc_trans_32_transpose
+
+x5_addr: .word 0xfffff000
+x9_addr: .word 0xffff0000
+
+.type ihevc_itrans_recon_32x32_av8, %function
+
+ihevc_itrans_recon_32x32_av8:
+
+    ldr         w11, [sp]
+
+// stmfd sp!,{x0-x12,x14}
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+    stp         x0, x1,[sp,#-16]!
+    stp         x5, x6,[sp,#-16]!
+
+//ldr            x8,[sp,#56]     @ prediction stride
+//ldr            x7,[sp,#64]     @ destination stride
+    mov         x6, x4 // src stride
+    mov         x12, x7
+    lsl         x6, x6, #1                  // x sizeof(word16)
+    add         x10,x6,x6, lsl #1           // 3 rows
+
+
+    mov         x8,x0
+
+    adrp        x14, :got:g_ai2_ihevc_trans_32_transpose
+    ldr         x14, [x14, #:got_lo12:g_ai2_ihevc_trans_32_transpose]
+
+    ld1         {v0.4h, v1.4h, v2.4h, v3.4h},[x14],#32
+    ld1         {v4.4h, v5.4h, v6.4h, v7.4h},[x14],#32
+
+//registers which are free
+//  x10,x9,x11,x12
+    mov         x9,#0xffffff00
+    mov         x10,#0xfffffff0
+    ldr         w5, x5_addr
+    ldr         w7, x9_addr
+    cmp         x12,x10
+    mov         x20,#1
+    csel        x14, x20, x14,hs
+    bhs         stage1
+
+
+    cmp         x12,x9
+    mov         x20,#2
+    csel        x14, x20, x14,hs
+    bhs         stage1
+
+    cmp         x12,x5
+    mov         x20,#3
+    csel        x14, x20, x14,hs
+    bhs         stage1
+
+    cmp         x12,x7
+    mov         x20,#4
+    csel        x14, x20, x14,hs
+
+    mov         x14,#8
+    b           stage1
+//.ltorg
+
+
+dct_stage1:
+    add         x8,x8,#8
+    mov         x0,x8
+
+stage1:
+    ld1         {v10.4h},[x0],x6
+    ld1         {v8.4h},[x0],x6
+    ld1         {v11.4h},[x0],x6
+    ld1         {v9.4h},[x0],x6
+
+    smull       v24.4s, v8.4h, v0.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v8.4h, v0.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v8.4h, v1.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v8.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v9.4h, v0.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlal       v26.4s, v9.4h, v2.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v9.4h, v3.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlal       v30.4s, v9.4h, v5.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smull       v20.4s, v10.4h, v0.4h[0]
+    smlal       v20.4s, v11.4h, v0.4h[2]
+
+
+    smull       v22.4s, v10.4h, v0.4h[0]
+    smlal       v22.4s, v11.4h, v1.4h[2]
+
+    smull       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v2.4h[2]
+
+    smull       v18.4s, v10.4h, v0.4h[0]
+    smlal       v18.4s, v11.4h, v3.4h[2]
+    cmp         x11,x10
+    bhs         shift1
+
+    ld1         {v12.4h},[x0],x6
+    ld1         {v14.4h},[x0],x6
+    ld1         {v13.4h},[x0],x6
+    ld1         {v15.4h},[x0],x6
+
+
+
+
+
+
+
+    smlal       v24.4s, v14.4h, v1.4h[1]
+    smlal       v26.4s, v14.4h, v3.4h[3]
+    smlal       v28.4s, v14.4h, v6.4h[1]
+    smlsl       v30.4s, v14.4h, v7.4h[1]
+
+
+    smlal       v24.4s, v15.4h, v1.4h[3]
+    smlal       v26.4s, v15.4h, v5.4h[1]
+    smlsl       v28.4s, v15.4h, v7.4h[1]
+    smlsl       v30.4s, v15.4h, v3.4h[3]
+
+
+    smlal       v20.4s, v12.4h, v1.4h[0]
+    smlal       v20.4s, v13.4h, v1.4h[2]
+    smlal       v22.4s, v12.4h, v3.4h[0]
+    smlal       v22.4s, v13.4h, v4.4h[2]
+    smlal       v16.4s, v12.4h, v5.4h[0]
+    smlal       v16.4s, v13.4h, v7.4h[2]
+    smlal       v18.4s, v12.4h, v7.4h[0]
+    smlsl       v18.4s, v13.4h, v5.4h[2]
+
+    cmp         x11,x9
+    bhs         shift1
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v8.4h},[x0],x6
+    ld1         {v11.4h},[x0],x6
+    ld1         {v9.4h},[x0],x6
+
+
+    smlal       v24.4s, v8.4h, v2.4h[1]     //// y1 * cos1(part of b0)
+    smlal       v26.4s, v8.4h, v6.4h[3]     //// y1 * cos3(part of b1)
+    smlsl       v28.4s, v8.4h, v4.4h[3]     //// y1 * sin3(part of b2)
+    smlsl       v30.4s, v8.4h, v0.4h[1]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v9.4h, v2.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v7.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v2.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v3.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlal       v20.4s, v10.4h, v2.4h[0]
+    smlal       v20.4s, v11.4h, v2.4h[2]
+
+
+    smlal       v22.4s, v10.4h, v6.4h[0]
+    smlal       v22.4s, v11.4h, v7.4h[2]
+
+    smlsl       v16.4s, v10.4h, v6.4h[0]
+    smlsl       v16.4s, v11.4h, v3.4h[2]
+
+    smlsl       v18.4s, v10.4h, v2.4h[0]
+    smlsl       v18.4s, v11.4h, v1.4h[2]
+
+    cmp         x11,x5
+    bhs         shift1
+
+
+    ld1         {v12.4h},[x0],x6
+    ld1         {v14.4h},[x0],x6
+    ld1         {v13.4h},[x0],x6
+    ld1         {v15.4h},[x0],x6
+
+
+
+
+
+
+
+
+
+    smlal       v24.4s, v14.4h, v3.4h[1]
+    smlsl       v26.4s, v14.4h, v6.4h[1]
+    smlsl       v28.4s, v14.4h, v0.4h[1]
+    smlsl       v30.4s, v14.4h, v6.4h[3]
+
+
+    smlal       v24.4s, v15.4h, v3.4h[3]
+    smlsl       v26.4s, v15.4h, v4.4h[3]
+    smlsl       v28.4s, v15.4h, v2.4h[3]
+    smlal       v30.4s, v15.4h, v5.4h[3]
+
+
+    smlal       v20.4s, v12.4h, v3.4h[0]
+    smlal       v20.4s, v13.4h, v3.4h[2]
+    smlsl       v22.4s, v12.4h, v7.4h[0]
+    smlsl       v22.4s, v13.4h, v5.4h[2]
+    smlsl       v16.4s, v12.4h, v1.4h[0]
+    smlsl       v16.4s, v13.4h, v1.4h[2]
+    smlsl       v18.4s, v12.4h, v5.4h[0]
+    smlal       v18.4s, v13.4h, v7.4h[2]
+
+    cmp         x11,x7
+    bhs         shift1
+
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v8.4h},[x0],x6
+    ld1         {v11.4h},[x0],x6
+    ld1         {v9.4h},[x0],x6
+
+
+
+    smlal       v24.4s, v8.4h, v4.4h[1]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v8.4h, v3.4h[1]     //// y1 * cos3(part of b1)
+    smlsl       v28.4s, v8.4h, v5.4h[1]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v8.4h, v2.4h[1]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v9.4h, v4.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v1.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v7.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlal       v30.4s, v9.4h, v1.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlal       v20.4s, v10.4h, v0.4h[0]
+    smlal       v20.4s, v11.4h, v4.4h[2]
+
+
+    smlsl       v22.4s, v10.4h, v0.4h[0]
+    smlsl       v22.4s, v11.4h, v2.4h[2]
+
+    smlsl       v16.4s, v10.4h, v0.4h[0]
+    smlsl       v16.4s, v11.4h, v6.4h[2]
+
+    smlal       v18.4s, v10.4h, v0.4h[0]
+    smlal       v18.4s, v11.4h, v0.4h[2]
+
+
+
+    ld1         {v12.4h},[x0],x6
+    ld1         {v14.4h},[x0],x6
+    ld1         {v13.4h},[x0],x6
+    ld1         {v15.4h},[x0],x6
+
+
+
+
+    smlal       v24.4s, v14.4h, v5.4h[1]
+    smlsl       v26.4s, v14.4h, v0.4h[2]
+    smlal       v28.4s, v14.4h, v5.4h[3]
+    smlal       v30.4s, v14.4h, v4.4h[3]
+
+
+    smlal       v24.4s, v15.4h, v5.4h[3]
+    smlsl       v26.4s, v15.4h, v1.4h[1]
+    smlal       v28.4s, v15.4h, v3.4h[1]
+    smlsl       v30.4s, v15.4h, v7.4h[3]
+
+
+    smlal       v20.4s, v12.4h, v5.4h[0]
+    smlal       v20.4s, v13.4h, v5.4h[2]
+    smlsl       v22.4s, v12.4h, v1.4h[0]
+    smlsl       v22.4s, v13.4h, v0.4h[2]
+    smlal       v16.4s, v12.4h, v7.4h[0]
+    smlal       v16.4s, v13.4h, v4.4h[2]
+    smlal       v18.4s, v12.4h, v3.4h[0]
+    smlal       v18.4s, v13.4h, v6.4h[2]
+
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v8.4h},[x0],x6
+    ld1         {v11.4h},[x0],x6
+    ld1         {v9.4h},[x0],x6
+
+
+
+
+
+
+
+    smlal       v24.4s, v8.4h, v6.4h[1]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v8.4h, v2.4h[3]     //// y1 * cos3(part of b1)
+    smlal       v28.4s, v8.4h, v0.4h[1]     //// y1 * sin3(part of b2)
+    smlsl       v30.4s, v8.4h, v4.4h[1]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v9.4h, v6.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v4.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v9.4h, v1.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v0.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlal       v20.4s, v10.4h, v6.4h[0]
+    smlal       v20.4s, v11.4h, v6.4h[2]
+
+
+    smlsl       v22.4s, v10.4h, v2.4h[0]
+    smlsl       v22.4s, v11.4h, v3.4h[2]
+
+    smlal       v16.4s, v10.4h, v2.4h[0]
+    smlal       v16.4s, v11.4h, v0.4h[2]
+
+    smlsl       v18.4s, v10.4h, v6.4h[0]
+    smlsl       v18.4s, v11.4h, v2.4h[2]
+
+    ld1         {v12.4h},[x0],x6
+    ld1         {v14.4h},[x0],x6
+    ld1         {v13.4h},[x0],x6
+    ld1         {v15.4h},[x0],x6
+
+
+    smlal       v24.4s, v14.4h, v7.4h[1]
+    smlsl       v26.4s, v14.4h, v5.4h[3]
+    smlal       v28.4s, v14.4h, v4.4h[1]
+    smlsl       v30.4s, v14.4h, v2.4h[3]
+
+
+    smlal       v24.4s, v15.4h, v7.4h[3]
+    smlsl       v26.4s, v15.4h, v7.4h[1]
+    smlal       v28.4s, v15.4h, v6.4h[3]
+    smlsl       v30.4s, v15.4h, v6.4h[1]
+
+
+    smlal       v20.4s, v12.4h, v7.4h[0]
+    smlal       v20.4s, v13.4h, v7.4h[2]
+    smlsl       v22.4s, v12.4h, v5.4h[0]
+    smlsl       v22.4s, v13.4h, v6.4h[2]
+    smlal       v16.4s, v12.4h, v3.4h[0]
+    smlal       v16.4s, v13.4h, v5.4h[2]
+    smlsl       v18.4s, v12.4h, v1.4h[0]
+    smlsl       v18.4s, v13.4h, v4.4h[2]
+
+
+
+shift1:
+    add         v8.4s,  v20.4s ,  v24.4s
+    sub         v10.4s,  v20.4s ,  v24.4s
+
+    add         v12.4s,  v22.4s ,  v26.4s
+    sub         v24.4s,  v22.4s ,  v26.4s
+
+    add         v14.4s,  v16.4s ,  v28.4s
+    sub         v26.4s,  v16.4s ,  v28.4s
+
+
+    add         v16.4s,  v18.4s ,  v30.4s
+    sub         v28.4s,  v18.4s ,  v30.4s
+
+
+    sqrshrn     v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+
+    // registers used q15,q14,q6,q7
+
+    umov        x15,v24.d[0]
+    umov        x16,v25.d[0]
+    umov        x19,v26.d[0]
+    umov        x20,v27.d[0]
+
+    trn1        v24.4h, v30.4h, v12.4h
+    trn2        v25.4h, v30.4h, v12.4h
+    trn1        v26.4h, v31.4h, v13.4h
+    trn2        v27.4h, v31.4h, v13.4h
+
+    trn1        v30.2s, v24.2s, v26.2s
+    trn2        v31.2s, v24.2s, v26.2s
+    trn1        v12.2s, v25.2s, v27.2s
+    trn2        v13.2s, v25.2s, v27.2s
+
+    trn1        v24.4h, v14.4h, v18.4h
+    trn2        v25.4h, v14.4h, v18.4h
+    trn1        v26.4h, v15.4h, v19.4h
+    trn2        v27.4h, v15.4h, v19.4h
+
+    trn1        v14.2s, v24.2s, v26.2s
+    trn2        v15.2s, v24.2s, v26.2s
+    trn1        v18.2s, v25.2s, v27.2s
+    trn2        v19.2s, v25.2s, v27.2s
+
+    mov         v24.d[0],x15
+    mov         v25.d[0],x16
+    mov         v26.d[0],x19
+    mov         v27.d[0],x20
+
+// d30 =x0 1- 4 values
+// d31 =x2 1- 4 values
+// d12=x1 1- 4 values
+// d13=x3 1- 4 values
+// d14 =x0 28-31 values
+// d15 =x2 28- 31 values
+// d18=x1 28- 31 values
+// d19=x3 28- 31 values
+
+
+
+    st1         { v30.4h, v31.4h},[x1],#16
+    st1         { v12.4h, v13.4h},[x1],#16
+    add         x1,x1,#192
+    st1         { v14.4h, v15.4h},[x1],#16
+    st1         { v18.4h, v19.4h},[x1],#16
+    sub         x1,x1,#224
+
+    mov         x0,x8
+
+
+
+
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v8.4h},[x0],x6
+    ld1         {v11.4h},[x0],x6
+    ld1         {v9.4h},[x0],x6
+
+
+
+
+    smull       v24.4s, v8.4h, v2.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v8.4h, v2.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v8.4h, v3.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v8.4h, v3.4h[3]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v9.4h, v6.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v7.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v6.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v4.4h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smull       v20.4s, v10.4h, v0.4h[0]
+    smlal       v20.4s, v11.4h, v4.4h[2]
+
+
+    smull       v22.4s, v10.4h, v0.4h[0]
+    smlal       v22.4s, v11.4h, v5.4h[2]
+
+    smull       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v6.4h[2]
+
+    smull       v18.4s, v10.4h, v0.4h[0]
+    smlal       v18.4s, v11.4h, v7.4h[2]
+    cmp         x11,x10
+    bhs         shift2
+
+    ld1         {v12.4h},[x0],x6
+    ld1         {v14.4h},[x0],x6
+    ld1         {v13.4h},[x0],x6
+    ld1         {v15.4h},[x0],x6
+
+
+    smlsl       v24.4s, v14.4h, v4.4h[3]
+    smlsl       v26.4s, v14.4h, v2.4h[1]
+    smlsl       v28.4s, v14.4h, v0.4h[1]
+    smlsl       v30.4s, v14.4h, v2.4h[3]
+
+
+    smlsl       v24.4s, v15.4h, v0.4h[3]
+    smlsl       v26.4s, v15.4h, v3.4h[1]
+    smlsl       v28.4s, v15.4h, v6.4h[3]
+    smlal       v30.4s, v15.4h, v5.4h[3]
+
+
+    smlsl       v20.4s, v12.4h, v7.4h[0]
+    smlsl       v20.4s, v13.4h, v2.4h[2]
+    smlsl       v22.4s, v12.4h, v5.4h[0]
+    smlsl       v22.4s, v13.4h, v0.4h[2]
+    smlsl       v16.4s, v12.4h, v3.4h[0]
+    smlsl       v16.4s, v13.4h, v3.4h[2]
+    smlsl       v18.4s, v12.4h, v1.4h[0]
+    smlsl       v18.4s, v13.4h, v6.4h[2]
+
+    cmp         x11,x9
+    bhs         shift2
+
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v8.4h},[x0],x6
+    ld1         {v11.4h},[x0],x6
+    ld1         {v9.4h},[x0],x6
+
+
+
+
+
+
+
+    smlsl       v24.4s, v8.4h, v4.4h[1]     //// y1 * cos1(part of b0)
+    smlal       v26.4s, v8.4h, v7.4h[1]     //// y1 * cos3(part of b1)
+    smlal       v28.4s, v8.4h, v2.4h[3]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v8.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v9.4h, v7.4h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlal       v26.4s, v9.4h, v1.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v9.4h, v3.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v6.4h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlsl       v20.4s, v10.4h, v2.4h[0]
+    smlsl       v20.4s, v11.4h, v6.4h[2]
+
+
+    smlsl       v22.4s, v10.4h, v6.4h[0]
+    smlal       v22.4s, v11.4h, v4.4h[2]
+
+    smlal       v16.4s, v10.4h, v6.4h[0]
+    smlal       v16.4s, v11.4h, v0.4h[2]
+
+    smlal       v18.4s, v10.4h, v2.4h[0]
+    smlal       v18.4s, v11.4h, v5.4h[2]
+
+    cmp         x11,x5
+    bhs         shift2
+
+
+    ld1         {v12.4h},[x0],x6
+    ld1         {v14.4h},[x0],x6
+    ld1         {v13.4h},[x0],x6
+    ld1         {v15.4h},[x0],x6
+
+
+
+
+
+    smlal       v24.4s, v14.4h, v2.4h[3]
+    smlal       v26.4s, v14.4h, v3.4h[3]
+    smlsl       v28.4s, v14.4h, v5.4h[3]
+    smlsl       v30.4s, v14.4h, v0.4h[3]
+
+
+    smlal       v24.4s, v15.4h, v1.4h[3]
+    smlsl       v26.4s, v15.4h, v6.4h[3]
+    smlsl       v28.4s, v15.4h, v0.4h[3]
+    smlal       v30.4s, v15.4h, v7.4h[3]
+
+
+    smlal       v20.4s, v12.4h, v5.4h[0]
+    smlal       v20.4s, v13.4h, v0.4h[2]
+    smlal       v22.4s, v12.4h, v1.4h[0]
+    smlal       v22.4s, v13.4h, v6.4h[2]
+    smlal       v16.4s, v12.4h, v7.4h[0]
+    smlsl       v16.4s, v13.4h, v2.4h[2]
+    smlsl       v18.4s, v12.4h, v3.4h[0]
+    smlsl       v18.4s, v13.4h, v4.4h[2]
+
+
+    cmp         x11,x7
+    bhs         shift2
+
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v8.4h},[x0],x6
+    ld1         {v11.4h},[x0],x6
+    ld1         {v9.4h},[x0],x6
+
+
+
+
+
+
+
+    smlal       v24.4s, v8.4h, v6.4h[1]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v8.4h, v1.4h[1]     //// y1 * cos3(part of b1)
+    smlsl       v28.4s, v8.4h, v7.4h[1]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v8.4h, v0.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v9.4h, v5.4h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v4.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v9.4h, v2.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlal       v30.4s, v9.4h, v7.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlal       v20.4s, v10.4h, v0.4h[0]
+    smlsl       v20.4s, v11.4h, v7.4h[2]
+
+
+    smlsl       v22.4s, v10.4h, v0.4h[0]
+    smlsl       v22.4s, v11.4h, v1.4h[2]
+
+    smlsl       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v5.4h[2]
+
+    smlal       v18.4s, v10.4h, v0.4h[0]
+    smlal       v18.4s, v11.4h, v3.4h[2]
+
+
+
+    ld1         {v12.4h},[x0],x6
+    ld1         {v14.4h},[x0],x6
+    ld1         {v13.4h},[x0],x6
+    ld1         {v15.4h},[x0],x6
+
+
+    smlsl       v24.4s, v14.4h, v0.4h[1]
+    smlal       v26.4s, v14.4h, v6.4h[1]
+    smlal       v28.4s, v14.4h, v4.4h[1]
+    smlsl       v30.4s, v14.4h, v1.4h[1]
+
+
+    smlsl       v24.4s, v15.4h, v3.4h[3]
+    smlal       v26.4s, v15.4h, v0.4h[1]
+    smlsl       v28.4s, v15.4h, v5.4h[1]
+    smlsl       v30.4s, v15.4h, v6.4h[1]
+
+
+    smlsl       v20.4s, v12.4h, v3.4h[0]
+    smlsl       v20.4s, v13.4h, v1.4h[2]
+    smlsl       v22.4s, v12.4h, v7.4h[0]
+    smlal       v22.4s, v13.4h, v3.4h[2]
+    smlal       v16.4s, v12.4h, v1.4h[0]
+    smlal       v16.4s, v13.4h, v7.4h[2]
+    smlsl       v18.4s, v12.4h, v5.4h[0]
+    smlsl       v18.4s, v13.4h, v2.4h[2]
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v8.4h},[x0],x6
+    ld1         {v11.4h},[x0],x6
+    ld1         {v9.4h},[x0],x6
+
+
+
+
+    smlal       v24.4s, v8.4h, v7.4h[3]     //// y1 * cos1(part of b0)
+    smlal       v26.4s, v8.4h, v4.4h[3]     //// y1 * cos3(part of b1)
+    smlsl       v28.4s, v8.4h, v1.4h[1]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v8.4h, v2.4h[1]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v9.4h, v3.4h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v5.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v7.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlal       v30.4s, v9.4h, v5.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlsl       v20.4s, v10.4h, v6.4h[0]
+    smlal       v20.4s, v11.4h, v5.4h[2]
+
+
+    smlal       v22.4s, v10.4h, v2.4h[0]
+    smlal       v22.4s, v11.4h, v7.4h[2]
+
+    smlsl       v16.4s, v10.4h, v2.4h[0]
+    smlsl       v16.4s, v11.4h, v4.4h[2]
+
+    smlal       v18.4s, v10.4h, v6.4h[0]
+    smlal       v18.4s, v11.4h, v1.4h[2]
+
+
+    ld1         {v12.4h},[x0],x6
+    ld1         {v14.4h},[x0],x6
+    ld1         {v13.4h},[x0],x6
+    ld1         {v15.4h},[x0],x6
+
+
+
+
+
+    smlal       v24.4s, v14.4h, v1.4h[1]
+    smlsl       v26.4s, v14.4h, v0.4h[3]
+    smlal       v28.4s, v14.4h, v1.4h[3]
+    smlsl       v30.4s, v14.4h, v3.4h[1]
+
+
+    smlal       v24.4s, v15.4h, v5.4h[3]
+    smlsl       v26.4s, v15.4h, v5.4h[1]
+    smlal       v28.4s, v15.4h, v4.4h[3]
+    smlsl       v30.4s, v15.4h, v4.4h[1]
+
+
+    smlal       v20.4s, v12.4h, v1.4h[0]
+    smlal       v20.4s, v13.4h, v3.4h[2]
+    smlsl       v22.4s, v12.4h, v3.4h[0]
+    smlsl       v22.4s, v13.4h, v2.4h[2]
+    smlal       v16.4s, v12.4h, v5.4h[0]
+    smlal       v16.4s, v13.4h, v1.4h[2]
+    smlsl       v18.4s, v12.4h, v7.4h[0]
+    smlsl       v18.4s, v13.4h, v0.4h[2]
+
+shift2:
+    add         v8.4s,  v20.4s ,  v24.4s
+    sub         v10.4s,  v20.4s ,  v24.4s
+
+    add         v12.4s,  v22.4s ,  v26.4s
+    sub         v24.4s,  v22.4s ,  v26.4s
+
+    add         v14.4s,  v16.4s ,  v28.4s
+    sub         v26.4s,  v16.4s ,  v28.4s
+
+
+    add         v16.4s,  v18.4s ,  v30.4s
+    sub         v28.4s,  v18.4s ,  v30.4s
+
+
+    sqrshrn     v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+    umov        x15,v24.d[0]
+    umov        x16,v25.d[0]
+    umov        x19,v26.d[0]
+    umov        x20,v27.d[0]
+
+    trn1        v24.4h, v30.4h, v12.4h
+    trn2        v25.4h, v30.4h, v12.4h
+    trn1        v26.4h, v31.4h, v13.4h
+    trn2        v27.4h, v31.4h, v13.4h
+
+    trn1        v30.2s, v24.2s, v26.2s
+    trn2        v31.2s, v24.2s, v26.2s
+    trn1        v12.2s, v25.2s, v27.2s
+    trn2        v13.2s, v25.2s, v27.2s
+
+    trn1        v24.4h, v14.4h, v18.4h
+    trn2        v25.4h, v14.4h, v18.4h
+    trn1        v26.4h, v15.4h, v19.4h
+    trn2        v27.4h, v15.4h, v19.4h
+
+    trn1        v14.2s, v24.2s, v26.2s
+    trn2        v15.2s, v24.2s, v26.2s
+    trn1        v18.2s, v25.2s, v27.2s
+    trn2        v19.2s, v25.2s, v27.2s
+
+    mov         v24.d[0],x15
+    mov         v25.d[0],x16
+    mov         v26.d[0],x19
+    mov         v27.d[0],x20
+
+    st1         { v30.4h, v31.4h},[x1],#16
+    st1         { v12.4h, v13.4h},[x1],#16
+    add         x1,x1,#128
+    st1         { v14.4h, v15.4h},[x1],#16
+    st1         { v18.4h, v19.4h},[x1],#16
+    sub         x1,x1,#160
+    mov         x0,x8
+
+
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v8.4h},[x0],x6
+    ld1         {v11.4h},[x0],x6
+    ld1         {v9.4h},[x0],x6
+
+
+    smull       v24.4s, v8.4h, v4.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v8.4h, v4.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v8.4h, v5.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v8.4h, v5.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v9.4h, v3.4h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v1.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v0.4h[2]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v1.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smull       v20.4s, v10.4h, v0.4h[0]
+    smlsl       v20.4s, v11.4h, v7.4h[2]
+
+
+    smull       v22.4s, v10.4h, v0.4h[0]
+    smlsl       v22.4s, v11.4h, v6.4h[2]
+
+    smull       v16.4s, v10.4h, v0.4h[0]
+    smlsl       v16.4s, v11.4h, v5.4h[2]
+
+    smull       v18.4s, v10.4h, v0.4h[0]
+    smlsl       v18.4s, v11.4h, v4.4h[2]
+
+    cmp         x11,x10
+    bhs         shift3
+
+    ld1         {v12.4h},[x0],x6
+    ld1         {v14.4h},[x0],x6
+    ld1         {v13.4h},[x0],x6
+    ld1         {v15.4h},[x0],x6
+
+
+
+
+    smlsl       v24.4s, v14.4h, v5.4h[1]
+    smlsl       v26.4s, v14.4h, v7.4h[3]
+    smlal       v28.4s, v14.4h, v5.4h[3]
+    smlal       v30.4s, v14.4h, v3.4h[1]
+
+
+    smlal       v24.4s, v15.4h, v2.4h[1]
+    smlal       v26.4s, v15.4h, v1.4h[1]
+    smlal       v28.4s, v15.4h, v4.4h[3]
+    smlsl       v30.4s, v15.4h, v7.4h[3]
+
+
+    smlsl       v20.4s, v12.4h, v1.4h[0]
+    smlal       v20.4s, v13.4h, v6.4h[2]
+    smlsl       v22.4s, v12.4h, v3.4h[0]
+    smlal       v22.4s, v13.4h, v3.4h[2]
+    smlsl       v16.4s, v12.4h, v5.4h[0]
+    smlal       v16.4s, v13.4h, v0.4h[2]
+    smlsl       v18.4s, v12.4h, v7.4h[0]
+    smlal       v18.4s, v13.4h, v2.4h[2]
+
+    cmp         x11,x9
+    bhs         shift3
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v8.4h},[x0],x6
+    ld1         {v11.4h},[x0],x6
+    ld1         {v9.4h},[x0],x6
+
+    smlal       v24.4s, v8.4h, v6.4h[1]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v8.4h, v5.4h[1]     //// y1 * cos3(part of b1)
+    smlsl       v28.4s, v8.4h, v0.4h[3]     //// y1 * sin3(part of b2)
+    smlsl       v30.4s, v8.4h, v3.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v9.4h, v1.4h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v4.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v9.4h, v6.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlal       v30.4s, v9.4h, v0.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlal       v20.4s, v10.4h, v2.4h[0]
+    smlsl       v20.4s, v11.4h, v5.4h[2]
+
+
+    smlal       v22.4s, v10.4h, v6.4h[0]
+    smlsl       v22.4s, v11.4h, v0.4h[2]
+
+    smlsl       v16.4s, v10.4h, v6.4h[0]
+    smlsl       v16.4s, v11.4h, v4.4h[2]
+
+    smlsl       v18.4s, v10.4h, v2.4h[0]
+    smlal       v18.4s, v11.4h, v6.4h[2]
+
+    cmp         x11,x5
+    bhs         shift3
+
+
+    ld1         {v12.4h},[x0],x6
+    ld1         {v14.4h},[x0],x6
+    ld1         {v13.4h},[x0],x6
+    ld1         {v15.4h},[x0],x6
+
+
+
+
+
+
+    smlsl       v24.4s, v14.4h, v7.4h[1]
+    smlal       v26.4s, v14.4h, v2.4h[1]
+    smlal       v28.4s, v14.4h, v4.4h[1]
+    smlsl       v30.4s, v14.4h, v5.4h[1]
+
+
+    smlal       v24.4s, v15.4h, v0.4h[3]
+    smlal       v26.4s, v15.4h, v7.4h[1]
+    smlsl       v28.4s, v15.4h, v1.4h[1]
+    smlsl       v30.4s, v15.4h, v6.4h[1]
+
+
+    smlsl       v20.4s, v12.4h, v3.4h[0]
+    smlal       v20.4s, v13.4h, v4.4h[2]
+    smlal       v22.4s, v12.4h, v7.4h[0]
+    smlal       v22.4s, v13.4h, v2.4h[2]
+    smlal       v16.4s, v12.4h, v1.4h[0]
+    smlsl       v16.4s, v13.4h, v6.4h[2]
+    smlal       v18.4s, v12.4h, v5.4h[0]
+    smlsl       v18.4s, v13.4h, v0.4h[2]
+
+
+    cmp         x11,x7
+    bhs         shift3
+
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v8.4h},[x0],x6
+    ld1         {v11.4h},[x0],x6
+    ld1         {v9.4h},[x0],x6
+
+
+    smlsl       v24.4s, v8.4h, v7.4h[3]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v8.4h, v0.4h[1]     //// y1 * cos3(part of b1)
+    smlal       v28.4s, v8.4h, v6.4h[3]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v8.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v9.4h, v0.4h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlal       v26.4s, v9.4h, v5.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v9.4h, v3.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v2.4h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlal       v20.4s, v10.4h, v0.4h[0]
+    smlsl       v20.4s, v11.4h, v3.4h[2]
+
+
+    smlsl       v22.4s, v10.4h, v0.4h[0]
+    smlsl       v22.4s, v11.4h, v5.4h[2]
+
+    smlsl       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v1.4h[2]
+
+    smlal       v18.4s, v10.4h, v0.4h[0]
+    smlal       v18.4s, v11.4h, v7.4h[2]
+
+
+    ld1         {v12.4h},[x0],x6
+    ld1         {v14.4h},[x0],x6
+    ld1         {v13.4h},[x0],x6
+    ld1         {v15.4h},[x0],x6
+
+
+
+    smlal       v24.4s, v14.4h, v6.4h[3]
+    smlal       v26.4s, v14.4h, v3.4h[3]
+    smlsl       v28.4s, v14.4h, v1.4h[3]
+    smlal       v30.4s, v14.4h, v7.4h[1]
+
+
+    smlal       v24.4s, v15.4h, v1.4h[3]
+    smlsl       v26.4s, v15.4h, v2.4h[3]
+    smlal       v28.4s, v15.4h, v7.4h[1]
+    smlal       v30.4s, v15.4h, v4.4h[1]
+
+
+    smlsl       v20.4s, v12.4h, v5.4h[0]
+    smlal       v20.4s, v13.4h, v2.4h[2]
+    smlal       v22.4s, v12.4h, v1.4h[0]
+    smlsl       v22.4s, v13.4h, v7.4h[2]
+    smlsl       v16.4s, v12.4h, v7.4h[0]
+    smlsl       v16.4s, v13.4h, v3.4h[2]
+    smlsl       v18.4s, v12.4h, v3.4h[0]
+    smlal       v18.4s, v13.4h, v1.4h[2]
+
+
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v8.4h},[x0],x6
+    ld1         {v11.4h},[x0],x6
+    ld1         {v9.4h},[x0],x6
+
+
+
+
+    smlsl       v24.4s, v8.4h, v5.4h[3]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v8.4h, v6.4h[3]     //// y1 * cos3(part of b1)
+    smlal       v28.4s, v8.4h, v3.4h[1]     //// y1 * sin3(part of b2)
+    smlsl       v30.4s, v8.4h, v0.4h[1]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v9.4h, v2.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlal       v26.4s, v9.4h, v0.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v2.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlal       v30.4s, v9.4h, v4.4h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlal       v20.4s, v10.4h, v6.4h[0]
+    smlsl       v20.4s, v11.4h, v1.4h[2]
+
+
+    smlsl       v22.4s, v10.4h, v2.4h[0]
+    smlal       v22.4s, v11.4h, v4.4h[2]
+
+    smlal       v16.4s, v10.4h, v2.4h[0]
+    smlsl       v16.4s, v11.4h, v7.4h[2]
+
+    smlsl       v18.4s, v10.4h, v6.4h[0]
+    smlsl       v18.4s, v11.4h, v5.4h[2]
+
+
+    ld1         {v12.4h},[x0],x6
+    ld1         {v14.4h},[x0],x6
+    ld1         {v13.4h},[x0],x6
+    ld1         {v15.4h},[x0],x6
+
+    smlal       v24.4s, v14.4h, v4.4h[3]
+    smlsl       v26.4s, v14.4h, v6.4h[1]
+    smlal       v28.4s, v14.4h, v7.4h[3]
+    smlal       v30.4s, v14.4h, v6.4h[3]
+
+
+    smlal       v24.4s, v15.4h, v3.4h[3]
+    smlsl       v26.4s, v15.4h, v3.4h[1]
+    smlal       v28.4s, v15.4h, v2.4h[3]
+    smlsl       v30.4s, v15.4h, v2.4h[1]
+
+
+    smlsl       v20.4s, v12.4h, v7.4h[0]
+    smlal       v20.4s, v13.4h, v0.4h[2]
+    smlal       v22.4s, v12.4h, v5.4h[0]
+    smlsl       v22.4s, v13.4h, v1.4h[2]
+    smlsl       v16.4s, v12.4h, v3.4h[0]
+    smlal       v16.4s, v13.4h, v2.4h[2]
+    smlal       v18.4s, v12.4h, v1.4h[0]
+    smlsl       v18.4s, v13.4h, v3.4h[2]
+
+shift3:
+    add         v8.4s,  v20.4s ,  v24.4s
+    sub         v10.4s,  v20.4s ,  v24.4s
+
+    add         v12.4s,  v22.4s ,  v26.4s
+    sub         v24.4s,  v22.4s ,  v26.4s
+
+    add         v14.4s,  v16.4s ,  v28.4s
+    sub         v26.4s,  v16.4s ,  v28.4s
+
+
+    add         v16.4s,  v18.4s ,  v30.4s
+    sub         v28.4s,  v18.4s ,  v30.4s
+
+
+    sqrshrn     v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+    umov        x15,v24.d[0]
+    umov        x16,v25.d[0]
+    umov        x19,v26.d[0]
+    umov        x20,v27.d[0]
+
+    trn1        v24.4h, v30.4h, v12.4h
+    trn2        v25.4h, v30.4h, v12.4h
+    trn1        v26.4h, v31.4h, v13.4h
+    trn2        v27.4h, v31.4h, v13.4h
+
+    trn1        v30.2s, v24.2s, v26.2s
+    trn2        v31.2s, v24.2s, v26.2s
+    trn1        v12.2s, v25.2s, v27.2s
+    trn2        v13.2s, v25.2s, v27.2s
+
+    trn1        v24.4h, v14.4h, v18.4h
+    trn2        v25.4h, v14.4h, v18.4h
+    trn1        v26.4h, v15.4h, v19.4h
+    trn2        v27.4h, v15.4h, v19.4h
+
+    trn1        v14.2s, v24.2s, v26.2s
+    trn2        v15.2s, v24.2s, v26.2s
+    trn1        v18.2s, v25.2s, v27.2s
+    trn2        v19.2s, v25.2s, v27.2s
+
+    mov         v24.d[0],x15
+    mov         v25.d[0],x16
+    mov         v26.d[0],x19
+    mov         v27.d[0],x20
+    st1         { v30.4h, v31.4h},[x1],#16
+    st1         { v12.4h, v13.4h},[x1],#16
+    add         x1,x1,#64
+    st1         { v14.4h, v15.4h},[x1],#16
+    st1         { v18.4h, v19.4h},[x1],#16
+    sub         x1,x1,#96
+
+    mov         x0,x8
+
+
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v8.4h},[x0],x6
+    ld1         {v11.4h},[x0],x6
+    ld1         {v9.4h},[x0],x6
+
+
+    smull       v24.4s, v8.4h, v6.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v8.4h, v6.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v8.4h, v7.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v8.4h, v7.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v9.4h, v2.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v4.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v5.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v7.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smull       v20.4s, v10.4h, v0.4h[0]
+    smlsl       v20.4s, v11.4h, v3.4h[2]
+
+
+    smull       v22.4s, v10.4h, v0.4h[0]
+    smlsl       v22.4s, v11.4h, v2.4h[2]
+
+    smull       v16.4s, v10.4h, v0.4h[0]
+    smlsl       v16.4s, v11.4h, v1.4h[2]
+
+    smull       v18.4s, v10.4h, v0.4h[0]
+    smlsl       v18.4s, v11.4h, v0.4h[2]
+
+    cmp         x11,x10
+    bhs         shift4
+
+    ld1         {v12.4h},[x0],x6
+    ld1         {v14.4h},[x0],x6
+    ld1         {v13.4h},[x0],x6
+    ld1         {v15.4h},[x0],x6
+
+
+
+
+
+
+    smlal       v24.4s, v14.4h, v0.4h[1]
+    smlal       v26.4s, v14.4h, v1.4h[3]
+    smlal       v28.4s, v14.4h, v4.4h[1]
+    smlal       v30.4s, v14.4h, v6.4h[3]
+
+
+    smlsl       v24.4s, v15.4h, v4.4h[1]
+    smlsl       v26.4s, v15.4h, v0.4h[3]
+    smlsl       v28.4s, v15.4h, v2.4h[3]
+    smlsl       v30.4s, v15.4h, v6.4h[1]
+
+
+    smlal       v20.4s, v12.4h, v7.4h[0]
+    smlal       v20.4s, v13.4h, v5.4h[2]
+    smlal       v22.4s, v12.4h, v5.4h[0]
+    smlsl       v22.4s, v13.4h, v7.4h[2]
+    smlal       v16.4s, v12.4h, v3.4h[0]
+    smlsl       v16.4s, v13.4h, v4.4h[2]
+    smlal       v18.4s, v12.4h, v1.4h[0]
+    smlsl       v18.4s, v13.4h, v1.4h[2]
+
+    cmp         x11,x9
+    bhs         shift4
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v8.4h},[x0],x6
+    ld1         {v11.4h},[x0],x6
+    ld1         {v9.4h},[x0],x6
+
+
+
+    smlal       v24.4s, v8.4h, v7.4h[3]     //// y1 * cos1(part of b0)
+    smlal       v26.4s, v8.4h, v3.4h[1]     //// y1 * cos3(part of b1)
+    smlal       v28.4s, v8.4h, v1.4h[1]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v8.4h, v5.4h[3]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v9.4h, v4.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v5.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v0.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v5.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlsl       v20.4s, v10.4h, v2.4h[0]
+    smlal       v20.4s, v11.4h, v1.4h[2]
+
+
+    smlsl       v22.4s, v10.4h, v6.4h[0]
+    smlal       v22.4s, v11.4h, v3.4h[2]
+
+    smlal       v16.4s, v10.4h, v6.4h[0]
+    smlsl       v16.4s, v11.4h, v7.4h[2]
+
+    smlal       v18.4s, v10.4h, v2.4h[0]
+    smlsl       v18.4s, v11.4h, v2.4h[2]
+
+    cmp         x11,x5
+    bhs         shift4
+
+
+    ld1         {v12.4h},[x0],x6
+    ld1         {v14.4h},[x0],x6
+    ld1         {v13.4h},[x0],x6
+    ld1         {v15.4h},[x0],x6
+
+
+
+
+
+
+    smlsl       v24.4s, v14.4h, v1.4h[1]
+    smlsl       v26.4s, v14.4h, v7.4h[3]
+    smlal       v28.4s, v14.4h, v1.4h[3]
+    smlal       v30.4s, v14.4h, v4.4h[3]
+
+
+    smlal       v24.4s, v15.4h, v2.4h[1]
+    smlal       v26.4s, v15.4h, v5.4h[1]
+    smlsl       v28.4s, v15.4h, v3.4h[1]
+    smlsl       v30.4s, v15.4h, v4.4h[1]
+
+
+    smlsl       v20.4s, v12.4h, v5.4h[0]
+    smlsl       v20.4s, v13.4h, v7.4h[2]
+    smlsl       v22.4s, v12.4h, v1.4h[0]
+    smlal       v22.4s, v13.4h, v1.4h[2]
+    smlsl       v16.4s, v12.4h, v7.4h[0]
+    smlal       v16.4s, v13.4h, v5.4h[2]
+    smlal       v18.4s, v12.4h, v3.4h[0]
+    smlsl       v18.4s, v13.4h, v3.4h[2]
+
+    cmp         x11,x7
+    bhs         shift4
+
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v8.4h},[x0],x6
+    ld1         {v11.4h},[x0],x6
+    ld1         {v9.4h},[x0],x6
+
+
+    smlsl       v24.4s, v8.4h, v5.4h[3]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v8.4h, v2.4h[3]     //// y1 * cos3(part of b1)
+    smlal       v28.4s, v8.4h, v4.4h[3]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v8.4h, v3.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v9.4h, v6.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlal       v26.4s, v9.4h, v0.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v6.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v3.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlal       v20.4s, v10.4h, v0.4h[0]
+    smlsl       v20.4s, v11.4h, v0.4h[2]
+
+
+    smlsl       v22.4s, v10.4h, v0.4h[0]
+    smlal       v22.4s, v11.4h, v6.4h[2]
+
+    smlsl       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v2.4h[2]
+
+    smlal       v18.4s, v10.4h, v0.4h[0]
+    smlsl       v18.4s, v11.4h, v4.4h[2]
+
+
+
+
+    ld1         {v12.4h},[x0],x6
+    ld1         {v14.4h},[x0],x6
+    ld1         {v13.4h},[x0],x6
+    ld1         {v15.4h},[x0],x6
+
+
+
+
+
+
+    smlal       v24.4s, v14.4h, v3.4h[1]
+    smlsl       v26.4s, v14.4h, v2.4h[1]
+    smlal       v28.4s, v14.4h, v7.4h[3]
+    smlal       v30.4s, v14.4h, v2.4h[3]
+
+
+    smlsl       v24.4s, v15.4h, v0.4h[3]
+    smlal       v26.4s, v15.4h, v4.4h[3]
+    smlal       v28.4s, v15.4h, v6.4h[3]
+    smlsl       v30.4s, v15.4h, v2.4h[1]
+
+
+    smlal       v20.4s, v12.4h, v3.4h[0]
+    smlsl       v20.4s, v13.4h, v6.4h[2]
+    smlal       v22.4s, v12.4h, v7.4h[0]
+    smlsl       v22.4s, v13.4h, v4.4h[2]
+    smlsl       v16.4s, v12.4h, v1.4h[0]
+    smlal       v16.4s, v13.4h, v0.4h[2]
+    smlal       v18.4s, v12.4h, v5.4h[0]
+    smlsl       v18.4s, v13.4h, v5.4h[2]
+
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v8.4h},[x0],x6
+    ld1         {v11.4h},[x0],x6
+    ld1         {v9.4h},[x0],x6
+
+
+
+
+
+    smlal       v24.4s, v8.4h, v3.4h[3]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v8.4h, v7.4h[1]     //// y1 * cos3(part of b1)
+    smlsl       v28.4s, v8.4h, v5.4h[1]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v8.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v9.4h, v7.4h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v6.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v9.4h, v3.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v1.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlsl       v20.4s, v10.4h, v6.4h[0]
+    smlal       v20.4s, v11.4h, v2.4h[2]
+
+
+    smlal       v22.4s, v10.4h, v2.4h[0]
+    smlsl       v22.4s, v11.4h, v0.4h[2]
+
+    smlsl       v16.4s, v10.4h, v2.4h[0]
+    smlal       v16.4s, v11.4h, v3.4h[2]
+
+    smlal       v18.4s, v10.4h, v6.4h[0]
+    smlsl       v18.4s, v11.4h, v6.4h[2]
+
+
+    ld1         {v12.4h},[x0],x6
+    ld1         {v14.4h},[x0],x6
+    ld1         {v13.4h},[x0],x6
+    ld1         {v15.4h},[x0],x6
+
+
+
+
+    smlsl       v24.4s, v14.4h, v5.4h[1]
+    smlal       v26.4s, v14.4h, v3.4h[3]
+    smlsl       v28.4s, v14.4h, v2.4h[1]
+    smlal       v30.4s, v14.4h, v0.4h[3]
+
+
+    smlal       v24.4s, v15.4h, v1.4h[3]
+    smlsl       v26.4s, v15.4h, v1.4h[1]
+    smlal       v28.4s, v15.4h, v0.4h[3]
+    smlsl       v30.4s, v15.4h, v0.4h[1]
+
+
+    smlsl       v20.4s, v12.4h, v1.4h[0]
+    smlal       v20.4s, v13.4h, v4.4h[2]
+    smlal       v22.4s, v12.4h, v3.4h[0]
+    smlsl       v22.4s, v13.4h, v5.4h[2]
+    smlsl       v16.4s, v12.4h, v5.4h[0]
+    smlal       v16.4s, v13.4h, v6.4h[2]
+    smlal       v18.4s, v12.4h, v7.4h[0]
+    smlsl       v18.4s, v13.4h, v7.4h[2]
+
+shift4:
+    add         v8.4s,  v20.4s ,  v24.4s
+    sub         v10.4s,  v20.4s ,  v24.4s
+
+    add         v12.4s,  v22.4s ,  v26.4s
+    sub         v24.4s,  v22.4s ,  v26.4s
+
+    add         v14.4s,  v16.4s ,  v28.4s
+    sub         v26.4s,  v16.4s ,  v28.4s
+
+
+    add         v16.4s,  v18.4s ,  v30.4s
+    sub         v28.4s,  v18.4s ,  v30.4s
+
+
+    sqrshrn     v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+    umov        x15,v24.d[0]
+    umov        x16,v25.d[0]
+    umov        x19,v26.d[0]
+    umov        x20,v27.d[0]
+
+    trn1        v24.4h, v30.4h, v12.4h
+    trn2        v25.4h, v30.4h, v12.4h
+    trn1        v26.4h, v31.4h, v13.4h
+    trn2        v27.4h, v31.4h, v13.4h
+
+    trn1        v30.2s, v24.2s, v26.2s
+    trn2        v31.2s, v24.2s, v26.2s
+    trn1        v12.2s, v25.2s, v27.2s
+    trn2        v13.2s, v25.2s, v27.2s
+
+    trn1        v24.4h, v14.4h, v18.4h
+    trn2        v25.4h, v14.4h, v18.4h
+    trn1        v26.4h, v15.4h, v19.4h
+    trn2        v27.4h, v15.4h, v19.4h
+
+    trn1        v14.2s, v24.2s, v26.2s
+    trn2        v15.2s, v24.2s, v26.2s
+    trn1        v18.2s, v25.2s, v27.2s
+    trn2        v19.2s, v25.2s, v27.2s
+
+    mov         v24.d[0],x15
+    mov         v25.d[0],x16
+    mov         v26.d[0],x19
+    mov         v27.d[0],x20
+
+    st1         { v30.4h, v31.4h},[x1],#16
+    st1         { v12.4h, v13.4h},[x1],#16
+    st1         { v14.4h, v15.4h},[x1],#16
+    st1         { v18.4h, v19.4h},[x1],#16
+
+    add         x1,x1,#96
+
+    subs        x14,x14,#1
+    bne         dct_stage1
+second_stage_dct:
+//    mov        x0,x1
+    ldp         x8, x7,[sp],#16
+    ldp         x0, x1,[sp],#16
+
+//    add x4,x2,x8, lsl #1    @ x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
+//    add x5,x8,x8, lsl #1    @
+//    sub   x0,x0,#512
+    mov         x11,#0xfffffff0
+    mov         x5, #0xffffff00
+    ldr         w6, x5_addr
+    ldr         w9, x9_addr
+//    sub         x1,x1,#2048
+    mov         x4,x1
+    mov         x10,#240
+    mov         x14,#8
+    b           stage2
+
+// registers free :
+
+// arm registers used
+// x8 : predicition stride
+// x7 : destination stride
+// x1: temp buffer
+// x2 : pred buffer
+// x3 : destination buffer
+// x14 : loop counter
+//x0 : scratch buffer
+//x10 : used as stride
+// x4 : used to store the initial address
+//x12 : zero cols
+// x11 : 0xfffffff0
+// x5 : 0xffffff00
+dct_stage2:
+    add         x4,x4,#32
+    mov         x1,x4
+stage2:
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],x10
+
+    smull       v24.4s, v8.4h, v0.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v8.4h, v0.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v8.4h, v1.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v8.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v9.4h, v0.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlal       v26.4s, v9.4h, v2.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v9.4h, v3.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlal       v30.4s, v9.4h, v5.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+    smull       v20.4s, v10.4h, v0.4h[0]
+    smlal       v20.4s, v11.4h, v0.4h[2]
+
+
+    smull       v22.4s, v10.4h, v0.4h[0]
+    smlal       v22.4s, v11.4h, v1.4h[2]
+
+    smull       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v2.4h[2]
+
+    smull       v18.4s, v10.4h, v0.4h[0]
+    smlal       v18.4s, v11.4h, v3.4h[2]
+    cmp         x12,x11
+    bhs         stage2_shift1
+
+    ld1         {v12.4h, v13.4h},[x1],#16
+    ld1         {v14.4h, v15.4h},[x1],x10
+
+
+
+
+
+
+    smlal       v24.4s, v14.4h, v1.4h[1]
+    smlal       v26.4s, v14.4h, v3.4h[3]
+    smlal       v28.4s, v14.4h, v6.4h[1]
+    smlsl       v30.4s, v14.4h, v7.4h[1]
+
+
+    smlal       v24.4s, v15.4h, v1.4h[3]
+    smlal       v26.4s, v15.4h, v5.4h[1]
+    smlsl       v28.4s, v15.4h, v7.4h[1]
+    smlsl       v30.4s, v15.4h, v3.4h[3]
+
+
+    smlal       v20.4s, v12.4h, v1.4h[0]
+    smlal       v20.4s, v13.4h, v1.4h[2]
+    smlal       v22.4s, v12.4h, v3.4h[0]
+    smlal       v22.4s, v13.4h, v4.4h[2]
+    smlal       v16.4s, v12.4h, v5.4h[0]
+    smlal       v16.4s, v13.4h, v7.4h[2]
+    smlal       v18.4s, v12.4h, v7.4h[0]
+    smlsl       v18.4s, v13.4h, v5.4h[2]
+    cmp         x12,x5
+    bhs         stage2_shift1
+
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],x10
+
+    smlal       v24.4s, v8.4h, v2.4h[1]     //// y1 * cos1(part of b0)
+    smlal       v26.4s, v8.4h, v6.4h[3]     //// y1 * cos3(part of b1)
+    smlsl       v28.4s, v8.4h, v4.4h[3]     //// y1 * sin3(part of b2)
+    smlsl       v30.4s, v8.4h, v0.4h[1]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v9.4h, v2.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v7.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v2.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v3.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlal       v20.4s, v10.4h, v2.4h[0]
+    smlal       v20.4s, v11.4h, v2.4h[2]
+
+
+    smlal       v22.4s, v10.4h, v6.4h[0]
+    smlal       v22.4s, v11.4h, v7.4h[2]
+
+    smlsl       v16.4s, v10.4h, v6.4h[0]
+    smlsl       v16.4s, v11.4h, v3.4h[2]
+
+    smlsl       v18.4s, v10.4h, v2.4h[0]
+    smlsl       v18.4s, v11.4h, v1.4h[2]
+
+    cmp         x12,x6
+    bhs         stage2_shift1
+
+
+    ld1         {v12.4h, v13.4h},[x1],#16
+    ld1         {v14.4h, v15.4h},[x1],x10
+
+
+
+
+
+    smlal       v24.4s, v14.4h, v3.4h[1]
+    smlsl       v26.4s, v14.4h, v6.4h[1]
+    smlsl       v28.4s, v14.4h, v0.4h[1]
+    smlsl       v30.4s, v14.4h, v6.4h[3]
+
+
+    smlal       v24.4s, v15.4h, v3.4h[3]
+    smlsl       v26.4s, v15.4h, v4.4h[3]
+    smlsl       v28.4s, v15.4h, v2.4h[3]
+    smlal       v30.4s, v15.4h, v5.4h[3]
+
+
+    smlal       v20.4s, v12.4h, v3.4h[0]
+    smlal       v20.4s, v13.4h, v3.4h[2]
+    smlsl       v22.4s, v12.4h, v7.4h[0]
+    smlsl       v22.4s, v13.4h, v5.4h[2]
+    smlsl       v16.4s, v12.4h, v1.4h[0]
+    smlsl       v16.4s, v13.4h, v1.4h[2]
+    smlsl       v18.4s, v12.4h, v5.4h[0]
+    smlal       v18.4s, v13.4h, v7.4h[2]
+
+    cmp         x12,x9
+    bhs         stage2_shift1
+
+
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],x10
+
+
+    smlal       v24.4s, v8.4h, v4.4h[1]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v8.4h, v3.4h[1]     //// y1 * cos3(part of b1)
+    smlsl       v28.4s, v8.4h, v5.4h[1]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v8.4h, v2.4h[1]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v9.4h, v4.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v1.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v7.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlal       v30.4s, v9.4h, v1.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlal       v20.4s, v10.4h, v0.4h[0]
+    smlal       v20.4s, v11.4h, v4.4h[2]
+
+
+    smlsl       v22.4s, v10.4h, v0.4h[0]
+    smlsl       v22.4s, v11.4h, v2.4h[2]
+
+    smlsl       v16.4s, v10.4h, v0.4h[0]
+    smlsl       v16.4s, v11.4h, v6.4h[2]
+
+    smlal       v18.4s, v10.4h, v0.4h[0]
+    smlal       v18.4s, v11.4h, v0.4h[2]
+
+    ld1         {v12.4h, v13.4h},[x1],#16
+    ld1         {v14.4h, v15.4h},[x1],x10
+
+
+
+
+
+    smlal       v24.4s, v14.4h, v5.4h[1]
+    smlsl       v26.4s, v14.4h, v0.4h[2]
+    smlal       v28.4s, v14.4h, v5.4h[3]
+    smlal       v30.4s, v14.4h, v4.4h[3]
+
+
+    smlal       v24.4s, v15.4h, v5.4h[3]
+    smlsl       v26.4s, v15.4h, v1.4h[1]
+    smlal       v28.4s, v15.4h, v3.4h[1]
+    smlsl       v30.4s, v15.4h, v7.4h[3]
+
+
+    smlal       v20.4s, v12.4h, v5.4h[0]
+    smlal       v20.4s, v13.4h, v5.4h[2]
+    smlsl       v22.4s, v12.4h, v1.4h[0]
+    smlsl       v22.4s, v13.4h, v0.4h[2]
+    smlal       v16.4s, v12.4h, v7.4h[0]
+    smlal       v16.4s, v13.4h, v4.4h[2]
+    smlal       v18.4s, v12.4h, v3.4h[0]
+    smlal       v18.4s, v13.4h, v6.4h[2]
+
+
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],x10
+
+
+
+
+    smlal       v24.4s, v8.4h, v6.4h[1]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v8.4h, v2.4h[3]     //// y1 * cos3(part of b1)
+    smlal       v28.4s, v8.4h, v0.4h[1]     //// y1 * sin3(part of b2)
+    smlsl       v30.4s, v8.4h, v4.4h[1]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v9.4h, v6.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v4.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v9.4h, v1.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v0.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlal       v20.4s, v10.4h, v6.4h[0]
+    smlal       v20.4s, v11.4h, v6.4h[2]
+
+
+    smlsl       v22.4s, v10.4h, v2.4h[0]
+    smlsl       v22.4s, v11.4h, v3.4h[2]
+
+    smlal       v16.4s, v10.4h, v2.4h[0]
+    smlal       v16.4s, v11.4h, v0.4h[2]
+
+    smlsl       v18.4s, v10.4h, v6.4h[0]
+    smlsl       v18.4s, v11.4h, v2.4h[2]
+
+    ld1         {v12.4h, v13.4h},[x1],#16
+    ld1         {v14.4h, v15.4h},[x1],x10
+
+    smlal       v24.4s, v14.4h, v7.4h[1]
+    smlsl       v26.4s, v14.4h, v5.4h[3]
+    smlal       v28.4s, v14.4h, v4.4h[1]
+    smlsl       v30.4s, v14.4h, v2.4h[3]
+
+
+    smlal       v24.4s, v15.4h, v7.4h[3]
+    smlsl       v26.4s, v15.4h, v7.4h[1]
+    smlal       v28.4s, v15.4h, v6.4h[3]
+    smlsl       v30.4s, v15.4h, v6.4h[1]
+
+
+    smlal       v20.4s, v12.4h, v7.4h[0]
+    smlal       v20.4s, v13.4h, v7.4h[2]
+    smlsl       v22.4s, v12.4h, v5.4h[0]
+    smlsl       v22.4s, v13.4h, v6.4h[2]
+    smlal       v16.4s, v12.4h, v3.4h[0]
+    smlal       v16.4s, v13.4h, v5.4h[2]
+    smlsl       v18.4s, v12.4h, v1.4h[0]
+    smlsl       v18.4s, v13.4h, v4.4h[2]
+
+stage2_shift1:
+    add         v8.4s,  v20.4s ,  v24.4s
+    sub         v10.4s,  v20.4s ,  v24.4s
+
+    add         v12.4s,  v22.4s ,  v26.4s
+    sub         v24.4s,  v22.4s ,  v26.4s
+
+    add         v14.4s,  v16.4s ,  v28.4s
+    sub         v26.4s,  v16.4s ,  v28.4s
+
+
+    add         v16.4s,  v18.4s ,  v30.4s
+    sub         v28.4s,  v18.4s ,  v30.4s
+
+
+    sqrshrn     v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
+
+
+    umov        x15,v24.d[0]
+    umov        x16,v25.d[0]
+    umov        x19,v26.d[0]
+    umov        x20,v27.d[0]
+
+    trn1        v24.4h, v30.4h, v12.4h
+    trn2        v25.4h, v30.4h, v12.4h
+    trn1        v26.4h, v31.4h, v13.4h
+    trn2        v27.4h, v31.4h, v13.4h
+
+    trn1        v30.2s, v24.2s, v26.2s
+    trn2        v31.2s, v24.2s, v26.2s
+    trn1        v12.2s, v25.2s, v27.2s
+    trn2        v13.2s, v25.2s, v27.2s
+
+    trn1        v24.4h, v14.4h, v18.4h
+    trn2        v25.4h, v14.4h, v18.4h
+    trn1        v26.4h, v15.4h, v19.4h
+    trn2        v27.4h, v15.4h, v19.4h
+
+    trn1        v14.2s, v24.2s, v26.2s
+    trn2        v15.2s, v24.2s, v26.2s
+    trn1        v18.2s, v25.2s, v27.2s
+    trn2        v19.2s, v25.2s, v27.2s
+
+    mov         v24.d[0],x15
+    mov         v25.d[0],x16
+    mov         v26.d[0],x19
+    mov         v27.d[0],x20
+
+    st1         { v30.4h, v31.4h},[x0],#16
+    st1         { v12.4h, v13.4h},[x0],#16
+    st1         { v14.4h, v15.4h},[x0],#16
+    st1         { v18.4h, v19.4h},[x0],#16
+
+    mov         x1,x4
+
+
+
+
+
+
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],x10
+
+
+    smull       v24.4s, v8.4h, v2.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v8.4h, v2.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v8.4h, v3.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v8.4h, v3.4h[3]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v9.4h, v6.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v7.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v6.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v4.4h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smull       v20.4s, v10.4h, v0.4h[0]
+    smlal       v20.4s, v11.4h, v4.4h[2]
+
+
+    smull       v22.4s, v10.4h, v0.4h[0]
+    smlal       v22.4s, v11.4h, v5.4h[2]
+
+    smull       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v6.4h[2]
+
+    smull       v18.4s, v10.4h, v0.4h[0]
+    smlal       v18.4s, v11.4h, v7.4h[2]
+
+    cmp         x12,x11
+    bhs         stage2_shift2
+
+    ld1         {v12.4h, v13.4h},[x1],#16
+    ld1         {v14.4h, v15.4h},[x1],x10
+
+
+    smlsl       v24.4s, v14.4h, v4.4h[3]
+    smlsl       v26.4s, v14.4h, v2.4h[1]
+    smlsl       v28.4s, v14.4h, v0.4h[1]
+    smlsl       v30.4s, v14.4h, v2.4h[3]
+
+
+    smlsl       v24.4s, v15.4h, v0.4h[3]
+    smlsl       v26.4s, v15.4h, v3.4h[1]
+    smlsl       v28.4s, v15.4h, v6.4h[3]
+    smlal       v30.4s, v15.4h, v5.4h[3]
+
+
+    smlsl       v20.4s, v12.4h, v7.4h[0]
+    smlsl       v20.4s, v13.4h, v2.4h[2]
+    smlsl       v22.4s, v12.4h, v5.4h[0]
+    smlsl       v22.4s, v13.4h, v0.4h[2]
+    smlsl       v16.4s, v12.4h, v3.4h[0]
+    smlsl       v16.4s, v13.4h, v3.4h[2]
+    smlsl       v18.4s, v12.4h, v1.4h[0]
+    smlsl       v18.4s, v13.4h, v6.4h[2]
+
+    cmp         x12,x5
+    bhs         stage2_shift2
+
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],x10
+
+
+
+
+
+    smlsl       v24.4s, v8.4h, v4.4h[1]     //// y1 * cos1(part of b0)
+    smlal       v26.4s, v8.4h, v7.4h[1]     //// y1 * cos3(part of b1)
+    smlal       v28.4s, v8.4h, v2.4h[3]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v8.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v9.4h, v7.4h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlal       v26.4s, v9.4h, v1.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v9.4h, v3.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v6.4h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlsl       v20.4s, v10.4h, v2.4h[0]
+    smlsl       v20.4s, v11.4h, v6.4h[2]
+
+
+    smlsl       v22.4s, v10.4h, v6.4h[0]
+    smlal       v22.4s, v11.4h, v4.4h[2]
+
+    smlal       v16.4s, v10.4h, v6.4h[0]
+    smlal       v16.4s, v11.4h, v0.4h[2]
+
+    smlal       v18.4s, v10.4h, v2.4h[0]
+    smlal       v18.4s, v11.4h, v5.4h[2]
+
+    cmp         x12,x6
+    bhs         stage2_shift2
+
+
+    ld1         {v12.4h, v13.4h},[x1],#16
+    ld1         {v14.4h, v15.4h},[x1],x10
+
+
+
+
+
+
+    smlal       v24.4s, v14.4h, v2.4h[3]
+    smlal       v26.4s, v14.4h, v3.4h[3]
+    smlsl       v28.4s, v14.4h, v5.4h[3]
+    smlsl       v30.4s, v14.4h, v0.4h[3]
+
+
+    smlal       v24.4s, v15.4h, v1.4h[3]
+    smlsl       v26.4s, v15.4h, v6.4h[3]
+    smlsl       v28.4s, v15.4h, v0.4h[3]
+    smlal       v30.4s, v15.4h, v7.4h[3]
+
+
+    smlal       v20.4s, v12.4h, v5.4h[0]
+    smlal       v20.4s, v13.4h, v0.4h[2]
+    smlal       v22.4s, v12.4h, v1.4h[0]
+    smlal       v22.4s, v13.4h, v6.4h[2]
+    smlal       v16.4s, v12.4h, v7.4h[0]
+    smlsl       v16.4s, v13.4h, v2.4h[2]
+    smlsl       v18.4s, v12.4h, v3.4h[0]
+    smlsl       v18.4s, v13.4h, v4.4h[2]
+
+    cmp         x12,x9
+    bhs         stage2_shift2
+
+
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],x10
+
+
+
+    smlal       v24.4s, v8.4h, v6.4h[1]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v8.4h, v1.4h[1]     //// y1 * cos3(part of b1)
+    smlsl       v28.4s, v8.4h, v7.4h[1]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v8.4h, v0.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v9.4h, v5.4h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v4.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v9.4h, v2.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlal       v30.4s, v9.4h, v7.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlal       v20.4s, v10.4h, v0.4h[0]
+    smlsl       v20.4s, v11.4h, v7.4h[2]
+
+
+    smlsl       v22.4s, v10.4h, v0.4h[0]
+    smlsl       v22.4s, v11.4h, v1.4h[2]
+
+    smlsl       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v5.4h[2]
+
+    smlal       v18.4s, v10.4h, v0.4h[0]
+    smlal       v18.4s, v11.4h, v3.4h[2]
+
+    ld1         {v12.4h, v13.4h},[x1],#16
+    ld1         {v14.4h, v15.4h},[x1],x10
+
+
+
+
+    smlsl       v24.4s, v14.4h, v0.4h[1]
+    smlal       v26.4s, v14.4h, v6.4h[1]
+    smlal       v28.4s, v14.4h, v4.4h[1]
+    smlsl       v30.4s, v14.4h, v1.4h[1]
+
+
+    smlsl       v24.4s, v15.4h, v3.4h[3]
+    smlal       v26.4s, v15.4h, v0.4h[1]
+    smlsl       v28.4s, v15.4h, v5.4h[1]
+    smlsl       v30.4s, v15.4h, v6.4h[1]
+
+
+    smlsl       v20.4s, v12.4h, v3.4h[0]
+    smlsl       v20.4s, v13.4h, v1.4h[2]
+    smlsl       v22.4s, v12.4h, v7.4h[0]
+    smlal       v22.4s, v13.4h, v3.4h[2]
+    smlal       v16.4s, v12.4h, v1.4h[0]
+    smlal       v16.4s, v13.4h, v7.4h[2]
+    smlsl       v18.4s, v12.4h, v5.4h[0]
+    smlsl       v18.4s, v13.4h, v2.4h[2]
+
+
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],x10
+
+
+    smlal       v24.4s, v8.4h, v7.4h[3]     //// y1 * cos1(part of b0)
+    smlal       v26.4s, v8.4h, v4.4h[3]     //// y1 * cos3(part of b1)
+    smlsl       v28.4s, v8.4h, v1.4h[1]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v8.4h, v2.4h[1]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v9.4h, v3.4h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v5.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v7.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlal       v30.4s, v9.4h, v5.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlsl       v20.4s, v10.4h, v6.4h[0]
+    smlal       v20.4s, v11.4h, v5.4h[2]
+
+
+    smlal       v22.4s, v10.4h, v2.4h[0]
+    smlal       v22.4s, v11.4h, v7.4h[2]
+
+    smlsl       v16.4s, v10.4h, v2.4h[0]
+    smlsl       v16.4s, v11.4h, v4.4h[2]
+
+    smlal       v18.4s, v10.4h, v6.4h[0]
+    smlal       v18.4s, v11.4h, v1.4h[2]
+
+
+    ld1         {v12.4h, v13.4h},[x1],#16
+    ld1         {v14.4h, v15.4h},[x1],x10
+
+
+
+    smlal       v24.4s, v14.4h, v1.4h[1]
+    smlsl       v26.4s, v14.4h, v0.4h[3]
+    smlal       v28.4s, v14.4h, v1.4h[3]
+    smlsl       v30.4s, v14.4h, v3.4h[1]
+
+
+    smlal       v24.4s, v15.4h, v5.4h[3]
+    smlsl       v26.4s, v15.4h, v5.4h[1]
+    smlal       v28.4s, v15.4h, v4.4h[3]
+    smlsl       v30.4s, v15.4h, v4.4h[1]
+
+
+    smlal       v20.4s, v12.4h, v1.4h[0]
+    smlal       v20.4s, v13.4h, v3.4h[2]
+    smlsl       v22.4s, v12.4h, v3.4h[0]
+    smlsl       v22.4s, v13.4h, v2.4h[2]
+    smlal       v16.4s, v12.4h, v5.4h[0]
+    smlal       v16.4s, v13.4h, v1.4h[2]
+    smlsl       v18.4s, v12.4h, v7.4h[0]
+    smlsl       v18.4s, v13.4h, v0.4h[2]
+
+stage2_shift2:
+    add         v8.4s,  v20.4s ,  v24.4s
+    sub         v10.4s,  v20.4s ,  v24.4s
+
+    add         v12.4s,  v22.4s ,  v26.4s
+    sub         v24.4s,  v22.4s ,  v26.4s
+
+    add         v14.4s,  v16.4s ,  v28.4s
+    sub         v26.4s,  v16.4s ,  v28.4s
+
+
+    add         v16.4s,  v18.4s ,  v30.4s
+    sub         v28.4s,  v18.4s ,  v30.4s
+
+
+    sqrshrn     v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
+
+    umov        x15,v24.d[0]
+    umov        x16,v25.d[0]
+    umov        x19,v26.d[0]
+    umov        x20,v27.d[0]
+
+    trn1        v24.4h, v30.4h, v12.4h
+    trn2        v25.4h, v30.4h, v12.4h
+    trn1        v26.4h, v31.4h, v13.4h
+    trn2        v27.4h, v31.4h, v13.4h
+
+    trn1        v30.2s, v24.2s, v26.2s
+    trn2        v31.2s, v24.2s, v26.2s
+    trn1        v12.2s, v25.2s, v27.2s
+    trn2        v13.2s, v25.2s, v27.2s
+
+    trn1        v24.4h, v14.4h, v18.4h
+    trn2        v25.4h, v14.4h, v18.4h
+    trn1        v26.4h, v15.4h, v19.4h
+    trn2        v27.4h, v15.4h, v19.4h
+
+    trn1        v14.2s, v24.2s, v26.2s
+    trn2        v15.2s, v24.2s, v26.2s
+    trn1        v18.2s, v25.2s, v27.2s
+    trn2        v19.2s, v25.2s, v27.2s
+
+    mov         v24.d[0],x15
+    mov         v25.d[0],x16
+    mov         v26.d[0],x19
+    mov         v27.d[0],x20
+
+    st1         { v30.4h, v31.4h},[x0],#16
+    st1         { v12.4h, v13.4h},[x0],#16
+    st1         { v14.4h, v15.4h},[x0],#16
+    st1         { v18.4h, v19.4h},[x0],#16
+
+
+    mov         x1,x4
+
+
+
+
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],x10
+
+    smull       v24.4s, v8.4h, v4.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v8.4h, v4.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v8.4h, v5.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v8.4h, v5.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v9.4h, v3.4h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v1.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v0.4h[2]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v1.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smull       v20.4s, v10.4h, v0.4h[0]
+    smlsl       v20.4s, v11.4h, v7.4h[2]
+
+
+    smull       v22.4s, v10.4h, v0.4h[0]
+    smlsl       v22.4s, v11.4h, v6.4h[2]
+
+    smull       v16.4s, v10.4h, v0.4h[0]
+    smlsl       v16.4s, v11.4h, v5.4h[2]
+
+    smull       v18.4s, v10.4h, v0.4h[0]
+    smlsl       v18.4s, v11.4h, v4.4h[2]
+
+    cmp         x12,x11
+    bhs         stage2_shift3
+
+    ld1         {v12.4h, v13.4h},[x1],#16
+    ld1         {v14.4h, v15.4h},[x1],x10
+
+    smlsl       v24.4s, v14.4h, v5.4h[1]
+    smlsl       v26.4s, v14.4h, v7.4h[3]
+    smlal       v28.4s, v14.4h, v5.4h[3]
+    smlal       v30.4s, v14.4h, v3.4h[1]
+
+
+    smlal       v24.4s, v15.4h, v2.4h[1]
+    smlal       v26.4s, v15.4h, v1.4h[1]
+    smlal       v28.4s, v15.4h, v4.4h[3]
+    smlsl       v30.4s, v15.4h, v7.4h[3]
+
+
+    smlsl       v20.4s, v12.4h, v1.4h[0]
+    smlal       v20.4s, v13.4h, v6.4h[2]
+    smlsl       v22.4s, v12.4h, v3.4h[0]
+    smlal       v22.4s, v13.4h, v3.4h[2]
+    smlsl       v16.4s, v12.4h, v5.4h[0]
+    smlal       v16.4s, v13.4h, v0.4h[2]
+    smlsl       v18.4s, v12.4h, v7.4h[0]
+    smlal       v18.4s, v13.4h, v2.4h[2]
+
+    cmp         x12,x5
+    bhs         stage2_shift3
+
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],x10
+
+
+
+    smlal       v24.4s, v8.4h, v6.4h[1]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v8.4h, v5.4h[1]     //// y1 * cos3(part of b1)
+    smlsl       v28.4s, v8.4h, v0.4h[3]     //// y1 * sin3(part of b2)
+    smlsl       v30.4s, v8.4h, v3.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v9.4h, v1.4h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v4.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v9.4h, v6.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlal       v30.4s, v9.4h, v0.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlal       v20.4s, v10.4h, v2.4h[0]
+    smlsl       v20.4s, v11.4h, v5.4h[2]
+
+
+    smlal       v22.4s, v10.4h, v6.4h[0]
+    smlsl       v22.4s, v11.4h, v0.4h[2]
+
+    smlsl       v16.4s, v10.4h, v6.4h[0]
+    smlsl       v16.4s, v11.4h, v4.4h[2]
+
+    smlsl       v18.4s, v10.4h, v2.4h[0]
+    smlal       v18.4s, v11.4h, v6.4h[2]
+
+    cmp         x12,x6
+    bhs         stage2_shift3
+
+    ld1         {v12.4h, v13.4h},[x1],#16
+    ld1         {v14.4h, v15.4h},[x1],x10
+
+
+
+
+
+    smlsl       v24.4s, v14.4h, v7.4h[1]
+    smlal       v26.4s, v14.4h, v2.4h[1]
+    smlal       v28.4s, v14.4h, v4.4h[1]
+    smlsl       v30.4s, v14.4h, v5.4h[1]
+
+
+    smlal       v24.4s, v15.4h, v0.4h[3]
+    smlal       v26.4s, v15.4h, v7.4h[1]
+    smlsl       v28.4s, v15.4h, v1.4h[1]
+    smlsl       v30.4s, v15.4h, v6.4h[1]
+
+
+    smlsl       v20.4s, v12.4h, v3.4h[0]
+    smlal       v20.4s, v13.4h, v4.4h[2]
+    smlal       v22.4s, v12.4h, v7.4h[0]
+    smlal       v22.4s, v13.4h, v2.4h[2]
+    smlal       v16.4s, v12.4h, v1.4h[0]
+    smlsl       v16.4s, v13.4h, v6.4h[2]
+    smlal       v18.4s, v12.4h, v5.4h[0]
+    smlsl       v18.4s, v13.4h, v0.4h[2]
+
+    cmp         x12,x9
+    bhs         stage2_shift3
+
+
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],x10
+
+
+    smlsl       v24.4s, v8.4h, v7.4h[3]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v8.4h, v0.4h[1]     //// y1 * cos3(part of b1)
+    smlal       v28.4s, v8.4h, v6.4h[3]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v8.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v9.4h, v0.4h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlal       v26.4s, v9.4h, v5.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v9.4h, v3.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v2.4h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlal       v20.4s, v10.4h, v0.4h[0]
+    smlsl       v20.4s, v11.4h, v3.4h[2]
+
+
+    smlsl       v22.4s, v10.4h, v0.4h[0]
+    smlsl       v22.4s, v11.4h, v5.4h[2]
+
+    smlsl       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v1.4h[2]
+
+    smlal       v18.4s, v10.4h, v0.4h[0]
+    smlal       v18.4s, v11.4h, v7.4h[2]
+
+    ld1         {v12.4h, v13.4h},[x1],#16
+    ld1         {v14.4h, v15.4h},[x1],x10
+
+
+
+
+    smlal       v24.4s, v14.4h, v6.4h[3]
+    smlal       v26.4s, v14.4h, v3.4h[3]
+    smlsl       v28.4s, v14.4h, v1.4h[3]
+    smlal       v30.4s, v14.4h, v7.4h[1]
+
+
+    smlal       v24.4s, v15.4h, v1.4h[3]
+    smlsl       v26.4s, v15.4h, v2.4h[3]
+    smlal       v28.4s, v15.4h, v7.4h[1]
+    smlal       v30.4s, v15.4h, v4.4h[1]
+
+
+    smlsl       v20.4s, v12.4h, v5.4h[0]
+    smlal       v20.4s, v13.4h, v2.4h[2]
+    smlal       v22.4s, v12.4h, v1.4h[0]
+    smlsl       v22.4s, v13.4h, v7.4h[2]
+    smlsl       v16.4s, v12.4h, v7.4h[0]
+    smlsl       v16.4s, v13.4h, v3.4h[2]
+    smlsl       v18.4s, v12.4h, v3.4h[0]
+    smlal       v18.4s, v13.4h, v1.4h[2]
+
+
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],x10
+
+
+    smlsl       v24.4s, v8.4h, v5.4h[3]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v8.4h, v6.4h[3]     //// y1 * cos3(part of b1)
+    smlal       v28.4s, v8.4h, v3.4h[1]     //// y1 * sin3(part of b2)
+    smlsl       v30.4s, v8.4h, v0.4h[1]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v9.4h, v2.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlal       v26.4s, v9.4h, v0.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v2.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlal       v30.4s, v9.4h, v4.4h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlal       v20.4s, v10.4h, v6.4h[0]
+    smlsl       v20.4s, v11.4h, v1.4h[2]
+
+
+    smlsl       v22.4s, v10.4h, v2.4h[0]
+    smlal       v22.4s, v11.4h, v4.4h[2]
+
+    smlal       v16.4s, v10.4h, v2.4h[0]
+    smlsl       v16.4s, v11.4h, v7.4h[2]
+
+    smlsl       v18.4s, v10.4h, v6.4h[0]
+    smlsl       v18.4s, v11.4h, v5.4h[2]
+
+    ld1         {v12.4h, v13.4h},[x1],#16
+    ld1         {v14.4h, v15.4h},[x1],x10
+
+
+
+    smlal       v24.4s, v14.4h, v4.4h[3]
+    smlsl       v26.4s, v14.4h, v6.4h[1]
+    smlal       v28.4s, v14.4h, v7.4h[3]
+    smlal       v30.4s, v14.4h, v6.4h[3]
+
+
+    smlal       v24.4s, v15.4h, v3.4h[3]
+    smlsl       v26.4s, v15.4h, v3.4h[1]
+    smlal       v28.4s, v15.4h, v2.4h[3]
+    smlsl       v30.4s, v15.4h, v2.4h[1]
+
+
+    smlsl       v20.4s, v12.4h, v7.4h[0]
+    smlal       v20.4s, v13.4h, v0.4h[2]
+    smlal       v22.4s, v12.4h, v5.4h[0]
+    smlsl       v22.4s, v13.4h, v1.4h[2]
+    smlsl       v16.4s, v12.4h, v3.4h[0]
+    smlal       v16.4s, v13.4h, v2.4h[2]
+    smlal       v18.4s, v12.4h, v1.4h[0]
+    smlsl       v18.4s, v13.4h, v3.4h[2]
+
+stage2_shift3:
+    add         v8.4s,  v20.4s ,  v24.4s
+    sub         v10.4s,  v20.4s ,  v24.4s
+
+    add         v12.4s,  v22.4s ,  v26.4s
+    sub         v24.4s,  v22.4s ,  v26.4s
+
+    add         v14.4s,  v16.4s ,  v28.4s
+    sub         v26.4s,  v16.4s ,  v28.4s
+
+
+    add         v16.4s,  v18.4s ,  v30.4s
+    sub         v28.4s,  v18.4s ,  v30.4s
+
+
+    sqrshrn     v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
+
+    umov        x15,v24.d[0]
+    umov        x16,v25.d[0]
+    umov        x19,v26.d[0]
+    umov        x20,v27.d[0]
+
+    trn1        v24.4h, v30.4h, v12.4h
+    trn2        v25.4h, v30.4h, v12.4h
+    trn1        v26.4h, v31.4h, v13.4h
+    trn2        v27.4h, v31.4h, v13.4h
+
+    trn1        v30.2s, v24.2s, v26.2s
+    trn2        v31.2s, v24.2s, v26.2s
+    trn1        v12.2s, v25.2s, v27.2s
+    trn2        v13.2s, v25.2s, v27.2s
+
+    trn1        v24.4h, v14.4h, v18.4h
+    trn2        v25.4h, v14.4h, v18.4h
+    trn1        v26.4h, v15.4h, v19.4h
+    trn2        v27.4h, v15.4h, v19.4h
+
+    trn1        v14.2s, v24.2s, v26.2s
+    trn2        v15.2s, v24.2s, v26.2s
+    trn1        v18.2s, v25.2s, v27.2s
+    trn2        v19.2s, v25.2s, v27.2s
+
+    mov         v24.d[0],x15
+    mov         v25.d[0],x16
+    mov         v26.d[0],x19
+    mov         v27.d[0],x20
+
+    st1         { v30.4h, v31.4h},[x0],#16
+    st1         { v12.4h, v13.4h},[x0],#16
+    st1         { v14.4h, v15.4h},[x0],#16
+    st1         { v18.4h, v19.4h},[x0],#16
+
+
+
+    mov         x1,x4
+
+
+
+
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],x10
+
+
+    smull       v24.4s, v8.4h, v6.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v8.4h, v6.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v8.4h, v7.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v8.4h, v7.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v9.4h, v2.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v4.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v5.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v7.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smull       v20.4s, v10.4h, v0.4h[0]
+    smlsl       v20.4s, v11.4h, v3.4h[2]
+
+
+    smull       v22.4s, v10.4h, v0.4h[0]
+    smlsl       v22.4s, v11.4h, v2.4h[2]
+
+    smull       v16.4s, v10.4h, v0.4h[0]
+    smlsl       v16.4s, v11.4h, v1.4h[2]
+
+    smull       v18.4s, v10.4h, v0.4h[0]
+    smlsl       v18.4s, v11.4h, v0.4h[2]
+
+    cmp         x12,x11
+    bhs         stage2_shift4
+    ld1         {v12.4h, v13.4h},[x1],#16
+    ld1         {v14.4h, v15.4h},[x1],x10
+
+
+
+
+
+
+    smlal       v24.4s, v14.4h, v0.4h[1]
+    smlal       v26.4s, v14.4h, v1.4h[3]
+    smlal       v28.4s, v14.4h, v4.4h[1]
+    smlal       v30.4s, v14.4h, v6.4h[3]
+
+
+    smlsl       v24.4s, v15.4h, v4.4h[1]
+    smlsl       v26.4s, v15.4h, v0.4h[3]
+    smlsl       v28.4s, v15.4h, v2.4h[3]
+    smlsl       v30.4s, v15.4h, v6.4h[1]
+
+
+    smlal       v20.4s, v12.4h, v7.4h[0]
+    smlal       v20.4s, v13.4h, v5.4h[2]
+    smlal       v22.4s, v12.4h, v5.4h[0]
+    smlsl       v22.4s, v13.4h, v7.4h[2]
+    smlal       v16.4s, v12.4h, v3.4h[0]
+    smlsl       v16.4s, v13.4h, v4.4h[2]
+    smlal       v18.4s, v12.4h, v1.4h[0]
+    smlsl       v18.4s, v13.4h, v1.4h[2]
+
+    cmp         x12,x5
+    bhs         stage2_shift4
+
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],x10
+
+
+
+    smlal       v24.4s, v8.4h, v7.4h[3]     //// y1 * cos1(part of b0)
+    smlal       v26.4s, v8.4h, v3.4h[1]     //// y1 * cos3(part of b1)
+    smlal       v28.4s, v8.4h, v1.4h[1]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v8.4h, v5.4h[3]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v9.4h, v4.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v5.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v0.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v5.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlsl       v20.4s, v10.4h, v2.4h[0]
+    smlal       v20.4s, v11.4h, v1.4h[2]
+
+
+    smlsl       v22.4s, v10.4h, v6.4h[0]
+    smlal       v22.4s, v11.4h, v3.4h[2]
+
+    smlal       v16.4s, v10.4h, v6.4h[0]
+    smlsl       v16.4s, v11.4h, v7.4h[2]
+
+    smlal       v18.4s, v10.4h, v2.4h[0]
+    smlsl       v18.4s, v11.4h, v2.4h[2]
+
+    cmp         x12,x6
+    bhs         stage2_shift4
+
+
+    ld1         {v12.4h, v13.4h},[x1],#16
+    ld1         {v14.4h, v15.4h},[x1],x10
+
+
+
+
+
+
+    smlsl       v24.4s, v14.4h, v1.4h[1]
+    smlsl       v26.4s, v14.4h, v7.4h[3]
+    smlal       v28.4s, v14.4h, v1.4h[3]
+    smlal       v30.4s, v14.4h, v4.4h[3]
+
+
+    smlal       v24.4s, v15.4h, v2.4h[1]
+    smlal       v26.4s, v15.4h, v5.4h[1]
+    smlsl       v28.4s, v15.4h, v3.4h[1]
+    smlsl       v30.4s, v15.4h, v4.4h[1]
+
+
+    smlsl       v20.4s, v12.4h, v5.4h[0]
+    smlsl       v20.4s, v13.4h, v7.4h[2]
+    smlsl       v22.4s, v12.4h, v1.4h[0]
+    smlal       v22.4s, v13.4h, v1.4h[2]
+    smlsl       v16.4s, v12.4h, v7.4h[0]
+    smlal       v16.4s, v13.4h, v5.4h[2]
+    smlal       v18.4s, v12.4h, v3.4h[0]
+    smlsl       v18.4s, v13.4h, v3.4h[2]
+
+    cmp         x12,x9
+    bhs         stage2_shift4
+
+
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],x10
+
+
+    smlsl       v24.4s, v8.4h, v5.4h[3]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v8.4h, v2.4h[3]     //// y1 * cos3(part of b1)
+    smlal       v28.4s, v8.4h, v4.4h[3]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v8.4h, v3.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v9.4h, v6.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlal       v26.4s, v9.4h, v0.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v6.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v3.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlal       v20.4s, v10.4h, v0.4h[0]
+    smlsl       v20.4s, v11.4h, v0.4h[2]
+
+
+    smlsl       v22.4s, v10.4h, v0.4h[0]
+    smlal       v22.4s, v11.4h, v6.4h[2]
+
+    smlsl       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v2.4h[2]
+
+    smlal       v18.4s, v10.4h, v0.4h[0]
+    smlsl       v18.4s, v11.4h, v4.4h[2]
+
+    ld1         {v12.4h, v13.4h},[x1],#16
+    ld1         {v14.4h, v15.4h},[x1],x10
+
+
+
+
+    smlal       v24.4s, v14.4h, v3.4h[1]
+    smlsl       v26.4s, v14.4h, v2.4h[1]
+    smlal       v28.4s, v14.4h, v7.4h[3]
+    smlal       v30.4s, v14.4h, v2.4h[3]
+
+
+    smlsl       v24.4s, v15.4h, v0.4h[3]
+    smlal       v26.4s, v15.4h, v4.4h[3]
+    smlal       v28.4s, v15.4h, v6.4h[3]
+    smlsl       v30.4s, v15.4h, v2.4h[1]
+
+
+    smlal       v20.4s, v12.4h, v3.4h[0]
+    smlsl       v20.4s, v13.4h, v6.4h[2]
+    smlal       v22.4s, v12.4h, v7.4h[0]
+    smlsl       v22.4s, v13.4h, v4.4h[2]
+    smlsl       v16.4s, v12.4h, v1.4h[0]
+    smlal       v16.4s, v13.4h, v0.4h[2]
+    smlal       v18.4s, v12.4h, v5.4h[0]
+    smlsl       v18.4s, v13.4h, v5.4h[2]
+
+
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],x10
+
+
+
+
+    smlal       v24.4s, v8.4h, v3.4h[3]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v8.4h, v7.4h[1]     //// y1 * cos3(part of b1)
+    smlsl       v28.4s, v8.4h, v5.4h[1]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v8.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v9.4h, v7.4h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v6.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v9.4h, v3.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v1.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+    smlsl       v20.4s, v10.4h, v6.4h[0]
+    smlal       v20.4s, v11.4h, v2.4h[2]
+
+
+    smlal       v22.4s, v10.4h, v2.4h[0]
+    smlsl       v22.4s, v11.4h, v0.4h[2]
+
+    smlsl       v16.4s, v10.4h, v2.4h[0]
+    smlal       v16.4s, v11.4h, v3.4h[2]
+
+    smlal       v18.4s, v10.4h, v6.4h[0]
+    smlsl       v18.4s, v11.4h, v6.4h[2]
+
+
+    ld1         {v12.4h, v13.4h},[x1],#16
+    ld1         {v14.4h, v15.4h},[x1],x10
+
+
+
+    smlsl       v24.4s, v14.4h, v5.4h[1]
+    smlal       v26.4s, v14.4h, v3.4h[3]
+    smlsl       v28.4s, v14.4h, v2.4h[1]
+    smlal       v30.4s, v14.4h, v0.4h[3]
+
+
+    smlal       v24.4s, v15.4h, v1.4h[3]
+    smlsl       v26.4s, v15.4h, v1.4h[1]
+    smlal       v28.4s, v15.4h, v0.4h[3]
+    smlsl       v30.4s, v15.4h, v0.4h[1]
+
+
+    smlsl       v20.4s, v12.4h, v1.4h[0]
+    smlal       v20.4s, v13.4h, v4.4h[2]
+    smlal       v22.4s, v12.4h, v3.4h[0]
+    smlsl       v22.4s, v13.4h, v5.4h[2]
+    smlsl       v16.4s, v12.4h, v5.4h[0]
+    smlal       v16.4s, v13.4h, v6.4h[2]
+    smlal       v18.4s, v12.4h, v7.4h[0]
+    smlsl       v18.4s, v13.4h, v7.4h[2]
+
+stage2_shift4:
+    add         v8.4s,  v20.4s ,  v24.4s
+    sub         v10.4s,  v20.4s ,  v24.4s
+
+    add         v12.4s,  v22.4s ,  v26.4s
+    sub         v24.4s,  v22.4s ,  v26.4s
+
+    add         v14.4s,  v16.4s ,  v28.4s
+    sub         v26.4s,  v16.4s ,  v28.4s
+
+
+    add         v16.4s,  v18.4s ,  v30.4s
+    sub         v28.4s,  v18.4s ,  v30.4s
+
+
+    sqrshrn     v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
+    sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
+
+
+
+    umov        x15,v24.d[0]
+    umov        x16,v25.d[0]
+    umov        x19,v26.d[0]
+    umov        x20,v27.d[0]
+
+    trn1        v24.4h, v30.4h, v12.4h
+    trn2        v25.4h, v30.4h, v12.4h
+    trn1        v26.4h, v31.4h, v13.4h
+    trn2        v27.4h, v31.4h, v13.4h
+
+    trn1        v30.2s, v24.2s, v26.2s
+    trn2        v31.2s, v24.2s, v26.2s
+    trn1        v12.2s, v25.2s, v27.2s
+    trn2        v13.2s, v25.2s, v27.2s
+
+    trn1        v24.4h, v14.4h, v18.4h
+    trn2        v25.4h, v14.4h, v18.4h
+    trn1        v26.4h, v15.4h, v19.4h
+    trn2        v27.4h, v15.4h, v19.4h
+
+    trn1        v14.2s, v24.2s, v26.2s
+    trn2        v15.2s, v24.2s, v26.2s
+    trn1        v18.2s, v25.2s, v27.2s
+    trn2        v19.2s, v25.2s, v27.2s
+
+    mov         v24.d[0],x15
+    mov         v25.d[0],x16
+    mov         v26.d[0],x19
+    mov         v27.d[0],x20
+
+    st1         { v30.4h, v31.4h},[x0],#16
+    st1         { v12.4h, v13.4h},[x0],#16
+    st1         { v14.4h, v15.4h},[x0],#16
+    st1         { v18.4h, v19.4h},[x0],#16
+
+
+
+
+    sub         x0,x0,#256
+prediction_buffer:
+
+
+    ld1         {v12.8h},[x0],#16
+    ld1         {v14.8h},[x0],#16
+
+    add         x0,x0,#32
+
+    ld1         {v16.8h},[x0],#16
+    ld1         {v18.8h},[x0],#16
+    add         x0,x0,#32
+
+    ld1         {v20.8h},[x0],#16
+    ld1         {v22.8h},[x0],#16
+
+
+    add         x0,x0,#32
+
+    ld1         {v24.8h},[x0],#16
+    ld1         {v26.8h},[x0],#16
+
+
+
+
+
+// d12 =x0 1- 4 values
+// d13 =x2 1- 4 values
+// d14=x1 1- 4 values
+// d15=x3 1- 4 values
+
+// d16 =x0 5- 8 values
+// d17 =x2 5- 8 values
+// d18=x1 5- 8 values
+// d19=x3 5- 8 values
+
+// d20 =x0 9- 12 values
+// d21 =x2 9- 12 values
+// d22=x1 9- 12 values
+// d23=x3 9- 12 values
+
+// d24 =x0 13-16 values
+// d25 =x2 13- 16 values
+// d26=x1 13- 16 values
+// d27=x3 13- 16 values
+
+    // swapping v12 upper and v16 lower 64bits
+    mov         v13.d[0], v12.d[1]
+    mov         v12.d[1], v16.d[0]
+    mov         v16.d[0], v13.d[0]
+    // swapping v20 upper and v24 lower 64bits
+    mov         v21.d[0], v20.d[1]
+    mov         v20.d[1], v24.d[0]
+    mov         v24.d[0], v21.d[0]
+    // swapping v14 uppper and v18 lower 64bits
+    mov         v15.d[0], v14.d[1]
+    mov         v14.d[1], v18.d[0]
+    mov         v18.d[0], v15.d[0]
+    // swapping v22 upper and v26 lower 64bits
+    mov         v23.d[0], v22.d[1]
+    mov         v22.d[1], v26.d[0]
+    mov         v26.d[0], v23.d[0]
+
+
+    ld1         {v8.8b, v9.8b},[x2],x8
+    ld1         {v10.8b, v11.8b},[x2],x8
+    ld1         {v28.8b, v29.8b},[x2],x8
+    ld1         {v30.8b, v31.8b},[x2],x8
+
+
+    uaddw       v12.8h,  v12.8h ,  v8.8b
+    uaddw       v20.8h,  v20.8h ,  v9.8b
+    uaddw       v14.8h,  v14.8h ,  v10.8b
+    uaddw       v22.8h,  v22.8h ,  v11.8b
+    uaddw       v16.8h,  v16.8h ,  v28.8b
+    uaddw       v24.8h,  v24.8h ,  v29.8b
+    uaddw       v18.8h,  v18.8h ,  v30.8b
+    uaddw       v26.8h,  v26.8h ,  v31.8b
+    sub         x2,x2,x8,lsl #2
+    add         x2,x2,#16
+    sqxtun      v12.8b, v12.8h
+    sqxtun      v13.8b, v20.8h
+    sqxtun      v20.8b, v14.8h
+    sqxtun      v21.8b, v22.8h
+    sqxtun      v14.8b, v16.8h
+    sqxtun      v15.8b, v24.8h
+    sqxtun      v22.8b, v18.8h
+    sqxtun      v23.8b, v26.8h
+
+
+    st1         {v12.8b, v13.8b},[x3],x7
+    st1         {v20.8b, v21.8b},[x3],x7
+    st1         {v14.8b, v15.8b},[x3],x7
+    st1         {v22.8b, v23.8b},[x3],x7
+
+
+    sub         x3,x3,x7,lsl #2
+    add         x3,x3,#16
+
+    ld1         {v12.8h},[x0],#16
+    ld1         {v14.8h},[x0],#16
+
+    sub         x0,x0,#96
+
+    ld1         {v16.8h},[x0],#16
+    ld1         {v18.8h},[x0],#16
+    sub         x0,x0,#96
+
+    ld1         {v20.8h},[x0],#16
+    ld1         {v22.8h},[x0],#16
+
+
+    sub         x0,x0,#96
+
+    ld1         {v24.8h},[x0],#16
+    ld1         {v26.8h},[x0],#16
+
+
+    sub         x0,x0,#64
+
+
+    // swapping v12 upper and v16 lower 64bits
+    mov         v13.d[0], v12.d[1]
+    mov         v12.d[1], v16.d[0]
+    mov         v16.d[0], v13.d[0]
+    // swapping v20 upper and v24 lower 64bits
+    mov         v21.d[0], v20.d[1]
+    mov         v20.d[1], v24.d[0]
+    mov         v24.d[0], v21.d[0]
+    // swapping v14 uppper and v18 lower 64bits
+    mov         v15.d[0], v14.d[1]
+    mov         v14.d[1], v18.d[0]
+    mov         v18.d[0], v15.d[0]
+    // swapping v22 upper and v26 lower 64bits
+    mov         v23.d[0], v22.d[1]
+    mov         v22.d[1], v26.d[0]
+    mov         v26.d[0], v23.d[0]
+
+
+    ld1         {v8.8b, v9.8b},[x2],x8
+    ld1         {v10.8b, v11.8b},[x2],x8
+    ld1         {v28.8b, v29.8b},[x2],x8
+    ld1         {v30.8b, v31.8b},[x2],x8
+
+
+    uaddw       v12.8h,  v12.8h ,  v8.8b
+    uaddw       v20.8h,  v20.8h ,  v9.8b
+    uaddw       v14.8h,  v14.8h ,  v10.8b
+    uaddw       v22.8h,  v22.8h ,  v11.8b
+    uaddw       v16.8h,  v16.8h ,  v28.8b
+    uaddw       v24.8h,  v24.8h ,  v29.8b
+    uaddw       v18.8h,  v18.8h ,  v30.8b
+    uaddw       v26.8h,  v26.8h ,  v31.8b
+    sub         x2,x2,#16
+
+    sqxtun      v12.8b, v12.8h
+    sqxtun      v13.8b, v20.8h
+    sqxtun      v20.8b, v14.8h
+    sqxtun      v21.8b, v22.8h
+    sqxtun      v14.8b, v16.8h
+    sqxtun      v15.8b, v24.8h
+    sqxtun      v22.8b, v18.8h
+    sqxtun      v23.8b, v26.8h
+
+
+    st1         {v12.8b, v13.8b},[x3],x7
+    st1         {v20.8b, v21.8b},[x3],x7
+    st1         {v14.8b, v15.8b},[x3],x7
+    st1         {v22.8b, v23.8b},[x3],x7
+
+    sub         x3,x3,#16
+
+    subs        x14,x14,#1
+    bne         dct_stage2
+    // ldmfd sp!,{x0-x12,pc}
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+

diff --git a/common/arm64/ihevc_itrans_recon_4x4.s b/common/arm64/ihevc_itrans_recon_4x4.s
new file mode 100644
index 0000000..b18fb89
--- /dev/null
+++ b/common/arm64/ihevc_itrans_recon_4x4.s

@@ -0,0 +1,237 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+// *******************************************************************************
+// * @file
+// *  ihevc_itrans_recon_4x4_neon.s
+// *
+// * @brief
+// *  contains function definitions for single stage  inverse transform
+// *
+// * @author
+// *     naveen sr
+// *
+// * @par list of functions:
+// *  - ihevc_itrans_recon_4x4()
+// *
+// * @remarks
+// *  none
+// *
+// *******************************************************************************
+//*/
+// /**
+// *******************************************************************************
+// *
+// * @brief
+// *  this function performs inverse transform  and reconstruction for 4x4
+// * input block
+// *
+// * @par description:
+// *  performs inverse transform and adds the prediction  data and clips output
+// * to 8 bit
+// *
+// * @param[in] pi2_src
+// *  input 4x4 coefficients
+// *
+// * @param[in] pi2_tmp
+// *  temporary 4x4 buffer for storing inverse
+// *
+// *  transform
+// *  1st stage output
+// *
+// * @param[in] pu1_pred
+// *  prediction 4x4 block
+// *
+// * @param[out] pu1_dst
+// *  output 4x4 block
+// *
+// * @param[in] src_strd
+// *  input stride
+// *
+// * @param[in] pred_strd
+// *  prediction stride
+// *
+// * @param[in] dst_strd
+// *  output stride
+// *
+// * @param[in] shift
+// *  output shift
+// *
+// * @param[in] zero_cols
+// *  zero columns in pi2_src
+// *
+// * @returns  void
+// *
+// * @remarks
+// *  none
+// *
+// *******************************************************************************
+// */
+//void ihevc_itrans_recon_4x4(word16 *pi2_src,
+//        word16 *pi2_tmp,
+//        uword8 *pu1_pred,
+//        uword8 *pu1_dst,
+//        word32 src_strd,
+//        word32 pred_strd,
+//        word32 dst_strd,
+//        word32 zero_cols)
+//**************variables vs registers*************************
+//    x0 => *pi2_src
+//    x1 => *pi2_tmp
+//    x2 => *pu1_pred
+//    x3 => *pu1_dst
+//    x4 => src_strd
+//    x5 => pred_strd
+//    x6 => dst_strd
+//    x7 => zero_cols
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.set shift_stage1_idct ,   7
+.set shift_stage2_idct ,   12
+
+
+
+.globl ihevc_itrans_recon_4x4_av8
+
+.extern g_ai2_ihevc_trans_4_transpose
+
+.type ihevc_itrans_recon_4x4_av8, %function
+
+ihevc_itrans_recon_4x4_av8:
+
+    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    adrp        x8, :got:g_ai2_ihevc_trans_4_transpose
+    ldr         x8, [x8, #:got_lo12:g_ai2_ihevc_trans_4_transpose]
+
+    add         x4,x4,x4                    // src_strd in terms of word16
+    add         x9,x0,x4                    // pi2_src[0] + src_strd
+
+    ld1         {v4.4h},[x8]                //loading first row of g_ai2_ihevc_trans_4_transpose
+    // d4 = {36,64,83,64}
+    //index = 3  2  1  0
+    add         x10,x9,x4, lsl #1           // 3*src_strd
+    add         x4,x4,x4
+    ld1         {v1.4h},[x9]                //loading pi2_src 2nd row
+    ld1         {v3.4h},[x10]               //loading pi2_src 4th row
+    ld1         {v0.4h},[x0],x4             //loading pi2_src 1st row
+    ld1         {v2.4h},[x0],x4             //loading pi2_src 3rd row
+
+
+    // first stage computation starts
+    smull       v6.4s, v1.4h, v4.4h[1]      //83 * pi2_src[1]
+    smlal       v6.4s, v3.4h, v4.4h[3]      //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
+    smull       v8.4s, v1.4h, v4.4h[3]      //36 * pi2_src[1]
+    ld1         {v22.s}[0],[x2],x5
+    smlsl       v8.4s, v3.4h, v4.4h[1]      //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
+
+    saddl       v10.4s, v0.4h, v2.4h        //pi2_src[0] + pi2_src[2]
+    ssubl       v12.4s, v0.4h, v2.4h        //pi2_src[0] - pi2_src[2]
+    shl         v10.4s, v10.4s,#6           //e[0] = 64*(pi2_src[0] + pi2_src[2])
+    shl         v12.4s, v12.4s,#6           //e[1] = 64*(pi2_src[0] - pi2_src[2])
+
+    add         v14.4s,  v10.4s ,  v6.4s    //((e[0] + o[0] )
+    add         v16.4s,  v12.4s ,  v8.4s    //((e[1] + o[1])
+    sub         v18.4s,  v12.4s ,  v8.4s    //((e[1] - o[1])
+    sub         v20.4s,  v10.4s ,  v6.4s    //((e[0] - o[0])
+
+    sqrshrn     v28.4h, v14.4s,#shift_stage1_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
+    sqrshrn     v29.4h, v16.4s,#shift_stage1_idct //pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) )
+    sqrshrn     v30.4h, v18.4s,#shift_stage1_idct //pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) )
+    sqrshrn     v31.4h, v20.4s,#shift_stage1_idct //pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) )
+
+    trn1        v24.4h, v28.4h, v29.4h
+    trn2        v25.4h, v28.4h, v29.4h
+    trn1        v26.4h, v30.4h, v31.4h
+    trn2        v27.4h, v30.4h, v31.4h
+    trn1        v0.2s, v24.2s, v26.2s
+    trn2        v2.2s, v24.2s, v26.2s
+    trn1        v1.2s, v25.2s, v27.2s
+    trn2        v3.2s, v25.2s, v27.2s
+
+    // first stage ends
+    // output in d0,d1,d2,d3
+    // second stage starts
+    smull       v6.4s, v1.4h, v4.4h[1]      //83 * pi2_src[1]
+    ld1         {v22.s}[1],[x2],x5
+    smlal       v6.4s, v3.4h, v4.4h[3]      //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
+    smull       v8.4s, v1.4h, v4.4h[3]      //36 * pi2_src[1]
+    smlsl       v8.4s, v3.4h, v4.4h[1]      //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
+    ld1         {v23.s}[0],[x2],x5
+
+    saddl       v10.4s, v0.4h, v2.4h        //pi2_src[0] + pi2_src[2]
+    ssubl       v12.4s, v0.4h, v2.4h        //pi2_src[0] - pi2_src[2]
+    shl         v10.4s, v10.4s,#6           //e[0] = 64*(pi2_src[0] + pi2_src[2])
+    shl         v12.4s, v12.4s,#6           //e[1] = 64*(pi2_src[0] - pi2_src[2])
+
+
+    add         v14.4s,  v10.4s ,  v6.4s    //((e[0] + o[0] )
+    add         v16.4s,  v12.4s ,  v8.4s    //((e[1] + o[1])
+    sub         v18.4s,  v12.4s ,  v8.4s    //((e[1] - o[1])
+    sub         v20.4s,  v10.4s ,  v6.4s    //((e[0] - o[0])
+
+    sqrshrn     v28.4h, v14.4s,#shift_stage2_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
+    sqrshrn     v29.4h, v16.4s,#shift_stage2_idct //pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) )
+    sqrshrn     v30.4h, v18.4s,#shift_stage2_idct //pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) )
+    sqrshrn     v31.4h, v20.4s,#shift_stage2_idct //pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) )
+    ld1         {v23.s}[1],[x2],x5
+
+    trn1        v24.4h, v28.4h, v29.4h
+    trn2        v25.4h, v28.4h, v29.4h
+    trn1        v26.4h, v30.4h, v31.4h
+    trn2        v27.4h, v30.4h, v31.4h
+    trn1        v0.2s, v24.2s, v26.2s
+    trn2        v2.2s, v24.2s, v26.2s
+    trn1        v1.2s, v25.2s, v27.2s
+    trn2        v3.2s, v25.2s, v27.2s
+    // second stage ends
+    // output in d0,d1,d2,d3
+    // second stage computation ends
+
+    // loading pred
+
+    mov         v0.d[1],v1.d[0]
+    mov         v2.d[1],v3.d[0]
+
+    uaddw       v0.8h,  v0.8h ,  v22.8b     // pi2_out(16bit) + pu1_pred(8bit)
+    uaddw       v2.8h,  v2.8h ,  v23.8b     // pi2_out(16bit) + pu1_pred(8bit)
+    sqxtun      v0.8b, v0.8h                // clip_u8(pi2_out(16bit) + pu1_pred(8bit))
+    sqxtun      v1.8b, v2.8h                // clip_u8(pi2_out(16bit) + pu1_pred(8bit))
+
+    // storing destination
+    st1         {v0.s}[0],[x3],x6
+    st1         {v0.s}[1],[x3],x6
+    st1         {v1.s}[0],[x3],x6
+    st1         {v1.s}[1],[x3],x6
+
+
+    // ldmfd sp!,{x4-x12,x15}                //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+

diff --git a/common/arm64/ihevc_itrans_recon_4x4_ttype1.s b/common/arm64/ihevc_itrans_recon_4x4_ttype1.s
new file mode 100644
index 0000000..fa04b8e
--- /dev/null
+++ b/common/arm64/ihevc_itrans_recon_4x4_ttype1.s

@@ -0,0 +1,246 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+// *******************************************************************************
+// * @file
+// *  ihevc_itrans_recon_4x4_ttype1.s
+// *
+// * @brief
+// *  contains function definitions for inverse transform  and reconstruction
+// *
+// *
+// * @author
+// *  naveen sr
+// *
+// * @par list of functions:
+// *  - ihevc_itrans_recon_4x4_ttype1()
+// *
+// * @remarks
+// *  none
+// *
+// *******************************************************************************
+// */
+
+///* all the functions here are replicated from ihevc_itrans.c and modified to */
+///* include reconstruction */
+//
+///**
+// *******************************************************************************
+// *
+// * @brief
+// *  this function performs inverse transform type 1 (dst)  and reconstruction
+// * for 4x4 input block
+// *
+// * @par description:
+// *  performs inverse transform and adds the prediction  data and clips output
+// * to 8 bit
+// *
+// * @param[in] pi2_src
+// *  input 4x4 coefficients
+// *
+// * @param[in] pi2_tmp
+// *  temporary 4x4 buffer for storing inverse
+// *
+// *  transform
+// *  1st stage output
+// *
+// * @param[in] pu1_pred
+// *  prediction 4x4 block
+// *
+// * @param[out] pu1_dst
+// *  output 4x4 block
+// *
+// * @param[in] src_strd
+// *  input stride
+// *
+// * @param[in] pred_strd
+// *  prediction stride
+// *
+// * @param[in] dst_strd
+// *  output stride
+// *
+// * @param[in] zero_cols
+// *  zero columns in pi2_src
+// *
+// * @returns  void
+// *
+// * @remarks
+// *  none
+// *
+// *******************************************************************************
+// */
+//void ihevc_itrans_recon_4x4_ttype1(word16 *pi2_src,
+//        word16 *pi2_tmp,
+//        uword8 *pu1_pred,
+//        uword8 *pu1_dst,
+//        word32 src_strd,
+//        word32 pred_strd,
+//        word32 dst_strd,
+//        word32 zero_cols)
+
+//**************variables vs registers*************************
+//    x0 => *pi2_src
+//    x1 => *pi2_tmp
+//    x2 => *pu1_pred
+//    x3 => *pu1_dst
+//    x4 => src_strd
+//    x5 => pred_strd
+//    x6 => dst_strd
+//    x7 => zero_cols
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.set shift_stage1_idct ,   7
+.set shift_stage2_idct ,   12
+
+.globl ihevc_itrans_recon_4x4_ttype1_av8
+
+.type ihevc_itrans_recon_4x4_ttype1_av8, %function
+
+ihevc_itrans_recon_4x4_ttype1_av8:
+
+    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    add         x4,x4,x4                    // src_strd in terms of word16
+
+    mov         x8,#29
+    mov         x9,#55
+    mov         x10,#74
+    mov         x11,#84
+    mov         v4.4h[0], w8
+    ld1         {v0.4h},[x0],x4             //loading pi2_src 1st row
+    mov         v4.4h[1], w9
+    ld1         {v1.4h},[x0],x4             //loading pi2_src 2nd row
+    mov         v4.4h[2], w10
+    ld1         {v2.4h},[x0],x4             //loading pi2_src 3rd row
+    mov         v4.4h[3], w11
+    ld1         {v3.4h},[x0],x4             //loading pi2_src 4th row
+
+    // first stage computation starts
+    smull       v6.4s, v1.4h, v4.4h[2]      //74 * pi2_src[1]
+    smlal       v6.4s, v0.4h, v4.4h[0]      //74 * pi2_src[1] + 29 * pi2_src[0]
+    smlal       v6.4s, v3.4h, v4.4h[1]      //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
+    smlal       v6.4s, v2.4h, v4.4h[3]      //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
+
+    smull       v8.4s, v1.4h, v4.4h[2]      //74 * pi2_src[1]
+    smlal       v8.4s, v0.4h, v4.4h[1]      //74 * pi2_src[1] + 55 * pi2_src[0]
+    smlsl       v8.4s, v2.4h, v4.4h[0]      //74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2]
+    smlsl       v8.4s, v3.4h, v4.4h[3]      //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2] - 84 * pi2_src[3])
+
+    smull       v10.4s, v0.4h, v4.4h[2]     // 74 * pi2_src[0]
+    smlsl       v10.4s, v2.4h, v4.4h[2]     // 74 * pi2_src[0] - 74 * pi2_src[2]
+    smlal       v10.4s, v3.4h, v4.4h[2]     //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
+
+    smull       v12.4s, v2.4h, v4.4h[1]     // 55 * pi2_src[2]
+    smlsl       v12.4s, v1.4h, v4.4h[2]     // 55 * pi2_src[2] - 74 * pi2_src[1]
+    smlsl       v12.4s, v3.4h, v4.4h[0]     // - 74 * pi2_src[1] +   55 * pi2_src[2]    - 29 * pi2_src[3]
+    smlal       v12.4s, v0.4h, v4.4h[3]     //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+
+    sqrshrn     v28.4h, v6.4s,#shift_stage1_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct
+    sqrshrn     v29.4h, v8.4s,#shift_stage1_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct
+    sqrshrn     v30.4h, v10.4s,#shift_stage1_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct
+    sqrshrn     v31.4h, v12.4s,#shift_stage1_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct
+    ld1         {v18.s}[0],[x2],x5
+
+    trn1        v24.4h, v28.4h, v29.4h
+    trn2        v25.4h, v28.4h, v29.4h
+    trn1        v26.4h, v30.4h, v31.4h
+    trn2        v27.4h, v30.4h, v31.4h
+    trn1        v14.2s, v24.2s, v26.2s
+    trn2        v16.2s, v24.2s, v26.2s
+    trn1        v15.2s, v25.2s, v27.2s
+    trn2        v17.2s, v25.2s, v27.2s
+    // output in d14,d15,d16,d17
+    // first stage computation ends
+
+    // second stage computation starts  :  copy pasting 1st stage
+    // register changes
+    // d14 - d0
+    // d15 - d1
+    // d16 - d2
+    // d17 - d3
+    ld1         {v18.s}[1],[x2],x5
+    smull       v6.4s, v15.4h, v4.4h[2]     //74 * pi2_src[1]
+    smlal       v6.4s, v14.4h, v4.4h[0]     //74 * pi2_src[1] + 29 * pi2_src[0]
+    smlal       v6.4s, v17.4h, v4.4h[1]     //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
+    smlal       v6.4s, v16.4h, v4.4h[3]     //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
+
+    smull       v8.4s, v15.4h, v4.4h[2]     //74 * pi2_src[1]
+    smlal       v8.4s, v14.4h, v4.4h[1]     //74 * pi2_src[1] + 55 * pi2_src[0]
+    smlsl       v8.4s, v16.4h, v4.4h[0]     //74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2]
+    smlsl       v8.4s, v17.4h, v4.4h[3]     //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2] - 84 * pi2_src[3])
+
+    smull       v10.4s, v14.4h, v4.4h[2]    // 74 * pi2_src[0]
+    smlsl       v10.4s, v16.4h, v4.4h[2]    // 74 * pi2_src[0] - 74 * pi2_src[2]
+    smlal       v10.4s, v17.4h, v4.4h[2]    //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
+    ld1         {v19.s}[0],[x2],x5
+
+    smull       v12.4s, v16.4h, v4.4h[1]    // 55 * pi2_src[2]
+    smlsl       v12.4s, v15.4h, v4.4h[2]    //  - 74 * pi2_src[1] +   55 * pi2_src[2]
+    smlsl       v12.4s, v17.4h, v4.4h[0]    // - 74 * pi2_src[1] +   55 * pi2_src[2]    - 29 * pi2_src[3]
+    smlal       v12.4s, v14.4h, v4.4h[3]    //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+
+    sqrshrn     v28.4h, v6.4s,#shift_stage2_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct
+    sqrshrn     v29.4h, v8.4s,#shift_stage2_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct
+    sqrshrn     v30.4h, v10.4s,#shift_stage2_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct
+    sqrshrn     v31.4h, v12.4s,#shift_stage2_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct
+    ld1         {v19.s}[1],[x2],x5
+    trn1        v24.4h, v28.4h, v29.4h
+    trn2        v25.4h, v28.4h, v29.4h
+    trn1        v26.4h, v30.4h, v31.4h
+    trn2        v27.4h, v30.4h, v31.4h
+    trn1        v0.2s, v24.2s, v26.2s
+    trn2        v2.2s, v24.2s, v26.2s
+    trn1        v1.2s, v25.2s, v27.2s
+    trn2        v3.2s, v25.2s, v27.2s
+    // output in d0,d1,d2,d3
+    // second stage computation ends
+
+    // loading pred
+    mov         v0.d[1],v1.d[0]
+    mov         v2.d[1],v3.d[0]
+
+    uaddw       v0.8h,  v0.8h ,  v18.8b     // pi2_out(16bit) + pu1_pred(8bit)
+    sqxtun      v0.8b, v0.8h                // clip_u8(pi2_out(16bit) + pu1_pred(8bit))
+    uaddw       v2.8h,  v2.8h ,  v19.8b     // pi2_out(16bit) + pu1_pred(8bit)
+    sqxtun      v1.8b, v2.8h                // clip_u8(pi2_out(16bit) + pu1_pred(8bit))
+
+    // storing destination
+    st1         {v0.s}[0],[x3],x6
+    st1         {v0.s}[1],[x3],x6
+    st1         {v1.s}[0],[x3],x6
+    st1         {v1.s}[1],[x3],x6
+
+    // ldmfd sp!,{x4-x12,x15}            //reload the registers from sp
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_itrans_recon_8x8.s b/common/arm64/ihevc_itrans_recon_8x8.s
new file mode 100644
index 0000000..332677e
--- /dev/null
+++ b/common/arm64/ihevc_itrans_recon_8x8.s

@@ -0,0 +1,1038 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+// *******************************************************************************
+// * @file
+// *  ihevc_itrans_recon_8x8_neon.s
+// *
+// * @brief
+// *  contains function definitions for single stage  inverse transform
+// *
+// * @author
+// *  anand s
+// *
+// * @par list of functions:
+// *  - ihevc_itrans_recon_8x8()
+// *
+// * @remarks
+// *  none
+// *
+// *******************************************************************************
+//*/
+
+///**
+// *******************************************************************************
+// *
+// * @brief
+// *  this function performs inverse transform  and reconstruction for 8x8
+// * input block
+// *
+// * @par description:
+// *  performs inverse transform and adds the prediction  data and clips output
+// * to 8 bit
+// *
+// * @param[in] pi2_src
+// *  input 8x8 coefficients
+// *
+// * @param[in] pi2_tmp
+// *  temporary 8x8 buffer for storing inverse
+// *
+// *  transform
+// *  1st stage output
+// *
+// * @param[in] pu1_pred
+// *  prediction 8x8 block
+// *
+// * @param[out] pu1_dst
+// *  output 8x8 block
+// *
+// * @param[in] src_strd
+// *  input stride
+// *
+// * @param[in] pred_strd
+// *  prediction stride
+// *
+// * @param[in] dst_strd
+// *  output stride
+// *
+// * @param[in] shift
+// *  output shift
+// *
+// * @param[in] zero_cols
+// *  zero columns in pi2_src
+// *
+// * @returns  void
+// *
+// * @remarks
+// *  none
+// *
+// *******************************************************************************
+// */
+
+//void ihevc_itrans_recon_8x8(word16 *pi2_src,
+//                            word16 *pi2_tmp,
+//                            uword8 *pu1_pred,
+//                            uword8 *pu1_dst,
+//                            word32 src_strd,
+//                            word32 pred_strd,
+//                            word32 dst_strd,
+//                            word32 zero_cols
+//                             word32    zero_rows                )
+
+//**************variables vs registers*************************
+//    x0 => *pi2_src
+//    x1 => *pi2_tmp
+//    x2 => *pu1_pred
+//    x3 => *pu1_dst
+//    src_strd
+//    pred_strd
+//    dst_strd
+//    zero_cols
+
+
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.set width_x_size_x5 ,   40
+.set width_x_size_x2 ,   32
+.set shift_stage1_idct ,   7
+.set shift_stage2_idct ,   12
+
+.globl ihevc_itrans_recon_8x8_av8
+
+.extern g_ai2_ihevc_trans_8_transpose
+
+.type ihevc_itrans_recon_8x8_av8, %function
+
+ihevc_itrans_recon_8x8_av8:
+////register usage.extern        - loading and until idct of columns
+////    cosine constants     -     d0
+////    sine constants         -     d1
+////    row 0 first half     -     d2        -    y0
+////    row 1 first half     -     d6        -    y1
+////    row 2 first half     -     d3        -    y2
+////    row 3 first half     -     d7        -    y3
+////    row 4 first half     -     d10        -    y4
+////    row 5 first half     -     d14        -    y5
+////    row 6 first half     -     d11        -    y6
+////    row 7 first half     -     d15        -    y7
+
+////    row 0 second half    -     d4        -    y0
+////    row 1 second half    -     d8      -    y1
+////    row 2 second half    -     d5      -    y2
+////    row 3 second half    -     d9      -    y3
+////    row 4 second half    -     d12     -    y4
+////    row 5 second half    -     d16     -    y5
+////    row 6 second half    -     d13     -    y6
+////    row 7 second half    -     d17     -    y7
+
+    //// copy the input pointer to another register
+    //// step 1 : load all constants
+    // stmfd sp!,{x4-x12,x14}
+
+    ldr         w11, [sp]                   // zero rows
+
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    mov         x12, x7 // zero columns
+    mov         x8, x5 // prediction stride
+    mov         x7, x6 // destination stride
+    mov         x6, x4 // src stride
+    lsl         x6, x6, #1                  // x sizeof(word16)
+    add         x9,x0,x6, lsl #1            // 2 rows
+
+    add         x10,x6,x6, lsl #1           // 3 rows
+
+    sub         x10,x10, #8                 // - 4 cols * sizeof(word16)
+    sub         x5,x6, #8                   // src_strd - 4 cols * sizeof(word16)
+
+    adrp        x14, :got:g_ai2_ihevc_trans_8_transpose
+    ldr         x14, [x14, #:got_lo12:g_ai2_ihevc_trans_8_transpose]
+
+    ld1         {v0.4h, v1.4h},[x14]        ////d0,d1 are used for storing the constant data
+
+    ////step 2 load all the input data
+    ////step 3 operate first 4 colums at a time
+
+    and         x11,x11,#0xff
+    and         x12,x12,#0xff
+
+    cmp         x11,#0xf0
+    bge         skip_last4_rows
+
+
+    ld1         {v2.4h},[x0],#8
+    ld1         {v3.4h},[x9],#8
+    ld1         {v4.4h},[x0],x5
+    smull       v20.4s, v2.4h, v0.4h[0]     //// y0 * cos4(part of c0 and c1)
+    ld1         {v5.4h},[x9],x5
+    smull       v18.4s, v3.4h, v1.4h[2]     //// y2 * sin2 (q3 is freed by this time)(part of d1)
+    ld1         {v6.4h},[x0],#8
+    ld1         {v7.4h},[x9],#8
+    smull       v24.4s, v6.4h, v0.4h[1]     //// y1 * cos1(part of b0)
+    ld1         {v8.4h},[x0],x10
+    smull       v26.4s, v6.4h, v0.4h[3]     //// y1 * cos3(part of b1)
+    ld1         {v9.4h},[x9],x10
+    smull       v28.4s, v6.4h, v1.4h[1]     //// y1 * sin3(part of b2)
+    ld1         {v10.4h},[x0],#8
+    smull       v30.4s, v6.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+    ld1         {v11.4h},[x9],#8
+    smlal       v24.4s, v7.4h, v0.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    ld1         {v12.4h},[x0],x5
+    smlsl       v26.4s, v7.4h, v1.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    ld1         {v13.4h},[x9],x5
+    smlsl       v28.4s, v7.4h, v0.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    ld1         {v14.4h},[x0],#8
+    smlsl       v30.4s, v7.4h, v1.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+    ld1         {v15.4h},[x9],#8
+    smull       v22.4s, v10.4h, v0.4h[0]    //// y4 * cos4(part of c0 and c1)
+    ld1         {v16.4h},[x0],x10
+    smull       v6.4s, v3.4h, v0.4h[2]      //// y2 * cos2(part of d0)
+    ld1         {v17.4h},[x9],x10
+
+    ///* this following was activated when alignment is not there */
+////    vld1.16        d2,[x0]!
+////    vld1.16        d3,[x2]!
+////    vld1.16        d4,[x0]!
+////    vld1.16        d5,[x2]!
+////    vld1.16        d6,[x0]!
+////    vld1.16        d7,[x2]!
+////    vld1.16        d8,[x0],x3
+////    vld1.16        d9,[x2],x3
+////    vld1.16        d10,[x0]!
+////    vld1.16        d11,[x2]!
+////    vld1.16        d12,[x0]!
+////    vld1.16        d13,[x2]!
+////    vld1.16        d14,[x0]!
+////    vld1.16        d15,[x2]!
+////    vld1.16        d16,[x0],x3
+////    vld1.16        d17,[x2],x3
+
+
+
+
+    smlal       v24.4s, v14.4h, v1.4h[1]    //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+    smlsl       v26.4s, v14.4h, v0.4h[1]    //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+    smlal       v28.4s, v14.4h, v1.4h[3]    //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+    smlal       v30.4s, v14.4h, v0.4h[3]    //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+    smlsl       v18.4s, v11.4h, v0.4h[2]    //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+    smlal       v6.4s, v11.4h, v1.4h[2]     //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+    add         v10.4s,  v20.4s ,  v22.4s   //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+    sub         v20.4s,  v20.4s ,  v22.4s   //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+    smlal       v24.4s, v15.4h, v1.4h[3]    //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+    smlsl       v26.4s, v15.4h, v1.4h[1]    //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+    smlal       v28.4s, v15.4h, v0.4h[3]    //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+    smlsl       v30.4s, v15.4h, v0.4h[1]    //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+
+    add         v14.4s,  v10.4s ,  v6.4s    ////    a0 = c0 + d0(part of x0,x7)
+    sub         v10.4s,  v10.4s ,  v6.4s    //// a3 = c0 - d0(part of x3,x4)
+    sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of x2,x5)
+    add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of x1,x6)
+
+    add         v20.4s,  v14.4s ,  v24.4s   //// a0 + b0(part of x0)
+    sub         v6.4s,  v14.4s ,  v24.4s    //// a0 - b0(part of x7)
+
+    add         v24.4s,  v22.4s ,  v28.4s   //// a2 + b2(part of x2)
+    sub         v22.4s,  v22.4s ,  v28.4s   //// a2 - b2(part of x5)
+
+    add         v28.4s,  v18.4s ,  v26.4s   //// a1 + b1(part of x1)
+    sub         v18.4s,  v18.4s ,  v26.4s   //// a1 - b1(part of x6)
+
+    add         v26.4s,  v10.4s ,  v30.4s   //// a3 + b3(part of x3)
+    sub         v30.4s,  v10.4s ,  v30.4s   //// a3 - b3(part of x4)
+
+    sqrshrn     v2.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v15.4h, v6.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v3.4h, v24.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v14.4h, v22.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v6.4h, v28.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v11.4h, v18.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v7.4h, v26.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v10.4h, v30.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+
+    b           last4_cols
+
+
+
+skip_last4_rows:
+
+
+
+    ld1         {v2.4h},[x0],#8
+    ld1         {v3.4h},[x9],#8
+    ld1         {v4.4h},[x0],x5
+    ld1         {v5.4h},[x9],x5
+    ld1         {v6.4h},[x0],#8
+    ld1         {v7.4h},[x9],#8
+    ld1         {v8.4h},[x0],x10
+    ld1         {v9.4h},[x9],x10
+
+
+
+    movi        v12.4h, #0
+    movi        v13.4h, #0
+    movi        v16.4h, #0
+    movi        v17.4h, #0
+
+
+
+
+    smull       v24.4s, v6.4h, v0.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v6.4h, v0.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v6.4h, v1.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v6.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v7.4h, v0.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v7.4h, v1.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v7.4h, v0.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v7.4h, v1.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+    smull       v18.4s, v3.4h, v1.4h[2]     //// y2 * sin2 (q3 is freed by this time)(part of d1)
+    smull       v6.4s, v3.4h, v0.4h[2]      //// y2 * cos2(part of d0)
+
+    smull       v20.4s, v2.4h, v0.4h[0]     //// y0 * cos4(part of c0 and c1)
+
+
+    add         v14.4s,  v20.4s ,  v6.4s    ////    a0 = c0 + d0(part of x0,x7)
+    sub         v10.4s,  v20.4s ,  v6.4s    //// a3 = c0 - d0(part of x3,x4)
+    sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of x2,x5)
+    add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of x1,x6)
+
+    add         v20.4s,  v14.4s ,  v24.4s   //// a0 + b0(part of x0)
+    sub         v6.4s,  v14.4s ,  v24.4s    //// a0 - b0(part of x7)
+
+    add         v24.4s,  v22.4s ,  v28.4s   //// a2 + b2(part of x2)
+    sub         v22.4s,  v22.4s ,  v28.4s   //// a2 - b2(part of x5)
+
+    add         v28.4s,  v18.4s ,  v26.4s   //// a1 + b1(part of x1)
+    sub         v18.4s,  v18.4s ,  v26.4s   //// a1 - b1(part of x6)
+
+    add         v26.4s,  v10.4s ,  v30.4s   //// a3 + b3(part of x3)
+    sub         v30.4s,  v10.4s ,  v30.4s   //// a3 - b3(part of x4)
+
+    sqrshrn     v2.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v15.4h, v6.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v3.4h, v24.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v14.4h, v22.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v6.4h, v28.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v11.4h, v18.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v7.4h, v26.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v10.4h, v30.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+
+last4_cols:
+
+
+    cmp         x12,#0xf0
+    bge         skip_last4cols
+
+    smull       v24.4s, v8.4h, v0.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v8.4h, v0.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v8.4h, v1.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v8.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v9.4h, v0.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v9.4h, v1.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v9.4h, v0.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v9.4h, v1.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+    smull       v18.4s, v5.4h, v1.4h[2]     //// y2 * sin2 (q4 is freed by this time)(part of d1)
+    smull       v8.4s, v5.4h, v0.4h[2]      //// y2 * cos2(part of d0)
+
+    smull       v20.4s, v4.4h, v0.4h[0]     //// y0 * cos4(part of c0 and c1)
+    smull       v22.4s, v12.4h, v0.4h[0]    //// y4 * cos4(part of c0 and c1)
+
+    smlal       v24.4s, v16.4h, v1.4h[1]    //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+    smlsl       v26.4s, v16.4h, v0.4h[1]    //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+    smlal       v28.4s, v16.4h, v1.4h[3]    //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+    smlal       v30.4s, v16.4h, v0.4h[3]    //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+    smlsl       v18.4s, v13.4h, v0.4h[2]    //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+    smlal       v8.4s, v13.4h, v1.4h[2]     //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+    add         v12.4s,  v20.4s ,  v22.4s   //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+    sub         v20.4s,  v20.4s ,  v22.4s   //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+    smlal       v24.4s, v17.4h, v1.4h[3]    //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
+    smlsl       v26.4s, v17.4h, v1.4h[1]    //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
+    smlal       v28.4s, v17.4h, v0.4h[3]    //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
+    smlsl       v30.4s, v17.4h, v0.4h[1]    //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
+
+    add         v16.4s,  v12.4s ,  v8.4s    ////    a0 = c0 + d0(part of e0,e7)
+    sub         v12.4s,  v12.4s ,  v8.4s    //// a3 = c0 - d0(part of e3,e4)
+    sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of e2,e5)
+    add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of e1,e6)
+
+    add         v20.4s,  v16.4s ,  v24.4s   //// a0 + b0(part of e0)
+    sub         v8.4s,  v16.4s ,  v24.4s    //// a0 - b0(part of e7)
+
+    add         v24.4s,  v22.4s ,  v28.4s   //// a2 + b2(part of e2)
+    sub         v22.4s,  v22.4s ,  v28.4s   //// a2 - b2(part of e5)
+
+    add         v28.4s,  v18.4s ,  v26.4s   //// a1 + b1(part of e1)
+    sub         v18.4s,  v18.4s ,  v26.4s   //// a1 - b1(part of e6)
+
+    add         v26.4s,  v12.4s ,  v30.4s   //// a3 + b3(part of e3)
+    sub         v30.4s,  v12.4s ,  v30.4s   //// a3 - b3(part of x4)
+
+    sqrshrn     v4.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v17.4h, v8.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v5.4h, v24.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v16.4h, v22.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v8.4h, v28.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v13.4h, v18.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v9.4h, v26.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v12.4h, v30.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+    b           end_skip_last4cols
+
+
+
+skip_last4cols:
+
+    umov        x15,v25.d[0]
+
+    trn1        v25.4h, v2.4h, v6.4h
+    trn2        v29.4h, v2.4h, v6.4h        ////[x3,x1],[x2,x0] first qudrant transposing
+
+    trn1        v27.4h, v3.4h, v7.4h
+    trn2        v31.4h, v3.4h, v7.4h        ////[x3,x1],[x2,x0] first qudrant transposing
+
+    trn1        v6.2s, v29.2s, v31.2s
+    trn2        v7.2s, v29.2s, v31.2s       ////x0,x1,x2,x3 first qudrant transposing continued.....
+    trn1        v2.2s, v25.2s, v27.2s
+    trn2        v3.2s, v25.2s, v27.2s       ////x0,x1,x2,x3 first qudrant transposing continued.....
+
+
+    trn1        v25.4h, v10.4h, v14.4h
+    trn2        v29.4h, v10.4h, v14.4h      ////[x7,x5],[x6,x4] third qudrant transposing
+
+    trn1        v27.4h, v11.4h, v15.4h
+    trn2        v31.4h, v11.4h, v15.4h      ////[x7,x5],[x6,x4] third qudrant transposing
+
+    trn1        v10.2s, v25.2s, v27.2s
+    trn2        v11.2s, v25.2s, v27.2s      ////x4,x5,x6,x7 third qudrant transposing continued.....
+    trn1        v14.2s, v29.2s, v31.2s
+    trn2        v15.2s, v29.2s, v31.2s      ////x4,x5,x6,x7 third qudrant transposing continued.....
+
+    mov         v25.d[0],x15
+
+    smull       v24.4s, v6.4h, v0.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v6.4h, v0.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v6.4h, v1.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v6.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v7.4h, v0.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v7.4h, v1.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v7.4h, v0.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v7.4h, v1.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+    smull       v20.4s, v2.4h, v0.4h[0]     //// y0 * cos4(part of c0 and c1)
+//    vmull.s16    q11,d4,d0[0]                    @// y4 * cos4(part of c0 and c1)
+
+    smull       v18.4s, v3.4h, v1.4h[2]     //// y2 * sin2 (q3 is freed by this time)(part of d1)
+    smull       v6.4s, v3.4h, v0.4h[2]      //// y2 * cos2(part of d0)
+
+
+
+
+    sub         v22.4s,  v20.4s ,  v6.4s    //// a3 = c0 - d0(part of x3,x4)
+    add         v4.4s,  v20.4s ,  v6.4s     ////    a0 = c0 + d0(part of x0,x7)
+
+
+    add         v2.4s,  v4.4s ,  v24.4s
+
+    sub         v6.4s,  v4.4s ,  v24.4s
+
+    add         v8.4s,  v22.4s ,  v30.4s
+
+    sub         v24.4s,  v22.4s ,  v30.4s
+
+    sqrshrn     v5.4h, v8.4s,#shift_stage2_idct
+    sqrshrn     v2.4h, v2.4s,#shift_stage2_idct
+    sqrshrn     v9.4h, v6.4s,#shift_stage2_idct
+    sqrshrn     v6.4h, v24.4s,#shift_stage2_idct
+
+    sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of x2,x5)
+    add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of x1,x6)
+
+
+    add         v30.4s,  v22.4s ,  v28.4s
+
+    sub         v24.4s,  v22.4s ,  v28.4s
+
+    add         v28.4s,  v18.4s ,  v26.4s
+
+    sub         v22.4s,  v18.4s ,  v26.4s
+    sqrshrn     v4.4h, v30.4s,#shift_stage2_idct
+    sqrshrn     v7.4h, v24.4s,#shift_stage2_idct
+    sqrshrn     v3.4h, v28.4s,#shift_stage2_idct
+    sqrshrn     v8.4h, v22.4s,#shift_stage2_idct
+
+
+
+    umov        x19,v25.d[0]
+    umov        x20,v25.d[1]
+
+    trn1        v27.4h, v2.4h, v3.4h
+    trn2        v29.4h, v2.4h, v3.4h
+    trn1        v25.4h, v4.4h, v5.4h
+    trn2        v31.4h, v4.4h, v5.4h
+
+    trn1        v2.2s, v27.2s, v25.2s
+    trn2        v4.2s, v27.2s, v25.2s
+    trn1        v3.2s, v29.2s, v31.2s
+    trn2        v5.2s, v29.2s, v31.2s
+
+    trn1        v27.4h, v6.4h, v7.4h
+    trn2        v29.4h, v6.4h, v7.4h
+    trn1        v25.4h, v8.4h, v9.4h
+    trn2        v31.4h, v8.4h, v9.4h
+
+    trn1        v6.2s, v27.2s, v25.2s
+    trn2        v8.2s, v27.2s, v25.2s
+    trn1        v7.2s, v29.2s, v31.2s
+    trn2        v9.2s, v29.2s, v31.2s
+
+    mov         v25.d[0],x19
+    mov         v25.d[1],x20
+
+    smull       v24.4s, v14.4h, v0.4h[1]    //// y1 * cos1(part of b0)
+
+    smull       v26.4s, v14.4h, v0.4h[3]    //// y1 * cos3(part of b1)
+    smull       v28.4s, v14.4h, v1.4h[1]    //// y1 * sin3(part of b2)
+    smull       v30.4s, v14.4h, v1.4h[3]    //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v15.4h, v0.4h[3]    //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v15.4h, v1.4h[3]    //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v15.4h, v0.4h[1]    //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v15.4h, v1.4h[1]    //// y1 * sin1 - y3 * sin3(part of b3)
+    smull       v20.4s, v10.4h, v0.4h[0]    //// y0 * cos4(part of c0 and c1)
+    smull       v18.4s, v11.4h, v1.4h[2]    //// y2 * sin2 (q7 is freed by this time)(part of d1)
+    smull       v14.4s, v11.4h, v0.4h[2]    //// y2 * cos2(part of d0)
+
+
+    add         x4,x2,x8, lsl #1            // x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
+
+
+    add         x5,x8,x8, lsl #1            //
+
+
+    add         x0,x3,x7, lsl #1            // x0 points to 3rd row of dest data
+
+
+    add         x10,x7,x7, lsl #1           //
+
+    // swapping v3 and v6
+    mov         v31.d[0], v3.d[0]
+    mov         v3.d[0], v6.d[0]
+    mov         v6.d[0], v31.d[0]
+
+    // swapping v5 and v8
+    mov         v31.d[0], v5.d[0]
+    mov         v5.d[0], v8.d[0]
+    mov         v8.d[0], v31.d[0]
+
+
+    sub         v22.4s,  v20.4s ,  v14.4s   //// a3 = c0 - d0(part of x3,x4)
+    add         v12.4s,  v20.4s ,  v14.4s   ////    a0 = c0 + d0(part of x0,x7)
+
+
+    add         v0.4s,  v12.4s ,  v24.4s
+
+
+    sub         v24.4s,  v12.4s ,  v24.4s
+
+
+    add         v12.4s,  v22.4s ,  v30.4s
+
+
+    sub         v14.4s,  v22.4s ,  v30.4s
+
+    sqrshrn     v10.4h, v0.4s,#shift_stage2_idct
+    sqrshrn     v17.4h, v24.4s,#shift_stage2_idct
+    sqrshrn     v13.4h, v12.4s,#shift_stage2_idct
+    sqrshrn     v14.4h, v14.4s,#shift_stage2_idct
+
+    sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of x2,x5)
+    add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of x1,x6)
+
+
+    add         v0.4s,  v22.4s ,  v28.4s
+
+
+    sub         v24.4s,  v22.4s ,  v28.4s
+
+
+    add         v28.4s,  v18.4s ,  v26.4s
+
+
+    sub         v26.4s,  v18.4s ,  v26.4s
+    ld1         {v18.8b},[x2],x8
+
+    sqrshrn     v12.4h, v0.4s,#shift_stage2_idct
+    ld1         {v20.8b},[x2],x5
+
+
+    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct
+    ld1         {v19.8b},[x2],x8
+
+
+
+
+    sqrshrn     v11.4h, v28.4s,#shift_stage2_idct
+    ld1         {v22.8b},[x4],x8
+
+
+
+
+    sqrshrn     v16.4h, v26.4s,#shift_stage2_idct
+    ld1         {v21.8b},[x2],x5
+
+
+    b           pred_buff_addition
+end_skip_last4cols:
+
+
+    umov        x19,v25.d[0]
+    umov        x20,v25.d[1]
+
+///* now the idct of columns is done, transpose so that row idct done efficiently(step5) */
+    trn1        v27.4h, v2.4h, v6.4h
+    trn2        v29.4h, v2.4h, v6.4h        ////[x3,x1],[x2,x0] first qudrant transposing
+    trn1        v25.4h, v3.4h, v7.4h
+    trn2        v31.4h, v3.4h, v7.4h        ////[x3,x1],[x2,x0] first qudrant transposing
+
+    trn1        v2.2s, v27.2s, v25.2s
+    trn2        v3.2s, v27.2s, v25.2s       ////x0,x1,x2,x3 first qudrant transposing continued.....
+    trn1        v6.2s, v29.2s, v31.2s
+    trn2        v7.2s, v29.2s, v31.2s       ////x0,x1,x2,x3 first qudrant transposing continued.....
+
+    trn1        v27.4h, v4.4h, v8.4h
+    trn2        v29.4h, v4.4h, v8.4h        ////[x3,x1],[x2,x0] second qudrant transposing
+    trn1        v25.4h, v5.4h, v9.4h
+    trn2        v31.4h, v5.4h, v9.4h        ////[x3,x1],[x2,x0] second qudrant transposing
+
+    trn1        v4.2s, v27.2s, v25.2s
+    trn2        v5.2s, v27.2s, v25.2s       ////x0,x1,x2,x3 second qudrant transposing continued.....
+    trn1        v8.2s, v29.2s, v31.2s
+    trn2        v9.2s, v29.2s, v31.2s       ////x0,x1,x2,x3 second qudrant transposing continued.....
+
+    trn1        v27.4h, v10.4h, v14.4h
+    trn2        v29.4h, v10.4h, v14.4h      ////[x7,x5],[x6,x4] third qudrant transposing
+    trn1        v25.4h, v11.4h, v15.4h
+    trn2        v31.4h, v11.4h, v15.4h      ////[x7,x5],[x6,x4] third qudrant transposing
+
+    trn1        v10.2s, v27.2s, v25.2s
+    trn2        v11.2s, v27.2s, v25.2s      ////x4,x5,x6,x7 third qudrant transposing continued.....
+    trn1        v14.2s, v29.2s, v31.2s
+    trn2        v15.2s, v29.2s, v31.2s      ////x4,x5,x6,x7 third qudrant transposing continued.....
+
+    trn1        v27.4h, v12.4h, v16.4h
+    trn2        v29.4h, v12.4h, v16.4h      ////[x7,x5],[x6,x4] fourth qudrant transposing
+    trn1        v25.4h, v13.4h, v17.4h
+    trn2        v31.4h, v13.4h, v17.4h      ////[x7,x5],[x6,x4] fourth qudrant transposing
+
+    trn1        v12.2s, v27.2s, v25.2s
+    trn2        v13.2s, v27.2s, v25.2s      ////x4,x5,x6,x7 fourth qudrant transposing continued.....
+    trn1        v16.2s, v29.2s, v31.2s
+    trn2        v17.2s, v29.2s, v31.2s      ////x4,x5,x6,x7 fourth qudrant transposing continued.....
+
+    mov         v25.d[0],x19
+    mov         v25.d[1],x20
+
+    ////step6 operate on first four rows and find their idct
+    ////register usage.extern        - storing and idct of rows
+////    cosine constants     -     d0
+////    sine constants         -     d1
+////    element 0 first four     -     d2        -    y0
+////    element 1 first four     -     d6        -    y1
+////    element 2 first four     -     d3        -    y2
+////    element 3 first four     -     d7        -    y3
+////    element 4 first four     -     d4        -    y4
+////    element 5 first four     -     d8        -    y5
+////    element 6 first four     -     d5        -    y6
+////    element 7 first four     -     d9        -    y7
+////    element 0 second four    -     d10        -    y0
+////    element 1 second four    -     d14     -    y1
+////    element 2 second four    -     d11     -    y2
+////    element 3 second four    -     d15     -    y3
+////    element 4 second four    -     d12     -    y4
+////    element 5 second four    -     d16     -    y5
+////    element 6 second four    -     d13     -    y6
+////    element 7 second four    -     d17     -    y7
+
+    //// map between first kernel code seq and current
+////        d2    ->    d2
+////        d6    ->    d6
+////        d3    ->    d3
+////        d7    ->    d7
+////        d10    ->    d4
+////        d14    ->    d8
+////        d11    ->    d5
+////        d15    ->    d9
+////        q3    ->    q3
+////        q5    ->    q2
+////        q7    ->    q4
+
+    smull       v24.4s, v6.4h, v0.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v6.4h, v0.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v6.4h, v1.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v6.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v7.4h, v0.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v7.4h, v1.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v7.4h, v0.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v7.4h, v1.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+    smull       v20.4s, v2.4h, v0.4h[0]     //// y0 * cos4(part of c0 and c1)
+    smull       v22.4s, v4.4h, v0.4h[0]     //// y4 * cos4(part of c0 and c1)
+
+    smull       v18.4s, v3.4h, v1.4h[2]     //// y2 * sin2 (q3 is freed by this time)(part of d1)
+    smull       v6.4s, v3.4h, v0.4h[2]      //// y2 * cos2(part of d0)
+
+
+    smlal       v24.4s, v8.4h, v1.4h[1]     //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+    smlsl       v26.4s, v8.4h, v0.4h[1]     //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+    smlal       v28.4s, v8.4h, v1.4h[3]     //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+    smlal       v30.4s, v8.4h, v0.4h[3]     //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+    smlsl       v18.4s, v5.4h, v0.4h[2]     //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+    smlal       v6.4s, v5.4h, v1.4h[2]      //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+    add         v2.4s,  v20.4s ,  v22.4s    //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+    sub         v20.4s,  v20.4s ,  v22.4s   //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+    smlal       v24.4s, v9.4h, v1.4h[3]     //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+    smlsl       v26.4s, v9.4h, v1.4h[1]     //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+    smlal       v28.4s, v9.4h, v0.4h[3]     //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+    smlsl       v30.4s, v9.4h, v0.4h[1]     //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+
+    sub         v22.4s,  v2.4s ,  v6.4s     //// a3 = c0 - d0(part of x3,x4)
+    add         v4.4s,  v2.4s ,  v6.4s      ////    a0 = c0 + d0(part of x0,x7)
+
+
+    add         v2.4s,  v4.4s ,  v24.4s
+
+    sub         v6.4s,  v4.4s ,  v24.4s
+
+    add         v8.4s,  v22.4s ,  v30.4s
+
+    sub         v24.4s,  v22.4s ,  v30.4s
+
+    sqrshrn     v5.4h, v8.4s,#shift_stage2_idct
+    sqrshrn     v2.4h, v2.4s,#shift_stage2_idct
+    sqrshrn     v9.4h, v6.4s,#shift_stage2_idct
+    sqrshrn     v6.4h, v24.4s,#shift_stage2_idct
+
+    sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of x2,x5)
+    add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of x1,x6)
+
+
+    add         v30.4s,  v22.4s ,  v28.4s
+
+    sub         v24.4s,  v22.4s ,  v28.4s
+
+    add         v28.4s,  v18.4s ,  v26.4s
+
+    sub         v22.4s,  v18.4s ,  v26.4s
+    sqrshrn     v4.4h, v30.4s,#shift_stage2_idct
+    sqrshrn     v7.4h, v24.4s,#shift_stage2_idct
+    sqrshrn     v3.4h, v28.4s,#shift_stage2_idct
+    sqrshrn     v8.4h, v22.4s,#shift_stage2_idct
+
+
+
+    umov        x19,v25.d[0]
+    umov        x20,v25.d[1]
+
+    trn1        v27.4h, v2.4h, v3.4h
+    trn2        v29.4h, v2.4h, v3.4h
+    trn1        v25.4h, v4.4h, v5.4h
+    trn2        v31.4h, v4.4h, v5.4h
+
+    trn1        v2.2s, v27.2s, v25.2s
+    trn2        v4.2s, v27.2s, v25.2s
+    trn1        v3.2s, v29.2s, v31.2s
+    trn2        v5.2s, v29.2s, v31.2s
+
+    trn1        v27.4h, v6.4h, v7.4h
+    trn2        v29.4h, v6.4h, v7.4h
+    trn1        v25.4h, v8.4h, v9.4h
+    trn2        v31.4h, v8.4h, v9.4h
+
+    trn1        v6.2s, v27.2s, v25.2s
+    trn2        v8.2s, v27.2s, v25.2s
+    trn1        v7.2s, v29.2s, v31.2s
+    trn2        v9.2s, v29.2s, v31.2s
+
+    mov         v25.d[0],x19
+    mov         v25.d[1],x20
+
+
+
+    smull       v24.4s, v14.4h, v0.4h[1]    //// y1 * cos1(part of b0)
+    smull       v26.4s, v14.4h, v0.4h[3]    //// y1 * cos3(part of b1)
+    smull       v28.4s, v14.4h, v1.4h[1]    //// y1 * sin3(part of b2)
+    smull       v30.4s, v14.4h, v1.4h[3]    //// y1 * sin1(part of b3)
+    smlal       v24.4s, v15.4h, v0.4h[3]    //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v15.4h, v1.4h[3]    //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v15.4h, v0.4h[1]    //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v15.4h, v1.4h[1]    //// y1 * sin1 - y3 * sin3(part of b3)
+    smull       v20.4s, v10.4h, v0.4h[0]    //// y0 * cos4(part of c0 and c1)
+    smull       v22.4s, v12.4h, v0.4h[0]    //// y4 * cos4(part of c0 and c1)
+    smull       v18.4s, v11.4h, v1.4h[2]    //// y2 * sin2 (q7 is freed by this time)(part of d1)
+    smull       v14.4s, v11.4h, v0.4h[2]    //// y2 * cos2(part of d0)
+    smlal       v24.4s, v16.4h, v1.4h[1]    //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+
+    add         x4,x2,x8, lsl #1            // x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
+    smlsl       v26.4s, v16.4h, v0.4h[1]    //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+
+    add         x5,x8,x8, lsl #1            //
+    smlal       v28.4s, v16.4h, v1.4h[3]    //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+
+    add         x0,x3,x7, lsl #1            // x0 points to 3rd row of dest data
+    smlal       v30.4s, v16.4h, v0.4h[3]    //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+    add         x10,x7,x7, lsl #1           //
+    smlsl       v18.4s, v13.4h, v0.4h[2]    //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+
+
+    smlal       v14.4s, v13.4h, v1.4h[2]    //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+    add         v12.4s,  v20.4s ,  v22.4s   //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+    sub         v20.4s,  v20.4s ,  v22.4s   //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+    smlal       v24.4s, v17.4h, v1.4h[3]    //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+
+    // swapping v3 and v6
+    mov         v31.d[0], v3.d[0]
+    mov         v3.d[0], v6.d[0]
+    mov         v6.d[0], v31.d[0]
+
+    smlsl       v26.4s, v17.4h, v1.4h[1]    //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+    // swapping v5 and v8
+    mov         v31.d[0], v5.d[0]
+    mov         v5.d[0], v8.d[0]
+    mov         v8.d[0], v31.d[0]
+
+    smlal       v28.4s, v17.4h, v0.4h[3]    //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+    smlsl       v30.4s, v17.4h, v0.4h[1]    //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+
+    sub         v22.4s,  v12.4s ,  v14.4s   //// a3 = c0 - d0(part of x3,x4)
+    add         v12.4s,  v12.4s ,  v14.4s   ////    a0 = c0 + d0(part of x0,x7)
+
+
+    add         v0.4s,  v12.4s ,  v24.4s
+
+
+    sub         v24.4s,  v12.4s ,  v24.4s
+
+
+    add         v12.4s,  v22.4s ,  v30.4s
+
+
+    sub         v14.4s,  v22.4s ,  v30.4s
+
+    sqrshrn     v10.4h, v0.4s,#shift_stage2_idct
+    sqrshrn     v17.4h, v24.4s,#shift_stage2_idct
+    sqrshrn     v13.4h, v12.4s,#shift_stage2_idct
+    sqrshrn     v14.4h, v14.4s,#shift_stage2_idct
+
+    sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of x2,x5)
+    add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of x1,x6)
+
+
+    add         v0.4s,  v22.4s ,  v28.4s
+
+
+    sub         v24.4s,  v22.4s ,  v28.4s
+
+
+    add         v28.4s,  v18.4s ,  v26.4s
+
+
+    sub         v26.4s,  v18.4s ,  v26.4s
+    ld1         {v18.8b},[x2],x8
+
+    sqrshrn     v12.4h, v0.4s,#shift_stage2_idct
+    ld1         {v20.8b},[x2],x5
+
+
+    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct
+    ld1         {v19.8b},[x2],x8
+
+
+
+
+    sqrshrn     v11.4h, v28.4s,#shift_stage2_idct
+    ld1         {v22.8b},[x4],x8
+
+
+
+
+    sqrshrn     v16.4h, v26.4s,#shift_stage2_idct
+    ld1         {v21.8b},[x2],x5
+
+
+
+
+pred_buff_addition:
+
+    umov        x19,v25.d[0]
+    umov        x20,v25.d[1]
+
+    trn1        v27.4h, v10.4h, v11.4h
+    trn2        v29.4h, v10.4h, v11.4h
+    trn1        v25.4h, v12.4h, v13.4h
+    trn2        v31.4h, v12.4h, v13.4h
+
+    trn1        v10.2s, v27.2s, v25.2s
+    trn2        v12.2s, v27.2s, v25.2s
+    trn1        v11.2s, v29.2s, v31.2s
+    trn2        v13.2s, v29.2s, v31.2s
+
+    trn1        v27.4h, v14.4h, v15.4h
+    trn2        v29.4h, v14.4h, v15.4h
+    trn1        v25.4h, v16.4h, v17.4h
+    trn2        v31.4h, v16.4h, v17.4h
+
+    trn1        v14.2s, v27.2s, v25.2s
+    trn2        v16.2s, v27.2s, v25.2s
+    trn1        v15.2s, v29.2s, v31.2s
+    trn2        v17.2s, v29.2s, v31.2s
+
+
+    mov         v25.d[0],x19
+    mov         v25.d[1],x20
+
+
+    ld1         {v24.8b},[x4],x5
+    ld1         {v23.8b},[x4],x8
+    ld1         {v25.8b},[x4],x5
+    mov         v2.d[1], v3.d[0]
+    mov         v4.d[1], v5.d[0]
+    mov         v6.d[1], v7.d[0]
+    mov         v8.d[1], v9.d[0]
+    uaddw       v2.8h,  v2.8h ,  v18.8b
+    uaddw       v4.8h,  v4.8h ,  v22.8b
+    uaddw       v6.8h,  v6.8h ,  v20.8b
+    uaddw       v8.8h,  v8.8h ,  v24.8b
+
+    // swapping v11 and v14
+    mov         v31.d[0], v11.d[0]
+    mov         v11.d[0], v14.d[0]
+    mov         v14.d[0], v31.d[0]
+
+    // swapping v13 and v16
+    mov         v31.d[0], v13.d[0]
+    mov         v13.d[0], v16.d[0]
+    mov         v16.d[0], v31.d[0]
+// row values stored in the q register.
+
+//q1 :x0
+//q3: x1
+//q2: x2
+//q4: x3
+//q5: x4
+//q7: x5
+//q6: x6
+//q8: x7
+
+
+
+///// adding the prediction buffer
+
+
+
+
+
+
+
+
+
+    // load prediction data
+
+
+
+
+
+    //adding recon with prediction
+
+
+
+
+    mov         v10.d[1], v11.d[0]
+    mov         v12.d[1], v13.d[0]
+    mov         v14.d[1], v15.d[0]
+    mov         v16.d[1], v17.d[0]
+    uaddw       v10.8h,  v10.8h ,  v19.8b
+    sqxtun      v2.8b, v2.8h
+    uaddw       v14.8h,  v14.8h ,  v21.8b
+    sqxtun      v4.8b, v4.8h
+    uaddw       v12.8h,  v12.8h ,  v23.8b
+    sqxtun      v6.8b, v6.8h
+    uaddw       v16.8h,  v16.8h ,  v25.8b
+    sqxtun      v8.8b, v8.8h
+
+
+
+
+
+
+
+    st1         {v2.8b},[x3],x7
+    sqxtun      v10.8b, v10.8h
+    st1         {v6.8b},[x3],x10
+    sqxtun      v14.8b, v14.8h
+    st1         {v4.8b},[x0],x7
+    sqxtun      v12.8b, v12.8h
+    st1         {v8.8b},[x0],x10
+    sqxtun      v16.8b, v16.8h
+
+
+
+
+
+
+
+    st1         {v10.8b},[x3],x7
+    st1         {v14.8b},[x3],x10
+    st1         {v12.8b},[x0],x7
+    st1         {v16.8b},[x0],x10
+
+
+
+
+    // ldmfd sp!,{x4-x12,pc}
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+

diff --git a/common/arm64/ihevc_mem_fns.s b/common/arm64/ihevc_mem_fns.s
new file mode 100644
index 0000000..6619c6c
--- /dev/null
+++ b/common/arm64/ihevc_mem_fns.s

@@ -0,0 +1,280 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+// *******************************************************************************
+// * ,:file
+// *  ihevc_mem_fns_neon.s
+// *
+// * ,:brief
+// *  Contains function definitions for memory manipulation
+// *
+// * ,:author
+// *     Naveen SR
+// *
+// * ,:par List of Functions:
+// *  - ihevc_memcpy()
+// *  - ihevc_memset_mul_8()
+// *  - ihevc_memset_16bit_mul_8()
+// *
+// * ,:remarks
+// *  None
+// *
+// *******************************************************************************
+//*/
+
+///**
+//*******************************************************************************
+//*
+//* ,:brief
+//*   memcpy of a 1d array
+//*
+//* ,:par Description:
+//*   Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
+//*
+//* ,:param[in] pu1_dst
+//*  UWORD8 pointer to the destination
+//*
+//* ,:param[in] pu1_src
+//*  UWORD8 pointer to the source
+//*
+//* ,:param[in] num_bytes
+//*  number of bytes to copy
+//* ,:returns
+//*
+//* ,:remarks
+//*  None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_memcpy_mul_8(UWORD8 *pu1_dst,
+//                      UWORD8 *pu1_src,
+//                      UWORD8 num_bytes)
+//**************Variables Vs Registers*************************
+//    x0 => *pu1_dst
+//    x1 => *pu1_src
+//    x2 => num_bytes
+
+.text
+.p2align 2
+
+
+    .global ihevc_memcpy_mul_8_av8
+.type ihevc_memcpy_mul_8_av8, %function
+
+ihevc_memcpy_mul_8_av8:
+
+LOOP_NEON_MEMCPY_MUL_8:
+    // Memcpy 8 bytes
+    LD1         {v0.8b},[x1],#8
+    ST1         {v0.8b},[x0],#8
+
+    SUBS        x2,x2,#8
+    BNE         LOOP_NEON_MEMCPY_MUL_8
+    ret
+
+
+
+//*******************************************************************************
+//*/
+//void ihevc_memcpy(UWORD8 *pu1_dst,
+//                  UWORD8 *pu1_src,
+//                  UWORD8 num_bytes)
+//**************Variables Vs Registers*************************
+//    x0 => *pu1_dst
+//    x1 => *pu1_src
+//    x2 => num_bytes
+
+
+
+    .global ihevc_memcpy_av8
+.type ihevc_memcpy_av8, %function
+
+ihevc_memcpy_av8:
+    SUBS        x2,x2,#8
+    BLT         ARM_MEMCPY
+LOOP_NEON_MEMCPY:
+    // Memcpy 8 bytes
+    LD1         {v0.8b},[x1],#8
+    ST1         {v0.8b},[x0],#8
+
+    SUBS        x2,x2,#8
+    BGE         LOOP_NEON_MEMCPY
+    CMP         x2,#-8
+    BEQ         MEMCPY_RETURN
+
+ARM_MEMCPY:
+    ADD         x2,x2,#8
+
+LOOP_ARM_MEMCPY:
+    LDRB        w3,[x1],#1
+    STRB        w3,[x0],#1
+    SUBS        x2,x2,#1
+    BNE         LOOP_ARM_MEMCPY
+MEMCPY_RETURN:
+    ret
+
+
+
+
+//void ihevc_memset_mul_8(UWORD8 *pu1_dst,
+//                       UWORD8 value,
+//                       UWORD8 num_bytes)
+//**************Variables Vs Registers*************************
+//    x0 => *pu1_dst
+//    x1 => value
+//    x2 => num_bytes
+
+.text
+.p2align 2
+
+
+
+    .global ihevc_memset_mul_8_av8
+.type ihevc_memset_mul_8_av8, %function
+
+ihevc_memset_mul_8_av8:
+
+// Assumptions: numbytes is either 8, 16 or 32
+    dup         v0.8b,w1
+LOOP_MEMSET_MUL_8:
+    // Memset 8 bytes
+    ST1         {v0.8b},[x0],#8
+
+    SUBS        x2,x2,#8
+    BNE         LOOP_MEMSET_MUL_8
+
+    ret
+
+
+
+
+//void ihevc_memset(UWORD8 *pu1_dst,
+//                       UWORD8 value,
+//                       UWORD8 num_bytes)
+//**************Variables Vs Registers*************************
+//    x0 => *pu1_dst
+//    x1 => value
+//    x2 => num_bytes
+
+
+
+    .global ihevc_memset_av8
+.type ihevc_memset_av8, %function
+
+ihevc_memset_av8:
+    SUBS        x2,x2,#8
+    BLT         ARM_MEMSET
+    dup         v0.8b,w1
+LOOP_NEON_MEMSET:
+    // Memcpy 8 bytes
+    ST1         {v0.8b},[x0],#8
+
+    SUBS        x2,x2,#8
+    BGE         LOOP_NEON_MEMSET
+    CMP         x2,#-8
+    BEQ         MEMSET_RETURN
+
+ARM_MEMSET:
+    ADD         x2,x2,#8
+
+LOOP_ARM_MEMSET:
+    STRB        w1,[x0],#1
+    SUBS        x2,x2,#1
+    BNE         LOOP_ARM_MEMSET
+
+MEMSET_RETURN:
+    ret
+
+
+
+
+//void ihevc_memset_16bit_mul_8(UWORD16 *pu2_dst,
+//                                      UWORD16 value,
+//                                      UWORD8 num_words)
+//**************Variables Vs Registers*************************
+//    x0 => *pu2_dst
+//    x1 => value
+//    x2 => num_words
+
+.text
+.p2align 2
+
+
+
+    .global ihevc_memset_16bit_mul_8_av8
+.type ihevc_memset_16bit_mul_8_av8, %function
+
+ihevc_memset_16bit_mul_8_av8:
+
+// Assumptions: num_words is either 8, 16 or 32
+
+    // Memset 8 words
+    dup         v0.8h,w1
+LOOP_MEMSET_16BIT_MUL_8:
+    ST1         {v0.8h},[x0],#16
+
+    SUBS        x2,x2,#8
+    BNE         LOOP_MEMSET_16BIT_MUL_8
+
+    ret
+
+
+
+
+//void ihevc_memset_16bit(UWORD16 *pu2_dst,
+//                       UWORD16 value,
+//                       UWORD8 num_words)
+//**************Variables Vs Registers*************************
+//    x0 => *pu2_dst
+//    x1 => value
+//    x2 => num_words
+
+
+
+    .global ihevc_memset_16bit_av8
+.type ihevc_memset_16bit_av8, %function
+
+ihevc_memset_16bit_av8:
+    SUBS        x2,x2,#8
+    BLT         ARM_MEMSET_16BIT
+    dup         v0.8h,w1
+LOOP_NEON_MEMSET_16BIT:
+    // Memset 8 words
+    ST1         {v0.8h},[x0],#16
+
+    SUBS        x2,x2,#8
+    BGE         LOOP_NEON_MEMSET_16BIT
+    CMP         x2,#-8
+    BEQ         MEMSET_16BIT_RETURN
+
+ARM_MEMSET_16BIT:
+    ADD         x2,x2,#8
+
+LOOP_ARM_MEMSET_16BIT:
+    STRH        w1,[x0],#2
+    SUBS        x2,x2,#1
+    BNE         LOOP_ARM_MEMSET_16BIT
+
+MEMSET_16BIT_RETURN:
+    ret
+
+
+
+
+    .section .note.GNU-stack,"",%progbits
+

diff --git a/common/arm64/ihevc_neon_macros.s b/common/arm64/ihevc_neon_macros.s
new file mode 100644
index 0000000..09a1de9
--- /dev/null
+++ b/common/arm64/ihevc_neon_macros.s

@@ -0,0 +1,50 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_neon_macros.s
+//*
+//* @brief
+//*  Contains assembly macros
+//*
+//* @author
+//*  Naveen SR
+//*
+//* @par List of Functions:
+//*
+//*
+//* @remarks
+//*  None
+//*
+//*******************************************************************************
+
+
+.macro push_v_regs
+    stp         d8,d9,[sp,#-16]!
+    stp         d10,d11,[sp,#-16]!
+    stp         d12,d13,[sp,#-16]!
+    stp         d14,d15,[sp,#-16]!
+.endm
+.macro pop_v_regs
+    ldp         d14,d15,[sp],#16
+    ldp         d12,d13,[sp],#16
+    ldp         d10,d11,[sp],#16
+    ldp         d8,d9,[sp],#16
+.endm
+

diff --git a/common/arm64/ihevc_padding.s b/common/arm64/ihevc_padding.s
new file mode 100644
index 0000000..5a33d0a
--- /dev/null
+++ b/common/arm64/ihevc_padding.s

@@ -0,0 +1,523 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+// *******************************************************************************
+// * //file
+// *  ihevc_padding_neon.s
+// *
+// * //brief
+// *  contains function definitions padding
+// *
+// * //author
+// *     naveen sr
+// *
+// * //par list of functions:
+// *  - ihevc_pad_left_luma()
+// *  - ihevc_pad_left_chroma()
+// *
+// * //remarks
+// *  none
+// *
+// *******************************************************************************
+//*/
+
+///**
+//*******************************************************************************
+//*
+//* //brief
+//*   padding (luma block) at the left of a 2d array
+//*
+//* //par description:
+//*   the left column of a 2d array is replicated for pad_size times at the left
+//*
+//*
+//* //param[in] pu1_src
+//*  uword8 pointer to the source
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //param[in] pad_size
+//*  integer -padding size of the array
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+//.if pad_left_luma == c
+//void ihevc_pad_left_luma(uword8 *pu1_src,
+//                        word32 src_strd,
+//                        word32 ht,
+//                        word32 pad_size)
+//**************variables vs registers*************************
+//    x0 => *pu1_src
+//    x1 => src_strd
+//    x2 => ht
+//    x3 => pad_size
+
+.text
+.align 4
+
+.globl ihevc_pad_left_luma_av8
+
+.type ihevc_pad_left_luma_av8, %function
+
+ihevc_pad_left_luma_av8:
+
+loop_start_luma_left:
+    // pad size is assumed to be pad_left = 80
+    sub         x4,x0,x3
+
+    ldrb        w8,[x0]
+    add         x0,x0,x1
+    ldrb        w9,[x0]
+    add         x0,x0,x1
+    ldrb        w10,[x0]
+    add         x0,x0,x1
+    ldrb        w11,[x0]
+    add         x0,x0,x1
+
+    dup         v0.16b,w8
+    dup         v2.16b,w9
+    dup         v4.16b,w10
+    dup         v6.16b,w11
+
+    add         x5,x4,x1
+
+    st1         {v0.16b},[x4],#16           //128/8 = 16 bytes store
+    st1         {v0.16b},[x4],#16           // 16 bytes store
+    st1         {v0.16b},[x4],#16           // 16 bytes store
+    st1         {v0.16b},[x4],#16           // 16 bytes store
+    st1         {v0.16b},[x4]               // 16 bytes store
+
+    add         x6,x5,x1
+
+    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
+    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
+    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
+    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
+    st1         {v2.16b},[x5]               //128/8 = 16 bytes store
+
+    add         x7,x6,x1
+
+    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
+    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
+    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
+    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
+    st1         {v4.16b},[x6]               //128/8 = 16 bytes store
+
+    subs        x2, x2,#4
+
+    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
+    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
+    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
+    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
+    st1         {v6.16b},[x7]               //128/8 = 16 bytes store
+
+    // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
+
+    bne         loop_start_luma_left
+
+    ret
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//* //brief
+//*   padding (chroma block) at the left of a 2d array
+//*
+//* //par description:
+//*   the left column of a 2d array is replicated for pad_size times at the left
+//*
+//*
+//* //param[in] pu1_src
+//*  uword8 pointer to the source
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array (each colour component)
+//*
+//* //param[in] pad_size
+//*  integer -padding size of the array
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+//.if pad_left_chroma == c
+//void ihevc_pad_left_chroma(uword8 *pu1_src,
+//                            word32 src_strd,
+//                            word32 ht,
+//                            word32 pad_size)
+//{
+//    x0 => *pu1_src
+//    x1 => src_strd
+//    x2 => ht
+//    x3 => pad_size
+
+
+
+.globl ihevc_pad_left_chroma_av8
+
+.type ihevc_pad_left_chroma_av8, %function
+
+ihevc_pad_left_chroma_av8:
+
+
+loop_start_chroma_left:
+    // pad size is assumed to be pad_left = 80
+    sub         x4,x0,x3
+
+    ldrh        w8,[x0]
+    add         x0,x0,x1
+    ldrh        w9,[x0]
+    add         x0,x0,x1
+    ldrh        w10,[x0]
+    add         x0,x0,x1
+    ldrh        w11,[x0]
+    add         x0,x0,x1
+
+    dup         v0.8h,w8
+    dup         v2.8h,w9
+    dup         v4.8h,w10
+    dup         v6.8h,w11
+
+    add         x5,x4,x1
+
+    st1         {v0.16b},[x4],#16           //128/8 = 16 bytes store
+    st1         {v0.16b},[x4],#16           // 16 bytes store
+    st1         {v0.16b},[x4],#16           // 16 bytes store
+    st1         {v0.16b},[x4],#16           // 16 bytes store
+    st1         {v0.16b},[x4]               // 16 bytes store
+
+    add         x6,x5,x1
+
+    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
+    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
+    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
+    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
+    st1         {v2.16b},[x5]               //128/8 = 16 bytes store
+
+    add         x7,x6,x1
+
+    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
+    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
+    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
+    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
+    st1         {v4.16b},[x6]               //128/8 = 16 bytes store
+
+    subs        x2, x2,#4
+
+    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
+    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
+    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
+    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
+    st1         {v6.16b},[x7]               //128/8 = 16 bytes store
+
+    // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
+
+    bne         loop_start_chroma_left
+
+    ret
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* padding (luma block) at the right of a 2d array
+//*
+//* //par description:
+//* the right column of a 2d array is replicated for pad_size times at the right
+//*
+//*
+//* //param[in] pu1_src
+//*  uword8 pointer to the source
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //param[in] pad_size
+//*  integer -padding size of the array
+//*
+//* //param[in] ht
+//*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+//.if pad_right_luma == c
+//void ihevc_pad_right_luma(uword8 *pu1_src,
+//                        word32 src_strd,
+//                        word32 ht,
+//                        word32 pad_size)
+//{
+//    word32 row//
+//
+//    for(row = 0// row < ht// row++)
+//    {
+//        memset(pu1_src, *(pu1_src -1), pad_size)//
+//
+//        pu1_src += src_strd//
+//    }
+//}
+//
+//    x0 => *pu1_src
+//    x1 => src_strd
+//    x2 => ht
+//    x3 => pad_size
+
+
+
+.globl ihevc_pad_right_luma_av8
+
+.type ihevc_pad_right_luma_av8, %function
+
+ihevc_pad_right_luma_av8:
+
+
+loop_start_luma_right:
+    // pad size is assumed to be pad_left = 80
+    mov         x4,x0
+
+    ldrb        w8,[x0, #-1]
+    add         x0,x0,x1
+    ldrb        w9,[x0, #-1]
+    add         x0,x0,x1
+    ldrb        w10,[x0, #-1]
+    add         x0,x0,x1
+    ldrb        w11,[x0, #-1]
+    add         x0,x0,x1
+
+    add         x5,x4,x1
+    add         x6,x5,x1
+    add         x7,x6,x1
+
+    dup         v0.16b,w8
+    dup         v2.16b,w9
+    dup         v4.16b,w10
+    dup         v6.16b,w11
+
+    st1         {v0.16b},[x4],#16           //128/8 = 16 bytes store
+    st1         {v0.16b},[x4],#16           // 16 bytes store
+    st1         {v0.16b},[x4],#16           // 16 bytes store
+    st1         {v0.16b},[x4],#16           // 16 bytes store
+    st1         {v0.16b},[x4]               // 16 bytes store
+
+
+    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
+    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
+    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
+    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
+    st1         {v2.16b},[x5]               //128/8 = 16 bytes store
+
+    subs        x2, x2,#4
+
+    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
+    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
+    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
+    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
+    st1         {v4.16b},[x6]               //128/8 = 16 bytes store
+
+    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
+    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
+    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
+    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
+    st1         {v6.16b},[x7]               //128/8 = 16 bytes store
+
+
+    // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
+
+
+    bne         loop_start_luma_right
+
+    ret
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//* //brief
+////* padding (chroma block) at the right of a 2d array
+//*
+//* //par description:
+//* the right column of a 2d array is replicated for pad_size times at the right
+//*
+//*
+//* //param[in] pu1_src
+////*  uword8 pointer to the source
+//*
+//* //param[in] src_strd
+//*  integer source stride
+//*
+//* //param[in] ht
+////*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array (each colour component)
+//*
+//* //param[in] pad_size
+//*  integer -padding size of the array
+//*
+//* //param[in] ht
+////*  integer height of the array
+//*
+//* //param[in] wd
+//*  integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+//.if pad_right_chroma == c
+//void ihevc_pad_right_chroma(uword8 *pu1_src,
+//                        word32 src_strd,
+//                        word32 ht,
+//                        word32 pad_size)
+//    x0 => *pu1_src
+//    x1 => src_strd
+//    x2 => ht
+//    x3 => pad_size
+
+
+
+.globl ihevc_pad_right_chroma_av8
+
+.type ihevc_pad_right_chroma_av8, %function
+
+ihevc_pad_right_chroma_av8:
+
+
+loop_start_chroma_right:
+    // pad size is assumed to be pad_left = 80
+    mov         x4,x0
+
+    ldrh        w8,[x0, #-2]
+    add         x0,x0,x1
+    ldrh        w9,[x0, #-2]
+    add         x0,x0,x1
+    ldrh        w10,[x0, #-2]
+    add         x0,x0,x1
+    ldrh        w11,[x0, #-2]
+    add         x0,x0,x1
+
+    dup         v0.8h,w8
+    dup         v2.8h,w9
+    dup         v4.8h,w10
+    dup         v6.8h,w11
+
+    add         x5,x4,x1
+
+    st1         {v0.16b},[x4],#16           //128/8 = 16 bytes store
+    st1         {v0.16b},[x4],#16           // 16 bytes store
+    st1         {v0.16b},[x4],#16           // 16 bytes store
+    st1         {v0.16b},[x4],#16           // 16 bytes store
+    st1         {v0.16b},[x4]               // 16 bytes store
+
+    add         x6,x5,x1
+
+    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
+    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
+    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
+    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
+    st1         {v2.16b},[x5]               //128/8 = 16 bytes store
+
+    add         x7,x6,x1
+
+    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
+    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
+    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
+    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
+    st1         {v4.16b},[x6]               //128/8 = 16 bytes store
+
+    subs        x2, x2,#4
+
+    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
+    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
+    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
+    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
+    st1         {v6.16b},[x7]               //128/8 = 16 bytes store
+
+    // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
+
+    bne         loop_start_chroma_right
+
+    ret
+
+
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_sao_band_offset_chroma.s b/common/arm64/ihevc_sao_band_offset_chroma.s
new file mode 100644
index 0000000..f67a3de
--- /dev/null
+++ b/common/arm64/ihevc_sao_band_offset_chroma.s

@@ -0,0 +1,430 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//*  ihevc_sao_band_offset_chroma.s
+//*
+//* ,:brief
+//*  Contains function definitions for inter prediction  interpolation.
+//* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
+//* RVCT
+//*
+//* ,:author
+//*  Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//*  None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_band_offset_chroma(UWORD8 *pu1_src,
+//                           WORD32 src_strd,
+//                           UWORD8 *pu1_src_left,
+//                           UWORD8 *pu1_src_top,
+//                           UWORD8 *pu1_src_top_left,
+//                           WORD32 sao_band_pos_u,
+//                           WORD32 sao_band_pos_v,
+//                           WORD8 *pi1_sao_offset_u,
+//                           WORD8 *pi1_sao_offset_v,
+//                           WORD32 wd,
+//                           WORD32 ht)
+//
+//**************Variables Vs Registers*****************************************
+//x0 =>    *pu1_src
+//x1 =>    src_strd
+//x2 =>    *pu1_src_left
+//x3 =>    *pu1_src_top
+//x4    =>    *pu1_src_top_left 40
+//x5    =>    sao_band_pos_u 44
+//x6    =>    sao_band_pos_v 48
+//x7    =>    *pi1_sao_offset_u 52
+//x8    =>    *pi1_sao_offset_v 56
+//x9    =>    wd 60
+//x10=>    ht 64
+
+.text
+.p2align 2
+.include "ihevc_neon_macros.s"
+
+.globl gu1_table_band_idx
+.globl ihevc_sao_band_offset_chroma_av8
+
+ihevc_sao_band_offset_chroma_av8:
+    mov         x8,#0
+    mov         x9,#0
+    mov         x10,#0
+
+    ldr         x8,[sp,#0]
+    ldr         w9,[sp,#8]
+    ldr         w10,[sp,#16]
+
+    push_v_regs
+    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
+    stp         x19, x20,[sp,#-16]!
+    stp         x21, x22,[sp,#-16]!
+    stp         x23, x24,[sp,#-16]!
+
+    mov         x15,x4 // pu1_src_top_left 40
+    mov         x16,x5 // sao_band_pos_u 44
+    mov         x17,x6 // sao_band_pos_v 48
+    mov         x19,x7 // pi1_sao_offset_u 52
+    mov         x20,x8 // pi1_sao_offset_v 56
+    mov         x21,x9 // wd 60
+    mov         x22,x10 // ht 64
+
+    MOV         x4, x15                     //Loads pu1_src_top_left
+    MOV         x10, x22                    //Loads ht
+
+    MOV         x9, x21                     //Loads wd
+    MOV         x11,x10                     //Move the ht to x9 for loop counter
+
+    ADD         x12,x0,x9                   //pu1_src[row * src_strd + (wd)]
+    ADRP        x14, :got:gu1_table_band_idx
+    LDR         x14, [x14, #:got_lo12:gu1_table_band_idx]
+
+    SUB         x12,x12,#2                  //wd-2
+
+SRC_LEFT_LOOP:
+    LDRH        w5,[x12]                    //Load the value
+    ADD         x12,x12,x1
+    SUBS        x11,x11,#1                  //Decrement the loop counter
+    STRH        w5,[x2],#2                  //Store the value in pu1_src_left pointer
+    BNE         SRC_LEFT_LOOP
+
+    MOV         x5, x16                     //Loads sao_band_pos_u
+    LD1         {v1.8b},[x14],#8            //band_table_u.val[0]
+    ADD         x12,x3,x9                   //pu1_src_top[wd]
+
+    sub         x23,x12,#2
+    LDRH        w11,[x23]
+    LD1         {v2.8b},[x14],#8            //band_table_u.val[1]
+    LSL         x6,x5,#3                    //sao_band_pos_u
+
+    STRH        w11,[x4]                    //store to pu1_src_top_left[0]
+    LD1         {v3.8b},[x14],#8            //band_table_u.val[2]
+    MOV         x7, x19                     //Loads pi1_sao_offset_u
+
+    SUB         x4,x10,#1                   //ht-1
+    dup         v31.8b,w6                   //band_pos_u
+    mul         x4, x4, x1                  //ht-1 * src_strd
+
+    ADD         x4,x4,x0                    //pu1_src[(ht - 1) * src_strd]
+    LD1         {v4.8b},[x14],#8            //band_table_u.val[3]
+    MOV         x11,x9                      //Move the wd to x9 for loop counter
+
+SRC_TOP_LOOP:                               //wd is always multiple of 8
+    LD1         {v0.8b},[x4],#8             //Load pu1_src[(ht - 1) * src_strd + col]
+    SUBS        x11,x11,#8                  //Decrement the loop counter by 8
+    ST1         {v0.8b},[x3],#8             //Store to pu1_src_top[col]
+    BNE         SRC_TOP_LOOP
+
+    LD1         {v30.8b},[x7]               //pi1_sao_offset_u load
+    ADD         v5.8b,  v1.8b ,  v31.8b     //band_table_u.val[0] = vadd_u8(band_table_u.val[0], sao_band_pos_u)
+
+    dup         v29.8b, v30.8b[1]           //vdup_n_u8(pi1_sao_offset_u[1])
+    ADD         v6.8b,  v2.8b ,  v31.8b     //band_table_u.val[1] = vadd_u8(band_table_u.val[1], sao_band_pos_u)
+
+    dup         v28.8b, v30.8b[2]           //vdup_n_u8(pi1_sao_offset_u[2])
+    ADD         v7.8b,  v3.8b ,  v31.8b     //band_table_u.val[2] = vadd_u8(band_table_u.val[2], sao_band_pos_u)
+
+    dup         v27.8b, v30.8b[3]           //vdup_n_u8(pi1_sao_offset_u[3])
+    ADD         v8.8b,  v4.8b ,  v31.8b     //band_table_u.val[3] = vadd_u8(band_table_u.val[3], sao_band_pos_u)
+
+    CMP         x5,#28
+    dup         v26.8b, v30.8b[4]           //vdup_n_u8(pi1_sao_offset_u[4])
+    ADRP        x14, :got:gu1_table_band_idx
+    LDR         x14, [x14, #:got_lo12:gu1_table_band_idx]
+
+    movi        v30.8b, #16                 //vdup_n_u8(16)
+    ADD         v1.8b,  v5.8b ,  v29.8b     //band_table_u.val[0] = vadd_u8(band_table_u.val[0], vdup_n_u8(pi1_sao_offset_u[1]))
+
+    LD1         {v9.8b},[x14],#8            //band_table_v.val[0]
+    ADD         v2.8b,  v6.8b ,  v28.8b     //band_table_u.val[1] = vadd_u8(band_table_u.val[1], vdup_n_u8(pi1_sao_offset_u[2]))
+
+    LD1         {v10.8b},[x14],#8           //band_table_v.val[1]
+    ADD         v3.8b,  v7.8b ,  v27.8b     //band_table_u.val[2] = vadd_u8(band_table_u.val[2], vdup_n_u8(pi1_sao_offset_u[3]))
+
+    MOV         x6, x17                     //Loads sao_band_pos_v
+    ADD         v4.8b,  v8.8b ,  v26.8b     //band_table_u.val[3] = vadd_u8(band_table_u.val[3], vdup_n_u8(pi1_sao_offset_u[4]))
+    LSL         x11,x6,#3                   //sao_band_pos_v
+
+    BLT         SAO_BAND_POS_U_0
+
+SAO_BAND_POS_U_28:                          //case 28
+    cmhs        v13.8b,  v30.8b ,  v4.8b    //vcle_u8(band_table.val[3], vdup_n_u8(16))
+    BNE         SAO_BAND_POS_U_29
+
+    ORR         v4.8b,  v4.8b ,  v13.8b     //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
+    B           SWITCH_BREAK_U
+
+SAO_BAND_POS_U_29:                          //case 29
+    CMP         x5,#29
+
+    cmhs        v14.8b,  v30.8b ,  v3.8b    //vcle_u8(band_table.val[2], vdup_n_u8(16))
+    BNE         SAO_BAND_POS_U_30
+    ORR         v3.8b,  v3.8b ,  v14.8b     //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
+
+    AND         v4.8b,  v4.8b ,  v13.8b     //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
+    B           SWITCH_BREAK_U
+
+SAO_BAND_POS_U_30:                          //case 30
+    CMP         x5,#30
+
+    cmhs        v15.8b,  v30.8b ,  v2.8b    //vcle_u8(band_table.val[1], vdup_n_u8(16))
+    BNE         SAO_BAND_POS_U_31
+    ORR         v2.8b,  v2.8b ,  v15.8b     //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
+
+    AND         v3.8b,  v3.8b ,  v14.8b     //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
+
+SAO_BAND_POS_U_31:                          //case 31
+    CMP         x5,#31
+    BNE         SWITCH_BREAK_U
+
+    cmhs        v16.8b,  v30.8b ,  v1.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
+    ORR         v1.8b,  v1.8b ,  v16.8b     //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
+
+    AND         v2.8b,  v2.8b ,  v15.8b     //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
+    B           SWITCH_BREAK_U
+
+SAO_BAND_POS_U_0:
+    CMP         x5,#0                       //case 0
+    BNE         SWITCH_BREAK_U
+
+    cmhs        v16.8b,  v30.8b ,  v1.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
+    AND         v1.8b,  v1.8b ,  v16.8b     //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
+
+SWITCH_BREAK_U:
+    dup         v30.8b,w11                  //band_pos_v
+    MOV         x8, x20                     //Loads pi1_sao_offset_v
+
+    LD1         {v11.8b},[x14],#8           //band_table_v.val[2]
+    ADD         v13.8b,  v9.8b ,  v30.8b    //band_table_v.val[0] = vadd_u8(band_table_v.val[0], band_pos_v)
+
+    LD1         {v12.8b},[x14],#8           //band_table_v.val[3]
+    ADD         v14.8b,  v10.8b ,  v30.8b   //band_table_v.val[1] = vadd_u8(band_table_v.val[1], band_pos_v)
+
+    LD1         {v25.8b},[x8]               //pi1_sao_offset_v load
+    ADD         v15.8b,  v11.8b ,  v30.8b   //band_table_v.val[2] = vadd_u8(band_table_v.val[2], band_pos_v)
+
+    dup         v29.8b, v25.8b[1]           //vdup_n_u8(pi1_sao_offset_v[1])
+    ADD         v16.8b,  v12.8b ,  v30.8b   //band_table_v.val[3] = vadd_u8(band_table_v.val[3], band_pos_v)
+
+    dup         v28.8b, v25.8b[2]           //vdup_n_u8(pi1_sao_offset_v[2])
+    ADD         v9.8b,  v13.8b ,  v29.8b    //band_table_v.val[0] = vadd_u8(band_table_v.val[0], vdup_n_u8(pi1_sao_offset_v[1]))
+
+    dup         v27.8b, v25.8b[3]           //vdup_n_u8(pi1_sao_offset_v[3])
+    ADD         v10.8b,  v14.8b ,  v28.8b   //band_table_v.val[1] = vadd_u8(band_table_v.val[1], vdup_n_u8(pi1_sao_offset_v[2]))
+
+    dup         v26.8b, v25.8b[4]           //vdup_n_u8(pi1_sao_offset_v[4])
+    ADD         v11.8b,  v15.8b ,  v27.8b   //band_table_v.val[2] = vadd_u8(band_table_v.val[2], vdup_n_u8(pi1_sao_offset_v[3]))
+
+    movi        v29.8b, #16                 //vdup_n_u8(16)
+    ADD         v12.8b,  v16.8b ,  v26.8b   //band_table_v.val[3] = vadd_u8(band_table_v.val[3], vdup_n_u8(pi1_sao_offset_v[4]))
+    AND         x12,x9,#0xf
+
+    CMP         x6,#28
+    BLT         SAO_BAND_POS_V_0
+
+SAO_BAND_POS_V_28:                          //case 28
+    cmhs        v17.8b,  v29.8b ,  v12.8b   //vcle_u8(band_table.val[3], vdup_n_u8(16))
+    BNE         SAO_BAND_POS_V_29
+    ORR         v12.8b,  v12.8b ,  v17.8b   //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
+    B           SWITCH_BREAK_V
+
+SAO_BAND_POS_V_29:                          //case 29
+    CMP         x6,#29
+
+    cmhs        v18.8b,  v29.8b ,  v11.8b   //vcle_u8(band_table.val[2], vdup_n_u8(16))
+    BNE         SAO_BAND_POS_V_30
+    ORR         v11.8b,  v11.8b ,  v18.8b   //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
+
+    AND         v12.8b,  v12.8b ,  v17.8b   //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
+    B           SWITCH_BREAK_V
+
+SAO_BAND_POS_V_30:                          //case 30
+    CMP         x6,#30
+
+    cmhs        v19.8b,  v29.8b ,  v10.8b   //vcle_u8(band_table.val[1], vdup_n_u8(16))
+    BNE         SAO_BAND_POS_V_31
+    ORR         v10.8b,  v10.8b ,  v19.8b   //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
+
+    AND         v11.8b,  v11.8b ,  v18.8b   //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
+    B           SWITCH_BREAK_V
+
+SAO_BAND_POS_V_31:                          //case 31
+    CMP         x6,#31
+    BNE         SWITCH_BREAK_V
+
+    cmhs        v20.8b,  v29.8b ,  v9.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
+    ORR         v9.8b,  v9.8b ,  v20.8b     //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
+
+    AND         v10.8b,  v10.8b ,  v19.8b   //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
+    B           SWITCH_BREAK_V
+
+SAO_BAND_POS_V_0:
+    CMP         x6,#0                       //case 0
+    BNE         SWITCH_BREAK_V
+
+    cmhs        v20.8b,  v29.8b ,  v9.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
+    AND         v9.8b,  v9.8b ,  v20.8b     //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
+
+SWITCH_BREAK_V:
+    CMP         x9,#16
+    MOV         x4,x0                       //pu1_src_cpy
+    mov         v1.d[1],v2.d[0]
+    mov         v2.d[0],v3.d[0]
+    mov         v2.d[1],v4.d[0]
+    mov         v9.d[1],v10.d[0]
+    mov         v10.d[0],v11.d[0]
+    mov         v10.d[1],v12.d[0]
+    BLT         WIDTH_RESIDUE
+
+WIDTH_LOOP:                                 //Width is assigned to be multiple of 16
+    MOV         x4,x0                       //pu1_src_cpy
+    MOV         x11,x10                     //move ht
+    ADD         x5,x4,x1
+
+HEIGHT_LOOP:                                //unrolled for 4 rows
+
+    ADD         x6,x5,x1
+    LD2         {v5.8b, v6.8b},[x4]         //vld1q_u8(pu1_src_cpy)
+    ADD         x7,x6,x1
+
+    LD2         {v13.8b, v14.8b},[x5]       //vld1q_u8(pu1_src_cpy)
+    SUB         v7.8b,  v5.8b ,  v31.8b     //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+    LD2         {v17.8b, v18.8b},[x6]       //vld1q_u8(pu1_src_cpy)
+    SUB         v8.8b,  v6.8b ,  v30.8b     //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+    LD2         {v21.8b, v22.8b},[x7]       //vld1q_u8(pu1_src_cpy)
+    SUB         v15.8b,  v13.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+    TBX         v5.8b, {v1.16b- v2.16b},v7.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+    SUB         v16.8b,  v14.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+    TBX         v6.8b, {v9.16b- v10.16b},v8.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+    SUB         v19.8b,  v17.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+    TBX         v13.8b, {v1.16b- v2.16b},v15.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+    SUB         v20.8b,  v18.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+    TBX         v14.8b, {v9.16b- v10.16b},v16.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+    SUB         v23.8b,  v21.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+    ST2         {v5.8b, v6.8b},[x4]         //vst1q_u8(pu1_src_cpy, au1_cur_row)
+    SUB         v24.8b,  v22.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+    SUBS        x11,x11,#4                  //Decrement the ht loop count by 4
+    TBX         v17.8b, {v1.16b- v2.16b},v19.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+
+    ST2         {v13.8b, v14.8b},[x5]       //vst1q_u8(pu1_src_cpy, au1_cur_row)
+
+    TBX         v18.8b, {v9.16b- v10.16b},v20.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+    TBX         v21.8b, {v1.16b- v2.16b},v23.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+    TBX         v22.8b, {v9.16b- v10.16b},v24.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+
+    ST2         {v17.8b, v18.8b},[x6],x1    //vst1q_u8(pu1_src_cpy, au1_cur_row)
+
+    ADD         x4,x6,x1
+    ST2         {v21.8b, v22.8b},[x7]       //vst1q_u8(pu1_src_cpy, au1_cur_row)
+    ADD         x5,x4,x1
+
+    BNE         HEIGHT_LOOP
+
+    SUB         x9,x9,#16                   //Decrement the width loop by 16
+    ADD         x0,x0,#16
+    CMP         x9,#8
+    BGT         WIDTH_LOOP
+    BLT         END_LOOP
+    MOV         x4,x0                       //pu1_src_cpy
+
+WIDTH_RESIDUE:                              //If width is not multiple of 16
+
+    ADD         x5,x4,x1
+    LD2         {v5.8b, v6.8b},[x4]         //vld1q_u8(pu1_src_cpy)
+    ADD         x6,x5,x1
+
+    ADD         x7,x6,x1
+    LD2         {v13.8b, v14.8b},[x5]       //vld1q_u8(pu1_src_cpy)
+    SUB         v7.8b,  v5.8b ,  v31.8b     //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+    LD2         {v17.8b, v18.8b},[x6]       //vld1q_u8(pu1_src_cpy)
+    SUB         v8.8b,  v6.8b ,  v30.8b     //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+    TBX         v5.8b, {v1.16b- v2.16b},v7.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+    SUB         v15.8b,  v13.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+    TBX         v6.8b, {v9.16b- v10.16b},v8.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+    SUB         v16.8b,  v14.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+    LD2         {v21.8b, v22.8b},[x7]       //vld1q_u8(pu1_src_cpy)
+    SUB         v19.8b,  v17.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+    TBX         v13.8b, {v1.16b- v2.16b},v15.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+    SUB         v20.8b,  v18.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+    TBX         v14.8b, {v9.16b- v10.16b},v16.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+    ZIP1        v28.8b, v5.8b, v6.8b
+    ZIP2        v6.8b, v5.8b, v6.8b
+    mov         v5.8b, v28.8b
+
+    TBX         v17.8b, {v1.16b- v2.16b},v19.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+    SUB         v23.8b,  v21.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+    ST1         {v5.8b},[x4]                //vst1q_u8(pu1_src_cpy, au1_cur_row)
+    ZIP1        v28.8b, v13.8b, v14.8b
+    ZIP2        v14.8b, v13.8b, v14.8b
+    mov         v13.8b, v28.8b
+
+    TBX         v18.8b, {v9.16b- v10.16b},v20.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+    SUB         v24.8b,  v22.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+    ST1         {v13.8b},[x5]               //vst1q_u8(pu1_src_cpy, au1_cur_row)
+    SUBS        x10,x10,#4                  //Decrement the ht loop count by 4
+
+    TBX         v21.8b, {v1.16b- v2.16b},v23.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+    ZIP1        v28.8b, v17.8b, v18.8b
+    ZIP2        v18.8b, v17.8b, v18.8b
+    mov         v17.8b, v28.8b
+
+    TBX         v22.8b, {v9.16b- v10.16b},v24.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+    ST1         {v17.8b},[x6],x1            //vst1q_u8(pu1_src_cpy, au1_cur_row)
+    ZIP1        v28.8b, v21.8b, v22.8b
+    ZIP2        v22.8b, v21.8b, v22.8b
+    mov         v21.8b, v28.8b
+
+    ADD         x4,x6,x1
+    ST1         {v21.8b},[x7]               //vst1q_u8(pu1_src_cpy, au1_cur_row)
+    ADD         x5,x4,x1
+
+    BNE         WIDTH_RESIDUE
+
+END_LOOP:
+    // LDMFD sp!,{x4-x12,x15}            //Reload the registers from SP
+    ldp         x23, x24,[sp],#16
+    ldp         x21, x22,[sp],#16
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+

diff --git a/common/arm64/ihevc_sao_band_offset_luma.s b/common/arm64/ihevc_sao_band_offset_luma.s
new file mode 100644
index 0000000..099d581
--- /dev/null
+++ b/common/arm64/ihevc_sao_band_offset_luma.s

@@ -0,0 +1,245 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//*  ihevc_sao_band_offset_luma.s
+//*
+//* ,:brief
+//*  Contains function definitions for inter prediction  interpolation.
+//* Functions are coded using NEON  intrinsics and can be compiled using// ARM
+//* RVCT
+//*
+//* ,:author
+//*  Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//*  None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_band_offset_luma(UWORD8 *pu1_src,
+//                           WORD32 src_strd,
+//                           UWORD8 *pu1_src_left,
+//                           UWORD8 *pu1_src_top,
+//                           UWORD8 *pu1_src_top_left,
+//                           WORD32 sao_band_pos,
+//                           WORD8 *pi1_sao_offset,
+//                           WORD32 wd,
+//                           WORD32 ht)
+//
+//**************Variables Vs Registers*****************************************
+//x0 =>    *pu1_src
+//x1 =>    src_strd
+//x2 =>    *pu1_src_left
+//x3 =>    *pu1_src_top
+//x4    =>    *pu1_src_top_left
+//x5    =>    sao_band_pos
+//x6    =>    *pi1_sao_offset
+//x7    =>    wd
+//x8    =>    ht
+
+
+.set WIDE_REFERENCE, 0
+.set ARCHITECTURE, 5
+.set DO1STROUNDING, 0
+
+.include "ihevc_neon_macros.s"
+
+.text
+.p2align 2
+
+.globl gu1_table_band_idx
+.globl ihevc_sao_band_offset_luma_av8
+
+ihevc_sao_band_offset_luma_av8:
+
+    // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
+
+    LDR         w8,[sp]                     //Loads ht
+
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    MOV         x9,x8                       //Move the ht to x9 for loop counter
+    ADD         x10,x0,x7                   //pu1_src[row * src_strd + (wd)]
+
+    SUB         x10,x10,#1                  //wd-1
+    ADRP        x14, :got:gu1_table_band_idx
+    LDR         x14, [x14, #:got_lo12:gu1_table_band_idx]
+
+SRC_LEFT_LOOP:
+    LDRB        w11,[x10]
+    add         x10, x10, x1                //Load the value
+    SUBS        x9,x9,#1                    //Decrement the loop counter
+    STRB        w11,[x2],#1                 //Store the value in pu1_src_left pointer
+    BNE         SRC_LEFT_LOOP
+
+    ADD         x9,x3,x7                    //pu1_src_top[wd]
+    LD1         {v1.8b},[x14],#8            //band_table.val[0]
+
+    LSL         x11,x5,#3
+    LD1         {v2.8b},[x14],#8            //band_table.val[1]
+
+    LDRB        w10,[x9,#-1]
+    dup         v31.8b,w11                  //band_pos
+    SUB         x12,x8,#1                   //ht-1
+
+    STRB        w10,[x4]                    //store to pu1_src_top_left[0]
+    LD1         {v3.8b},[x14],#8            //band_table.val[2]
+    mul         x12, x12, x1                //ht-1 * src_strd
+
+    ADD         x4,x12,x0                   //pu1_src[(ht - 1) * src_strd]
+    LD1         {v4.8b},[x14],#8            //band_table.val[3]
+    MOV         x9,x7                       //Move the wd to x9 for loop counter
+
+SRC_TOP_LOOP:                               //wd is always multiple of 8
+    LD1         {v0.8b},[x4],#8             //Load pu1_src[(ht - 1) * src_strd + col]
+    SUBS        x9,x9,#8                    //Decrement the loop counter by 8
+    ST1         {v0.8b},[x3],#8             //Store to pu1_src_top[col]
+    BNE         SRC_TOP_LOOP
+
+    LD1         {v30.8b},[x6]               //pi1_sao_offset load
+    ADD         v5.8b,  v1.8b ,  v31.8b     //band_table.val[0] = vadd_u8(band_table.val[0], band_pos)
+
+    dup         v29.8b, v30.8b[1]           //vdup_n_u8(pi1_sao_offset[1])
+    ADD         v6.8b,  v2.8b ,  v31.8b     //band_table.val[1] = vadd_u8(band_table.val[1], band_pos)
+
+    dup         v28.8b, v30.8b[2]           //vdup_n_u8(pi1_sao_offset[2])
+    ADD         v7.8b,  v3.8b ,  v31.8b     //band_table.val[2] = vadd_u8(band_table.val[2], band_pos)
+
+    dup         v27.8b, v30.8b[3]           //vdup_n_u8(pi1_sao_offset[3])
+    ADD         v8.8b,  v4.8b ,  v31.8b     //band_table.val[3] = vadd_u8(band_table.val[3], band_pos)
+
+    dup         v26.8b, v30.8b[4]           //vdup_n_u8(pi1_sao_offset[4])
+    ADD         v1.8b,  v5.8b ,  v29.8b     //band_table.val[0] = vadd_u8(band_table.val[0], vdup_n_u8(pi1_sao_offset[1]))
+
+    movi        v29.8b, #16                 //vdup_n_u8(16)
+    ADD         v2.8b,  v6.8b ,  v28.8b     //band_table.val[1] = vadd_u8(band_table.val[1], vdup_n_u8(pi1_sao_offset[2]))
+
+    CMP         x5,#28
+    ADD         v3.8b,  v7.8b ,  v27.8b     //band_table.val[2] = vadd_u8(band_table.val[2], vdup_n_u8(pi1_sao_offset[3]))
+
+    ADD         v4.8b,  v8.8b ,  v26.8b     //band_table.val[3] = vadd_u8(band_table.val[3], vdup_n_u8(pi1_sao_offset[4]))
+    BLT         SAO_BAND_POS_0
+
+SAO_BAND_POS_28:                            //case 28
+
+    cmhs        v12.8b,  v29.8b ,  v4.8b    //vcle_u8(band_table.val[3], vdup_n_u8(16))
+
+    BNE         SAO_BAND_POS_29
+    ORR         v4.8b,  v4.8b ,  v12.8b     //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
+    B           SWITCH_BREAK
+
+SAO_BAND_POS_29:                            //case 29
+    CMP         x5,#29
+    cmhs        v11.8b,  v29.8b ,  v3.8b    //vcle_u8(band_table.val[2], vdup_n_u8(16))
+
+    BNE         SAO_BAND_POS_30
+    ORR         v3.8b,  v3.8b ,  v11.8b     //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
+
+    AND         v4.8b,  v4.8b ,  v12.8b     //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
+    B           SWITCH_BREAK
+
+SAO_BAND_POS_30:                            //case 30
+    CMP         x5,#30
+    cmhs        v10.8b,  v29.8b ,  v2.8b    //vcle_u8(band_table.val[1], vdup_n_u8(16))
+
+    BNE         SAO_BAND_POS_31
+    ORR         v2.8b,  v2.8b ,  v10.8b     //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
+
+    AND         v3.8b,  v3.8b ,  v11.8b     //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
+    B           SWITCH_BREAK
+
+SAO_BAND_POS_31:                            //case 31
+    CMP         x5,#31
+    BNE         SWITCH_BREAK
+
+    cmhs        v9.8b,  v29.8b ,  v1.8b     //vcle_u8(band_table.val[0], vdup_n_u8(16))
+    ORR         v1.8b,  v1.8b ,  v9.8b      //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
+
+    AND         v2.8b,  v2.8b ,  v10.8b     //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
+
+SAO_BAND_POS_0:
+    CMP         x5,#0                       //case 0
+    BNE         SWITCH_BREAK
+
+    cmhs        v9.8b,  v29.8b ,  v1.8b     //vcle_u8(band_table.val[0], vdup_n_u8(16))
+    AND         v1.8b,  v1.8b ,  v9.8b      //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
+
+SWITCH_BREAK:
+
+    mov         v1.d[1],v2.d[0]
+    mov         v2.d[0],v3.d[0]
+    mov         v2.d[1],v4.d[0]
+
+SWITCH_BREAK_1:
+
+    MOV         x4,x0                       //pu1_src_cpy
+    MOV         x11,x8                      //move ht
+    ADD         x5,x4,x1
+
+HEIGHT_LOOP:
+    ADD         x6,x5,x1
+    LD1         {v13.8b},[x4]               //au1_cur_row = vld1_u8(pu1_src_cpy)
+
+    ADD         x10,x6,x1
+    LD1         {v15.8b},[x5]               //au1_cur_row = vld1_u8(pu1_src_cpy)
+
+    LD1         {v17.8b},[x6]               //au1_cur_row = vld1_u8(pu1_src_cpy)
+
+    LD1         {v19.8b},[x10]              //au1_cur_row = vld1_u8(pu1_src_cpy)
+    SUB         v14.8b,  v13.8b ,  v31.8b   //vsub_u8(au1_cur_row, band_pos)
+
+    TBX         v13.8b, {v1.16b- v2.16b},v14.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
+    SUB         v16.8b,  v15.8b ,  v31.8b   //vsub_u8(au1_cur_row, band_pos)
+
+    TBX         v15.8b, {v1.16b- v2.16b},v16.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
+    SUB         v18.8b,  v17.8b ,  v31.8b   //vsub_u8(au1_cur_row, band_pos)
+
+    TBX         v17.8b, {v1.16b- v2.16b},v18.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
+    SUB         v20.8b,  v19.8b ,  v31.8b   //vsub_u8(au1_cur_row, band_pos)
+
+    TBX         v19.8b, {v1.16b- v2.16b},v20.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
+    ST1         {v13.8b},[x4],x1            //vst1_u8(pu1_src_cpy, au1_cur_row)
+
+    ST1         {v15.8b},[x5]               //vst1_u8(pu1_src_cpy, au1_cur_row)
+    SUBS        x11,x11,#4                  //Decrement the ht loop count by 4
+
+    ST1         {v17.8b},[x6],x1            //vst1_u8(pu1_src_cpy, au1_cur_row)
+
+    ADD         x4,x6,x1
+    ST1         {v19.8b},[x10]              //vst1_u8(pu1_src_cpy, au1_cur_row)
+    ADD         x5,x4,x1
+
+    BNE         HEIGHT_LOOP
+
+    SUBS        x7,x7,#8                    //Decrement the width loop by 8
+    ADD         x0,x0,#8
+    BNE         SWITCH_BREAK_1
+
+    // LDMFD sp!,{x4-x12,x15}              //Reload the registers from SP
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+    ret
+
+
+

diff --git a/common/arm64/ihevc_sao_edge_offset_class0.s b/common/arm64/ihevc_sao_edge_offset_class0.s
new file mode 100644
index 0000000..f7d6621
--- /dev/null
+++ b/common/arm64/ihevc_sao_edge_offset_class0.s

@@ -0,0 +1,345 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//*  ihevc_sao_edge_offset_class0.s
+//*
+//* ,:brief
+//*  Contains function definitions for inter prediction  interpolation.
+//* Functions are coded using NEON  intrinsics and can be compiled using// ARM
+//* RVCT
+//*
+//* ,:author
+//*  Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//*  None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_edge_offset_class0(UWORD8 *pu1_src,
+//                              WORD32 src_strd,
+//                              UWORD8 *pu1_src_left,
+//                              UWORD8 *pu1_src_top,
+//                              UWORD8 *pu1_src_top_left,
+//                              UWORD8 *pu1_src_top_right,
+//                              UWORD8 *pu1_src_bot_left,
+//                              UWORD8 *pu1_avail,
+//                              WORD8 *pi1_sao_offset,
+//                              WORD32 wd,
+//                              WORD32 ht)
+//
+//**************Variables Vs Registers*****************************************
+//x0 =>    *pu1_src
+//x1 =>    src_strd
+//x2 =>    *pu1_src_left
+//x3 =>    *pu1_src_top
+//x4    =>    *pu1_src_top_left
+//x7    =>    *pu1_avail
+//x8    =>    *pi1_sao_offset
+//x9    =>    wd
+//x10=>    ht
+
+.text
+.p2align 2
+
+.include "ihevc_neon_macros.s"
+
+.globl gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class0_av8
+
+ihevc_sao_edge_offset_class0_av8:
+
+
+    // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
+
+    LDR         x8, [sp]                    // pi1_sao_offset
+    LDR         x9,[sp,#8]                  //Loads wd
+    AND         x9,x9,0xFFFFFFFF            // Since argument is passed as WORD32, Using only lower half of x9
+    LDR         x10,[sp,#16]                //Loads ht
+    AND         x10,x10,0xFFFFFFFF          // Since argument is passed as WORD32, Using only lower half of x10
+
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    movi        v2.16b, #2                  //const_2 = vdupq_n_s8(2)
+    ADD         x11,x3,x9                   //pu1_src_top[wd]
+    SUB         x11,x11,#1
+
+    movi        v4.8h, #0                   //const_min_clip = vdupq_n_s16(0)
+    LDRB        w12,[x11]                   //pu1_src_top[wd - 1]
+    ADD         x11,x11,#1
+
+    movi        v6.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+    ADRP        x14, :got:gi1_table_edge_idx //table pointer
+    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
+
+    movi        v8.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
+    STRB        w12,[x4]                    //*pu1_src_top_left = pu1_src_top[wd - 1]
+
+    MOV         x6,x0                       //pu1_src_org
+    LD1         {v10.8b},[x14]              //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    SUB         x4,x10,#1                   //(ht - 1)
+
+    MOV         x12,x9                      //Move wd to x12 for loop count
+    LD1         {v11.8b},[x8]               //offset_tbl = vld1_s8(pi1_sao_offset)
+    mul         x4, x4, x1                  //(ht - 1) * src_strd
+
+    ADD         x4,x4,x0                    //pu1_src[(ht - 1) * src_strd]
+
+SRC_TOP_LOOP:                               //wd is always multiple of 8
+    LD1         {v0.8b},[x4],#8             //Load pu1_src[(ht - 1) * src_strd + col]
+    SUBS        x12,x12,#8                  //Decrement the loop counter by 8
+    ST1         {v0.8b},[x3],#8             //Store to pu1_src_top[col]
+    BNE         SRC_TOP_LOOP
+    ADD         x6,x6,#15                   //pu1_src_org[16 - 1]
+
+    CMP         x9,#16                      //Compare wd with 16
+    MOV         x3,x2                       //pu1_src_left backup to reload later
+    BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+
+    MOV         x8,x9                       //move wd to x8 for loop count
+
+WIDTH_LOOP_16:
+    CMP         x8,x9                       //if(col == wd)
+    BNE         AU1_MASK_FF                 //jump to else part
+    LDRB        w12,[x7]                    //pu1_avail[0]
+    mov         v8.8b[0], w12               //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+    B           SKIP_AU1_MASK_FF            //Skip the else part
+
+AU1_MASK_FF:
+    MOV         x12,#0xFF                   //move -1 to x12
+    mov         v8.8b[0], w12               //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+SKIP_AU1_MASK_FF:
+    CMP         x8,#16                      //If col == 16
+    BNE         SKIP_MASKING_IF_NOT16       //If not skip masking
+    LDRB        w12,[x7,#1]                 //pu1_avail[1]
+    mov         v8.b[15], w12               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_MASKING_IF_NOT16:
+    MOV         x12,x0                      //pu1_src_cpy = pu1_src
+    MOV         x4,x10                      //move ht to x4 for loop count
+
+PU1_SRC_LOOP:
+    LDRB        w11,[x2]                    //load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
+    LD1         {v12.16b},[x12],x1          //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    SUB         x5,x9,x8                    //wd - col
+
+    SUB         x14,x10,x4                  //ht - row
+    mov         v14.8b[15], w11             //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+    mul         x14, x14, x1                //(ht - row) * src_strd
+
+    LD1         {v26.16b},[x12]             //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    EXT         v14.16b,  v14.16b ,  v12.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+    ADD         x5,x14,x5                   //(ht - row) * src_strd + (wd - col)
+
+    LDRB        w11,[x2, #1]                //II Iteration load pu1_src_left since ht - row + 1 =1
+    cmhi        v16.16b,  v12.16b ,  v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    LDRB        w14,[x6,x5]                 //pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
+
+    SUB         x4,x4,#1
+    mov         v28.8b[15], w11             //II Iteration vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+    cmhi        v18.16b,  v14.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+    SUB         x12,x12,x1                  //Decrement the pu1_src pointer by src_strd
+    SUB         v20.16b,  v18.16b ,  v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    STRB        w14,[x2],#1                 //pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+
+    LDRB        w11,[x12,#16]               //pu1_src_cpy[16]
+    EXT         v28.16b,  v28.16b ,  v26.16b,#15 //II Iteration pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+    SUB         x5,x9,x8                    //II wd - col
+
+    ADD         x12,x12,x1                  //Increment the pu1_src pointer by src_strd
+    mov         v14.8b[0], w11              //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+    cmhi        v30.16b,  v26.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+    LDRB        w11,[x12,#16]               //II pu1_src_cpy[16]
+    EXT         v14.16b,  v12.16b ,  v14.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+    SUB         x14,x10,x4                  //II ht - row
+
+    cmhi        v0.16b,  v28.16b ,  v26.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    mov         v28.8b[0], w11              //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+    SUB         x12,x12,x1                  //Decrement the pu1_src pointer by src_strd
+
+    mul         x14, x14, x1                //II (ht - row) * src_strd
+    cmhi        v16.16b,  v12.16b ,  v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    ADD         x5,x14,x5                   //II (ht - row) * src_strd + (wd - col)
+
+    cmhi        v18.16b,  v14.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    EXT         v28.16b,  v26.16b ,  v28.16b,#1 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+
+    LDRB        w14,[x6,x5]                 //II pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
+    SUB         v22.16b,  v18.16b ,  v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUBS        x4,x4,#1                    //Decrement row by 1
+
+    ADD         v14.16b,  v2.16b ,  v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
+    STRB        w14,[x2],#1                 //II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+
+    ADD         v14.16b,  v14.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
+    Uxtl        v18.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    SUB         v20.16b,  v0.16b ,  v30.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    TBL         v14.16b, {v10.16b},v14.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    cmhi        v30.16b,  v26.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+    cmhi        v0.16b,  v28.16b ,  v26.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+//  TBL v15.8b, {v10.16b},v15.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    SUB         v22.16b,  v0.16b ,  v30.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    AND         v14.16b,  v14.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    TBL         v16.16b, {v11.16b},v14.16b  //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    Uxtl        v0.8h, v26.8b               //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    ADD         v28.16b,  v2.16b ,  v20.16b //II edge_idx = vaddq_s8(const_2, sign_left)
+    ADD         v28.16b,  v28.16b ,  v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right)
+
+    SADDW       v18.8h,  v18.8h ,  v16.8b
+    TBL         v28.16b, {v10.16b},v28.16b  //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    SMAX        v18.8h,  v18.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+//  TBL v29.8b, {v10.16b},v29.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    UMIN        v18.8h,  v18.8h ,  v6.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    AND         v28.16b,  v28.16b ,  v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+//  TBL v17.8b, {v11.16b},v15.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+
+    Uxtl2       v14.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    TBL         v30.16b, {v11.16b},v28.16b  //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    SADDW2      v14.8h,  v14.8h ,  v16.16b  //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    SMAX        v14.8h,  v14.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+//  TBL v31.8b, {v11.16b},v29.8b                    //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    UMIN        v14.8h,  v14.8h ,  v6.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    xtn         v18.8b,  v18.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+    SADDW       v0.8h,  v0.8h ,  v30.8b
+
+    xtn         v19.8b,  v14.8h             //vmovn_s16(pi2_tmp_cur_row.val[1])
+    SMAX        v0.8h,  v0.8h ,  v4.8h      //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    Uxtl2       v28.8h, v26.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    UMIN        v0.8h,  v0.8h ,  v6.8h      //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    xtn         v0.8b,  v0.8h               //II vmovn_s16(pi2_tmp_cur_row.val[0])
+    SADDW2      v28.8h,  v28.8h ,  v30.16b  //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    SMAX        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    ST1         {v18.8b, v19.8b},[x12],x1   //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    UMIN        v28.8h,  v28.8h ,  v6.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    xtn         v1.8b,  v28.8h              //II vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    ST1         {v0.8b, v1.8b},[x12],x1     //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    BNE         PU1_SRC_LOOP                //If not equal jump to the inner loop
+
+    ADD         x0,x0,#16                   //pu1_src += 16
+
+    SUBS        x8,x8,#16                   //Decrement column by 16
+    CMP         x8,#8                       //Check whether residue remains
+    MOV         x2,x3                       //Reload pu1_src_left
+    BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
+    BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
+    BLT         END_LOOPS                   //Jump to end function
+
+WIDTH_RESIDUE:
+    SUB         x6,x6,#15
+    AND         x8,x9,#0xF                  //wd_rem = wd & 0xF
+    CMP         x8,#0                       //Residue check
+    BEQ         END_LOOPS                   //No Residue jump to end function
+
+    CMP         x8,x9                       //if(wd_rem == wd)
+    BNE         AU1_MASK_FF_RESIDUE         //jump to else part
+    LDRB        w12,[x7]                    //pu1_avail[0]
+    mov         v8.8b[0], w12               //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+    B           SKIP_AU1_MASK_FF_RESIDUE    //Skip the else part
+
+AU1_MASK_FF_RESIDUE:
+    MOV         x12,#0xFF                   //move -s to x12
+    mov         v8.8b[0], w12               //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+SKIP_AU1_MASK_FF_RESIDUE:
+    LDRB        w11,[x7,#1]                 //pu1_avail[1]
+    SUB         x5,x9,#1                    //wd - 1
+
+    MOV         x4,x10                      //move ht to x4 for loop count
+    mov         v8.8b[7], w11               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    MOV         x12,x0                      //pu1_src_cpy = pu1_src
+
+PU1_SRC_LOOP_RESIDUE:
+    LD1         {v12.16b},[x12]             //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    LDRB        w11,[x2]                    //load pu1_src_left
+    mov         v14.8b[15], w11             //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+    EXT         v14.16b,  v14.16b ,  v12.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+
+    cmhi        v16.16b,  v12.16b ,  v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    cmhi        v18.16b,  v14.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    SUB         v20.16b,  v18.16b ,  v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    LDRB        w11,[x12,#16]               //pu1_src_cpy[16]
+    mov         v14.8b[0], w11              //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+    EXT         v14.16b,  v12.16b ,  v14.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+
+    cmhi        v16.16b,  v12.16b ,  v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    cmhi        v18.16b,  v14.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    SUB         v22.16b,  v18.16b ,  v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    ADD         v24.16b,  v2.16b ,  v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
+    ADD         v24.16b,  v24.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
+
+    TBL         v24.16b, {v10.16b},v24.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+//  TBL v25.8b, {v10.16b},v25.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    AND         v24.16b,  v24.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    NEG         v20.16b, v22.16b            //sign_left = vnegq_s8(sign_right)
+    EXT         v20.16b,  v20.16b ,  v22.16b,#15 //sign_left = vextq_s8(sign_left, sign_left, 15)
+
+    TBL         v26.8b, {v11.16b},v24.8b    //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    Uxtl        v28.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    SADDW       v28.8h,  v28.8h ,  v26.8b
+    SMAX        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v28.8h,  v28.8h ,  v6.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    SUB         x14,x10,x4                  //ht - row
+    mul         x14, x14, x1                //(ht - row) * src_strd
+    ADD         x11,x14,x5                  //(ht - row) * src_strd + (wd - 1)
+    LDRB        w14,[x6, x11]               //pu1_src_org[(ht - row) * src_strd + (wd - 1)]
+    STRB        w14,[x2],#1                 //pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+
+    ST1         {v28.8b},[x12],x1           //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    SUBS        x4,x4,#1                    //Decrement row by 1
+    BNE         PU1_SRC_LOOP_RESIDUE        //If not equal jump to the pu1_src loop
+
+END_LOOPS:
+    // LDMFD sp!,{x4-x12,x15}              //Reload the registers from SP
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+    ret
+
+
+
+

diff --git a/common/arm64/ihevc_sao_edge_offset_class0_chroma.s b/common/arm64/ihevc_sao_edge_offset_class0_chroma.s
new file mode 100644
index 0000000..d854c62
--- /dev/null
+++ b/common/arm64/ihevc_sao_edge_offset_class0_chroma.s

@@ -0,0 +1,483 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//*  ihevc_sao_edge_offset_class0_chroma.s
+//*
+//* ,:brief
+//*  Contains function definitions for inter prediction  interpolation.
+//* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
+//* RVCT
+//*
+//* ,:author
+//*  Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//*  None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_edge_offset_class0_chroma(UWORD8 *pu1_src,
+//                              WORD32 src_strd,
+//                              UWORD8 *pu1_src_left,
+//                              UWORD8 *pu1_src_top,
+//                              UWORD8 *pu1_src_top_left,
+//                              UWORD8 *pu1_src_top_right,
+//                              UWORD8 *pu1_src_bot_left,
+//                              UWORD8 *pu1_avail,
+//                              WORD8 *pi1_sao_offset_u,
+//                              WORD8 *pi1_sao_offset_v,
+//                              WORD32 wd,
+//
+//**************Variables Vs Registers*****************************************
+//x0 =>    *pu1_src
+//x1 =>    src_strd
+//x2 =>    *pu1_src_left
+//x3 =>    *pu1_src_top
+//x4    =>    *pu1_src_top_left
+//x7    =>    *pu1_avail
+//x8    =>    *pi1_sao_offset_u
+//x5    =>    *pi1_sao_offset_v
+//x9    =>    wd
+//x10=>    ht
+
+.text
+.p2align 2
+.include "ihevc_neon_macros.s"
+
+.globl gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class0_chroma_av8
+
+ihevc_sao_edge_offset_class0_chroma_av8:
+
+    ldr         x8,[sp,#0]
+    ldr         x9,[sp,#8]
+    ldr         w10,[sp,#16]
+    ldr         w11,[sp,#24]
+
+    push_v_regs
+
+    // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
+    stp         x19, x20,[sp,#-16]!
+    stp         x21, x22,[sp,#-16]!
+    stp         x23, x24,[sp,#-16]!
+    stp         x25, x26,[sp,#-16]!
+
+    mov         x15,x4 // *pu1_src_top_left 40
+    mov         x16,x5 // *pu1_src_top_right 44
+    mov         x17,x6 // *pu1_src_bot_left 48
+    mov         x21,x7 // *pu1_avail 52
+    mov         x22,x8 // *pi1_sao_offset_u 56
+    mov         x23,x9 // *pi1_sao_offset_v 60
+    mov         x24,x10 // wd 64
+    mov         x25,x11 // ht 68
+
+    MOV         x9, x24                     //Loads wd
+
+    MOV         x4, x15                     //Loads pu1_src_top_left
+    ADD         x11,x3,x9                   //pu1_src_top[wd]
+
+    MOV         x10, x25                    //Loads ht
+    movi        v2.16b, #2                  //const_2 = vdupq_n_s8(2)
+    SUB         x20,x11,#2
+    LDRH        w12,[x20]                   //pu1_src_top[wd - 1]
+
+    MOV         x7, x21                     //Loads pu1_avail
+    movi        v4.8h, #0                   //const_min_clip = vdupq_n_s16(0)
+    STRH        w12,[x4]                    //*pu1_src_top_left = pu1_src_top[wd - 1]
+
+    MOV         x8, x22                     //Loads pi1_sao_offset_u
+    movi        v6.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+    SUB         x4,x10,#1                   //(ht - 1)
+
+    ADRP        x14, :got:gi1_table_edge_idx //table pointer
+    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
+    movi        v8.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
+    mul         x4, x4, x1                  //(ht - 1) * src_strd
+
+    MOV         x5, x23                     //Loads pi1_sao_offset_v
+    LD1         {v11.8b},[x8]               //offset_tbl = vld1_s8(pi1_sao_offset_u)
+    ADD         x4,x4,x0                    //pu1_src[(ht - 1) * src_strd]
+
+    MOV         x6,x0                       //pu1_src_org
+    LD1         {v10.8b},[x14]              //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    MOV         x12,x9                      //Move wd to x12 for loop count
+
+SRC_TOP_LOOP:                               //wd is always multiple of 8
+    LD1         {v0.8b},[x4],#8             //Load pu1_src[(ht - 1) * src_strd + col]
+    SUBS        x12,x12,#8                  //Decrement the loop counter by 8
+    ST1         {v0.8b},[x3],#8             //Store to pu1_src_top[col]
+    BNE         SRC_TOP_LOOP
+    ADD         x6,x6,#14                   //pu1_src_org[14]
+
+    MOV         x3,x2                       //pu1_src_left backup to reload later
+    LD1         {v0.8b},[x5]                //offset_tbl = vld1_s8(pi1_sao_offset_v)
+    CMP         x9,#16                      //Compare wd with 16
+
+    BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+
+    MOV         x8,x9                       //move wd to x8 for loop count
+
+WIDTH_LOOP_16:
+    CMP         x8,x9                       //if(col == wd)
+    BNE         AU1_MASK_FF                 //jump to else part
+    LDRB        w12,[x7]                    //pu1_avail[0]
+    mov         v8.8b[0], w12               //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+    mov         v8.8b[1], w12               //vsetq_lane_s8(pu1_avail[0], au1_mask, 1)
+    B           SKIP_AU1_MASK_FF            //Skip the else part
+
+AU1_MASK_FF:
+    MOV         x12,#-1                     //move -1 to x12
+    mov         v8.4h[0], w12               //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+SKIP_AU1_MASK_FF:
+    CMP         x8,#16                      //If col == 16
+    BNE         SKIP_MASKING_IF_NOT16       //If not skip masking
+    LDRB        w12,[x7,#1]                 //pu1_avail[1]
+    mov         v8.8b[14], w12              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14)
+    mov         v8.8b[15], w12              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_MASKING_IF_NOT16:
+    MOV         x12,x0                      //pu1_src_cpy = pu1_src
+    MOV         x4,x10                      //move ht to x4 for loop count
+
+PU1_SRC_LOOP:
+    LDRH        w11,[x2]                    //load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
+    LD1         {v12.16b},[x12],x1          //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    //LD1 {v13.8b},[x12],x1                    //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    //SUB x12, x12,#8
+    SUB         x5,x9,x8                    //wd - col
+
+    SUB         x14,x10,x4                  //ht - row
+    mov         v14.4h[7], w11              //vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
+    mul         x14, x14, x1                //(ht - row) * src_strd
+
+    LD1         {v30.16b},[x12]             //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    //LD1 {v31.8b},[x12]                    //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    //SUB x12, x12,#8
+    EXT         v14.16b,  v14.16b ,  v12.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
+    SUB         x12,x12,x1
+
+    LDRH        w11,[x2,#2]                 //II load pu1_src_left since ht - row =0
+    cmhi        v16.16b,  v12.16b ,  v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    ADD         x5,x14,x5                   //(ht - row) * src_strd + (wd - col)
+
+    mov         v28.4h[7], w11              //II vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
+    cmhi        v18.16b,  v14.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+    LDRH        w14,[x6,x5]                 //pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)]
+    SUB         v20.16b,  v18.16b ,  v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         x4,x4,#1
+
+    LDRB        w11,[x12,#16]               //pu1_src_cpy[16]
+    EXT         v28.16b,  v28.16b ,  v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
+
+    mov         v14.8b[0], w11              //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+    cmhi        v26.16b,  v30.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+    LDRB        w11,[x12,#17]               //pu1_src_cpy[17]
+    cmhi        v24.16b,  v28.16b ,  v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    STRH        w14,[x2],#2                 //pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+
+    ADD         x12,x12,x1
+    mov         v14.8b[1], w11              //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+    LDRB        w11,[x12,#16]               //II pu1_src_cpy[16]
+
+    EXT         v14.16b,  v12.16b ,  v14.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
+    mov         v28.8b[0], w11              //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+
+    LDRB        w11,[x12,#17]               //II pu1_src_cpy[17]
+    cmhi        v16.16b,  v12.16b ,  v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    SUB         x12,x12,x1
+
+    cmhi        v18.16b,  v14.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    mov         v28.8b[1], w11              //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+
+    SUB         v22.16b,  v18.16b ,  v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    EXT         v28.16b,  v30.16b ,  v28.16b,#2 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
+
+    ADD         v14.16b,  v2.16b ,  v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
+
+    mov         v10.d[1],v10.d[0]
+    ADD         v14.16b,  v14.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
+    TBL         v14.16b, {v10.16b},v14.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    SUB         v20.16b,  v24.16b ,  v26.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    cmhi        v26.16b,  v30.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+//    TBL v15.8b, {v10.16b},v15.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    cmhi        v24.16b,  v28.16b ,  v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+    AND         v14.16b,  v14.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v15.d[0],v14.d[1]
+    UZP1        v1.8b, v14.8b, v15.8b
+    UZP2        v15.8b, v14.8b, v15.8b
+    mov         v14.8b, v1.8b
+
+    //mov v11.d[1],v0.d[0]
+    //mov v14.d[1],v15.d[0]
+    SUB         v22.16b,  v24.16b ,  v26.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    TBL         v16.8b, {v11.16b},v14.8b    //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+    ADD         v24.16b,  v2.16b ,  v20.16b //II edge_idx = vaddq_s8(const_2, sign_left)
+
+    Uxtl        v18.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    TBL         v17.8b, {v0.16b},v15.8b
+    ADD         v24.16b,  v24.16b ,  v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right)
+
+    //mov v17.d[0],v16.d[1]
+    ZIP1        v1.8b, v16.8b, v17.8b
+    ZIP2        v17.8b, v16.8b, v17.8b
+    mov         v16.8b, v1.8b
+    TBL         v24.16b, {v10.16b},v24.16b  //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    Uxtl2       v12.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+    //mov v16.d[1],v17.d[0]
+    SADDW       v18.8h,  v18.8h ,  v16.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    //TBL v25.8b, {v10.16b},v25.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    SMAX        v18.8h,  v18.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    AND         v24.16b,  v24.16b ,  v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v25.d[0],v24.d[1]
+    UMIN        v18.8h,  v18.8h ,  v6.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+    UZP1        v1.8b, v24.8b, v25.8b
+    UZP2        v25.8b, v24.8b, v25.8b      //II
+    mov         v24.8b, v1.8b
+
+    //mov v24.d[1],v25.d[0]
+    SADDW       v12.8h,  v12.8h ,  v17.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    TBL         v26.8b, {v11.16b},v24.8b    //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+    SMAX        v12.8h,  v12.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    UMIN        v12.8h,  v12.8h ,  v6.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    TBL         v27.8b, {v0.16b},v25.8b     //II
+    xtn         v14.8b,  v18.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    //mov v27.d[0],v26.d[1]
+    xtn         v15.8b,  v12.8h             //vmovn_s16(pi2_tmp_cur_row.val[1])
+    ZIP1        v1.8b, v26.8b, v27.8b
+    ZIP2        v27.8b, v26.8b, v27.8b      //II
+    mov         v26.8b, v1.8b
+
+    //mov v26.d[1],v27.d[0]
+    SUB         x5,x9,x8                    //II wd - col
+    Uxtl        v28.8h, v30.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    SUB         x14,x10,x4                  //II ht - row
+
+    mul         x14, x14, x1                //II (ht - row) * src_strd
+    SADDW       v28.8h,  v28.8h ,  v26.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    ADD         x5,x14,x5                   //II (ht - row) * src_strd + (wd - col)
+
+    LDRH        w14,[x6,x5]                 //II pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)]
+    SMAX        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    STRH        w14,[x2],#2                 //II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+    UMIN        v28.8h,  v28.8h ,  v6.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    //mov       v31.2d[0],v30.2d[1]
+    Uxtl2       v30.8h, v30.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+    SADDW       v30.8h,  v30.8h ,  v27.8b   //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    ST1         {v14.8b, v15.8b},[x12],x1   //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    SMAX        v30.8h,  v30.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    SUBS        x4,x4,#1                    //Decrement row by 1
+    UMIN        v30.8h,  v30.8h ,  v6.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    xtn         v28.8b,  v28.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
+    xtn         v29.8b,  v30.8h             //II vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    ST1         {v28.8b, v29.8b},[x12],x1   //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    BNE         PU1_SRC_LOOP                //If not equal jump to the inner loop
+
+    ADD         x0,x0,#16                   //pu1_src += 16
+
+    SUBS        x8,x8,#16                   //Decrement column by 16
+    CMP         x8,#8                       //Check whether residue remains
+    MOV         x2,x3                       //Reload pu1_src_left
+    BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
+    BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
+    BLT         END_LOOPS                   //Jump to end function
+
+WIDTH_RESIDUE:
+    SUB         x6,x6,#14
+    AND         x8,x9,#0xF                  //wd_rem = wd & 0xF
+    CMP         x8,#0                       //Residue check
+    BEQ         END_LOOPS                   //No Residue jump to end function
+
+    CMP         x8,x9                       //if(wd_rem == wd)
+    BNE         AU1_MASK_FF_RESIDUE         //jump to else part
+    LDRB        w12,[x7]                    //pu1_avail[0]
+    mov         v8.8b[0], w12               //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+    mov         v8.8b[1], w12               //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+    B           SKIP_AU1_MASK_FF_RESIDUE    //Skip the else part
+
+AU1_MASK_FF_RESIDUE:
+    MOV         x12,#-1                     //move -1 to x12
+    mov         v8.4h[0], w12               //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+SKIP_AU1_MASK_FF_RESIDUE:
+    LDRB        w12,[x7,#1]                 //pu1_avail[1]
+    mov         v8.8b[6], w12               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v8.8b[7], w12               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+    MOV         x12,x0                      //pu1_src_cpy = pu1_src
+    MOV         x4,x10                      //move ht to x4 for loop count
+
+PU1_SRC_LOOP_RESIDUE:
+    LDRH        w11,[x2]                    //load pu1_src_left
+    LD1         {v12.16b},[x12],x1          //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    //LD1 {v13.8b},[x12],x1                    //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    //SUB x12, x12,#8
+    SUB         x5,x9,#2                    //wd - 2
+
+    SUB         x14,x10,x4                  //(ht - row)
+    mov         v14.4h[7], w11              //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+    LSL         x14,x14,#1                  //(ht - row) * 2
+
+    LD1         {v30.16b},[x12]             //II pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    //LD1 {v31.8b},[x12]                    //II pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    //SUB x12, x12,#8
+    EXT         v14.16b,  v14.16b ,  v12.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+    SUB         x12,x12,x1
+
+    LDRH        w11,[x2,#2]                 //II load pu1_src_left
+    cmhi        v16.16b,  v12.16b ,  v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    mul         x14, x14, x1                //(ht - row) * 2 * src_strd
+
+    cmhi        v18.16b,  v14.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    mov         v28.4h[7], w11              //II vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+
+    LDRB        w11,[x12,#16]               //pu1_src_cpy[16]
+    SUB         v20.16b,  v18.16b ,  v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    ADD         x5,x14,x5                   //(ht - row) * 2 * src_strd + (wd - 2)
+
+    mov         v14.8b[0], w11              //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+    EXT         v28.16b,  v28.16b ,  v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+
+    LDRB        w11,[x12,#17]               //pu1_src_cpy[17]
+    cmhi        v26.16b,  v30.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    LDRH        w14,[x6, x5]                //pu1_src_org[(ht - row)  * 2* src_strd + (wd - 2)]
+
+    mov         v14.8b[1], w11              //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+    cmhi        v24.16b,  v28.16b ,  v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    ADD         x12,x12,x1
+
+    STRH        w14,[x2],#2                 //pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2]
+    EXT         v14.16b,  v12.16b ,  v14.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+    LDRB        w11,[x12,#16]               //II pu1_src_cpy[16]
+
+    cmhi        v16.16b,  v12.16b ,  v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    mov         v28.8b[0], w11              //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+
+    LDRB        w11,[x12,#17]               //II pu1_src_cpy[17]
+    cmhi        v18.16b,  v14.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    SUB         x4,x4,#1                    //II Decrement row by 1
+
+    SUB         v22.16b,  v18.16b ,  v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    mov         v28.8b[1], w11              //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+    SUB         x12,x12,x1
+
+    ADD         v14.16b,  v2.16b ,  v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
+    EXT         v28.16b,  v30.16b ,  v28.16b,#2 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+
+    ADD         v14.16b,  v14.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
+
+    SUB         v20.16b,  v24.16b ,  v26.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    TBL         v14.16b, {v10.16b},v14.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    cmhi        v26.16b,  v30.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+    cmhi        v24.16b,  v28.16b ,  v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    //TBL v15.8b, {v10.16b},v15.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    SUB         v22.16b,  v24.16b ,  v26.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    AND         v14.16b,  v14.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v15.d[0],v14.d[1]
+    UZP1        v1.8b, v14.8b, v15.8b
+    UZP2        v15.8b, v14.8b, v15.8b
+    mov         v14.8b, v1.8b
+
+    ADD         v28.16b,  v2.16b ,  v20.16b //II edge_idx = vaddq_s8(const_2, sign_left)
+    TBL         v16.8b, {v11.16b},v14.8b    //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+    ADD         v28.16b,  v28.16b ,  v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right)
+
+    Uxtl        v18.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    TBL         v17.8b, {v0.16b},v15.8b
+    Uxtl        v24.8h, v30.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    ZIP1        v1.8b, v16.8b, v17.8b
+    ZIP2        v17.8b, v16.8b, v17.8b
+    mov         v16.8b, v1.8b
+    TBL         v28.16b, {v10.16b},v28.16b  //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    SADDW       v18.8h,  v18.8h ,  v16.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    SMAX        v18.8h,  v18.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    //TBL v29.8b, {v10.16b},v29.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    UMIN        v18.8h,  v18.8h ,  v6.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    xtn         v18.8b,  v18.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+    AND         v28.16b,  v28.16b ,  v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v29.d[0],v28.d[1]
+    SUB         x5,x9,#2                    //II wd - 2
+    UZP1        v1.8b, v28.8b, v29.8b
+    UZP2        v29.8b, v28.8b, v29.8b      //II
+    mov         v28.8b, v1.8b
+    SUB         x14,x10,x4                  //II (ht - row)
+
+    LSL         x14,x14,#1                  //II (ht - row) * 2
+    TBL         v26.8b, {v11.16b},v28.8b    //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+    mul         x14, x14, x1                //II (ht - row) * 2 * src_strd
+
+    ADD         x5,x14,x5                   //II (ht - row) * 2 * src_strd + (wd - 2)
+    TBL         v27.8b, {v0.16b},v29.8b     //II
+    LDRH        w14,[x6, x5]                //II pu1_src_org[(ht - row)  * 2* src_strd + (wd - 2)]
+
+    ZIP1        v1.8b, v26.8b, v27.8b
+    ZIP2        v27.8b, v26.8b, v27.8b      //II
+    mov         v26.8b, v1.8b
+    ST1         {v18.8b},[x12],x1           //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    STRH        w14,[x2],#2                 //II pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2]
+    SADDW       v24.8h,  v24.8h ,  v26.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SUBS        x4,x4,#1                    //Decrement row by 1
+
+    SMAX        v24.8h,  v24.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v24.8h,  v24.8h ,  v6.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    xtn         v28.8b,  v24.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    ST1         {v28.8b},[x12],x1           //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    BNE         PU1_SRC_LOOP_RESIDUE        //If not equal jump to the pu1_src loop
+
+END_LOOPS:
+    // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
+    ldp         x25, x26,[sp],#16
+    ldp         x23, x24,[sp],#16
+    ldp         x21, x22,[sp],#16
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+

diff --git a/common/arm64/ihevc_sao_edge_offset_class1.s b/common/arm64/ihevc_sao_edge_offset_class1.s
new file mode 100644
index 0000000..8ed6169
--- /dev/null
+++ b/common/arm64/ihevc_sao_edge_offset_class1.s

@@ -0,0 +1,364 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//*  ihevc_sao_edge_offset_class1.s
+//*
+//* ,:brief
+//*  Contains function definitions for inter prediction  interpolation.
+//* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
+//* RVCT
+//*
+//* ,:author
+//*  Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//*  None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_edge_offset_class1(UWORD8 *pu1_src,
+//                              WORD32 src_strd,
+//                              UWORD8 *pu1_src_left,
+//                              UWORD8 *pu1_src_top,
+//                              UWORD8 *pu1_src_top_left,
+//                              UWORD8 *pu1_src_top_right,
+//                              UWORD8 *pu1_src_bot_left,
+//                              UWORD8 *pu1_avail,
+//                              WORD8 *pi1_sao_offset,
+//                              WORD32 wd,
+//                              WORD32 ht)
+//**************Variables Vs Registers*****************************************
+//x0 =>    *pu1_src
+//x1 =>    src_strd
+//x2 =>    *pu1_src_left
+//x3 =>    *pu1_src_top
+//x4    =>    *pu1_src_top_left
+//x5    =>    *pu1_avail
+//x6    =>    *pi1_sao_offset
+//x7    =>    wd
+//x8 =>    ht
+
+.text
+.p2align 2
+
+.include "ihevc_neon_macros.s"
+
+.globl gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class1_av8
+
+ihevc_sao_edge_offset_class1_av8:
+
+
+    // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
+    MOV         x5,x7                       //Loads pu1_avail
+
+    LDR         x6,[sp]                     //Loads pi1_sao_offset
+    LDR         w7,[sp,#8]                  //Loads wd
+    LDR         w8,[sp,#16]                 //Loads ht
+
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    SUB         x9,x7,#1                    //wd - 1
+    LDRB        w10,[x3,x9]                 //pu1_src_top[wd - 1]
+    STRB        w10,[x4]                    //*pu1_src_top_left = pu1_src_top[wd - 1]
+    ADD         x10,x0,x9                   //pu1_src[row * src_strd + wd - 1]
+    MOV         x11,x2                      //Move pu1_src_left pointer to x11
+    MOV         x12,x8                      //Move ht to x12 for loop count
+SRC_LEFT_LOOP:
+    LDRB        w14,[x10]                   //Load pu1_src[row * src_strd + wd - 1]
+    ADD         x10,x10,x1
+    STRB        w14,[x11],#1                //pu1_src_left[row]
+    SUBS        x12, x12,#1                 //Decrement the loop count
+    BNE         SRC_LEFT_LOOP               //If not equal to 0 jump to the src_left_loop
+
+    SUB         x12,x8,#1                   //ht - 1
+    mul         x12, x12, x1                //(ht - 1) * src_strd
+    ADD         x12,x12,x0                  //pu1_src[(ht - 1) * src_strd]
+
+    LDRB        w4,[x5,#2]                  //pu1_avail[2]
+    CMP         x4,#0                       //0 == pu1_avail[2]
+    ADD         x20,x0,x1                   //pu1_src += src_strd
+    csel        x0, x20, x0,EQ
+    SUB         x20,x8,#1                   //ht--
+    csel        x8, x20, x8,EQ
+
+    LDRB        w4,[x5,#3]                  //pu1_avail[3]
+    CMP         x4,#0                       //0 == pu1_avail[3]
+    SUB         x20,x8,#1                   //ht--
+    csel        x8, x20, x8,EQ
+
+    movi        v0.16b, #2                  //const_2 = vdupq_n_s8(2)
+    movi        v2.8h, #0                   //const_min_clip = vdupq_n_s16(0)
+    movi        v4.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+    ADRP        x14, :got:gi1_table_edge_idx //table pointer
+    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
+    LD1         {v6.8b},[x14]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    LD1         {v7.8b},[x6]                //offset_tbl = vld1_s8(pi1_sao_offset)
+
+    CMP         x7,#16                      //Compare wd with 16
+    BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+
+WIDTH_LOOP_16:
+    LDRB        w4,[x5,#2]                  //pu1_avail[2]
+    CMP         x4,#0                       //0 == pu1_avail[2]
+    SUB         x20,x0,x1                   //pu1_src -= src_strd
+    csel        x9, x20, x9,EQ
+    csel        x9, x3, x9,NE               //*pu1_src_top
+
+    MOV         x10,x0                      //*pu1_src
+
+    LD1         {v8.16b},[x9],#16           //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+    LD1         {v10.16b},[x0],#16          //pu1_cur_row = vld1q_u8(pu1_src)
+
+    LD1         {v30.16b},[x12],#16         //vld1q_u8(pu1_src[(ht - 1) * src_strd])
+    cmhi        v12.16b,  v10.16b ,  v8.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+    ST1         { v30.16b},[x3],#16         //vst1q_u8(pu1_src_top[col])
+    cmhi        v14.16b,  v8.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+
+    SUB         v16.16b,  v14.16b ,  v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         x11,x8                      //move ht to x11 for loop count
+
+PU1_SRC_LOOP:
+    ADD         x10,x10,x1                  //*pu1_src + src_strd
+    LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    ADD         x6,x10,x1                   //II Iteration *pu1_src + src_strd
+
+    cmhi        v12.16b,  v10.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    LD1         {v30.16b},[x6]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+
+    cmhi        v14.16b,  v18.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         x10,x10,x1
+
+    SUB         v20.16b,  v14.16b ,  v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    Uxtl        v26.8h, v18.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    ADD         v12.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    Uxtl2       v28.8h, v18.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+    ADD         v12.16b,  v12.16b ,  v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+    cmhi        v22.16b,  v18.16b ,  v30.16b //II vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+    NEG         v16.16b, v20.16b            //sign_up = vnegq_s8(sign_down)
+    TBL         v12.16b, {v6.16b},v12.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    cmhi        v24.16b,  v30.16b ,  v18.16b //II vcltq_u8(pu1_cur_row, pu1_top_row)
+
+    SUB         v8.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+//  TBL v13.8b, {v6.16b},v13.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    ADD         v22.16b,  v0.16b ,  v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+
+
+    NEG         v16.16b, v8.16b             //II sign_up = vnegq_s8(sign_down)
+    TBL         v12.16b, {v7.16b},v12.16b   //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    ADD         v22.16b,  v22.16b ,  v8.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
+
+
+    Uxtl        v20.8h, v10.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    TBL         v22.16b, {v6.16b},v22.16b   //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    SADDW       v20.8h,  v20.8h ,  v12.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+//  TBL v23.8b, {v6.16b},v23.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+
+    Uxtl2       v8.8h, v10.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+//  TBL v13.8b, {v7.16b},v13.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    mov         v10.16b, v30.16b            //II pu1_cur_row = pu1_next_row
+
+    SADDW2      v8.8h,  v8.8h ,  v12.16b    //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    TBL         v24.16b, {v7.16b},v22.16b   //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    SMAX        v8.8h,  v8.8h ,  v2.8h      //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    UMIN        v8.8h,  v8.8h ,  v4.8h      //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+//  TBL v25.8b, {v7.16b},v23.8b                    //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+
+    xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+    SADDW       v26.8h,  v26.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    xtn2        v20.16b,  v8.8h             //vmovn_s16(pi2_tmp_cur_row.val[1])
+    SADDW2      v28.8h,  v28.8h ,  v24.16b  //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+
+    SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    SMAX        v28.8h,  v28.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    ST1         { v20.16b},[x10],x1         //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    xtn         v30.8b,  v26.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
+    SUBS        x11,x11,#2                  //II Decrement the ht loop count by 1
+    xtn2        v30.16b,  v28.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    ST1         { v30.16b},[x10],x1         //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    BEQ         PU1_SRC_LOOP_END            //if 0 == pu1_avail[3] || 0 == pu1_avail[2] ht = ht--
+    CMP         x11,#1                      //checking any residue remains
+    BGT         PU1_SRC_LOOP                //If not equal jump to PU1_SRC_LOOP
+
+    ADD         x10,x10,x1                  //*pu1_src + src_strd
+    LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    cmhi        v12.16b,  v10.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v14.16b,  v18.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         v20.16b,  v14.16b ,  v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         x10,x10,x1
+
+    ADD         v22.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v22.16b,  v22.16b ,  v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+    TBL         v22.16b, {v6.16b},v22.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+//  TBL v23.8b, {v6.16b},v23.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    TBL         v24.16b, {v7.16b},v22.16b   //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    Uxtl        v26.8h, v10.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    SADDW       v26.8h,  v26.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+//  TBL v25.8b, {v7.16b},v23.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    Uxtl2       v28.8h, v10.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    SADDW2      v28.8h,  v28.8h ,  v24.16b  //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    xtn         v30.8b,  v26.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+    xtn2        v30.16b,  v28.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    ST1         { v30.16b},[x10],x1         //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+PU1_SRC_LOOP_END:
+    mov         v10.16b, v18.16b            //pu1_cur_row = pu1_next_row
+    SUBS        x7,x7,#16                   //Decrement the wd loop count by 16
+    CMP         x7,#8                       //Check whether residue remains
+    BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
+    BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
+    BLT         END_LOOPS                   //Jump to end function
+
+
+WIDTH_RESIDUE:
+    LDRB        w4,[x5,#2]                  //pu1_avail[2]
+    CMP         x4,#0                       //0 == pu1_avail[2]
+    SUB         x20,x0,x1                   //pu1_src -= src_strd
+    csel        x9, x20, x9,EQ
+    csel        x9, x3, x9,NE               //*pu1_src_top
+    MOV         x10,x0
+
+    LD1         {v8.16b},[x9],#16           //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+    LD1         {v10.16b},[x0],#16          //pu1_cur_row = vld1q_u8(pu1_src)
+
+    LD1         {v30.8b},[x12]              //vld1_u8(pu1_src[(ht - 1) * src_strd])
+    ST1         {v30.8b},[x3]               //vst1_u8(pu1_src_top[col])
+
+    cmhi        v12.16b,  v10.16b ,  v8.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v14.16b,  v8.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         v16.16b,  v14.16b ,  v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         x11,x8                      //move ht to x11 for loop count
+
+PU1_SRC_LOOP_RESIDUE:
+    ADD         x10,x10,x1                  //*pu1_src + src_strd
+    LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    ADD         x6,x10,x1                   //II Iteration *pu1_src + src_strd
+
+    cmhi        v12.16b,  v10.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
+    LD1         {v30.16b},[x6]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+
+    cmhi        v14.16b,  v18.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
+    SUB         x10,x10,x1
+
+    SUB         v20.16b,  v14.16b ,  v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    Uxtl        v26.8h, v18.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    ADD         v12.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    cmhi        v22.16b,  v18.16b ,  v30.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row)
+
+    ADD         v12.16b,  v12.16b ,  v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+    cmhi        v24.16b,  v30.16b ,  v18.16b //II vcltq_u8(pu1_cur_row, pu1_next_row)
+
+    NEG         v16.16b, v20.16b            //sign_up = vnegq_s8(sign_down)
+    TBL         v12.8b, {v6.16b},v12.8b     //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    SUB         v20.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    ADD         v22.16b,  v0.16b ,  v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+    TBL         v12.8b, {v7.16b},v12.8b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    NEG         v16.16b, v20.16b            //II sign_up = vnegq_s8(sign_down)
+
+    ADD         v22.16b,  v22.16b ,  v20.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
+    Uxtl        v20.8h, v10.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    SADDW       v20.8h,  v20.8h ,  v12.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    TBL         v22.8b, {v6.16b},v22.8b     //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+    TBL         v24.8b, {v7.16b},v22.8b     //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    SADDW       v26.8h,  v26.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    mov         v10.16b, v30.16b            //II pu1_cur_row = pu1_next_row
+    ST1         {v20.8b},[x10],x1           //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    xtn         v30.8b,  v26.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    SUBS        x11,x11,#2                  //Decrement the ht loop count by 1
+    ST1         {v30.8b},[x10],x1           //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    BEQ         END_LOOPS
+    CMP         x11,#1
+    BGT         PU1_SRC_LOOP_RESIDUE        //If not equal jump to PU1_SRC_LOOP
+
+
+    ADD         x10,x10,x1                  //*pu1_src + src_strd
+    LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    cmhi        v12.16b,  v10.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
+    cmhi        v14.16b,  v18.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
+    SUB         v20.16b,  v14.16b ,  v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         x10,x10,x1
+
+    ADD         v22.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v22.16b,  v22.16b ,  v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+    TBL         v22.8b, {v6.16b},v22.8b     //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+
+    TBL         v24.8b, {v7.16b},v22.8b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    Uxtl        v26.8h, v10.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    SADDW       v26.8h,  v26.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    xtn         v30.8b,  v26.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    ST1         {v30.8b},[x10],x1           //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+END_LOOPS:
+    // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+    ret
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_sao_edge_offset_class1_chroma.s b/common/arm64/ihevc_sao_edge_offset_class1_chroma.s
new file mode 100644
index 0000000..4baa5bf
--- /dev/null
+++ b/common/arm64/ihevc_sao_edge_offset_class1_chroma.s

@@ -0,0 +1,467 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//*  ihevc_sao_edge_offset_class1_chroma.s
+//*
+//* ,:brief
+//*  Contains function definitions for inter prediction  interpolation.
+//* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
+//* RVCT
+//*
+//* ,:author
+//*  Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//*  None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_edge_offset_class1_chroma(UWORD8 *pu1_src,
+//                              WORD32 src_strd,
+//                              UWORD8 *pu1_src_left,
+//                              UWORD8 *pu1_src_top,
+//                              UWORD8 *pu1_src_top_left,
+//                              UWORD8 *pu1_src_top_right,
+//                              UWORD8 *pu1_src_bot_left,
+//                              UWORD8 *pu1_avail,
+//                              WORD8 *pi1_sao_offset_u,
+//                              WORD8 *pi1_sao_offset_v,
+//                              WORD32 wd,
+//                              WORD32 ht)
+//**************Variables Vs Registers*****************************************
+//x0 =>    *pu1_src
+//x1 =>    src_strd
+//x2 =>    *pu1_src_left
+//x3 =>    *pu1_src_top
+//x4    =>    *pu1_src_top_left
+//x5    =>    *pu1_avail
+//x6    =>    *pi1_sao_offset_u
+//x7    =>    *pi1_sao_offset_v
+//x8    =>    wd
+//x9 =>    ht
+
+.text
+.p2align 2
+.include "ihevc_neon_macros.s"
+
+.globl gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class1_chroma_av8
+
+ihevc_sao_edge_offset_class1_chroma_av8:
+
+
+    ldr         x8,[sp,#0]
+    ldr         x9,[sp,#8]
+    ldr         w10,[sp,#16]
+    ldr         w11,[sp,#24]
+
+
+    push_v_regs
+    // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
+    stp         x19, x20,[sp,#-16]!
+    stp         x21, x22,[sp,#-16]!
+    stp         x23, x24,[sp,#-16]!
+    stp         x25, x26,[sp,#-16]!
+
+    mov         x15,x4 // *pu1_src_top_left 40
+    mov         x16,x5 // *pu1_src_top_right 44
+    mov         x17,x6 // *pu1_src_bot_left 48
+    mov         x21,x7 // *pu1_avail 52
+    mov         x22,x8 // *pi1_sao_offset_u 56
+    mov         x23,x9 // *pi1_sao_offset_v 60
+    mov         x24,x10 // wd 64
+    mov         x25,x11 // ht 68
+
+    mov         x4,x15
+    mov         x5,x21
+    mov         x6,x22
+    mov         x7,x23
+    mov         x8,x24
+    mov         x9,x25
+
+    SUB         x10,x8,#2                   //wd - 2
+    LDRH        w11,[x3,x10]                //pu1_src_top[wd - 2]
+    STRH        w11,[x4]                    //*pu1_src_top_left = pu1_src_top[wd - 2]
+    ADD         x11,x0,x10                  //pu1_src[row * src_strd + wd - 2]
+    MOV         x12,x2                      //Move pu1_src_left pointer to x11
+    MOV         x14,x9                      //Move ht to x14 for loop count
+SRC_LEFT_LOOP:
+    LDRH        w10,[x11]                   //Load pu1_src[row * src_strd + wd - 2]
+    ADD         x11,x11,x1
+    STRH        w10,[x12],#2                //pu1_src_left[row]
+    SUBS        x14, x14,#1                 //Decrement the loop count
+    BNE         SRC_LEFT_LOOP               //If not equal to 0 jump to the src_left_loop
+
+    SUB         x12,x9,#1                   //ht - 1
+    mul         x12, x12, x1                //(ht - 1) * src_strd
+    ADD         x12,x12,x0                  //pu1_src[(ht - 1) * src_strd]
+
+    LDRB        w4,[x5,#2]                  //pu1_avail[2]
+    CMP         x4,#0                       //0 == pu1_avail[2]
+    ADD         x20,x0,x1                   //pu1_src += src_strd
+    csel        x0, x20, x0,EQ
+    SUB         x20,x9,#1                   //ht--
+    csel        x9, x20, x9,EQ
+
+    LDRB        w4,[x5,#3]                  //pu1_avail[3]
+    CMP         x4,#0                       //0 == pu1_avail[3]
+    SUB         x20,x9,#1                   //ht--
+    csel        x9, x20, x9,EQ
+
+    movi        v0.16b, #2                  //const_2 = vdupq_n_s8(2)
+    movi        v2.8h, #0                   //const_min_clip = vdupq_n_s16(0)
+    movi        v4.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+    ADRP        x14, :got:gi1_table_edge_idx //table pointer
+    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
+    LD1         {v6.8b},[x14]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    LD1         {v7.8b},[x6]                //offset_tbl_u = vld1_s8(pi1_sao_offset_u)
+    LD1         {v8.8b},[x7]                //offset_tbl_v = vld1_s8(pi1_sao_offset_v)
+
+    CMP         x8,#16                      //Compare wd with 16
+    BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+
+WIDTH_LOOP_16:
+    LDRB        w4,[x5,#2]                  //pu1_avail[2]
+    CMP         x4,#0                       //0 == pu1_avail[2]
+    SUB         x20,x0,x1                   //pu1_src -= src_strd
+    csel        x11, x20, x11,EQ
+    csel        x11, x3, x11,NE             //*pu1_src_top
+
+    MOV         x10,x0                      //*pu1_src
+
+    LD1         {v28.16b},[x11],#16         //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+    //LD1 {v29.8b},[x11],#8                    //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+    LD1         {v10.16b},[x0],#16          //pu1_cur_row = vld1q_u8(pu1_src)
+    //LD1 {v11.8b},[x0],#8                    //pu1_cur_row = vld1q_u8(pu1_src)
+
+    LD1         {v30.16b},[x12],#16         //vld1q_u8(pu1_src[(ht - 1) * src_strd])
+    //LD1 {v31.8b},[x12],#8                    //vld1q_u8(pu1_src[(ht - 1) * src_strd])
+    cmhi        v12.16b,  v10.16b ,  v28.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+    ST1         { v30.16b},[x3],#16         //vst1q_u8(pu1_src_top[col])
+    cmhi        v14.16b,  v28.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+
+    SUB         v16.16b,  v14.16b ,  v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         x11,x9                      //move ht to x11 for loop count
+
+PU1_SRC_LOOP:
+    ADD         x10,x10,x1                  //*pu1_src + src_strd
+    LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //LD1 {v19.8b},[x10]                    //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //SUB x10, x10,#8
+    ADD         x6,x10,x1                   //II Iteration *pu1_src + src_strd
+
+    //mov   v19.d[0],v18.d[1]
+    cmhi        v12.16b,  v10.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    LD1         {v30.16b},[x6]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //LD1 {v31.8b},[x6]                    //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //SUB x6, x6,#8
+
+    cmhi        v14.16b,  v18.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         x10,x10,x1
+
+    SUB         v20.16b,  v14.16b ,  v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    Uxtl        v26.8h, v18.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    ADD         v12.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    Uxtl2       v28.8h, v18.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+    ADD         v12.16b,  v12.16b ,  v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+    cmhi        v22.16b,  v18.16b ,  v30.16b //II vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+    mov         v16.d[1],v16.d[0]
+    NEG         v16.16b, v20.16b            //sign_up = vnegq_s8(sign_down)
+    TBL         v12.16b, {v6.16b},v12.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    cmhi        v24.16b,  v30.16b ,  v18.16b //II vcltq_u8(pu1_cur_row, pu1_top_row)
+
+    SUB         v28.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    //TBL v13.8b, {v6.16b},v13.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    ADD         v22.16b,  v0.16b ,  v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+
+    mov         v13.d[0], v12.d[1]
+    UZP1        v27.8b, v12.8b, v13.8b
+    UZP2        v13.8b, v12.8b, v13.8b
+    mov         v12.8b,v27.8b
+    NEG         v16.16b, v28.16b            //II sign_up = vnegq_s8(sign_down)
+    TBL         v12.8b, {v7.16b},v12.8b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    ADD         v22.16b,  v22.16b ,  v28.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    Uxtl        v20.8h, v10.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    TBL         v13.8b, {v8.16b},v13.8b
+    ZIP1        v27.8b, v12.8b, v13.8b
+    ZIP2        v13.8b, v12.8b, v13.8b
+    mov         v12.8b,v27.8b
+
+    SADDW       v20.8h,  v20.8h ,  v12.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    TBL         v22.16b, {v6.16b},v22.16b   //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+    //TBL v23.8b, {v6.16b},v23.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    mov         v23.d[0], v22.d[1]
+    UZP1        v27.8b, v22.8b, v23.8b
+    UZP2        v23.8b, v22.8b, v23.8b
+    mov         v22.8b,v27.8b
+
+    Uxtl2       v28.8h, v10.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    //VTBL.8        D13,D7,D13                    @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    mov         v10.16b, v30.16b            //II pu1_cur_row = pu1_next_row
+
+    SADDW       v28.8h,  v28.8h ,  v13.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    TBL         v24.8b, {v7.16b},v22.8b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    TBL         v25.8b, {v8.16b},v23.8b
+    ZIP1        v27.8b, v24.8b, v25.8b
+    ZIP2        v25.8b, v24.8b, v25.8b
+    mov         v24.8b,v27.8b
+    //VTBL.8        D24,D7,D22                    @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    //VTBL.8        D25,D7,D23                    @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+
+    xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+    SADDW       v26.8h,  v26.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    xtn2        v20.16b,  v28.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    Uxtl2       v28.8h, v18.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    SADDW       v28.8h,  v28.8h ,  v25.8b   //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+
+    SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    SMAX        v28.8h,  v28.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    ST1         { v20.16b},[x10],x1         //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    xtn         v30.8b,  v26.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
+    SUBS        x11,x11,#2                  //II Decrement the ht loop count by 1
+    xtn2        v30.16b,  v28.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    ST1         { v30.16b},[x10],x1         //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    BEQ         PU1_SRC_LOOP_END            //if 0 == pu1_avail[3] || 0 == pu1_avail[2] ht = ht--
+    CMP         x11,#1                      //checking any residue remains
+    BGT         PU1_SRC_LOOP                //If not equal jump to PU1_SRC_LOOP
+
+    ADD         x10,x10,x1                  //*pu1_src + src_strd
+    LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //LD1 {v19.8b},[x10]                    //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //SUB x10, x10,#8
+    cmhi        v12.16b,  v10.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v14.16b,  v18.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         v20.16b,  v14.16b ,  v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         x10,x10,x1
+
+    ADD         v22.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v22.16b,  v22.16b ,  v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+    TBL         v22.16b, {v6.16b},v22.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    //TBL v23.8b, {v6.16b},v23.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    mov         v23.d[0],v22.d[1]
+    UZP1        v27.8b, v22.8b, v23.8b
+    UZP2        v23.8b, v22.8b, v23.8b
+    mov         v22.8b,v27.8b
+    TBL         v24.8b, {v7.16b},v22.8b
+    TBL         v25.8b, {v8.16b},v23.8b
+    ZIP1        v27.8b, v24.8b, v25.8b
+    ZIP2        v25.8b, v24.8b, v25.8b
+    mov         v24.8b,v27.8b
+
+    //VTBL.8        D24,D7,D22                    @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    Uxtl        v26.8h, v10.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    SADDW       v26.8h,  v26.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    //VTBL.8        D25,D7,D23                    @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    Uxtl2       v28.8h, v10.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    SADDW       v28.8h,  v28.8h ,  v25.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    xtn         v30.8b,  v26.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+    xtn2        v30.16b,  v28.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    ST1         { v30.16b},[x10],x1         //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+PU1_SRC_LOOP_END:
+    mov         v10.16b, v18.16b            //pu1_cur_row = pu1_next_row
+    SUBS        x8,x8,#16                   //Decrement the wd loop count by 16
+    CMP         x8,#8                       //Check whether residue remains
+    BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
+    BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
+    BLT         END_LOOPS                   //Jump to end function
+
+
+WIDTH_RESIDUE:
+    LDRB        w4,[x5,#2]                  //pu1_avail[2]
+    CMP         x4,#0                       //0 == pu1_avail[2]
+    SUB         x20,x0,x1                   //pu1_src -= src_strd
+    csel        x11, x20, x11,EQ
+    csel        x11, x3, x11,NE             //*pu1_src_top
+    MOV         x10,x0
+
+    LD1         {v28.16b},[x11]             //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+    //LD1 {v29.8b},[x11],#8                    //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+    LD1         {v10.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    //LD1 {v11.8b},[x0],#8                    //pu1_cur_row = vld1q_u8(pu1_src)
+
+    LD1         {v30.8b},[x12]              //vld1_u8(pu1_src[(ht - 1) * src_strd])
+    ST1         {v30.8b},[x3]               //vst1_u8(pu1_src_top[col])
+
+    cmhi        v12.16b,  v10.16b ,  v28.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v14.16b,  v28.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         v16.16b,  v14.16b ,  v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         x11,x9                      //move ht to x11 for loop count
+
+PU1_SRC_LOOP_RESIDUE:
+    ADD         x10,x10,x1                  //*pu1_src + src_strd
+    LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //LD1 {v19.8b},[x10]                    //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //SUB x10, x10,#8
+    ADD         x6,x10,x1                   //II Iteration *pu1_src + src_strd
+
+    cmhi        v12.16b,  v10.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
+    LD1         {v30.16b},[x6]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //LD1 {v31.8b},[x6]                    //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //SUB x6, x6,#8
+
+    cmhi        v14.16b,  v18.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
+    SUB         x10,x10,x1
+
+    SUB         v20.16b,  v14.16b ,  v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    Uxtl        v26.8h, v18.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    ADD         v12.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    cmhi        v22.16b,  v18.16b ,  v30.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row)
+
+    ADD         v12.16b,  v12.16b ,  v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+    cmhi        v24.16b,  v30.16b ,  v18.16b //II vcltq_u8(pu1_cur_row, pu1_next_row)
+
+    NEG         v16.16b, v20.16b            //sign_up = vnegq_s8(sign_down)
+    TBL         v12.8b, {v6.16b},v12.8b     //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    SUB         v20.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    UZP1        v27.8b, v12.8b, v13.8b
+    UZP2        v13.8b, v12.8b, v13.8b
+    mov         v12.8b,v27.8b
+
+    ADD         v22.16b,  v0.16b ,  v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+    TBL         v12.8b, {v7.16b},v12.8b
+    NEG         v16.16b, v20.16b            //II sign_up = vnegq_s8(sign_down)
+
+    TBL         v13.8b, {v8.16b},v13.8b
+    ZIP1        v27.8b, v12.8b, v13.8b
+    ZIP2        v13.8b, v12.8b, v13.8b
+    mov         v12.8b,v27.8b
+
+    //VTBL.8        D12,D7,D12                    @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+    ADD         v22.16b,  v22.16b ,  v20.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
+    Uxtl        v20.8h, v10.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    SADDW       v20.8h,  v20.8h ,  v12.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    TBL         v22.8b, {v6.16b},v22.8b     //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    UZP1        v27.8b, v22.8b, v23.8b
+    UZP2        v23.8b, v22.8b, v23.8b
+    mov         v22.8b,v27.8b
+
+    UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+    TBL         v24.8b, {v7.16b},v22.8b
+    xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    TBL         v25.8b, {v8.16b},v23.8b
+    ZIP1        v27.8b, v24.8b, v25.8b
+    ZIP2        v25.8b, v24.8b, v25.8b
+    mov         v24.8b,v27.8b
+    //VTBL.8        D24,D7,D22                    @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+    SADDW       v26.8h,  v26.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    mov         v10.16b, v30.16b            //II pu1_cur_row = pu1_next_row
+    ST1         {v20.8b},[x10],x1           //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    xtn         v30.8b,  v26.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    SUBS        x11,x11,#2                  //Decrement the ht loop count by 1
+    ST1         {v30.8b},[x10],x1           //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    BEQ         END_LOOPS
+    CMP         x11,#1
+    BGT         PU1_SRC_LOOP_RESIDUE        //If not equal jump to PU1_SRC_LOOP
+
+
+    ADD         x10,x10,x1                  //*pu1_src + src_strd
+    LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //LD1 {v19.8b},[x10]                    //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //SUB x10, x10,#8
+    cmhi        v12.16b,  v10.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
+    cmhi        v14.16b,  v18.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
+    SUB         v20.16b,  v14.16b ,  v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         x10,x10,x1
+
+    ADD         v22.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v22.16b,  v22.16b ,  v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+    TBL         v22.8b, {v6.16b},v22.8b     //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+
+    UZP1        v27.8b, v22.8b, v23.8b
+    UZP2        v23.8b, v22.8b, v23.8b
+    mov         v22.8b,v27.8b
+
+    TBL         v24.8b, {v7.16b},v22.8b
+    TBL         v25.8b, {v8.16b},v23.8b
+    ZIP1        v27.8b, v24.8b, v25.8b
+    ZIP2        v25.8b, v24.8b, v25.8b
+    mov         v24.8b,v27.8b
+
+    //VTBL.8        D24,D7,D22                    @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    Uxtl        v26.8h, v10.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    SADDW       v26.8h,  v26.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    xtn         v30.8b,  v26.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    ST1         {v30.8b},[x10],x1           //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+END_LOOPS:
+    // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
+    ldp         x25, x26,[sp],#16
+    ldp         x23, x24,[sp],#16
+    ldp         x21, x22,[sp],#16
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+
+    ret
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_sao_edge_offset_class2.s b/common/arm64/ihevc_sao_edge_offset_class2.s
new file mode 100644
index 0000000..3350e5c
--- /dev/null
+++ b/common/arm64/ihevc_sao_edge_offset_class2.s

@@ -0,0 +1,846 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//*  ihevc_sao_edge_offset_class2.s
+//*
+//* ,:brief
+//*  Contains function definitions for inter prediction  interpolation.
+//* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
+//* RVCT
+//*
+//* ,:author
+//*  Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//*  None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_edge_offset_class2(UWORD8 *pu1_src,
+//                              WORD32 src_strd,
+//                              UWORD8 *pu1_src_left,
+//                              UWORD8 *pu1_src_top,
+//                              UWORD8 *pu1_src_top_left,
+//                              UWORD8 *pu1_src_top_right,
+//                              UWORD8 *pu1_src_bot_left,
+//                              UWORD8 *pu1_avail,
+//                              WORD8 *pi1_sao_offset,
+//                              WORD32 wd,
+//                              WORD32 ht)
+//**************Variables Vs Registers*****************************************
+//x0 =>    *pu1_src
+//x1 =>    src_strd
+//x2 =>    *pu1_src_left
+//x3 =>    *pu1_src_top
+//x4    =>    *pu1_src_top_left
+//x5    =>    *pu1_avail
+//x6    =>    *pi1_sao_offset
+//x7    =>    wd
+//x8=>    ht
+
+.text
+.p2align 2
+
+.include "ihevc_neon_macros.s"
+
+.globl gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class2_av8
+
+ihevc_sao_edge_offset_class2_av8:
+
+
+    // STMFD sp!,{x4-x12,x14}            //stack stores the values of the arguments
+    MOV         x5,x7                       //Loads pu1_avail
+
+    LDR         x6,[sp]                     //Loads pi1_sao_offset
+    LDR         w7,[sp,#8]                  //Loads wd
+    LDR         w8,[sp,#16]                 //Loads ht
+
+    MOV         x16,x7 // wd
+    MOV         x17,x8 // ht
+
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+    stp         x21, x22,[sp,#-16]!
+    stp         x23, x24,[sp,#-16]!
+
+    SUB         x9,x7,#1                    //wd - 1
+
+    LDRB        w10,[x3,x9]                 //pu1_src_top[wd - 1]
+
+    MOV         x19,x0                      //Store pu1_src in sp
+    MOV         x21,x2                      //Store pu1_src_left in sp
+    MOV         x22,x3                      //Store pu1_src_top in sp
+    MOV         x23,x5                      //Store pu1_avail in sp
+    MOV         x24,x4                      //Store pu1_src_top_left in sp
+
+
+    MOV         x9,x7                       //Move width to x9 for loop count
+
+    SUB         sp,sp,#0xA0                 //Decrement the stack pointer to store some temp arr values
+
+    STRB        w10,[sp]                    //u1_src_top_left_tmp = pu1_src_top[wd - 1]
+    SUB         x10,x8,#1                   //ht-1
+    madd        x11, x10, x1, x0            //pu1_src[(ht - 1) * src_strd + col]
+    ADD         x12,sp,#0x02                //temp array
+
+AU1_SRC_TOP_LOOP:
+    LD1         {v0.8b},[x11],#8            //pu1_src[(ht - 1) * src_strd + col]
+    SUBS        x9,x9,#8                    //Decrement the loop count by 8
+    ST1         {v0.8b},[x12],#8            //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
+    BNE         AU1_SRC_TOP_LOOP
+
+PU1_AVAIL_4_LOOP:
+    LDRB        w10,[x5,#4]                 //pu1_avail[4]
+    CMP         x10,#0
+    LDRB        w9,[x0]                     //u1_pos_0_0_tmp = pu1_src[0]
+    BEQ         PU1_AVAIL_7_LOOP
+
+    LDRB        w11,[x4]                    //pu1_src_top_left[0]
+    ADD         x14,x0,x1                   //pu1_src + src_strd
+
+    SUBS        x12,x9,x11                  //pu1_src[0] - pu1_src_top_left[0]
+    LDRB        w4,[x14,#1]                 //pu1_src[1 + src_strd]
+
+    movn        x20,#0
+    csel        x12, x20, x12,LT
+    MOV         x20,#1
+    csel        x12, x20, x12,GT            //SIGN(pu1_src[0] - pu1_src_top_left[0])
+
+    ADRP        x14, :got:gi1_table_edge_idx //table pointer
+    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
+    SUBS        x11,x9,x4                   //pu1_src[0] - pu1_src[1 + src_strd]
+
+    movn        x20,#0
+    csel        x11, x20, x11,LT
+    MOV         x20,#1
+    csel        x11, x20, x11,GT            //SIGN(pu1_src[0] - pu1_src[1 + src_strd])
+    ADD         x4,x12,x11                  //SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[1 + src_strd])
+    ADD         x4,x4,#2                    //edge_idx
+
+    LDRSB       x12,[x14,x4]                //edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         x12,#0                      //0 != edge_idx
+    BEQ         PU1_AVAIL_7_LOOP
+    LDRSB       x10,[x6,x12]                //pi1_sao_offset[edge_idx]
+    ADD         x9,x9,x10                   //pu1_src[0] + pi1_sao_offset[edge_idx]
+    mov         x20,#255
+    cmp         x9,x20
+    csel        x9, x20, x9, ge             //u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_7_LOOP:
+    LDRB        w14,[x5,#7]                 //pu1_avail[7]
+    CMP         x14,#0
+    SUB         x10,x7,#1                   //wd - 1
+    SUB         x11,x8,#1                   //ht - 1
+    madd        x12, x11, x1, x10           //wd - 1 + (ht - 1) * src_strd
+    ADD         x12,x12,x0                  //pu1_src[wd - 1 + (ht - 1) * src_strd]
+    LDRB        w10,[x12]                   //u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd]
+    BEQ         PU1_AVAIL
+
+    SUB         x4,x12,x1                   //pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
+    SUB         x4,x4,#1
+    LDRB        w11,[x4]                    //Load pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]
+    ADD         x4,x4,#1
+    ADD         x14,x12,x1                  //pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
+
+    SUBS        x11,x10,x11                 //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd]
+    LDRB        w4,[x14,#1]                 //Load pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
+
+    movn        x20,#0
+    csel        x11, x20, x11,LT
+    MOV         x20,#1
+    csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd])
+
+    SUBS        x4,x10,x4                   //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
+    movn        x20,#0
+    csel        x4, x20, x4,LT
+    MOV         x20,#1
+    csel        x4, x20, x4,GT              //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
+
+    ADD         x11,x11,x4                  //Add 2 sign value
+    ADD         x11,x11,#2                  //edge_idx
+    ADRP        x14, :got:gi1_table_edge_idx //table pointer
+    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
+
+    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         x12,#0
+    BEQ         PU1_AVAIL
+    LDRSB       x11,[x6,x12]                //pi1_sao_offset[edge_idx]
+    ADD         x10,x10,x11                 //pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+    mov         x20,#255
+    cmp         x10,x20
+    csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL:
+    MOV         x12,x8                      //Move ht
+    movi        v0.16b, #2                  //const_2 = vdupq_n_s8(2)
+    LDRB        w11,[x5,#3]                 //pu1_avail[3]
+
+    MOV         x14,x2                      //Move pu1_src_left to pu1_src_left_cpy
+    movi        v2.8h, #0                   //const_min_clip = vdupq_n_s16(0)
+    CMP         x11,#0
+
+    LDRB        w5,[x5,#2]                  //pu1_avail[2]
+    movi        v4.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+    SUB         x20,x12,#1                  //ht_tmp--
+    csel        x12, x20, x12,EQ
+
+    CMP         x5,#0
+    LD1         {v7.8b},[x6]                //offset_tbl = vld1_s8(pi1_sao_offset)
+    ADRP        x11, :got:gi1_table_edge_idx //table pointer
+    LDR         x11, [x11, #:got_lo12:gi1_table_edge_idx]
+
+
+    ADD         x20,x0,x1                   //pu1_src += src_strd
+    csel        x0, x20, x0,EQ
+    LD1         {v6.8b},[x11]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    SUB         x20,x12,#1                  //ht_tmp--
+    csel        x12, x20, x12,EQ
+
+    MOV         x6,x7                       //move wd to x6 loop_count
+    movi        v8.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
+    ADD         x20,x14,#1                  //pu1_src_left_cpy += 1
+    csel        x14, x20, x14,EQ
+
+    MOV         x15,x0
+    CMP         x7,#16                      //Compare wd with 16
+
+    BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+    CMP         x8,#4                       //Compare ht with 4
+    BLE         WD_16_HT_4_LOOP             //If jump to WD_16_HT_4_LOOP
+
+WIDTH_LOOP_16:
+    MOV         x7,x16                      //Loads wd
+
+    MOV         x5,x23                      //Loads pu1_avail
+    CMP         x6,x7                       //col == wd
+    LDRb        w20, [x5]                   //pu1_avail[0]
+    csel        w8,w20,w8,EQ
+    MOV         x20,#-1
+    csel        x8, x20, x8,NE              //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
+    CMP         x6,#16                      //if(col == 16)
+    BNE         SKIP_AU1_MASK_VAL
+    LDRB        w8,[x5,#1]                  //pu1_avail[1]
+    mov         v8.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL:
+    LDRB        w11,[x5,#2]                 //pu1_avail[2]
+    CMP         x11,#0
+
+    SUB         x20,x0,x1                   //pu1_src - src_strd
+    csel        x8, x20, x8,EQ
+    csel        x8, x3, x8,NE               //pu1_src_top_cpy
+    SUB         x8,x8,#1                    //pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
+
+    MOV         x7,x16                      //Loads wd
+    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
+    ADD         x3,x3,#16
+
+    ADD         x5,sp,#0x42                 //*au1_src_left_tmp
+    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    MOV         x4,x17                      //Loads ht
+
+    SUB         x7,x7,x6                    //(wd - col)
+    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    MOV         x8,x19                      //Loads *pu1_src
+
+    ADD         x7,x7,#15                   //15 + (wd - col)
+    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
+
+    SUB         x5,x5,#1
+    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+AU1_SRC_LEFT_LOOP:
+    LDRB        w8,[x7]                     //load the value and increment by src_strd
+    ADD         x7,x7,x1
+    STRB        w8,[x5,#1]!                 //store it in the stack pointer
+    SUBS        x4,x4,#1                    //decrement the loop count
+    BNE         AU1_SRC_LEFT_LOOP
+
+    ADD         x8,x0,x1                    //I Iteration *pu1_src + src_strd
+    movi        v18.16b, #0
+    MOV         x4,x23                      //I Loads pu1_avail
+
+    MOV         x7,x12                      //row count, move ht_tmp to x7
+    LD1         {v16.16b},[x8]              //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    LDRB        w4,[x4,#2]                  //I pu1_avail[2]
+
+    LDRB        w5,[x8,#16]                 //I pu1_src_cpy[src_strd + 16]
+    mov         v18.8b[0], w5               //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+
+    EXT         v18.16b,  v16.16b ,  v18.16b,#1 //I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+    CMP         x4,#0                       //I
+    BNE         SIGN_UP_CHANGE_DONE         //I
+
+SIGN_UP_CHANGE:
+    SUB         x2,x12,x7                   //I ht_tmp - row
+    LDRB        w11,[x0]                    //I pu1_src_cpy[0]
+    ADD         x2,x14,x2                   //I pu1_src_left_cpy[ht_tmp - row]
+    SUB         x2,x2,#1
+    LDRB        w5,[x2]                     //I load the value
+    ADD         x2,x2,#1
+    SUBS        x4,x11,x5                   //I pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+    movn        x20,#0
+    csel        x4, x20, x4,LT              //I
+    MOV         x20,#1
+    csel        x4, x20, x4,GT              //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+    mov         v14.8b[0], w4               //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+SIGN_UP_CHANGE_DONE:
+    cmhi        v10.16b,  v12.16b ,  v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    ADD         v24.16b,  v0.16b ,  v14.16b //I edge_idx = vaddq_s8(const_2, sign_up)
+
+    cmhi        v18.16b,  v18.16b ,  v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    SUB         v10.16b,  v18.16b ,  v10.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    ADD         v24.16b,  v24.16b ,  v10.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
+    TBL         v18.16b, {v6.16b},v24.16b   //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+//  TBL v19.8b, {v6.16b},v25.8b                //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    AND         v18.16b,  v18.16b ,  v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    NEG         v14.16b, v10.16b            //I sign_up = vnegq_s8(sign_down)
+    TBL         v10.16b, {v7.16b},v18.16b   //I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    EXT         v14.16b,  v14.16b ,  v14.16b,#15 //I sign_up = vextq_s8(sign_up, sign_up, 15)
+
+    Uxtl        v20.8h, v12.8b              //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+//  TBL v11.8b, {v7.16b},v19.8b                    //I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    SADDW       v20.8h,  v20.8h ,  v10.8b   //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    SMAX        v20.8h,  v20.8h ,  v2.8h    //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    Uxtl2       v22.8h, v12.16b             //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+    UMIN        v20.8h,  v20.8h ,  v4.8h    //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+    mov         v12.16b, v16.16b            //I pu1_cur_row = pu1_next_row
+
+    SADDW2      v22.8h,  v22.8h ,  v10.16b  //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    xtn         v20.8b,  v20.8h             //I vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    SMAX        v22.8h,  v22.8h ,  v2.8h    //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    SUB         x7,x7,#1                    //I Decrement the ht_tmp loop count by 1
+
+    UMIN        v22.8h,  v22.8h ,  v4.8h    //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    xtn2        v20.16b,  v22.8h            //I vmovn_s16(pi2_tmp_cur_row.val[1])
+
+PU1_SRC_LOOP:
+
+    ST1         { v20.16b},[x0],x1          //I vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    ADD         x8,x0,x1                    //II iteration *pu1_src + src_strd
+
+    LD1         {v16.16b},[x8]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    ADD         x11,x8,x1                   //III iteration *pu1_src + src_strd
+
+    LDRB        w5,[x8,#16]                 //II pu1_src_cpy[src_strd + 16]
+    LD1         {v30.16b},[x11]             //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    LDRB        w4,[x0]                     //II pu1_src_cpy[0]
+
+    LDRB        w8,[x11,#16]                //III pu1_src_cpy[src_strd + 16]
+    mov         v28.8b[0], w5               //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+
+    SUB         x5,x12,x7                   //II ht_tmp - row
+    EXT         v22.16b,  v16.16b ,  v28.16b,#1 //II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+    ADD         x5,x14,x5                   //II pu1_src_left_cpy[ht_tmp - row]
+
+    SUB         x5,x5,#1
+    LDRB        w5,[x5]                     //II load the value
+    mov         v18.8b[0], w8               //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    SUB         x7,x7,#1                    //II Decrement the ht_tmp loop count by 1
+
+    SUBS        x4,x4,x5                    //II pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+    EXT         v18.16b,  v30.16b ,  v18.16b,#1 //III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+    LDRB        w2,[x0,x1]                  //III pu1_src_cpy[0]
+
+    cmhi        v24.16b,  v12.16b ,  v22.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    SUB         x5,x12,x7                   //III ht_tmp - row
+
+    movn        x20,#0
+    csel        x4, x20, x4,LT              //II
+    cmhi        v22.16b,  v22.16b ,  v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    ADD         x5,x14,x5                   //III pu1_src_left_cpy[ht_tmp - row]
+
+    MOV         x20,#1
+    csel        x4, x20, x4,GT              //II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+    SUB         v24.16b,  v22.16b ,  v24.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         x5,x5,#1
+    LDRB        w5,[x5]                     //III load the value
+
+    SUBS        x2,x2,x5                    //III pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+    mov         v14.8b[0], w4               //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+    movn        x20,#0
+    csel        x2, x20, x2,LT              //III
+    cmhi        v10.16b,  v16.16b ,  v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    MOV         x20,#1
+    csel        x2, x20, x2,GT              //III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+
+    ADD         v22.16b,  v0.16b ,  v14.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v22.16b,  v22.16b ,  v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    cmhi        v18.16b,  v18.16b ,  v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    TBL         v22.16b, {v6.16b},v22.16b   //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    NEG         v14.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
+
+    SUB         v10.16b,  v18.16b ,  v10.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+//  TBL v23.8b, {v6.16b},v23.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    EXT         v14.16b,  v14.16b ,  v14.16b,#15 //II sign_up = vextq_s8(sign_up, sign_up, 15)
+
+    AND         v22.16b,  v22.16b ,  v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v14.8b[0], w2               //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+    ADD         v18.16b,  v0.16b ,  v14.16b //III edge_idx = vaddq_s8(const_2, sign_up)
+    TBL         v24.16b, {v7.16b},v22.16b   //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    ADD         v18.16b,  v18.16b ,  v10.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    Uxtl        v26.8h, v12.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    TBL         v18.16b, {v6.16b},v18.16b   //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    NEG         v14.16b, v10.16b            //III sign_up = vnegq_s8(sign_down)
+
+    SADDW       v26.8h,  v26.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+//  TBL v19.8b, {v6.16b},v19.8b                //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    EXT         v14.16b,  v14.16b ,  v14.16b,#15 //III sign_up = vextq_s8(sign_up, sign_up, 15)
+
+    AND         v18.16b,  v18.16b ,  v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
+    Uxtl        v20.8h, v16.8b              //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+    SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    TBL         v10.16b, {v7.16b},v18.16b   //III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    SADDW       v20.8h,  v20.8h ,  v10.8b   //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+//  TBL v25.8b, {v7.16b},v23.8b                    //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    SMAX        v20.8h,  v20.8h ,  v2.8h    //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    Uxtl2       v28.8h, v12.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    UMIN        v20.8h,  v20.8h ,  v4.8h    //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    SADDW2      v28.8h,  v28.8h ,  v24.16b  //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+//  TBL v11.8b, {v7.16b},v19.8b                    //III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    SMAX        v28.8h,  v28.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    Uxtl2       v18.8h, v16.16b             //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+    mov         v12.16b, v30.16b            //III pu1_cur_row = pu1_next_row
+    xtn         v26.8b,  v26.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    xtn2        v26.16b,  v28.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
+    SADDW2      v18.8h,  v18.8h ,  v10.16b  //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    SMAX        v18.8h,  v18.8h ,  v2.8h    //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    xtn         v20.8b,  v20.8h             //III vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    SUB         x7,x7,#1                    //III Decrement the ht_tmp loop count by 1
+    UMIN        v18.8h,  v18.8h ,  v4.8h    //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    CMP         x7,#1                       //III
+
+    ST1         { v26.16b},[x0],x1          //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    xtn2        v20.16b,  v18.8h            //III vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    BGT         PU1_SRC_LOOP                //III If not equal jump to PU1_SRC_LOOP
+    BLT         INNER_LOOP_DONE
+
+    ST1         { v20.16b},[x0],x1          //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    ADD         x8,x0,x1                    //*pu1_src + src_strd
+
+    LDRB        w2,[x0]                     //pu1_src_cpy[0]
+    LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    LDRB        w5,[x8,#16]                 //pu1_src_cpy[src_strd + 16]
+
+    SUB         x11,x12,x7                  //ht_tmp - row
+    mov         v18.8b[0], w5               //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    ADD         x11,x14,x11                 //pu1_src_left_cpy[ht_tmp - row]
+
+    SUB         x11,x11,#1
+    LDRB        w5,[x11]                    //load the value
+    ADD         x11,x11,#1
+    EXT         v18.16b,  v16.16b ,  v18.16b,#1 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+    SUBS        x4,x2,x5                    //pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+
+    cmhi        v10.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    movn        x20,#0
+    csel        x4, x20, x4,LT
+
+    MOV         x20,#1
+    csel        x4, x20, x4,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+    cmhi        v18.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    mov         v14.8b[0], w4               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+    SUB         v10.16b,  v18.16b ,  v10.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    ADD         v18.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v18.16b,  v18.16b ,  v10.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    TBL         v18.16b, {v6.16b},v18.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    NEG         v14.16b, v10.16b            //sign_up = vnegq_s8(sign_down)
+
+//  TBL v19.8b, {v6.16b},v19.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    EXT         v14.16b,  v14.16b ,  v14.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
+
+    AND         v18.16b,  v18.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    TBL         v10.16b, {v7.16b},v18.16b   //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+    Uxtl        v20.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+//  TBL v11.8b, {v7.16b},v19.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    SADDW       v20.8h,  v20.8h ,  v10.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    Uxtl2       v12.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+    UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+    SADDW2      v12.8h,  v12.8h ,  v10.16b  //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    SMAX        v12.8h,  v12.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    UMIN        v12.8h,  v12.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    xtn2        v20.16b,  v12.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
+
+
+INNER_LOOP_DONE:
+    ADD         x5,sp,#0x42                 //*au1_src_left_tmp
+    ST1         { v20.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    MOV         x2,x21                      //Loads *pu1_src_left
+
+    MOV         x8,x17                      //Loads ht
+    SUB         x5,x5,#1
+
+    SUB         x2,x2,#1
+SRC_LEFT_LOOP:
+    LDRB        w7,[x5,#1]!                 //au1_src_left_tmp[row]
+    SUBS        x8,x8,#1
+    STRB        w7,[x2,#1]!                 //pu1_src_left[row] = au1_src_left_tmp[row]
+    BNE         SRC_LEFT_LOOP
+
+    SUB         x6,x6,#16                   //Decrement the wd loop count by 16
+    CMP         x6,#8                       //Check whether residue remains
+    BLT         RE_ASSINING_LOOP            //Jump to re-assigning loop
+    MOV         x7,x16                      //Loads wd
+    MOV         x0,x15                      //Loads *pu1_src
+    SUB         x7,x7,x6
+    ADD         x0,x0,x7
+    BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
+    BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
+
+
+WD_16_HT_4_LOOP:
+    MOV         x7,x16                      //Loads wd
+    MOV         x5,x23                      //Loads pu1_avail
+    CMP         x6,x7                       //col == wd
+    LDRb        w20, [x5]                   //pu1_avail[0]
+    csel        w8,w20,w8,EQ
+    MOV         x20,#-1
+    csel        x8, x20, x8,NE              //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
+    CMP         x6,#16                      //if(col == 16)
+    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
+    LDRB        w8,[x5,#1]                  //pu1_avail[1]
+    mov         v8.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL_WD_16_HT_4:
+    LDRB        w8,[x5,#2]                  //pu1_avail[2]
+    CMP         x8,#0
+
+    SUB         x20,x0,x1                   //pu1_src - src_strd
+    csel        x8, x20, x8,EQ
+    csel        x8, x3, x8,NE
+    SUB         x8,x8,#1                    //pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
+
+    MOV         x7,x16                      //Loads wd
+    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
+    ADD         x3,x3,#16
+
+    ADD         x5,sp,#0x42                 //*au1_src_left_tmp
+    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    MOV         x4,x17                      //Loads ht
+
+    SUB         x7,x7,x6                    //(wd - col)
+    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    MOV         x8,x19                      //Loads *pu1_src
+
+    ADD         x7,x7,#15                   //15 + (wd - col)
+    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
+
+    SUB         x5,x5,#1
+    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+AU1_SRC_LEFT_LOOP_WD_16_HT_4:
+    LDRB        w8,[x7]                     //load the value and increment by src_strd
+    ADD         x7,x7,x1
+    SUBS        x4,x4,#1                    //decrement the loop count
+    STRB        w8,[x5,#1]!                 //store it in the stack pointer
+    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
+
+    movi        v18.16b, #0
+    MOV         x7,x12                      //row count, move ht_tmp to x7
+
+PU1_SRC_LOOP_WD_16_HT_4:
+    ADD         x8,x0,x1                    //*pu1_src + src_strd
+    LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+
+    LDRB        w5,[x8,#16]                 //pu1_src_cpy[src_strd + 16]
+    mov         v18.8b[0], w5               //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    EXT         v18.16b,  v16.16b ,  v18.16b,#1 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+
+    CMP         x7,x12
+    BLT         SIGN_UP_CHANGE_WD_16_HT_4
+    MOV         x5,x23                      //Loads pu1_avail
+    LDRB        w5,[x5,#2]                  //pu1_avail[2]
+    CMP         x5,#0
+    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
+
+SIGN_UP_CHANGE_WD_16_HT_4:
+    LDRB        w8,[x0]                     //pu1_src_cpy[0]
+    SUB         x5,x12,x7                   //ht_tmp - row
+    ADD         x5,x14,x5                   //pu1_src_left_cpy[ht_tmp - row]
+    SUB         x5,x5,#1
+    LDRB        w5,[x5]                     //load the value
+    SUBS        x8,x8,x5                    //pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+    movn        x20,#0
+    csel        x8, x20, x8,LT
+    MOV         x20,#1
+    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+    mov         v14.8b[0], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+SIGN_UP_CHANGE_DONE_WD_16_HT_4:
+    cmhi        v20.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    SUB         v24.16b,  v22.16b ,  v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+    TBL         v26.16b, {v6.16b},v26.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+//  TBL v27.8b, {v6.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    NEG         v14.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
+    EXT         v14.16b,  v14.16b ,  v14.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
+
+    TBL         v24.16b, {v7.16b},v26.16b   //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    Uxtl        v28.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+//  TBL v25.8b, {v7.16b},v27.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    Uxtl2       v30.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    SADDW2      v30.8h,  v30.8h ,  v24.16b  //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    SMAX        v30.8h,  v30.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    UMIN        v30.8h,  v30.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+    xtn2        v28.16b,  v30.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    ST1         { v28.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    mov         v12.16b, v16.16b            //pu1_cur_row = pu1_next_row
+    SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
+    BNE         PU1_SRC_LOOP_WD_16_HT_4     //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
+
+    MOV         x8,x17                      //Loads ht
+    ADD         x5,sp,#0x42                 //*au1_src_left_tmp
+    MOV         x2,x21                      //Loads *pu1_src_left
+    SUB         x5,x5,#1
+    SUB         x2,x2,#1
+
+SRC_LEFT_LOOP_WD_16_HT_4:
+    LDRB        w7,[x5,#1]!                 //au1_src_left_tmp[row]
+    STRB        w7,[x2,#1]!                 //pu1_src_left[row] = au1_src_left_tmp[row]
+    SUBS        x8,x8,#1
+    BNE         SRC_LEFT_LOOP_WD_16_HT_4
+
+    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
+    BLE         RE_ASSINING_LOOP            //Jump to re-assigning loop
+
+
+WIDTH_RESIDUE:
+    MOV         x7,x16                      //Loads wd
+    MOV         x5,x23                      //Loads pu1_avail
+    CMP         x6,x7                       //wd_residue == wd
+    LDRb        w20, [x5]                   //pu1_avail[0]
+    csel        w8,w20,w8,EQ
+
+    MOV         x20,#-1
+    csel        x8, x20, x8,NE
+    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    LDRB        w8,[x5,#1]                  //pu1_avail[1]
+    mov         v8.8b[7], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+PU1_AVAIL_2_RESIDUE:
+    LDRB        w11,[x5,#2]                 //pu1_avail[2]
+    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    CMP         x11,#0
+
+    SUB         x20,x0,x1                   //pu1_src - src_strd
+    csel        x8, x20, x8,EQ
+    csel        x8, x3, x8,NE
+
+    SUB         x8,x8,#1
+
+    ADD         x5,sp,#0x42                 //*au1_src_left_tmp
+    LD1         {v10.16b},[x8],#16          //pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
+    MOV         x7,x16                      //Loads wd
+
+    MOV         x4,x17                      //Loads ht
+    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    SUB         x7,x7,#1                    //(wd - 1)
+
+    MOV         x8,x19                      //Loads *pu1_src
+    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         x5,x5,#1
+
+    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + (wd - 1)]
+    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+
+AU1_SRC_LEFT_LOOP_RESIDUE:
+    LDRB        w8,[x7]                     //load the value and increment by src_strd
+    ADD         x7,x7,x1
+    SUBS        x4,x4,#1                    //decrement the loop count
+    STRB        w8,[x5,#1]!                 //store it in the stack pointer
+    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
+
+
+    MOV         x7,x12                      //row count, move ht_tmp to x7
+
+PU1_SRC_LOOP_RESIDUE:
+    movi        v18.16b, #0
+    ADD         x8,x0,x1                    //*pu1_src + src_strd
+    LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+
+    LDRB        w8,[x8,#16]                 //pu1_src_cpy[src_strd + 16]
+    mov         v18.8b[0], w8               //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    EXT         v18.16b,  v16.16b ,  v18.16b,#1 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+
+    CMP         x7,x12
+    BLT         SIGN_UP_CHANGE_RESIDUE
+    MOV         x5,x23                      //Loads pu1_avail
+    LDRB        w5,[x5,#2]                  //pu1_avail[2]
+    CMP         x5,#0
+    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
+
+SIGN_UP_CHANGE_RESIDUE:
+    LDRB        w8,[x0]                     //pu1_src_cpy[0]
+    SUB         x5,x12,x7                   //ht_tmp - row
+
+    ADD         x5,x14,x5
+    SUB         x5,x5,#1
+    LDRB        w5,[x5]                     //load the value
+    SUBS        x8,x8,x5                    //pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+    movn        x20,#0
+    csel        x8, x20, x8,LT
+    MOV         x20,#1
+    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+    mov         v14.8b[0], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+SIGN_UP_CHANGE_DONE_RESIDUE:
+    cmhi        v20.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    SUB         v24.16b,  v22.16b ,  v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+    TBL         v26.16b, {v6.16b},v26.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+//  TBL v27.8b, {v6.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    NEG         v14.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
+    EXT         v14.16b,  v14.16b ,  v14.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
+
+    TBL         v24.8b, {v7.16b},v26.8b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    Uxtl        v28.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    xtn         v30.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    ST1         {v30.8b},[x0],x1            //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    mov         v12.16b, v16.16b            //pu1_cur_row = pu1_next_row
+    SUBS        x7,x7,#1
+    BNE         PU1_SRC_LOOP_RESIDUE
+
+    MOV         x8,x17                      //Loads ht
+    ADD         x5,sp,#0x42                 //*au1_src_left_tmp
+
+    MOV         x2,x21                      //Loads *pu1_src_left
+    SUB         x5,x5,#1
+
+    SUB         x2,x2,#1
+
+SRC_LEFT_LOOP_RESIDUE:
+    LDRB        w7,[x5,#1]!                 //au1_src_left_tmp[row]
+    SUBS        x8,x8,#1
+    STRB        w7,[x2,#1]!                 //pu1_src_left[row] = au1_src_left_tmp[row]
+    BNE         SRC_LEFT_LOOP_RESIDUE
+
+
+RE_ASSINING_LOOP:
+    MOV         x8,x17                      //Loads ht
+    MOV         x7,x16                      //Loads wd
+
+    MOV         x0,x19                      //Loads *pu1_src
+    SUB         x8,x8,#1                    //ht - 1
+
+    madd        x6, x8, x1, x7              //wd - 1 + (ht - 1) * src_strd
+    STRB        w9,[x0]                     //pu1_src_org[0] = u1_pos_0_0_tmp
+
+    MOV         x4,x24                      //Loads pu1_src_top_left
+    ADD         x6,x0,x6                    //pu1_src[wd - 1 + (ht - 1) * src_strd]
+
+    ADD         x12,sp,#0x02
+    SUB         x6,x6,#1
+    STRB        w10,[x6]                    //pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
+    ADD         x6,x6,#1
+
+    LDRB        w11,[sp]                    //load u1_src_top_left_tmp from stack pointer
+    MOV         x3,x22                      //Loads pu1_src_top
+
+    STRB        w11,[x4]                    //*pu1_src_top_left = u1_src_top_left_tmp
+
+SRC_TOP_LOOP:
+    LD1         {v0.8b},[x12],#8            //pu1_src_top[col] = au1_src_top_tmp[col]
+    SUBS        x7,x7,#8                    //Decrement the width
+    ST1         {v0.8b},[x3],#8             //pu1_src_top[col] = au1_src_top_tmp[col]
+    BNE         SRC_TOP_LOOP
+
+END_LOOPS:
+    ADD         sp,sp,#0xA0
+    // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
+    ldp         x23, x24,[sp],#16
+    ldp         x21, x22,[sp],#16
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+

diff --git a/common/arm64/ihevc_sao_edge_offset_class2_chroma.s b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
new file mode 100644
index 0000000..2fa7c22
--- /dev/null
+++ b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s

@@ -0,0 +1,1120 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//*  ihevc_sao_edge_offset_class2_chroma.s
+//*
+//* ,:brief
+//*  Contains function definitions for inter prediction  interpolation.
+//* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
+//* RVCT
+//*
+//* ,:author
+//*  Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//*  None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_edge_offset_class2_chroma(UWORD8 *pu1_src,
+//                              WORD32 src_strd,
+//                              UWORD8 *pu1_src_left,
+//                              UWORD8 *pu1_src_top,
+//                              UWORD8 *pu1_src_top_left,
+//                              UWORD8 *pu1_src_top_right,
+//                              UWORD8 *pu1_src_bot_left,
+//                              UWORD8 *pu1_avail,
+//                              WORD8 *pi1_sao_offset_u,
+//                              WORD8 *pi1_sao_offset_v,
+//                              WORD32 wd,
+//                              WORD32 ht)
+//**************Variables Vs Registers*****************************************
+//x0 =>    *pu1_src
+//x1 =>    src_strd
+//x2 =>    *pu1_src_left
+//x3 =>    *pu1_src_top
+//x4    =>    *pu1_src_top_left
+//x5    =>    *pu1_avail
+//x6    =>    *pi1_sao_offset_u
+//x9 =>  *pi1_sao_offset_v
+//x7    =>    wd
+//x8=>    ht
+
+.text
+.p2align 2
+.include "ihevc_neon_macros.s"
+
+.globl gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class2_chroma_av8
+
+ihevc_sao_edge_offset_class2_chroma_av8:
+
+
+    // STMFD sp!,{x4-x12,x14}            //stack stores the values of the arguments
+
+    ldr         x8,[sp,#0]
+    ldr         x9,[sp,#8]
+    ldr         w10,[sp,#16]
+    ldr         w11,[sp,#24]
+    push_v_regs
+
+
+    // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
+    stp         x19, x20,[sp,#-16]!
+    stp         x21, x22,[sp,#-16]!
+    stp         x23, x24,[sp,#-16]!
+    stp         x25, x26,[sp,#-16]!
+    stp         x27, x28,[sp,#-16]!
+
+    mov         x15,x4 // *pu1_src_top_left 0x28
+    //mov x16,x5    // *pu1_src_top_right 0x2c
+    mov         x17,x6 // *pu1_src_bot_left 0x30
+    mov         x21,x7 // *pu1_avail 0x34
+    mov         x22,x8 // *pi1_sao_offset_u 0x38
+    mov         x23,x9 // *pi1_sao_offset_v 0x3c
+    mov         x24,x10 // wd 0x40
+    mov         x25,x11 // ht 0x44
+
+
+    mov         w7, w24                     //Loads wd
+    mov         w8, w25                     //Loads ht
+    SUB         x9,x7,#2                    //wd - 2
+
+    mov         x4, x15                     //Loads pu1_src_top_left
+    LDRH        w10,[x3,x9]                 //pu1_src_top[wd - 2]
+
+    mov         x26, x0                     //Store pu1_src in sp
+    MOV         x9,x7                       //Move width to x9 for loop count
+
+    mov         x17, x2                     //Store pu1_src_bot_left in sp
+    mov         x5, x21                     //Loads pu1_avail
+    mov         x6, x22                     //Loads pi1_sao_offset_u
+
+    mov         x22, x3                     //Store pu1_src_top in sp
+    SUB         sp,sp,#0xE0                 //Decrement the stack pointer to store some temp arr values
+
+    STRH        w10,[sp]                    //u1_src_top_left_tmp = pu1_src_top[wd - 2]
+    SUB         x10,x8,#1                   //ht-1
+    madd        x11, x10, x1, x0            //pu1_src[(ht - 1) * src_strd + col]
+    ADD         x12,sp,#10                  //temp array
+
+AU1_SRC_TOP_LOOP:
+    LD1         {v0.8b},[x11],#8            //pu1_src[(ht - 1) * src_strd + col]
+    SUBS        x9,x9,#8                    //Decrement the loop count by 8
+    ST1         {v0.8b},[x12],#8            //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
+    BNE         AU1_SRC_TOP_LOOP
+
+PU1_AVAIL_4_LOOP_U:
+    LDRB        w9,[x5,#4]                  //pu1_avail[4]
+    CMP         x9,#0
+    LDRB        w9,[x0]                     //u1_pos_0_0_tmp_u = pu1_src[0]
+    LDRB        w10,[x0,#1]                 //u1_pos_0_0_tmp_v = pu1_src[1]
+    BEQ         PU1_AVAIL_7_LOOP_U
+
+    LDRB        w11,[x4]                    //pu1_src_top_left[0]
+    ADD         x14,x0,x1                   //pu1_src + src_strd
+
+    SUB         x12,x9,x11                  //pu1_src[0] - pu1_src_top_left[0]
+
+    LDRB        w14,[x14,#2]                //pu1_src[2 + src_strd]
+    CMP         x12,#0
+
+    movn        x20,#0
+    csel        x12, x20, x12,LT
+    SUB         x11,x9,x14                  //pu1_src[0] - pu1_src[2 + src_strd]
+
+    MOV         x20,#1
+    csel        x12, x20, x12,GT            //SIGN(pu1_src[0] - pu1_src_top_left[0])
+
+    CMP         x11,#0
+    movn        x20,#0
+    csel        x11, x20, x11,LT
+    ADRP        x14, :got:gi1_table_edge_idx //table pointer
+    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
+    MOV         x20,#1
+    csel        x11, x20, x11,GT            //SIGN(pu1_src[0] - pu1_src[2 + src_strd])
+
+    ADD         x11,x12,x11                 //SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[2 + src_strd])
+    ADD         x11,x11,#2                  //edge_idx
+
+    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         x12,#0                      //0 != edge_idx
+    BEQ         PU1_AVAIL_4_LOOP_V
+    LDRSB       x11,[x6,x12]                //pi1_sao_offset_u[edge_idx]
+    ADD         x9,x9,x11                   //pu1_src[0] + pi1_sao_offset_u[edge_idx]
+    mov         x20,#255
+    cmp         x9,x20
+    csel        x9, x20, x9, ge             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_4_LOOP_V:
+
+    LDRB        w11,[x4,#1]                 //pu1_src_top_left[1]
+    ADD         x14,x0,x1                   //pu1_src + src_strd
+
+    SUB         x12,x10,x11                 //pu1_src[1] - pu1_src_top_left[1]
+    LDRB        w14,[x14,#3]                //pu1_src[3 + src_strd]
+
+    CMP         x12,#0
+    movn        x20,#0
+    csel        x12, x20, x12,LT
+    SUB         x11,x10,x14                 //pu1_src[1] - pu1_src[3 + src_strd]
+    MOV         x20,#1
+    csel        x12, x20, x12,GT            //SIGN(pu1_src[0] - pu1_src_top_left[0])
+
+    CMP         x11,#0
+    movn        x20,#0
+    csel        x11, x20, x11,LT
+    ADRP        x14, :got:gi1_table_edge_idx //table pointer
+    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
+    MOV         x20,#1
+    csel        x11, x20, x11,GT            //SIGN(pu1_src[0] - pu1_src[3 + src_strd])
+
+    ADD         x11,x12,x11                 //SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[3 + src_strd])
+    ADD         x11,x11,#2                  //edge_idx
+
+    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         x12,#0                      //0 != edge_idx
+    BEQ         PU1_AVAIL_7_LOOP_U
+    mov         x11, x23                    //Loads pi1_sao_offset_v
+    LDRSB       x11,[x11,x12]               //pi1_sao_offset_v[edge_idx]
+    ADD         x10,x10,x11                 //pu1_src[0] + pi1_sao_offset_v[edge_idx]
+    mov         x20,#255
+    cmp         x10,x20
+    csel        x10, x20, x10, ge           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_7_LOOP_U:
+    STRB        w10,[sp,#7]
+    STRB        w9,[sp,#6]
+
+    LDRB        w10,[x5,#7]                 //pu1_avail[7]
+    CMP         x10,#0
+    SUB         x10,x7,#2                   //wd - 2
+    SUB         x11,x8,#1                   //ht - 1
+    madd        x12, x11, x1, x10           //wd - 2 + (ht - 1) * src_strd
+    ADD         x12,x12,x0                  //pu1_src[wd - 2 + (ht - 1) * src_strd]
+    LDRB        w10,[x12]                   //u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd]
+    LDRB        w9,[x12,#1]                 //u1_pos_wd_ht_tmp_v = pu1_src[wd - 2 + (ht - 1) * src_strd]
+    BEQ         PU1_AVAIL_3_LOOP
+
+    SUB         x11,x12,x1                  //pu1_src[(wd - 2 + (ht - 1) * src_strd) - src_strd]
+    SUB         x11,x11,#2                  //pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
+    LDRB        w11,[x11]                   //Load pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
+    SUB         x11,x10,x11                 //pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd]
+    CMP         x11,#0
+    movn        x20,#0
+    csel        x11, x20, x11,LT
+    MOV         x20,#1
+    csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd])
+
+    ADD         x14,x12,x1                  //pu1_src[(wd - 2 + (ht - 1) * src_strd) + src_strd]
+    ADD         x14,x14,#2                  //pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
+    LDRB        w14,[x14]                   //Load pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
+    SUB         x14,x10,x14                 //pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
+    CMP         x14,#0
+    movn        x20,#0
+    csel        x14, x20, x14,LT
+    MOV         x20,#1
+    csel        x14, x20, x14,GT            //SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd])
+
+    ADD         x11,x11,x14                 //Add 2 sign value
+    ADD         x11,x11,#2                  //edge_idx
+    ADRP        x14, :got:gi1_table_edge_idx //table pointer
+    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
+
+    LDRSB       x14,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         x14,#0
+    BEQ         PU1_AVAIL_7_LOOP_V
+    LDRSB       x11,[x6,x14]                //pi1_sao_offset_u[edge_idx]
+    ADD         x10,x10,x11                 //pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+    mov         x20,#255
+    cmp         x10,x20
+    csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_7_LOOP_V:
+    ADD         x12,x12,#1
+    SUB         x11,x12,x1                  //pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
+    SUB         x11,x11,#2                  //pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
+    LDRB        w11,[x11]                   //Load pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
+    SUB         x11,x9,x11                  //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 2 - src_strd]
+    CMP         x11,#0
+    movn        x20,#0
+    csel        x11, x20, x11,LT
+    MOV         x20,#1
+    csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd])
+
+    ADD         x14,x12,x1                  //pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
+    ADD         x14,x14,#2                  //pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
+    LDRB        w14,[x14]                   //Load pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
+    SUB         x14,x9,x14                  //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
+    CMP         x14,#0
+    movn        x20,#0
+    csel        x14, x20, x14,LT
+    MOV         x20,#1
+    csel        x14, x20, x14,GT            //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
+
+    ADD         x11,x11,x14                 //Add 2 sign value
+    ADD         x11,x11,#2                  //edge_idx
+    ADRP        x14, :got:gi1_table_edge_idx //table pointer
+    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
+
+    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         x12,#0
+    BEQ         PU1_AVAIL_3_LOOP
+    mov         x14, x23                    //Loads pi1_sao_offset_v
+    LDRSB       x11,[x14,x12]               //pi1_sao_offset_v[edge_idx]
+    ADD         x9,x9,x11                   //pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+    mov         x20,#255
+    cmp         x9,x20
+    csel        x9, x20, x9, ge             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_3_LOOP:
+    STRB        w10,[sp,#8]
+    movi        v0.16b, #2                  //const_2 = vdupq_n_s8(2)
+    STRB        w9,[sp,#9]
+
+    MOV         x12,x8                      //Move ht
+    movi        v2.8h, #0                   //const_min_clip = vdupq_n_s16(0)
+    MOV         x14,x2                      //Move pu1_src_left to pu1_src_left_cpy
+
+    LDRB        w11,[x5,#3]                 //pu1_avail[3]
+    movi        v4.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+    CMP         x11,#0
+
+    SUB         x20,x12,#1                  //ht_tmp--
+    csel        x12, x20, x12,EQ
+    LDRB        w5,[x5,#2]                  //pu1_avail[2]
+
+    CMP         x5,#0
+
+    ADD         x20,x0,x1                   //pu1_src += src_strd
+    csel        x0, x20, x0,EQ
+    LD1         {v6.8b},[x6]                //offset_tbl_u = vld1_s8(pi1_sao_offset_u)
+    SUB         x20,x12,#1                  //ht_tmp--
+    csel        x12, x20, x12,EQ
+
+    mov         x6, x23                     //Loads pi1_sao_offset_v
+    ADD         x20,x14,#2                  //pu1_src_left_cpy += 2
+    csel        x14, x20, x14,EQ
+
+    mov         x27, x0                     //Store pu1_src in sp
+    LD1         {v7.8b},[x6]                //offset_tbl_v = vld1_s8(pi1_sao_offset_v)
+    ADRP        x2, :got:gi1_table_edge_idx //table pointer
+    LDR         x2, [x2, #:got_lo12:gi1_table_edge_idx]
+
+    MOV         x6,x7                       //move wd to x6 loop_count
+    movi        v8.16b, #0XFF               //au1_mask = vdupq_n_s8(-1)
+    CMP         x7,#16                      //Compare wd with 16
+
+    BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+    CMP         x8,#4                       //Compare ht with 4
+    BLE         WD_16_HT_4_LOOP             //If jump to WD_16_HT_4_LOOP
+
+WIDTH_LOOP_16:
+    mov         x5, x21                     //Loads pu1_avail
+    mov         w7, w24                     //Loads wd
+    CMP         x6,x7                       //col == wd
+    LDRb        w20, [x5]                   //pu1_avail[0]
+    csel        w8,w20,w8,EQ
+
+    MOV         x20,#-1
+    csel        x8, x20, x8,NE
+    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    CMP         x6,#16                      //if(col == 16)
+    mov         v8.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    BNE         SKIP_AU1_MASK_VAL
+    LDRB        w8,[x5,#1]                  //pu1_avail[1]
+    mov         v8.16b[14], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v8.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL:
+    LDRB        w9,[x5,#2]                  //pu1_avail[2]
+    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
+    //SUB x0, x0,#8
+    CMP         x9,#0
+
+    mov         w4, w25                     //Loads ht
+    SUB         x20,x0,x1                   //pu1_src - src_strd
+    csel        x8, x20, x8,EQ
+
+    mov         w7, w24                     //Loads wd
+    csel        x8, x3, x8,NE               //pu1_src_top_cpy
+
+    SUB         x8,x8,#2                    //pu1_src - src_strd - 2
+    ADD         x3,x3,#16
+
+    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
+    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+    //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+    //SUB x8, x8,#8
+    SUB         x7,x7,x6                    //(wd - col)
+
+    ADD         x7,x7,#14                   //15 + (wd - col)
+    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    mov         x8, x26                     //Loads *pu1_src
+
+    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
+    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+
+AU1_SRC_LEFT_LOOP:
+    LDRH        w8,[x7]                     //load the value and increment by src_strd
+    SUBS        x4,x4,#1                    //decrement the loop count
+
+    STRH        w8,[x5],#2                  //store it in the stack pointer
+    ADD         x7,x7,x1
+
+    BNE         AU1_SRC_LEFT_LOOP
+
+    ADD         x8,x0,x1                    //I *pu1_src + src_strd
+    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         x7,x12                      //row count, move ht_tmp to x7
+
+    LD1         {v16.16b},[x8]              //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //LD1 {v17.8b},[x8]                        //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //SUB x8, x8,#8
+
+    ADD         x8,x8,#16                   //I
+    movi        v18.16b, #0
+    LDRH        w5,[x8]                     //I pu1_src_cpy[src_strd + 16]
+
+    mov         x10, x21                    //I Loads pu1_avail
+    mov         v18.4h[0], w5               //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    LDRB        w10,[x10,#2]                //I pu1_avail[2]
+
+    CMP         x10,#0                      //I
+    EXT         v18.16b,  v16.16b ,  v18.16b,#2 //I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+    BNE         SIGN_UP_CHANGE_DONE         //I
+
+    LDRB        w11,[x0]                    //I pu1_src_cpy[0]
+    SUB         x4,x12,x7                   //I ht_tmp - row
+
+    LDRB        w10,[x0,#1]                 //I pu1_src_cpy[0]
+    LSL         x4,x4,#1                    //I (ht_tmp - row) * 2
+
+    ADD         x9,x14,x4                   //I pu1_src_left_cpy[(ht_tmp - row) * 2]
+    sub         x13,x9,#2
+    LDRB        w5,[x13]                    //I load the value
+
+    SUB         x8,x11,x5                   //I pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+    sub         x13,x9,#1
+    LDRB        w5,[x13]                    //I load the value
+
+    CMP         x8,#0                       //I
+    SUB         x4,x10,x5                   //I pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+
+    movn        x20,#0
+    csel        x8, x20, x8,LT              //I
+    MOV         x20,#1
+    csel        x8, x20, x8,GT              //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+
+    CMP         x4,#0                       //I
+    mov         v14.8b[0], w8               //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+    movn        x20,#0
+    csel        x4, x20, x4,LT              //I
+
+    MOV         x20,#1
+    csel        x4, x20, x4,GT              //I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+    mov         v14.8b[1], w4               //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE:
+    LD1         {v30.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    cmhi        v20.16b,  v12.16b ,  v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    cmhi        v22.16b,  v18.16b ,  v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    SUB         v22.16b,  v22.16b ,  v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    ADD         v18.16b,  v0.16b ,  v14.16b //I edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v18.16b,  v18.16b ,  v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    TBL         v18.16b, {v30.16b},v18.16b  //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    NEG         v14.16b, v22.16b            //I sign_up = vnegq_s8(sign_down)
+
+    //TBL v19.8b, {v30.16b},v19.8b                //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    EXT         v14.16b,  v14.16b ,  v14.16b,#14 //I sign_up = vextq_s8(sign_up, sign_up, 14)
+
+    Uxtl        v20.8h, v12.8b              //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    AND         v22.16b,  v18.16b ,  v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v23.d[0],v22.d[1]
+
+    Uxtl2       v18.8h, v12.16b             //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    UZP1        v31.8b, v22.8b, v23.8b
+    UZP2        v23.8b, v22.8b, v23.8b      //I
+    mov         v22.8b,v31.8b
+
+    TBL         v22.8b, {v6.16b},v22.8b     //I
+    TBL         v23.8b, {v7.16b},v23.8b     //I
+    ZIP1        v31.8b, v22.8b, v23.8b
+    ZIP2        v23.8b, v22.8b, v23.8b      //I
+    mov         v22.8b,v31.8b
+
+    mov         v12.16b, v16.16b            //I pu1_cur_row = pu1_next_row
+    SADDW       v20.8h,  v20.8h ,  v22.8b   //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    SMAX        v20.8h,  v20.8h ,  v2.8h    //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v20.8h,  v20.8h ,  v4.8h    //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    SADDW       v18.8h,  v18.8h ,  v23.8b   //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    SMAX        v18.8h,  v18.8h ,  v2.8h    //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    UMIN        v18.8h,  v18.8h ,  v4.8h    //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    SUB         x7,x7,#1                    //I Decrement the ht_tmp loop count by 1
+
+
+PU1_SRC_LOOP:
+    ADD         x8,x0,x1,LSL #1             //II *pu1_src + src_strd
+    xtn         v20.8b,  v20.8h             //I vmovn_s16(pi2_tmp_cur_row.val[0])
+    ADD         x11,x8,x1                   //III *pu1_src + src_strd
+
+    LD1         {v16.16b},[x8]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //LD1 {v17.8b},[x8]                        //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //SUB x8, x8,#8
+    LD1         {v30.16b},[x11]             //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //LD1 {v31.8b},[x11]                    //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //SUB x11, x11,#8
+
+    ADD         x8,x8,#16                   //II
+    xtn2        v20.16b,  v18.8h            //I vmovn_s16(pi2_tmp_cur_row.val[1])
+    LDRH        w5,[x8]                     //II pu1_src_cpy[src_strd + 16]
+
+    ADD         x11,x11,#16                 //III
+    mov         v28.4h[0], w5               //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    LDRH        w4,[x11]                    //III pu1_src_cpy[src_strd + 16]
+
+    LDRB        w8,[x0,x1]                  //II pu1_src_cpy[0]
+    EXT         v28.16b,  v16.16b ,  v28.16b,#2 //II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+    SUB         x5,x12,x7                   //II ht_tmp - row
+
+    LSL         x5,x5,#1                    //II (ht_tmp - row) * 2
+    mov         v18.4h[0], w4               //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    ADD         x9,x14,x5                   //II pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+    sub         x13,x9,#2
+    LDRB        w11,[x13]                   //II load the value
+    ST1         { v20.16b},[x0],x1          //I vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    SUB         x8,x8,x11                   //II pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+
+    CMP         x8,#0                       //II
+    EXT         v18.16b,  v30.16b ,  v18.16b,#2 //III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+    LDRB        w11,[x0,#1]                 //II pu1_src_cpy[0]
+
+    movn        x20,#0
+    csel        x8, x20, x8,LT              //II
+    cmhi        v22.16b,  v12.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    MOV         x20,#1
+    csel        x8, x20, x8,GT              //II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+
+    sub         x13,x9,#1
+    LDRB        w5,[x13]                    //II load the value
+    mov         v14.8b[0], w8               //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+    SUB         x7,x7,#1                    //II Decrement the ht_tmp loop count by 1
+
+    SUB         x11,x11,x5                  //II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+    cmhi        v24.16b,  v28.16b ,  v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    CMP         x11,#0                      //II
+
+    movn        x20,#0
+    csel        x11, x20, x11,LT            //II
+    SUB         v24.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         x20,#1
+    csel        x11, x20, x11,GT            //II SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+
+    LDRB        w4,[x0,x1]                  //III pu1_src_cpy[0]
+    LD1         {v22.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    SUB         x5,x12,x7                   //III ht_tmp - row
+
+    ADD         x10,x0,x1
+    mov         v14.8b[1], w11              //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+    LSL         x5,x5,#1                    //III (ht_tmp - row) * 2
+
+    ADD         x9,x14,x5                   //III pu1_src_left_cpy[(ht_tmp - row) * 2]
+    ADD         v26.16b,  v0.16b ,  v14.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+    LDRB        w10,[x10,#1]                //III pu1_src_cpy[0]
+
+    sub         x13,x9,#2
+    LDRB        w5,[x13]                    //III load the value
+    ADD         v26.16b,  v26.16b ,  v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
+    SUB         x4,x4,x5                    //III pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+
+    mov         v22.d[1],v22.d[0]
+    CMP         x4,#0                       //III
+    sub         x13,x9,#1
+    LDRB        w9,[x13]                    //III load the value
+    TBL         v26.16b, {v22.16b},v26.16b  //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    NEG         v14.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
+
+    movn        x20,#0
+    csel        x4, x20, x4,LT              //III
+    SUB         x10,x10,x9                  //III pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+    //TBL v27.8b, {v22.16b},v27.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    EXT         v14.16b,  v14.16b ,  v14.16b,#14 //II sign_up = vextq_s8(sign_up, sign_up, 14)
+
+    MOV         x20,#1
+    csel        x4, x20, x4,GT              //III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+    AND         v26.16b,  v26.16b ,  v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+    CMP         x10,#0                      //III
+
+    mov         v27.d[0],v26.d[1]
+    UZP1        v31.8b, v26.8b, v27.8b
+    UZP2        v27.8b, v26.8b, v27.8b      //II
+    mov         v26.8b,v31.8b
+    mov         v14.8b[0], w4               //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+
+    movn        x20,#0
+    csel        x10, x20, x10,LT            //III
+    MOV         x20,#1
+    csel        x10, x20, x10,GT            //III SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+    TBL         v24.8b, {v6.16b},v26.8b     //II
+    cmhi        v20.16b,  v16.16b ,  v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    cmhi        v22.16b,  v18.16b ,  v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    TBL         v25.8b, {v7.16b},v27.8b     //II
+    SUB         v22.16b,  v22.16b ,  v20.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    mov         v14.8b[1], w10              //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+    ZIP1        v31.8b, v24.8b, v25.8b
+    ZIP2        v25.8b, v24.8b, v25.8b      //II
+    mov         v24.8b,v31.8b
+
+    Uxtl        v28.8h, v12.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    ADD         v18.16b,  v0.16b ,  v14.16b //III edge_idx = vaddq_s8(const_2, sign_up)
+
+    LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    SADDW       v28.8h,  v28.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    ADD         v18.16b,  v18.16b ,  v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
+    SMAX        v28.8h,  v28.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+    TBL         v18.16b, {v20.16b},v18.16b  //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    NEG         v14.16b, v22.16b            //III sign_up = vnegq_s8(sign_down)
+
+    //TBL v19.8b, {v20.16b},v19.8b                //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    EXT         v14.16b,  v14.16b ,  v14.16b,#14 //III sign_up = vextq_s8(sign_up, sign_up, 14)
+
+    Uxtl2       v26.8h, v12.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    AND         v18.16b,  v18.16b ,  v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    mov         v19.d[0],v18.d[1]
+    UZP1        v31.8b, v18.8b, v19.8b
+    UZP2        v19.8b, v18.8b, v19.8b      //III
+    mov         v18.8b,v31.8b
+    TBL         v22.8b, {v6.16b},v18.8b     //III
+    SADDW       v26.8h,  v26.8h ,  v25.8b   //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    mov         v12.16b, v30.16b            //III pu1_cur_row = pu1_next_row
+    TBL         v23.8b, {v7.16b},v19.8b     //III
+    SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    Uxtl        v20.8h, v16.8b              //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    ZIP1        v31.8b, v22.8b, v23.8b
+    ZIP2        v23.8b, v22.8b, v23.8b      //III
+    mov         v22.8b,v31.8b
+    xtn         v28.8b,  v28.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    xtn2        v28.16b,  v26.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
+    SADDW       v20.8h,  v20.8h ,  v22.8b   //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    Uxtl2       v18.8h, v16.16b             //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    SMAX        v20.8h,  v20.8h ,  v2.8h    //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    UMIN        v20.8h,  v20.8h ,  v4.8h    //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+    SADDW       v18.8h,  v18.8h ,  v23.8b   //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    SUB         x7,x7,#1                    //III Decrement the ht_tmp loop count by 1
+    SMAX        v18.8h,  v18.8h ,  v2.8h    //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    CMP         x7,#1
+
+    ST1         { v28.16b},[x0],x1          //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    UMIN        v18.8h,  v18.8h ,  v4.8h    //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    BGT         PU1_SRC_LOOP                //If not equal jump to PU1_SRC_LOOP
+    BLT         INNER_LOOP_DONE
+
+    ADD         x8,x0,x1,LSL #1             //*pu1_src + src_strd
+    xtn         v20.8b,  v20.8h             //III vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    LDRB        w11,[x0,x1]                 //pu1_src_cpy[0]
+    LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //LD1 {v17.8b},[x8]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //SUB x8, x8,#8
+    SUB         x4,x12,x7                   //ht_tmp - row
+
+    ADD         x8,x8,#16
+    xtn2        v20.16b,  v18.8h            //III vmovn_s16(pi2_tmp_cur_row.val[1])
+    LDRH        w5,[x8]                     //pu1_src_cpy[src_strd + 16]
+
+    LSL         x4,x4,#1                    //(ht_tmp - row) * 2
+    mov         v18.4h[0], w5               //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    ADD         x9,x14,x4                   //pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+    sub         x13,x9,#2
+    LDRB        w5,[x13]                    //load the value
+    EXT         v18.16b,  v16.16b ,  v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+    SUB         x8,x11,x5                   //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+
+    CMP         x8,#0
+    ST1         { v20.16b},[x0],x1          //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    movn        x20,#0
+    csel        x8, x20, x8,LT
+
+    MOV         x20,#1
+    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+    LD1         {v30.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+
+    LDRB        w11,[x0,#1]                 //pu1_src_cpy[0]
+    mov         v14.8b[0], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+    sub         x13,x9,#1
+    LDRB        w5,[x13]                    //load the value
+
+    SUB         x4,x11,x5                   //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+    cmhi        v22.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    CMP         x4,#0
+
+    movn        x20,#0
+    csel        x4, x20, x4,LT
+    cmhi        v24.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    MOV         x20,#1
+    csel        x4, x20, x4,GT              //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+
+    mov         v14.8b[1], w4               //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+    SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    mov         v30.d[1],v30.d[0]
+    TBL         v26.16b, {v30.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    //TBL v27.8b, {v30.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    Uxtl        v20.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v27.d[0],v26.d[1]
+
+    Uxtl2       v18.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    UZP1        v31.8b, v26.8b, v27.8b
+    UZP2        v27.8b, v26.8b, v27.8b
+    mov         v26.8b,v31.8b
+
+    TBL         v24.8b, {v6.16b},v26.8b
+    TBL         v25.8b, {v7.16b},v27.8b
+    ZIP1        v31.8b, v24.8b, v25.8b
+    ZIP2        v25.8b, v24.8b, v25.8b
+    mov         v24.8b,v31.8b
+
+    SADDW       v20.8h,  v20.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    SADDW       v18.8h,  v18.8h ,  v25.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    SMAX        v18.8h,  v18.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    UMIN        v18.8h,  v18.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+
+INNER_LOOP_DONE:
+    mov         w8, w25                     //Loads ht
+    xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
+
+    mov         x11, x17                    //Loads *pu1_src_left
+    xtn2        v20.16b,  v18.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
+
+
+SRC_LEFT_LOOP:
+    LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
+    SUBS        x8,x8,#2
+    STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
+    BNE         SRC_LEFT_LOOP
+
+    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
+    ST1         { v20.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    CMP         x6,#8                       //Check whether residue remains
+
+    BLT         RE_ASSINING_LOOP            //Jump to re-assigning loop
+    mov         w7, w24                     //Loads wd
+    mov         x0, x27                     //Loads *pu1_src
+    SUB         x7,x7,x6
+    ADD         x0,x0,x7
+    BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
+    BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
+
+
+WD_16_HT_4_LOOP:
+    mov         x5, x21                     //Loads pu1_avail
+    mov         w7, w24                     //Loads wd
+    CMP         x6,x7                       //col == wd
+    LDRb        w20, [x5]                   //pu1_avail[0]
+    csel        w8,w20,w8,EQ
+
+    MOV         x20,#-1
+    csel        x8, x20, x8,NE
+    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v8.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    CMP         x6,#16                      //if(col == 16)
+    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
+    LDRB        w8,[x5,#1]                  //pu1_avail[1]
+    mov         v8.16b[14], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v8.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL_WD_16_HT_4:
+    LDRB        w8,[x5,#2]                  //pu1_avail[2]
+    CMP         x8,#0
+
+    SUB         x20,x0,x1                   //pu1_src - src_strd
+    csel        x8, x20, x8,EQ
+    csel        x8, x3, x8,NE               //pu1_src_top_cpy
+    SUB         x8,x8,#2                    //pu1_src - src_strd - 2
+    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+    //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+    //SUB x8, x8,#8
+
+    ADD         x3,x3,#16
+    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
+    mov         w4, w25                     //Loads ht
+    mov         x7, x24                     //Loads wd
+    SUB         x7,x7,x6                    //(wd - col)
+    ADD         x7,x7,#14                   //15 + (wd - col)
+    mov         x8, x26                     //Loads *pu1_src
+    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
+
+AU1_SRC_LEFT_LOOP_WD_16_HT_4:
+    LDRH        w8,[x7]                     //load the value and increment by src_strd
+    STRH        w8,[x5],#2                  //store it in the stack pointer
+    ADD         x7,x7,x1
+
+    SUBS        x4,x4,#1                    //decrement the loop count
+    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
+
+    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
+    //SUB x0, x0,#8
+
+    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    movi        v18.16b, #0
+    MOV         x7,x12                      //row count, move ht_tmp to x7
+
+PU1_SRC_LOOP_WD_16_HT_4:
+    movi        v18.16b, #0
+    ADD         x8,x0,x1                    //*pu1_src + src_strd
+    LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //LD1 {v17.8b},[x8]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //SUB x8, x8,#8
+
+    ADD         x8,x8,#16
+    LDRH        w5,[x8]                     //pu1_src_cpy[src_strd + 16]
+    mov         v18.4h[0], w5               //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    EXT         v18.16b,  v16.16b ,  v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+
+    CMP         x7,x12
+    BLT         SIGN_UP_CHANGE_WD_16_HT_4
+    mov         x5, x21                     //Loads pu1_avail
+    LDRB        w5,[x5,#2]                  //pu1_avail[2]
+    CMP         x5,#0
+    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
+
+SIGN_UP_CHANGE_WD_16_HT_4:
+    LDRB        w8,[x0]                     //pu1_src_cpy[0]
+    SUB         x5,x12,x7                   //ht_tmp - row
+    LSL         x5,x5,#1                    //(ht_tmp - row) * 2
+    ADD         x9,x14,x5                   //pu1_src_left_cpy[(ht_tmp - row) * 2]
+    sub         x13,x9,#2
+    LDRB        w5,[x13]                    //load the value
+    SUB         x8,x8,x5                    //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+    CMP         x8,#0
+    movn        x20,#0
+    csel        x8, x20, x8,LT
+    MOV         x20,#1
+    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+    mov         v14.8b[0], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+
+    LDRB        w8,[x0,#1]                  //pu1_src_cpy[0]
+    sub         x13,x9,#1
+    LDRB        w5,[x13]                    //load the value
+    SUB         x8,x8,x5                    //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+    CMP         x8,#0
+    movn        x20,#0
+    csel        x8, x20, x8,LT
+    MOV         x20,#1
+    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+    mov         v14.8b[1], w8               //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE_WD_16_HT_4:
+    cmhi        v22.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v24.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    LD1         {v22.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    TBL         v26.16b, {v22.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    //TBL v27.8b, {v22.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v27.d[0],v26.d[1]
+
+    NEG         v14.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
+    EXT         v14.16b,  v14.16b ,  v14.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
+
+    UZP1        v31.8b, v26.8b, v27.8b
+    UZP2        v27.8b, v26.8b, v27.8b
+    mov         v26.8b,v31.8b
+    TBL         v24.8b, {v6.16b},v26.8b
+    TBL         v25.8b, {v7.16b},v27.8b
+    ZIP1        v31.8b, v24.8b, v25.8b
+    ZIP2        v25.8b, v24.8b, v25.8b
+    mov         v24.8b,v31.8b
+
+    Uxtl        v28.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    Uxtl2       v26.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    SADDW       v26.8h,  v26.8h ,  v25.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+    xtn2        v28.16b,  v26.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    ST1         { v28.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    mov         v12.16b, v16.16b            //pu1_cur_row = pu1_next_row
+    SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
+    BNE         PU1_SRC_LOOP_WD_16_HT_4     //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
+
+    mov         w8, w25                     //Loads ht
+    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
+    mov         x11, x17                    //Loads *pu1_src_left
+
+SRC_LEFT_LOOP_WD_16_HT_4:
+    LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
+    STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
+
+    SUBS        x8,x8,#2
+    BNE         SRC_LEFT_LOOP_WD_16_HT_4
+
+
+    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
+    BLE         RE_ASSINING_LOOP            //Jump to re-assigning loop
+    BGT         WD_16_HT_4_LOOP
+
+
+WIDTH_RESIDUE:
+    mov         w7, w24                     //Loads wd
+    mov         x5, x21                     //Loads pu1_avail
+    CMP         x6,x7                       //wd_residue == wd
+    LDRb        w20, [x5]                   //pu1_avail[0]
+    csel        w8,w20,w8,EQ
+
+    MOV         x20,#-1
+    csel        x8, x20, x8,NE
+    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v8.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    LDRB        w8,[x5,#1]                  //pu1_avail[1]
+    mov         v8.8b[6], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v8.8b[7], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+    LDRB        w8,[x5,#2]                  //pu1_avail[2]
+    CMP         x8,#0
+
+    SUB         x20,x0,x1                   //pu1_src - src_strd
+    csel        x8, x20, x8,EQ
+    csel        x8, x3, x8,NE
+    SUB         x8,x8,#2                    //pu1_src - src_strd - 2
+    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
+    //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
+    //SUB x8, x8,#8
+
+    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
+    mov         w4, w25                     //Loads ht
+    mov         w7, w24                     //Loads wd
+    mov         x8, x26                     //Loads *pu1_src
+    SUB         x7,x7,#2                    //(wd - 2)
+    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + (wd - 2)]
+
+AU1_SRC_LEFT_LOOP_RESIDUE:
+    LDRH        w8,[x7]                     //load the value and increment by src_strd
+    STRH        w8,[x5],#2                  //store it in the stack pointer
+    ADD         x7,x7,x1
+    SUBS        x4,x4,#1                    //decrement the loop count
+    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
+
+    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
+    //SUB x0, x0,#8
+
+    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         x7,x12                      //row count, move ht_tmp to x7
+
+PU1_SRC_LOOP_RESIDUE:
+    movi        v18.16b, #0
+    ADD         x8,x0,x1                    //*pu1_src + src_strd
+    LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //LD1 {v17.8b},[x8]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //SUB x8, x8,#8
+
+    ADD         x8,x8,#16
+    LDRH        w5,[x8]                     //pu1_src_cpy[src_strd + 16]
+    mov         v18.4h[0], w5               //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+    EXT         v18.16b,  v16.16b ,  v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+
+    CMP         x7,x12
+    BLT         SIGN_UP_CHANGE_RESIDUE
+    mov         x5, x21                     //Loads pu1_avail
+    LDRB        w5,[x5,#2]                  //pu1_avail[2]
+    CMP         x5,#0
+    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
+
+SIGN_UP_CHANGE_RESIDUE:
+    LDRB        w8,[x0]                     //pu1_src_cpy[0]
+    SUB         x5,x12,x7                   //ht_tmp - row
+    LSL         x5,x5,#1                    //(ht_tmp - row) * 2
+    ADD         x9,x14,x5                   //pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+    sub         x13,x9,#2
+    LDRB        w5,[x13]                    //load the value
+    SUB         x8,x8,x5                    //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+    CMP         x8,#0
+    movn        x20,#0
+    csel        x8, x20, x8,LT
+    MOV         x20,#1
+    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+    mov         v14.8b[0], w8               //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+
+    LDRB        w8,[x0,#1]                  //pu1_src_cpy[0]
+    sub         x13,x9,#1
+    LDRB        w5,[x13]                    //load the value
+    SUB         x8,x8,x5                    //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+    CMP         x8,#0
+    movn        x20,#0
+    csel        x8, x20, x8,LT
+    MOV         x20,#1
+    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+    mov         v14.8b[1], w8               //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE_RESIDUE:
+    cmhi        v22.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v24.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    LD1         {v22.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    mov         v22.d[1],v22.d[0]
+    TBL         v26.16b, {v22.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    //TBL v27.8b, {v22.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v27.d[0],v26.d[1]
+
+    NEG         v14.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
+    EXT         v14.16b,  v14.16b ,  v14.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
+
+    UZP1        v31.8b, v26.8b, v27.8b
+    UZP2        v27.8b, v26.8b, v27.8b
+    mov         v26.8b,v31.8b
+    TBL         v24.8b, {v6.16b},v26.8b
+    TBL         v25.8b, {v7.16b},v27.8b
+    ZIP1        v31.8b, v24.8b, v25.8b
+    ZIP2        v25.8b, v24.8b, v25.8b
+    mov         v24.8b,v31.8b
+
+    Uxtl        v28.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    ST1         {v28.8b},[x0],x1            //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    mov         v12.16b, v16.16b            //pu1_cur_row = pu1_next_row
+    SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
+    BNE         PU1_SRC_LOOP_RESIDUE        //If not equal jump to PU1_SRC_LOOP
+
+    mov         w8, w25                     //Loads ht
+    mov         x11, x17                    //Loads *pu1_src_left
+    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
+
+SRC_LEFT_LOOP_RESIDUE:
+    LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
+    SUBS        x8,x8,#2
+    STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
+
+    BNE         SRC_LEFT_LOOP_RESIDUE
+
+
+RE_ASSINING_LOOP:
+    mov         w8, w25                     //Loads ht
+
+    mov         x0, x26                     //Loads *pu1_src
+    SUB         x8,x8,#1                    //ht - 1
+
+    mov         w7, w24                     //Loads wd
+
+    LDRH        w9,[sp,#6]
+    madd        x6, x8, x1, x7              //wd - 2 + (ht - 1) * src_strd
+
+    STRH        w9,[x0]                     //pu1_src_org[0] = u1_pos_0_0_tmp
+    ADD         x6,x0,x6                    //pu1_src[wd - 2 + (ht - 1) * src_strd]
+
+    LDRH        w9,[sp,#8]
+    ADD         x12,sp,#10
+    sub         x13,x6,#2
+    STRH        w9,[x13]                    //pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
+
+    mov         x4, x15                     //Loads pu1_src_top_left
+    LDRH        w10,[sp]                    //load u1_src_top_left_tmp from stack pointer
+    STRH        w10,[x4]                    //*pu1_src_top_left = u1_src_top_left_tmp
+    mov         x3, x22                     //Loads pu1_src_top
+
+SRC_TOP_LOOP:
+    LD1         {v0.8b},[x12],#8            //pu1_src_top[col] = au1_src_top_tmp[col]
+    SUBS        x7,x7,#8                    //Decrement the width
+    ST1         {v0.8b},[x3],#8             //pu1_src_top[col] = au1_src_top_tmp[col]
+    BNE         SRC_TOP_LOOP
+
+END_LOOPS:
+    ADD         sp,sp,#0xE0
+    // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
+    ldp         x27, x28,[sp],#16
+    ldp         x25, x26,[sp],#16
+    ldp         x23, x24,[sp],#16
+    ldp         x21, x22,[sp],#16
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+

diff --git a/common/arm64/ihevc_sao_edge_offset_class3.s b/common/arm64/ihevc_sao_edge_offset_class3.s
new file mode 100644
index 0000000..6c47abe
--- /dev/null
+++ b/common/arm64/ihevc_sao_edge_offset_class3.s

@@ -0,0 +1,887 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//*  ihevc_sao_edge_offset_class3.s
+//*
+//* ,:brief
+//*  Contains function definitions for inter prediction  interpolation.
+//* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
+//* RVCT
+//*
+//* ,:author
+//*  Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//*  None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_edge_offset_class3(UWORD8 *pu1_src,
+//                              WORD32 src_strd,
+//                              UWORD8 *pu1_src_left,
+//                              UWORD8 *pu1_src_top,
+//                              UWORD8 *pu1_src_top_left,
+//                              UWORD8 *pu1_src_top_right,
+//                              UWORD8 *pu1_src_bot_left,
+//                              UWORD8 *pu1_avail,
+//                              WORD8 *pi1_sao_offset,
+//                              WORD32 wd,
+//                              WORD32 ht)
+//**************Variables Vs Registers*****************************************
+//x0 =>    *pu1_src
+//x1 =>    src_strd
+//x2 =>    *pu1_src_left
+//x3 =>    *pu1_src_top
+//x4    =>    *pu1_src_top_left
+//x5    =>    *pu1_avail
+//x6    =>    *pi1_sao_offset
+//x7    =>    wd
+//x8=>    ht
+
+.text
+.p2align 2
+
+.include "ihevc_neon_macros.s"
+
+.globl gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class3_av8
+
+ihevc_sao_edge_offset_class3_av8:
+
+
+    // STMFD sp!,{x4-x12,x14}            //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+    stp         x21, x22,[sp,#-16]!
+    stp         x23, x24,[sp,#-16]!
+
+    MOV         x19,x0                      //Store pu1_src in sp
+    MOV         x21,x6                      //Store pu1_src_left in sp
+    MOV         x22,x3                      //Store pu1_src_top in sp
+    MOV         x23,x7                      //Store pu1_avail in sp
+    MOV         x24,x4                      //Store pu1_src_top_left in sp
+    MOV         x20,x5                      //Store pu1_src_top_right in sp
+    MOV         x13,x6                      //Store pu1_src_bot_left in sp
+
+    MOV         x5,x7                       //Loads pu1_avail
+
+    LDR         x6,[sp,#112]                //Loads pi1_sao_offset
+    LDR         w7,[sp,#120]                //Loads wd
+    LDR         w8,[sp,#128]                //Loads ht
+
+    MOV         x16,x7 // wd
+    MOV         x17,x8 // ht
+
+    SUB         x9,x7,#1                    //wd - 1
+
+    LDRB        w10,[x3,x9]                 //pu1_src_top[wd - 1]
+
+    MOV         x9,x7                       //Move width to x9 for loop count
+
+    SUB         sp,sp,#0xA0                 //Decrement the stack pointer to store some temp arr values
+
+    STRB        w10,[sp]                    //u1_src_top_left_tmp = pu1_src_top[wd - 1]
+    SUB         x10,x8,#1                   //ht-1
+    madd        x11, x10, x1, x0            //pu1_src[(ht - 1) * src_strd + col]
+    ADD         x12,sp,#0x02                //temp array
+
+AU1_SRC_TOP_LOOP:
+    LD1         {v0.8b},[x11],#8            //pu1_src[(ht - 1) * src_strd + col]
+    SUBS        x9,x9,#8                    //Decrement the loop count by 8
+    ST1         {v0.8b},[x12],#8            //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
+    BNE         AU1_SRC_TOP_LOOP
+
+PU1_AVAIL_5_LOOP:
+    LDRB        w9,[x5,#5]                  //pu1_avail[5]
+    CMP         x9,#0
+    SUB         x10,x7,#1                   //[wd - 1]
+    LDRB        w9,[x0,x10]                 //u1_pos_0_0_tmp = pu1_src[wd - 1]
+    BEQ         PU1_AVAIL_6_LOOP
+
+    MOV         x11,x20                     //Load pu1_src_top_right from sp
+    SUB         x10,x10,#1                  //[wd - 1 - 1]
+
+    LDRB        w11,[x11]                   //pu1_src_top_right[0]
+    SUB         x12,x9,x11                  //pu1_src[wd - 1] - pu1_src_top_right[0]
+
+    ADD         x11,x0,x1                   //pu1_src + src_strd
+
+    LDRB        w14,[x11,x10]               //pu1_src[wd - 1 - 1 + src_strd]
+    CMP         x12,#0
+    movn        x20,#0
+    csel        x12, x20, x12,LT
+    SUB         x11,x9,x14                  //pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]
+
+    MOV         x20,#1
+    csel        x12, x20, x12,GT            //SIGN(pu1_src[wd - 1] - pu1_src_top_right[0])
+    CMP         x11,#0
+    movn        x20,#0
+    csel        x11, x20, x11,LT
+    MOV         x20,#1
+    csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
+    ADRP        x14, :got:gi1_table_edge_idx //table pointer
+    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
+    ADD         x11,x12,x11                 //SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
+    ADD         x11,x11,#2                  //edge_idx
+
+    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         x12,#0                      //0 != edge_idx
+    BEQ         PU1_AVAIL_6_LOOP
+    LDRSB       x10,[x6,x12]                //pi1_sao_offset[edge_idx]
+    ADD         x9,x9,x10                   //pu1_src[0] + pi1_sao_offset[edge_idx]
+    mov         x20,#255
+    cmp         x9,x20
+    csel        x9, x20, x9, ge             //u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_6_LOOP:
+    LDRB        w10,[x5,#6]                 //pu1_avail[6]
+    SUB         x11,x8,#1                   //ht - 1
+
+    CMP         x10,#0
+    madd        x12, x11, x1, x0            //pu1_src[(ht - 1) * src_strd]
+
+    LDRB        w10,[x12]                   //u1_pos_wd_ht_tmp = pu1_src[(ht - 1) * src_strd]
+    BEQ         PU1_AVAIL_3_LOOP
+
+    MOV         x14,x13                     //Load pu1_src_bot_left from sp
+    SUB         x11,x12,x1                  //pu1_src[(ht - 1) * src_strd) - src_strd]
+
+    LDRB        w14,[x14]                   //Load pu1_src_bot_left[0]
+    ADD         x11,x11,#1                  //pu1_src[(ht - 1) * src_strd + 1 - src_strd]
+
+    LDRB        w11,[x11]                   //Load pu1_src[(ht - 1) * src_strd + 1 - src_strd]
+    SUB         x14,x10,x14                 //pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
+
+    SUB         x11,x10,x11                 //pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]
+    CMP         x11,#0
+    movn        x20,#0
+    csel        x11, x20, x11,LT
+    MOV         x20,#1
+    csel        x11, x20, x11,GT            //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd])
+
+    CMP         x14,#0
+    movn        x20,#0
+    csel        x14, x20, x14,LT
+    MOV         x20,#1
+    csel        x14, x20, x14,GT            //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
+
+    ADD         x11,x11,x14                 //Add 2 sign value
+
+    ADRP        x14, :got:gi1_table_edge_idx //table pointer
+    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
+    ADD         x11,x11,#2                  //edge_idx
+
+    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         x12,#0
+    BEQ         PU1_AVAIL_3_LOOP
+    LDRSB       x11,[x6,x12]                //pi1_sao_offset[edge_idx]
+    ADD         x10,x10,x11                 //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+    mov         x20,#255
+    cmp         x10,x20
+    csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_3_LOOP:
+    MOV         x21,x2
+    MOV         x12,x8                      //Move ht
+
+    MOV         x14,x2                      //Move pu1_src_left to pu1_src_left_cpy
+    movi        v0.16b, #2                  //const_2 = vdupq_n_s8(2)
+    LDRB        w11,[x5,#3]                 //pu1_avail[3]
+
+    CMP         x11,#0
+    movi        v2.8h, #0                   //const_min_clip = vdupq_n_s16(0)
+    SUB         x20,x12,#1                  //ht_tmp--
+    csel        x12, x20, x12,EQ
+
+    LDRB        w5,[x5,#2]                  //pu1_avail[2]
+    movi        v4.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+    CMP         x5,#0
+
+    ADD         x20,x0,x1                   //pu1_src += src_strd
+    csel        x0, x20, x0,EQ
+    LD1         {v7.8b},[x6]                //offset_tbl = vld1_s8(pi1_sao_offset)
+    SUB         x20,x12,#1                  //ht_tmp--
+    csel        x12, x20, x12,EQ
+
+    ADRP        x6, :got:gi1_table_edge_idx //table pointer
+    LDR         x6, [x6, #:got_lo12:gi1_table_edge_idx]
+
+    movi        v8.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
+    ADD         x20,x14,#1                  //pu1_src_left_cpy += 1
+    csel        x14, x20, x14,EQ
+
+    MOV         x15,x0                      //Store pu1_src in sp
+    LD1         {v6.8b},[x6]                //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    MOV         x6,x7                       //move wd to x6 loop_count
+
+    CMP         x7,#16                      //Compare wd with 16
+    BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+    CMP         x8,#4                       //Compare ht with 4
+    BLE         WD_16_HT_4_LOOP             //If jump to WD_16_HT_4_LOOP
+
+WIDTH_LOOP_16:
+    MOV         x7,x16                      //Loads wd
+
+    MOV         x5,x23                      //Loads pu1_avail
+    CMP         x6,x7                       //col == wd
+    LDRb        w20, [x5]                   //pu1_avail[0]
+    csel        w8,w20,w8,EQ
+    MOV         x20,#-1
+    csel        x8, x20, x8,NE
+    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    CMP         x6,#16                      //if(col == 16)
+    BNE         SKIP_AU1_MASK_VAL
+    LDRB        w8,[x5,#1]                  //pu1_avail[1]
+    mov         v8.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL:
+    LDRB        w8,[x5,#2]                  //pu1_avail[2]
+    CMP         x8,#0
+
+    MOV         x4,x17                      //Loads ht
+    SUB         x20,x0,x1                   //pu1_src - src_strd
+    csel        x8, x20, x8,EQ
+
+    csel        x8, x3, x8,NE
+    ADD         x5,sp,#0x42                 //*au1_src_left_tmp
+
+    MOV         x7,x16                      //Loads wd
+    ADD         x8,x8,#1                    //pu1_src - src_strd + 1
+
+    SUB         x7,x7,x6                    //(wd - col)
+    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+    ADD         x3,x3,#16
+
+    MOV         x8,x19                      //Loads *pu1_src
+    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    ADD         x7,x7,#15                   //15 + (wd - col)
+
+    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
+    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    SUB         x5,x5,#1
+
+AU1_SRC_LEFT_LOOP:
+    LDRB        w8,[x7]                     //load the value and increment by src_strd
+    ADD         x7,x7,x1
+    SUBS        x4,x4,#1                    //decrement the loop count
+    STRB        w8,[x5,#1]!                 //store it in the stack pointer
+    BNE         AU1_SRC_LEFT_LOOP
+
+    movi        v18.16b, #0
+    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+
+    ADD         x8,x0,x1                    //I *pu1_src + src_strd
+    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         x7,x12                      //row count, move ht_tmp to x7
+
+    SUB         x5,x12,x7                   //I ht_tmp - row
+    LD1         {v16.16b},[x8]              //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    ADD         x8,x14,x5                   //I pu1_src_left_cpy[ht_tmp - row]
+
+    ADD         x8,x8,#1                    //I pu1_src_left_cpy[ht_tmp - row + 1]
+    LDRB        w8,[x8]
+
+    MOV         x5,x23                      //I Loads pu1_avail
+    mov         v18.16b[15], w8             //I vsetq_lane_u8
+    LDRB        w5,[x5,#2]                  //I pu1_avail[2]
+
+    EXT         v18.16b,  v18.16b ,  v16.16b,#15 //I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+    CMP         x5,#0                       //I
+    BNE         SIGN_UP_CHANGE_DONE         //I
+
+SIGN_UP_CHANGE:
+    LDRB        w8,[x0,#15]                 //I pu1_src_cpy[15]
+    SUB         x5,x0,x1                    //I pu1_src_cpy[16 - src_strd]
+
+    LDRB        w5,[x5,#16]                 //I load the value
+    SUB         x8,x8,x5                    //I pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+    CMP         x8,#0                       //I
+    movn        x20,#0
+    csel        x8, x20, x8,LT              //I
+    MOV         x20,#1
+    csel        x8, x20, x8,GT              //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+    mov         v14.16b[15], w8             //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+
+SIGN_UP_CHANGE_DONE:
+    cmhi        v10.16b,  v12.16b ,  v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v18.16b,  v18.16b ,  v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    SUB         v10.16b,  v18.16b ,  v10.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    ADD         v18.16b,  v0.16b ,  v14.16b //I edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v18.16b,  v18.16b ,  v10.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
+    TBL         v18.16b, {v6.16b},v18.16b   //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    NEG         v14.16b, v10.16b            //I sign_up = vnegq_s8(sign_down)
+
+    EXT         v14.16b,  v14.16b ,  v14.16b,#1 //I sign_up = vextq_s8(sign_up, sign_up, 1)
+//  TBL v19.8b, {v6.16b},v19.8b                //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    Uxtl        v20.8h, v12.8b              //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    AND         v18.16b,  v18.16b ,  v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    TBL         v10.16b, {v7.16b},v18.16b   //I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+    Uxtl2       v22.8h, v12.16b             //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    SADDW       v20.8h,  v20.8h ,  v10.8b   //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    SMAX        v20.8h,  v20.8h ,  v2.8h    //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+//  TBL v11.8b, {v7.16b},v19.8b                    //I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    UMIN        v20.8h,  v20.8h ,  v4.8h    //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    mov         v12.16b, v16.16b
+    SADDW2      v22.8h,  v22.8h ,  v10.16b  //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    SMAX        v22.8h,  v22.8h ,  v2.8h    //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    UMIN        v22.8h,  v22.8h ,  v4.8h    //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    SUB         x7,x7,#1                    //I Decrement the ht_tmp loop count by 1
+
+PU1_SRC_LOOP:
+    ADD         x8,x0,x1,LSL #1             //II *pu1_src + src_strd
+    xtn         v20.8b,  v20.8h             //I vmovn_s16(pi2_tmp_cur_row.val[0])
+    SUB         x5,x12,x7                   //II ht_tmp - row
+
+    ADD         x4,x0,x1                    //II pu1_src_cpy[16 - src_strd]
+    xtn2        v20.16b,  v22.8h            //I vmovn_s16(pi2_tmp_cur_row.val[1])
+    ADD         x2,x8,x1                    //III *pu1_src + src_strd
+
+    LDRB        w11,[x4,#15]                //II pu1_src_cpy[15]
+    LD1         {v16.16b},[x8]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    SUB         x7,x7,#1                    //II Decrement the ht_tmp loop count by 1
+
+    ADD         x8,x14,x5                   //II pu1_src_left_cpy[ht_tmp - row]
+    LD1         {v30.16b},[x2]              //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    LDRB        w8,[x8,#1]
+
+    LDRB        w4,[x0,#16]                 //II load the value
+    mov         v18.16b[15], w8             //II vsetq_lane_u8
+    SUB         x11,x11,x4                  //II pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+
+    CMP         x11,#0                      //II
+    ST1         { v20.16b},[x0],x1          //I vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    SUB         x5,x12,x7                   //III ht_tmp - row
+
+    movn        x20,#0
+    csel        x11, x20, x11,LT            //II
+    EXT         v18.16b,  v18.16b ,  v16.16b,#15 //II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+    MOV         x20,#1
+    csel        x11, x20, x11,GT            //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+
+    ADD         x8,x14,x5                   //III pu1_src_left_cpy[ht_tmp - row]
+    mov         v14.8b[15], w11             //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+    CMP         x7,#1                       //III
+
+    BNE         NEXT_ROW_ELSE_2             //III
+    MOV         x5,x23                      //III Loads pu1_avail
+    LDRB        w5,[x5,#3]                  //III pu1_avail[3]
+    CMP         x5,#0                       //III
+    SUB         x20,x2,#2                   //III pu1_src_cpy[src_strd - 1]
+    csel        x8, x20, x8,NE
+
+NEXT_ROW_ELSE_2:
+    LDRB        w8,[x8,#1]                  //III
+    cmhi        v24.16b,  v12.16b ,  v18.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    ADD         x5,x0,x1
+
+    LDRB        w2,[x5,#15]                 //III pu1_src_cpy[15]
+    cmhi        v26.16b,  v18.16b ,  v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    LDRB        w5,[x0,#16]                 //III load the value
+
+    SUB         x2,x2,x5                    //III pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+    SUB         v24.16b,  v26.16b ,  v24.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    CMP         x2,#0                       //III
+
+    movn        x20,#0
+    csel        x2, x20, x2,LT              //III
+    mov         v18.16b[15], w8             //III vsetq_lane_u8
+    MOV         x20,#1
+    csel        x2, x20, x2,GT              //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+
+    SUB         x7,x7,#1                    //III Decrement the ht_tmp loop count by 1
+    ADD         v26.16b,  v0.16b ,  v14.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+
+    NEG         v14.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
+    EXT         v18.16b,  v18.16b ,  v30.16b,#15 //III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+
+    ADD         v26.16b,  v26.16b ,  v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    EXT         v14.16b,  v14.16b ,  v14.16b,#1 //II sign_up = vextq_s8(sign_up, sign_up, 1)
+    TBL         v26.16b, {v6.16b},v26.16b   //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    cmhi        v10.16b,  v16.16b ,  v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    mov         v14.16b[15], w2             //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+//  TBL v27.8b, {v6.16b},v27.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    cmhi        v18.16b,  v18.16b ,  v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    Uxtl        v28.8h, v12.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    AND         v26.16b,  v26.16b ,  v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    SUB         v10.16b,  v18.16b ,  v10.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    TBL         v24.16b, {v7.16b},v26.16b   //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    ADD         v18.16b,  v0.16b ,  v14.16b //III edge_idx = vaddq_s8(const_2, sign_up)
+
+    ADD         v18.16b,  v18.16b ,  v10.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
+//  TBL v25.8b, {v7.16b},v27.8b                    //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    NEG         v14.16b, v10.16b            //III sign_up = vnegq_s8(sign_down)
+
+    SADDW       v28.8h,  v28.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    TBL         v18.16b, {v6.16b},v18.16b   //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    SMAX        v28.8h,  v28.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    EXT         v14.16b,  v14.16b ,  v14.16b,#1 //III sign_up = vextq_s8(sign_up, sign_up, 1)
+//  TBL v19.8b, {v6.16b},v19.8b                //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    Uxtl2       v26.8h, v12.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    AND         v18.16b,  v18.16b ,  v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    SADDW2      v26.8h,  v26.8h ,  v24.16b  //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    TBL         v10.16b, {v7.16b},v18.16b   //III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    Uxtl        v20.8h, v16.8b              //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    SADDW       v20.8h,  v20.8h ,  v10.8b   //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+//  TBL v11.8b, {v7.16b},v19.8b                    //III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    SMAX        v20.8h,  v20.8h ,  v2.8h    //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    Uxtl2       v22.8h, v16.16b             //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    UMIN        v20.8h,  v20.8h ,  v4.8h    //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    xtn         v28.8b,  v28.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
+    SADDW2      v22.8h,  v22.8h ,  v10.16b  //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    xtn2        v28.16b,  v26.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
+    SMAX        v22.8h,  v22.8h ,  v2.8h    //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    mov         v12.16b, v30.16b            //II pu1_cur_row = pu1_next_row
+    UMIN        v22.8h,  v22.8h ,  v4.8h    //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    CMP         x7,#1                       //III
+    ST1         { v28.16b},[x0],x1          //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    BGT         PU1_SRC_LOOP                //If not equal jump to PU1_SRC_LOOP
+    BLT         INNER_LOOP_DONE
+
+    ADD         x8,x0,x1,LSL #1             //*pu1_src + src_strd
+    xtn         v20.8b,  v20.8h             //III vmovn_s16(pi2_tmp_cur_row.val[0])
+    MOV         x5,x23                      //Loads pu1_avail
+
+    LDRB        w5,[x5,#3]                  //pu1_avail[3]
+    xtn2        v20.16b,  v22.8h            //III vmovn_s16(pi2_tmp_cur_row.val[1])
+    CMP         x5,#0
+
+    ADD         x4,x0,x1                    //pu1_src_cpy[16 - src_strd]
+    LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    LDRB        w5,[x0,#16]                 //load the value
+
+    BEQ         NEXT_ROW_ELSE_3
+    SUB         x8,x8,#1
+    LDRB        w8,[x8]                     //pu1_src_cpy[src_strd - 1]
+    B           NEXT_ROW_POINTER_ASSIGNED_3
+NEXT_ROW_ELSE_3:
+    SUB         x11,x12,x7                  //ht_tmp - row
+    ADD         x8,x14,x11                  //pu1_src_left_cpy[ht_tmp - row]
+    ADD         x8,x8,#1                    //pu1_src_left_cpy[ht_tmp - row + 1]
+    LDRB        w8,[x8]
+
+NEXT_ROW_POINTER_ASSIGNED_3:
+    LDRB        w11,[x4,#15]                //pu1_src_cpy[15]
+    mov         v18.16b[15], w8             //vsetq_lane_u8
+    SUB         x8,x11,x5                   //pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+
+    CMP         x8,#0
+    EXT         v18.16b,  v18.16b ,  v16.16b,#15 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+    movn        x20,#0
+    csel        x8, x20, x8,LT
+
+    ST1         { v20.16b},[x0],x1          //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    cmhi        v24.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    MOV         x20,#1
+    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+    cmhi        v26.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    mov         v14.16b[15], w8             //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+    SUB         v24.16b,  v26.16b ,  v24.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    Uxtl        v20.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+
+    Uxtl2       v22.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    TBL         v26.16b, {v6.16b},v26.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+//  TBL v27.8b, {v6.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    TBL         v24.16b, {v7.16b},v26.16b   //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+    SADDW       v20.8h,  v20.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+//  TBL v25.8b, {v7.16b},v27.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+    UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    SADDW2      v22.8h,  v22.8h ,  v24.16b  //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    SMAX        v22.8h,  v22.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    UMIN        v22.8h,  v22.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+INNER_LOOP_DONE:
+    xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+    MOV         x8,x17                      //Loads ht
+
+    xtn2        v20.16b,  v22.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
+    ADD         x5,sp,#0x42                 //*au1_src_left_tmp
+
+    ST1         { v20.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    MOV         x2,x21                      //Loads *pu1_src_left
+SRC_LEFT_LOOP:
+    LDR         w7,[x5],#4                  //au1_src_left_tmp[row]
+    SUBS        x8,x8,#4
+    STR         w7,[x2],#4                  //pu1_src_left[row] = au1_src_left_tmp[row]
+    BNE         SRC_LEFT_LOOP
+
+    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
+    CMP         x6,#8                       //Check whether residue remains
+    BLT         RE_ASSINING_LOOP            //Jump to re-assigning loop
+    MOV         x7,x16                      //Loads wd
+    MOV         x0,x15                      //Loads *pu1_src
+    SUB         x7,x7,x6
+    ADD         x0,x0,x7
+    BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
+    BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
+
+
+
+WD_16_HT_4_LOOP:
+    MOV         x5,x23                      //Loads pu1_avail
+    MOV         x7,x16                      //Loads wd
+    CMP         x6,x7                       //col == wd
+    LDRb        w20, [x5]                   //pu1_avail[0]
+    csel        w8,w20,w8,EQ
+    MOV         x20,#-1
+    csel        x8, x20, x8,NE
+    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    CMP         x6,#16                      //if(col == 16)
+    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
+    LDRB        w8,[x5,#1]                  //pu1_avail[1]
+    mov         v8.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL_WD_16_HT_4:
+    LDRB        w8,[x5,#2]                  //pu1_avail[2]
+    CMP         x8,#0
+
+    SUB         x20,x0,x1                   //pu1_src - src_strd
+    csel        x8, x20, x8,EQ
+    csel        x8, x3, x8,NE
+    ADD         x8,x8,#1                    //pu1_src - src_strd + 1
+    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+
+    ADD         x3,x3,#16
+    ADD         x5,sp,#0x42                 //*au1_src_left_tmp
+    MOV         x4,x17                      //Loads ht
+    MOV         x7,x16                      //Loads wd
+    SUB         x7,x7,x6                    //(wd - col)
+    ADD         x7,x7,#15                   //15 + (wd - col)
+    MOV         x8,x19                      //Loads *pu1_src
+    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
+    SUB         x5,x5,#1
+
+AU1_SRC_LEFT_LOOP_WD_16_HT_4:
+    LDRB        w8,[x7]                     //load the value and increment by src_strd
+    ADD         x7,x7,x1
+    STRB        w8,[x5,#1]!                 //store it in the stack pointer
+    SUBS        x4,x4,#1                    //decrement the loop count
+    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
+
+    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+
+    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    movi        v18.16b, #0
+    MOV         x7,x12                      //row count, move ht_tmp to x7
+
+PU1_SRC_LOOP_WD_16_HT_4:
+    ADD         x8,x0,x1                    //*pu1_src + src_strd
+    LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    MOV         x5,x23                      //Loads pu1_avail
+    LDRB        w5,[x5,#3]                  //pu1_avail[3]
+    CMP         x5,#0
+    BEQ         NEXT_ROW_ELSE_WD_16_HT_4
+    CMP         x7,#1
+    SUB         x8,x8,#1
+    LDRb        w20, [x8]                   //pu1_src_cpy[src_strd - 1]
+    csel        w8,w20,w8,EQ
+    BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
+NEXT_ROW_ELSE_WD_16_HT_4:
+    SUB         x5,x12,x7                   //ht_tmp - row
+    ADD         x8,x14,x5                   //pu1_src_left_cpy[ht_tmp - row]
+    ADD         x8,x8,#1                    //pu1_src_left_cpy[ht_tmp - row + 1]
+    LDRB        w8,[x8]
+
+NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
+    mov         v18.16b[15], w8             //vsetq_lane_u8
+    EXT         v18.16b,  v18.16b ,  v16.16b,#15 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+
+    CMP         x7,x12
+    BNE         SIGN_UP_CHANGE_WD_16_HT_4
+    MOV         x5,x23                      //Loads pu1_avail
+    LDRB        w5,[x5,#2]                  //pu1_avail[2]
+    CMP         x5,#0
+    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
+
+SIGN_UP_CHANGE_WD_16_HT_4:
+    LDRB        w8,[x0,#15]                 //pu1_src_cpy[15]
+    ADD         x5,x0,#16                   //pu1_src_cpy[16]
+    SUB         x5,x5,x1                    //pu1_src_cpy[16 - src_strd]
+    LDRB        w5,[x5]                     //load the value
+    SUB         x8,x8,x5                    //pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+    CMP         x8,#0
+    movn        x20,#0
+    csel        x8, x20, x8,LT
+    MOV         x20,#1
+    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+    mov         v14.16b[15], w8             //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+
+SIGN_UP_CHANGE_DONE_WD_16_HT_4:
+    cmhi        v20.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    SUB         v24.16b,  v22.16b ,  v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+    TBL         v26.16b, {v6.16b},v26.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+//  TBL v27.8b, {v6.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    NEG         v14.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
+    EXT         v14.16b,  v14.16b ,  v14.16b,#1 //sign_up = vextq_s8(sign_up, sign_up, 1)
+
+    TBL         v24.16b, {v7.16b},v26.16b   //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    Uxtl        v28.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+//  TBL v25.8b, {v7.16b},v27.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+    Uxtl2       v30.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    SADDW2      v30.8h,  v30.8h ,  v24.16b  //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    SMAX        v30.8h,  v30.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    UMIN        v30.8h,  v30.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+    xtn2        v28.16b,  v30.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    ST1         { v28.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    mov         v12.16b, v16.16b            //pu1_cur_row = pu1_next_row
+    SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
+    BNE         PU1_SRC_LOOP_WD_16_HT_4     //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
+
+    MOV         x8,x17                      //Loads ht
+    ADD         x5,sp,#0x42                 //*au1_src_left_tmp
+    MOV         x2,x21                      //Loads *pu1_src_left
+SRC_LEFT_LOOP_WD_16_HT_4:
+    LDR         w7,[x5],#4                  //au1_src_left_tmp[row]
+    STR         w7,[x2],#4                  //pu1_src_left[row] = au1_src_left_tmp[row]
+    SUBS        x8,x8,#4
+    BNE         SRC_LEFT_LOOP_WD_16_HT_4
+
+    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
+    BLE         RE_ASSINING_LOOP            //Jump to re-assigning loop
+    BGT         WD_16_HT_4_LOOP             //If not equal jump to width_loop
+
+
+WIDTH_RESIDUE:
+    MOV         x7,x16                      //Loads wd
+    MOV         x5,x23                      //Loads pu1_avail
+    CMP         x6,x7                       //wd_residue == wd
+    LDRb        w20, [x5]                   //pu1_avail[0]
+    csel        w8,w20,w8,EQ
+
+    MOV         x20,#-1
+    csel        x8, x20, x8,NE
+    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    LDRB        w8,[x5,#1]                  //pu1_avail[1]
+    mov         v8.8b[7], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+PU1_AVAIL_2_RESIDUE:
+    LDRB        w8,[x5,#2]                  //pu1_avail[2]
+    CMP         x8,#0
+
+    SUB         x20,x0,x1                   //pu1_src - src_strd
+    csel        x8, x20, x8,EQ
+    csel        x8, x3, x8,NE
+    ADD         x8,x8,#1                    //pu1_src - src_strd + 1
+    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+
+
+    ADD         x5,sp,#0x42                 //*au1_src_left_tmp
+    MOV         x4,x17                      //Loads ht
+    MOV         x7,x16                      //Loads wd
+    MOV         x8,x19                      //Loads *pu1_src
+    SUB         x7,x7,#1                    //(wd - 1)
+    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + (wd - 1)]
+    SUB         x5,x5,#1
+
+AU1_SRC_LEFT_LOOP_RESIDUE:
+    LDRB        w8,[x7]                     //load the value and increment by src_strd
+    ADD         x7,x7,x1
+    STRB        w8,[x5,#1]!                 //store it in the stack pointer
+    SUBS        x4,x4,#1                    //decrement the loop count
+    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
+
+    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+
+    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         x7,x12                      //row count, move ht_tmp to x7
+
+PU1_SRC_LOOP_RESIDUE:
+    movi        v18.16b, #0
+    ADD         x8,x0,x1                    //*pu1_src + src_strd
+    LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    MOV         x5,x23                      //Loads pu1_avail
+    LDRB        w5,[x5,#3]                  //pu1_avail[3]
+    CMP         x5,#0
+    BEQ         NEXT_ROW_ELSE_RESIDUE
+    CMP         x7,#1
+    SUB         x8,x8,#1
+    LDRb        w20, [x8]                   //pu1_src_cpy[src_strd - 1]
+    csel        w8,w20,w8,EQ
+    BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
+NEXT_ROW_ELSE_RESIDUE:
+    SUB         x5,x12,x7                   //ht_tmp - row
+    ADD         x8,x14,x5                   //pu1_src_left_cpy[ht_tmp - row]
+    ADD         x8,x8,#1                    //pu1_src_left_cpy[ht_tmp - row + 1]
+    LDRB        w8,[x8]
+
+NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
+    mov         v18.16b[15], w8             //vsetq_lane_u8
+    EXT         v18.16b,  v18.16b ,  v16.16b,#15 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+
+    CMP         x7,x12
+    BNE         SIGN_UP_CHANGE_RESIDUE
+    MOV         x5,x23                      //Loads pu1_avail
+    LDRB        w5,[x5,#2]                  //pu1_avail[2]
+    CMP         x5,#0
+    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
+
+SIGN_UP_CHANGE_RESIDUE:
+    LDRB        w8,[x0,#15]                 //pu1_src_cpy[15]
+    ADD         x5,x0,#16                   //pu1_src_cpy[16]
+    SUB         x5,x5,x1                    //pu1_src_cpy[16 - src_strd]
+    LDRB        w5,[x5]                     //load the value
+    SUB         x8,x8,x5                    //pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+    CMP         x8,#0
+    movn        x20,#0
+    csel        x8, x20, x8,LT
+    MOV         x20,#1
+    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+    mov         v14.16b[15], w8             //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+
+SIGN_UP_CHANGE_DONE_RESIDUE:
+    cmhi        v20.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    SUB         v24.16b,  v22.16b ,  v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+    TBL         v26.16b, {v6.16b},v26.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+//  TBL v27.8b, {v6.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+
+    NEG         v14.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
+    EXT         v14.16b,  v14.16b ,  v14.16b,#1 //sign_up = vextq_s8(sign_up, sign_up, 1)
+
+    TBL         v24.8b, {v7.16b},v26.8b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    Uxtl        v28.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    xtn         v30.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    ST1         {v30.8b},[x0],x1            //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    mov         v12.16b, v16.16b            //pu1_cur_row = pu1_next_row
+    SUBS        x7,x7,#1
+    BNE         PU1_SRC_LOOP_RESIDUE
+
+    MOV         x8,x17                      //Loads ht
+    MOV         x2,x21                      //Loads *pu1_src_left
+    ADD         x5,sp,#0x42                 //*au1_src_left_tmp
+
+SRC_LEFT_LOOP_RESIDUE:
+    LDR         w7,[x5],#4                  //au1_src_left_tmp[row]
+    SUBS        x8,x8,#4
+    STR         w7,[x2],#4                  //pu1_src_left[row] = au1_src_left_tmp[row]
+    BNE         SRC_LEFT_LOOP_RESIDUE
+
+
+RE_ASSINING_LOOP:
+    MOV         x7,x16                      //Loads wd
+    MOV         x0,x19                      //Loads *pu1_src
+
+    MOV         x11,x17                     //Loads ht
+    ADD         x8,x0,x7                    //pu1_src[wd]
+
+    MOV         x4,x24                      //Loads pu1_src_top_left
+    SUB         x11,x11,#1                  //ht - 1
+
+    SUB         x8,x8,#1
+    STRB        w9,[x8]                     //pu1_src_org[wd - 1] = u1_pos_wd_0_tmp
+    ADD         x8,x8,#1
+    madd        x6, x11, x1, x0             //pu1_src_org[(ht - 1) * src_strd]
+
+    LDRB        w8,[sp]                     //load u1_src_top_left_tmp from stack pointer
+    ADD         x12,sp,#0x02
+
+    STRB        w10,[x6]                    //pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
+    STRB        w8,[x4]                     //*pu1_src_top_left = u1_src_top_left_tmp
+    MOV         x3,x22                      //Loads pu1_src_top
+
+SRC_TOP_LOOP:
+    LD1         {v0.8b},[x12],#8            //pu1_src_top[col] = au1_src_top_tmp[col]
+    SUBS        x7,x7,#8                    //Decrement the width
+    ST1         {v0.8b},[x3],#8             //pu1_src_top[col] = au1_src_top_tmp[col]
+    BNE         SRC_TOP_LOOP
+
+END_LOOPS:
+    ADD         sp,sp,#0xA0
+    // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
+    ldp         x23, x24,[sp], #16
+    ldp         x21, x22,[sp], #16
+    ldp         x19, x20,[sp], #16
+    pop_v_regs
+    ret
+
+
+

diff --git a/common/arm64/ihevc_sao_edge_offset_class3_chroma.s b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
new file mode 100644
index 0000000..cf25102
--- /dev/null
+++ b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s

@@ -0,0 +1,1155 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//*  ihevc_sao_edge_offset_class3_chroma.s
+//*
+//* ,:brief
+//*  Contains function definitions for inter prediction  interpolation.
+//* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
+//* RVCT
+//*
+//* ,:author
+//*  Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//*  None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src,
+//                              WORD32 src_strd,
+//                              UWORD8 *pu1_src_left,
+//                              UWORD8 *pu1_src_top,
+//                              UWORD8 *pu1_src_top_left,
+//                              UWORD8 *pu1_src_top_right,
+//                              UWORD8 *pu1_src_bot_left,
+//                              UWORD8 *pu1_avail,
+//                              WORD8 *pi1_sao_offset_u,
+//                              WORD8 *pi1_sao_offset_v,
+//                              WORD32 wd,
+//                              WORD32 ht)
+//**************Variables Vs Registers*****************************************
+//x0 =>    *pu1_src
+//x1 =>    src_strd
+//x2 =>    *pu1_src_left
+//x3 =>    *pu1_src_top
+//x4    =>    *pu1_src_top_left
+//x5    =>    *pu1_avail
+//x6    =>    *pi1_sao_offset_u
+//x9 =>  *pi1_sao_offset_v
+//x7    =>    wd
+//x8=>    ht
+
+.text
+.p2align 2
+.include "ihevc_neon_macros.s"
+.globl gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class3_chroma_av8
+
+ihevc_sao_edge_offset_class3_chroma_av8:
+
+
+    // STMFD sp!,{x4-x12,x14}            //stack stores the values of the arguments
+
+
+    ldr         x8,[sp,#0]
+    ldr         x9,[sp,#8]
+    ldr         w10,[sp,#16]
+    ldr         w11,[sp,#24]
+
+    push_v_regs
+    // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
+    stp         x19, x20,[sp,#-16]!
+    stp         x21, x22,[sp,#-16]!
+    stp         x23, x24,[sp,#-16]!
+    stp         x25, x26,[sp,#-16]!
+    stp         x27, x28,[sp,#-16]!
+
+    mov         x15,x4 // *pu1_src_top_left 0x28
+    mov         x16,x5 // *pu1_src_top_right 0x2c
+    mov         x17,x6 // *pu1_src_bot_left 0x30
+    mov         x21,x7 // *pu1_avail 0x34
+    mov         x22,x8 // *pi1_sao_offset_u 0x38
+    mov         x23,x9 // *pi1_sao_offset_v 0x3c
+    mov         x24,x10 // wd 0x40
+    mov         x25,x11 // ht 0x44
+
+
+    mov         w7, w24                     //Loads wd
+    mov         w8, w25                     //Loads ht
+    SUB         x9,x7,#2                    //wd - 2
+
+    mov         x4, x15                     //Loads pu1_src_top_left
+    LDRH        w10,[x3,x9]                 //pu1_src_top[wd - 2]
+
+    MOV         x9,x7                       //Move width to x9 for loop count
+
+    mov         x5, x21                     //Loads pu1_avail
+    mov         x6, x22                     //Loads pi1_sao_offset_u
+
+    mov         x22, x3                     //Store pu1_src_top in sp
+    SUB         sp,sp,#0xE0                 //Decrement the stack pointer to store some temp arr values
+
+    STRH        w10,[sp]                    //u1_src_top_left_tmp = pu1_src_top[wd - 2]
+    SUB         x10,x8,#1                   //ht-1
+    madd        x11, x10, x1, x0            //pu1_src[(ht - 1) * src_strd + col]
+    ADD         x12,sp,#10                  //temp array
+
+AU1_SRC_TOP_LOOP:
+    LD1         {v0.8b},[x11],#8            //pu1_src[(ht - 1) * src_strd + col]
+    SUBS        x9,x9,#8                    //Decrement the loop count by 8
+    ST1         {v0.8b},[x12],#8            //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
+    BNE         AU1_SRC_TOP_LOOP
+
+PU1_AVAIL_5_LOOP_U:
+    LDRB        w9,[x5,#5]                  //pu1_avail[5]
+    CMP         x9,#0
+    SUB         x14,x7,#2                   //[wd - 2]
+    LDRB        w9,[x0,x14]                 //u1_pos_0_0_tmp_u = pu1_src[wd - 2]
+    SUB         x11,x7,#1                   //[wd - 1]
+    LDRB        w10,[x0,x11]                //u1_pos_0_0_tmp_v = pu1_src[wd - 1]
+    BEQ         PU1_AVAIL_6_LOOP_U
+
+    mov         x11, x16                    //Load pu1_src_top_right from sp
+    LDRB        w11,[x11]                   //pu1_src_top_right[0]
+    SUB         x12,x9,x11                  //pu1_src[wd - 2] - pu1_src_top_right[0]
+    CMP         x12,#0
+    movn        x20,#0
+    csel        x12, x20, x12,LT
+    MOV         x20,#1
+    csel        x12, x20, x12,GT            //SIGN(pu1_src[wd - 2] - pu1_src_top_right[0])
+    ADD         x11,x0,x1                   //pu1_src + src_strd
+    SUB         x14,x14,#2                  //[wd - 2 - 2]
+    LDRB        w14,[x11,x14]               //pu1_src[wd - 2 - 2 + src_strd]
+    SUB         x11,x9,x14                  //pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]
+    CMP         x11,#0
+    movn        x20,#0
+    csel        x11, x20, x11,LT
+    MOV         x20,#1
+    csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
+    ADD         x11,x12,x11                 //SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
+    ADD         x11,x11,#2                  //edge_idx
+    ADRP        x14, :got:gi1_table_edge_idx //table pointer
+    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
+
+    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         x12,#0                      //0 != edge_idx
+    BEQ         PU1_AVAIL_5_LOOP_V
+    LDRSB       x11,[x6,x12]                //pi1_sao_offset_u[edge_idx]
+    ADD         x9,x9,x11                   //pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx]
+    mov         x20,#255
+    cmp         x9,x20
+    csel        x9, x20, x9, ge             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_5_LOOP_V:
+
+    mov         x11, x16                    //Load pu1_src_top_right from sp
+    LDRB        w11,[x11,#1]                //pu1_src_top_right[1]
+    SUB         x12,x10,x11                 //pu1_src[wd - 1] - pu1_src_top_right[1]
+    CMP         x12,#0
+    movn        x20,#0
+    csel        x12, x20, x12,LT
+    MOV         x20,#1
+    csel        x12, x20, x12,GT            //SIGN(pu1_src[wd - 1] - pu1_src_top_right[1])
+    ADD         x11,x0,x1                   //pu1_src + src_strd
+    SUB         x14,x7,#3                   //[wd - 1 - 2]
+    LDRB        w14,[x11,x14]               //pu1_src[wd - 1 - 2 + src_strd]
+    SUB         x11,x10,x14                 //pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]
+    CMP         x11,#0
+    movn        x20,#0
+    csel        x11, x20, x11,LT
+    MOV         x20,#1
+    csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
+    ADD         x11,x12,x11                 //SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
+    ADD         x11,x11,#2                  //edge_idx
+    ADRP        x14, :got:gi1_table_edge_idx //table pointer
+    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
+
+    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         x12,#0                      //0 != edge_idx
+    BEQ         PU1_AVAIL_6_LOOP_U
+    mov         x11, x23                    //Loads pi1_sao_offset_v
+    LDRSB       x11,[x11,x12]               //pi1_sao_offset_v[edge_idx]
+    ADD         x10,x10,x11                 //pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx]
+    mov         x20,#255
+    cmp         x10,x20
+    csel        x10, x20, x10, ge           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_6_LOOP_U:
+    STRB        w9,[sp,#6]
+    STRB        w10,[sp,#7]
+    mov         x26, x0                     //Store pu1_src in sp
+
+    LDRB        w10,[x5,#6]                 //pu1_avail[6]
+    CMP         x10,#0
+    SUB         x11,x8,#1                   //ht - 1
+    madd        x12, x11, x1, x0            //pu1_src[(ht - 1) * src_strd]
+    LDRB        w10,[x12]                   //u1_pos_wd_ht_tmp_u = pu1_src[(ht - 1) * src_strd]
+    LDRB        w9,[x12,#1]                 //u1_pos_wd_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1]
+    BEQ         PU1_AVAIL_3_LOOP
+
+    SUB         x11,x12,x1                  //pu1_src[(ht - 1) * src_strd - src_strd]
+    ADD         x11,x11,#2                  //pu1_src[(ht - 1) * src_strd +  2 - src_strd]
+    LDRB        w11,[x11]                   //Load pu1_src[(ht - 1) * src_strd +  2 - src_strd]
+    SUB         x11,x10,x11                 //pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd]
+    CMP         x11,#0
+    movn        x20,#0
+    csel        x11, x20, x11,LT
+    MOV         x20,#1
+    csel        x11, x20, x11,GT            //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd])
+
+    mov         x14, x17                    //Load pu1_src_bot_left from sp
+    LDRB        w14,[x14]                   //Load pu1_src_bot_left[0]
+    SUB         x14,x10,x14                 //pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
+    CMP         x14,#0
+    movn        x20,#0
+    csel        x14, x20, x14,LT
+    MOV         x20,#1
+    csel        x14, x20, x14,GT            //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
+
+    ADD         x11,x11,x14                 //Add 2 sign value
+    ADD         x11,x11,#2                  //edge_idx
+    ADRP        x14, :got:gi1_table_edge_idx //table pointer
+    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
+
+    LDRSB       x14,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         x14,#0
+    BEQ         PU1_AVAIL_6_LOOP_V
+    LDRSB       x11,[x6,x14]                //pi1_sao_offset_u[edge_idx]
+    ADD         x10,x10,x11                 //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+    mov         x20,#255
+    cmp         x10,x20
+    csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_6_LOOP_V:
+    ADD         x12,x12,#1                  //pu1_src[(ht - 1) * src_strd + 1]
+    SUB         x11,x12,x1                  //pu1_src[(ht - 1) * src_strd + 1) - src_strd]
+    ADD         x11,x11,#2                  //pu1_src[(ht - 1) * src_strd + 2 - src_strd]
+    LDRB        w11,[x11]                   //Load pu1_src[(ht - 1) * src_strd + 2 - src_strd]
+    SUB         x11,x9,x11                  //pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]
+    CMP         x11,#0
+    movn        x20,#0
+    csel        x11, x20, x11,LT
+    MOV         x20,#1
+    csel        x11, x20, x11,GT            //SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd])
+
+    mov         x14, x17                    //Load pu1_src_bot_left from sp
+    LDRB        w14,[x14,#1]                //Load pu1_src_bot_left[1]
+    SUB         x14,x9,x14                  //pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]
+    CMP         x14,#0
+    movn        x20,#0
+    csel        x14, x20, x14,LT
+    MOV         x20,#1
+    csel        x14, x20, x14,GT            //SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1])
+
+    ADD         x11,x11,x14                 //Add 2 sign value
+    ADD         x11,x11,#2                  //edge_idx
+    ADRP        x14, :got:gi1_table_edge_idx //table pointer
+    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
+
+    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
+    CMP         x12,#0
+    BEQ         PU1_AVAIL_3_LOOP
+    mov         x14, x23                    //Loads pi1_sao_offset_v
+    LDRSB       x11,[x14,x12]               //pi1_sao_offset_v[edge_idx]
+    ADD         x9,x9,x11                   //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+    mov         x20,#255
+    cmp         x9,x20
+    csel        x9, x20, x9, ge             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_3_LOOP:
+    STRB        w10,[sp,#8]
+    STRB        w9,[sp,#9]
+    mov         x27, x2                     //Store pu1_src_left in sp
+
+    MOV         x12,x8                      //Move ht
+    MOV         x14,x2                      //Move pu1_src_left to pu1_src_left_cpy
+    LDRB        w11,[x5,#3]                 //pu1_avail[3]
+    CMP         x11,#0
+    BNE         PU1_AVAIL_2_LOOP
+    SUB         x12,x12,#1                  //ht_tmp--
+
+PU1_AVAIL_2_LOOP:
+    LDRB        w5,[x5,#2]                  //pu1_avail[2]
+    CMP         x5,#0
+    BNE         PU1_AVAIL_2_LOOP_END
+
+    ADD         x0,x0,x1                    //pu1_src += src_strd
+    SUB         x12,x12,#1                  //ht_tmp--
+    ADD         x14,x14,#2                  //pu1_src_left_cpy += 2
+
+PU1_AVAIL_2_LOOP_END:
+    mov         x28, x0                     //Store pu1_src in sp
+    movi        v0.16b, #2                  //const_2 = vdupq_n_s8(2)
+    movi        v2.8h, #0                   //const_min_clip = vdupq_n_s16(0)
+    movi        v4.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+    LD1         {v6.8b},[x6]                //offset_tbl_u = vld1_s8(pi1_sao_offset_u)
+    mov         x6, x23                     //Loads pi1_sao_offset_v
+    LD1         {v7.8b},[x6]                //offset_tbl_v = vld1_s8(pi1_sao_offset_v)
+    ADRP        x2, :got:gi1_table_edge_idx //table pointer
+    LDR         x2, [x2, #:got_lo12:gi1_table_edge_idx]
+
+    //VLD1.8        D6,[x6]                        @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    movi        v8.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
+    MOV         x6,x7                       //move wd to x6 loop_count
+
+    CMP         x7,#16                      //Compare wd with 16
+    BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+    CMP         x8,#4                       //Compare ht with 4
+    BLE         WD_16_HT_4_LOOP             //If jump to WD_16_HT_4_LOOP
+
+WIDTH_LOOP_16:
+    mov         w7, w24                     //Loads wd
+    CMP         x6,x7                       //col == wd
+    mov         x5, x21                     //Loads pu1_avail
+
+    LDRb        w20, [x5]                   //pu1_avail[0]
+    csel        w8,w20,w8,EQ
+    MOV         x20,#-1
+    csel        x8, x20, x8,NE
+
+    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    LDRB        w11,[x5,#2]                 //pu1_avail[2]
+
+    CMP         x6,#16                      //if(col == 16)
+    mov         v8.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    BNE         SKIP_AU1_MASK_VAL
+    LDRB        w8,[x5,#1]                  //pu1_avail[1]
+    mov         v8.16b[14], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v8.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL:
+    CMP         x11,#0
+    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
+    //SUB x0, x0,#8
+    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
+
+    SUB         x20,x0,x1                   //pu1_src - src_strd
+    csel        x8, x20, x8,EQ
+    movi        v18.16b, #0
+    csel        x8, x3, x8,NE
+
+    ADD         x8,x8,#2                    //pu1_src - src_strd + 2
+    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+    //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+    //SUB x8, x8,#8
+    ADD         x3,x3,#16
+
+    mov         w4, w25                     //Loads ht
+    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    mov         w7, w24                     //Loads wd
+
+    SUB         x7,x7,x6                    //(wd - col)
+    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    ADD         x7,x7,#14                   //15 + (wd - col)
+
+    mov         x8, x26                     //Loads *pu1_src
+    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
+
+AU1_SRC_LEFT_LOOP:
+    LDRH        w8,[x7]                     //load the value and increment by src_strd
+    SUBS        x4,x4,#1                    //decrement the loop count
+
+    STRH        w8,[x5],#2                  //store it in the stack pointer
+    ADD         x7,x7,x1
+    BNE         AU1_SRC_LEFT_LOOP
+
+
+    MOV         x7,x12                      //row count, move ht_tmp to x7
+    movi        v18.16b, #0                 //I
+    ADD         x11,x0,x1                   //I *pu1_src + src_strd
+
+    SUB         x5,x12,x7                   //I ht_tmp - row
+    LD1         {v16.16b},[x11]             //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //LD1 {v17.8b},[x11]                    //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //SUB x11, x11,#8
+    ADD         x8,x14,x5,LSL #1            //I pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+    LDRH        w5,[x8,#2]                  //I
+    mov         v18.4h[7], w5               //I vsetq_lane_u8
+    mov         x11, x21                    //I Loads pu1_avail
+
+    LDRB        w11,[x11,#2]                //I pu1_avail[2]
+    EXT         v18.16b,  v18.16b ,  v16.16b,#14 //I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+    CMP         x11,#0                      //I
+    BNE         SIGN_UP_CHANGE_DONE         //I
+
+    LDRB        w8,[x0,#14]                 //I pu1_src_cpy[14]
+    SUB         x5,x0,x1                    //I
+
+    LDRB        w11,[x5,#16]                //I load the value pu1_src_cpy[16 - src_strd]
+
+    LDRB        w9,[x0,#15]                 //I pu1_src_cpy[15]
+    SUB         x8,x8,x11                   //I pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+
+    LDRB        w10,[x5,#17]                //I load the value pu1_src_cpy[17 - src_strd]
+    CMP         x8,#0                       //I
+
+    movn        x20,#0
+    csel        x8, x20, x8,LT              //I
+    SUB         x9,x9,x10                   //I pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+    MOV         x20,#1
+    csel        x8, x20, x8,GT              //I SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+    CMP         x9,#0                       //I
+
+    movn        x20,#0
+    csel        x9, x20, x9,LT              //I
+    mov         v14.16b[14], w8             //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+    MOV         x20,#1
+    csel        x9, x20, x9,GT              //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+    mov         v14.16b[15], w9             //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE:
+    LD1         {v28.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    cmhi        v20.16b,  v12.16b ,  v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    cmhi        v22.16b,  v18.16b ,  v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    SUB         v22.16b,  v22.16b ,  v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    ADD         v18.16b,  v0.16b ,  v14.16b //I edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v18.16b,  v18.16b ,  v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
+    TBL         v18.16b, {v28.16b},v18.16b  //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    NEG         v14.16b, v22.16b            //I sign_up = vnegq_s8(sign_down)
+
+    //TBL v19.8b, {v28.16b},v19.8b                //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    EXT         v14.16b,  v14.16b ,  v14.16b,#2 //I sign_up = vextq_s8(sign_up, sign_up, 2)
+
+    Uxtl        v20.8h, v12.8b              //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    AND         v18.16b,  v18.16b ,  v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v19.d[0],v18.d[1]
+
+    UZP1        v31.8b, v18.8b, v19.8b
+    UZP2        v19.8b, v18.8b, v19.8b      //I
+    mov         v18.8b,v31.8b
+    TBL         v22.8b, {v6.16b},v18.8b     //I
+    TBL         v23.8b, {v7.16b},v19.8b     //I
+    ZIP1        v31.8b, v22.8b, v23.8b
+    ZIP2        v23.8b, v22.8b, v23.8b      //I
+    mov         v22.8b,v31.8b
+
+    Uxtl2       v18.8h, v12.16b             //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    SADDW       v20.8h,  v20.8h ,  v22.8b   //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    SMAX        v20.8h,  v20.8h ,  v2.8h    //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v20.8h,  v20.8h ,  v4.8h    //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    mov         v12.16b, v16.16b            //I pu1_cur_row = pu1_next_row
+    SADDW       v18.8h,  v18.8h ,  v23.8b   //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    SUB         x7,x7,#1                    //I Decrement the ht_tmp loop count by 1
+    SMAX        v18.8h,  v18.8h ,  v2.8h    //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    UMIN        v18.8h,  v18.8h ,  v4.8h    //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+
+PU1_SRC_LOOP:
+    ADD         x11,x0,x1,LSL #1            //II *pu1_src + src_strd
+    xtn         v20.8b,  v20.8h             //I vmovn_s16(pi2_tmp_cur_row.val[0])
+    SUB         x5,x12,x7                   //II ht_tmp - row
+
+    ADD         x4,x0,x1                    //III *pu1_src + src_strd
+    xtn2        v20.16b,  v18.8h            //I vmovn_s16(pi2_tmp_cur_row.val[1])
+    ADD         x8,x14,x5,LSL #1            //II pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+    LDRH        w9,[x8,#2]
+    LD1         {v16.16b},[x11]             //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //LD1 {v17.8b},[x11]                    //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //SUB x11, x11,#8
+    LDRB        w10,[x4,#14]                //II pu1_src_cpy[14]
+
+    LDRB        w8,[x4,#15]                 //II pu1_src_cpy[15]
+    mov         v28.4h[7], w9               //II vsetq_lane_u8
+    ADD         x4,x11,x1                   //III *pu1_src + src_strd
+
+    LDRB        w5,[x0,#17]                 //II load the value pu1_src_cpy[17 - src_strd]
+    LD1         {v30.16b},[x4]              //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //LD1 {v31.8b},[x4]                    //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //SUB x4, x4,#8
+    LDRB        w11,[x0,#16]                //II load the value pu1_src_cpy[16 - src_strd]
+
+    SUB         x7,x7,#1                    //II Decrement the ht_tmp loop count by 1
+    ST1         { v20.16b},[x0],x1          //I vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    SUB         x10,x10,x11                 //II pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+
+    CMP         x10,#0                      //II
+    EXT         v28.16b,  v28.16b ,  v16.16b,#14 //II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+    SUB         x8,x8,x5                    //II pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+    movn        x20,#0
+    csel        x10, x20, x10,LT            //II
+    LD1         {v21.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    MOV         x20,#1
+    csel        x10, x20, x10,GT            //II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+
+    CMP         x8,#0                       //II
+    mov         v14.8b[14], w10             //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+    movn        x20,#0
+    csel        x8, x20, x8,LT              //II
+
+    MOV         x20,#1
+    csel        x8, x20, x8,GT              //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+    SUB         x10,x12,x7                  //III ht_tmp - row
+    mov         v14.8b[15], w8              //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+    ADD         x11,x14,x10,LSL #1          //III pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+    CMP         x7,#1                       //III
+    cmhi        v22.16b,  v12.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    BNE         NEXT_ROW_POINTER_ASSIGNED_2 //III
+
+    mov         x5, x21                     //III Loads pu1_avail
+    LDRB        w5,[x5,#3]                  //III pu1_avail[3]
+    CMP         x5,#0                       //III
+    SUB         x20,x4,#4                   //III pu1_src[src_strd - 2]
+    csel        x11, x20, x11,NE
+
+NEXT_ROW_POINTER_ASSIGNED_2:
+    LDRH        w5,[x11,#2]                 //III
+    cmhi        v24.16b,  v28.16b ,  v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    ADD         x11,x0,x1                   //III
+
+    LDRB        w9,[x11,#14]                //III pu1_src_cpy[14]
+    mov         v18.4h[7], w5               //III vsetq_lane_u8
+    LDRB        w8,[x11,#15]                //III pu1_src_cpy[15]
+
+    LDRB        w11,[x0,#16]                //III load the value pu1_src_cpy[16 - src_strd]
+    SUB         v24.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    LDRB        w10,[x0,#17]                //III load the value pu1_src_cpy[17 - src_strd]
+
+    SUB         x9,x9,x11                   //III pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+    EXT         v18.16b,  v18.16b ,  v30.16b,#14 //III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+    SUB         x10,x8,x10                  //III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+    CMP         x9,#0                       //III
+    ADD         v26.16b,  v0.16b ,  v14.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+    movn        x20,#0
+    csel        x9, x20, x9,LT              //III
+
+    MOV         x20,#1
+    csel        x9, x20, x9,GT              //III SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+    ADD         v26.16b,  v26.16b ,  v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
+    CMP         x10,#0                      //III
+
+    NEG         v14.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
+    TBL         v26.16b, {v21.16b},v26.16b  //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    movn        x20,#0
+    csel        x10, x20, x10,LT            //III
+    MOV         x20,#1
+    csel        x10, x20, x10,GT            //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+    EXT         v14.16b,  v14.16b ,  v14.16b,#2 //II sign_up = vextq_s8(sign_up, sign_up, 2)
+    //TBL v27.8b, {v21.16b},v27.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    cmhi        v22.16b,  v16.16b ,  v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    mov         v14.16b[14], w9             //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+    AND         v26.16b,  v26.16b ,  v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v27.d[0],v26.d[1]
+
+    mov         v14.16b[15], w10            //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+    UZP1        v31.8b, v26.8b, v27.8b
+    UZP2        v27.8b, v26.8b, v27.8b      //II
+    mov         v26.8b,v31.8b
+
+    cmhi        v20.16b,  v18.16b ,  v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    TBL         v24.8b, {v6.16b},v26.8b     //II
+    SUB         v22.16b,  v20.16b ,  v22.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    ADD         v18.16b,  v0.16b ,  v14.16b //III edge_idx = vaddq_s8(const_2, sign_up)
+    TBL         v25.8b, {v7.16b},v27.8b     //II
+    ADD         v18.16b,  v18.16b ,  v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    ZIP1        v31.8b, v24.8b, v25.8b
+    ZIP2        v25.8b, v24.8b, v25.8b      //II
+    mov         v24.8b,v31.8b
+
+    Uxtl        v28.8h, v12.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    TBL         v18.16b, {v20.16b},v18.16b  //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    NEG         v14.16b, v22.16b            //III sign_up = vnegq_s8(sign_down)
+
+    SADDW       v28.8h,  v28.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    //TBL v19.8b, {v20.16b},v19.8b                //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    EXT         v14.16b,  v14.16b ,  v14.16b,#2 //III sign_up = vextq_s8(sign_up, sign_up, 2)
+
+    Uxtl2       v26.8h, v12.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    AND         v18.16b,  v18.16b ,  v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v19.d[0],v18.d[1]
+
+    Uxtl        v20.8h, v16.8b              //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    UZP1        v31.8b, v18.8b, v19.8b
+    UZP2        v19.8b, v18.8b, v19.8b      //III
+    mov         v18.8b,v31.8b
+
+    SMAX        v28.8h,  v28.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    TBL         v22.8b, {v6.16b},v18.8b     //III
+    UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    SADDW       v26.8h,  v26.8h ,  v25.8b   //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    TBL         v23.8b, {v7.16b},v19.8b     //III
+    SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    Uxtl2       v18.8h, v16.16b             //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    ZIP1        v31.8b, v22.8b, v23.8b
+    ZIP2        v23.8b, v22.8b, v23.8b      //III
+    mov         v22.8b,v31.8b
+
+    xtn         v28.8b,  v28.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
+    SADDW       v20.8h,  v20.8h ,  v22.8b   //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    mov         v12.16b, v30.16b            //III pu1_cur_row = pu1_next_row
+    UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    SUB         x7,x7,#1                    //III Decrement the ht_tmp loop count by 1
+    SMAX        v20.8h,  v20.8h ,  v2.8h    //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    CMP         x7,#1                       //III
+
+    xtn2        v28.16b,  v26.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
+    UMIN        v20.8h,  v20.8h ,  v4.8h    //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    SADDW       v18.8h,  v18.8h ,  v23.8b   //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    SMAX        v18.8h,  v18.8h ,  v2.8h    //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+    ST1         { v28.16b},[x0],x1          //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    UMIN        v18.8h,  v18.8h ,  v4.8h    //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    BGT         PU1_SRC_LOOP                //If not equal jump to PU1_SRC_LOOP
+    BLT         INNER_LOOP_DONE
+
+
+    ADD         x11,x0,x1,LSL #1            //*pu1_src + src_strd
+    xtn         v20.8b,  v20.8h             //III vmovn_s16(pi2_tmp_cur_row.val[0])
+    SUB         x5,x12,x7                   //ht_tmp - row
+
+    ADD         x8,x14,x5,LSL #1            //pu1_src_left_cpy[(ht_tmp - row) * 2]
+    xtn2        v20.16b,  v18.8h            //III vmovn_s16(pi2_tmp_cur_row.val[1])
+    CMP         x7,#1
+
+    LDRB        w4,[x0,#16]                 //load the value pu1_src_cpy[16 - src_strd]
+    LD1         {v16.16b},[x11]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //LD1 {v17.8b},[x11]                    //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //SUB x11, x11,#8
+    LDRB        w9,[x0,#17]                 //load the value pu1_src_cpy[17 - src_strd]
+
+    BNE         NEXT_ROW_POINTER_ASSIGNED_3
+    mov         x5, x21                     //Loads pu1_avail
+    LDRB        w5,[x5,#3]                  //pu1_avail[3]
+    CMP         x5,#0
+    SUB         x20,x11,#4                  //pu1_src[src_strd - 2]
+    csel        x8, x20, x8,NE
+
+NEXT_ROW_POINTER_ASSIGNED_3:
+    LDRH        w5,[x8,#2]
+    ST1         { v20.16b},[x0],x1          //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    LDRB        w8,[x0,#14]                 //pu1_src_cpy[14]
+
+    SUB         x8,x8,x4                    //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+    mov         v18.4h[7], w5               //vsetq_lane_u8
+    LDRB        w10,[x0,#15]                //pu1_src_cpy[15]
+
+    CMP         x8,#0
+    EXT         v18.16b,  v18.16b ,  v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+    SUB         x10,x10,x9                  //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+    movn        x20,#0
+    csel        x8, x20, x8,LT
+    LD1         {v28.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    MOV         x20,#1
+    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+
+    CMP         x10,#0
+    mov         v14.16b[14], w8             //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+    movn        x20,#0
+    csel        x10, x20, x10,LT
+
+    MOV         x20,#1
+    csel        x10, x20, x10,GT            //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+    mov         v14.16b[15], w10            //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+    cmhi        v20.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    cmhi        v22.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    SUB         v22.16b,  v22.16b ,  v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    ADD         v18.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v18.16b,  v18.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+    TBL         v18.16b, {v28.16b},v18.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    //TBL v19.8b, {v28.16b},v19.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+    AND         v18.16b,  v18.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v19.d[0],v18.d[1]
+
+    Uxtl        v20.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    UZP1        v31.8b, v18.8b, v19.8b
+    UZP2        v19.8b, v18.8b, v19.8b
+    mov         v18.8b,v31.8b
+
+    TBL         v22.8b, {v6.16b},v18.8b
+    TBL         v23.8b, {v7.16b},v19.8b
+
+    Uxtl2       v18.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    ZIP1        v31.8b, v22.8b, v23.8b
+    ZIP2        v23.8b, v22.8b, v23.8b
+    mov         v22.8b,v31.8b
+
+    SADDW       v20.8h,  v20.8h ,  v22.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    SADDW       v18.8h,  v18.8h ,  v23.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    SMAX        v18.8h,  v18.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    UMIN        v18.8h,  v18.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+
+INNER_LOOP_DONE:
+
+    mov         w8, w25                     //Loads ht
+    xtn         v20.8b,  v20.8h             //III vmovn_s16(pi2_tmp_cur_row.val[0])
+    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
+
+    LSL         x8,x8,#1
+    xtn2        v20.16b,  v18.8h            //III vmovn_s16(pi2_tmp_cur_row.val[1])
+    mov         x11, x27                    //Loads *pu1_src_left
+
+SRC_LEFT_LOOP:
+    LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
+    SUBS        x8,x8,#4
+    STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
+    BNE         SRC_LEFT_LOOP
+
+    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
+    ST1         { v20.16b},[x0],x1          //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    CMP         x6,#8                       //Check whether residue remains
+
+    BLT         RE_ASSINING_LOOP            //Jump to re-assigning loop
+    mov         w7, w24                     //Loads wd
+    mov         x0, x28                     //Loads *pu1_src
+    SUB         x7,x7,x6
+    ADD         x0,x0,x7
+    BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
+    BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
+
+WD_16_HT_4_LOOP:
+    mov         w7, w24                     //Loads wd
+
+    mov         x5, x21                     //Loads pu1_avail
+    CMP         x6,x7                       //col == wd
+
+    LDRb        w20, [x5]                   //pu1_avail[0]
+    csel        w8,w20,w8,EQ
+    MOV         x20,#-1
+    csel        x8, x20, x8,NE
+    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    CMP         x6,#16                      //if(col == 16)
+    mov         v8.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
+    LDRB        w8,[x5,#1]                  //pu1_avail[1]
+    mov         v8.16b[14], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v8.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL_WD_16_HT_4:
+    LDRB        w11,[x5,#2]                 //pu1_avail[2]
+    SUB         x20,x0,x1                   //pu1_src - src_strd
+    csel        x8, x20, x8,EQ
+
+    CMP         x11,#0
+    csel        x8, x3, x8,NE
+    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
+    //SUB x0, x0,#8
+    ADD         x8,x8,#2                    //pu1_src - src_strd + 2
+
+    ADD         x3,x3,#16
+    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+    //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+    //SUB x8, x8,#8
+    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
+
+    mov         w4, w25                     //Loads ht
+    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    mov         w7, w24                     //Loads wd
+
+    SUB         x7,x7,x6                    //(wd - col)
+    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    ADD         x7,x7,#14                   //15 + (wd - col)
+
+    mov         x8, x26                     //Loads *pu1_src
+    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
+
+AU1_SRC_LEFT_LOOP_WD_16_HT_4:
+    LDRH        w8,[x7]                     //load the value and increment by src_strd
+    SUBS        x4,x4,#1                    //decrement the loop count
+
+    STRH        w8,[x5],#2                  //store it in the stack pointer
+    ADD         x7,x7,x1
+    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
+
+    movi        v18.16b, #0
+    MOV         x7,x12                      //row count, move ht_tmp to x7
+
+PU1_SRC_LOOP_WD_16_HT_4:
+    ADD         x9,x0,x1                    //*pu1_src + src_strd
+
+    mov         x5, x21                     //Loads pu1_avail
+    LD1         {v16.16b},[x9]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //LD1 {v17.8b},[x9]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //SUB x9, x9,#8
+    LDRB        w5,[x5,#3]                  //pu1_avail[3]
+
+    SUB         x11,x12,x7                  //ht_tmp - row
+    ADD         x8,x14,x11,LSL #1           //pu1_src_left_cpy[(ht_tmp - row) * 2]
+    ADD         x8,x8,#2                    //pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
+
+    CMP         x5,#0
+    BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
+    CMP         x7,#1
+    SUB         x20,x9,#2                   //pu1_src[src_strd - 2]
+    csel        x8, x20, x8,EQ
+
+NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
+    LDRH        w5,[x8]
+    mov         v18.8h[7], w5               //vsetq_lane_u8
+    EXT         v18.16b,  v18.16b ,  v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+
+    CMP         x7,x12
+    BLT         SIGN_UP_CHANGE_WD_16_HT_4
+    mov         x5, x21                     //Loads pu1_avail
+    LDRB        w5,[x5,#2]                  //pu1_avail[2]
+    CMP         x5,#0
+    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
+
+SIGN_UP_CHANGE_WD_16_HT_4:
+    LDRB        w8,[x0,#14]                 //pu1_src_cpy[14]
+    SUB         x9,x0,x1
+
+    LDRB        w5,[x9,#16]                 //load the value pu1_src_cpy[16 - src_strd]
+
+    LDRB        w10,[x0,#15]                //pu1_src_cpy[15]
+    SUB         x8,x8,x5                    //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+
+    LDRB        w11,[x9,#17]                //load the value pu1_src_cpy[17 - src_strd]
+    CMP         x8,#0
+
+    movn        x20,#0
+    csel        x8, x20, x8,LT
+    SUB         x10,x10,x11                 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+    MOV         x20,#1
+    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+
+    CMP         x10,#0
+    mov         v14.16b[14], w8             //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+    movn        x20,#0
+    csel        x10, x20, x10,LT
+
+    MOV         x20,#1
+    csel        x10, x20, x10,GT            //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+    mov         v14.16b[15], w10            //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE_WD_16_HT_4:
+    LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    cmhi        v22.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    cmhi        v24.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    mov         v20.d[1],v20.d[0]
+    NEG         v14.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
+    TBL         v26.16b, {v20.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+
+    //TBL v27.8b, {v20.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    EXT         v14.16b,  v14.16b ,  v14.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 2)
+
+    Uxtl        v28.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v27.d[0],v26.d[1]
+
+    UZP1        v31.8b, v26.8b, v27.8b
+    UZP2        v27.8b, v26.8b, v27.8b
+    mov         v26.8b,v31.8b
+    TBL         v24.8b, {v6.16b},v26.8b
+    TBL         v25.8b, {v7.16b},v27.8b
+    ZIP1        v31.8b, v24.8b, v25.8b
+    ZIP2        v25.8b, v24.8b, v25.8b
+    mov         v24.8b,v31.8b
+
+    Uxtl2       v30.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    mov         v12.16b, v16.16b            //pu1_cur_row = pu1_next_row
+    SADDW       v30.8h,  v30.8h ,  v25.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+    SMAX        v30.8h,  v30.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    UMIN        v30.8h,  v30.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+    xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+    xtn2        v28.16b,  v30.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
+
+    SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
+    ST1         { v28.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    BNE         PU1_SRC_LOOP_WD_16_HT_4     //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
+
+    mov         w8, w25                     //Loads ht
+    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
+    mov         x11, x27                    //Loads *pu1_src_left
+
+SRC_LEFT_LOOP_WD_16_HT_4:
+    LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
+    SUBS        x8,x8,#2
+    STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
+    BNE         SRC_LEFT_LOOP_WD_16_HT_4
+
+    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
+    BLE         RE_ASSINING_LOOP            //Jump to re-assigning loop
+    BGT         WD_16_HT_4_LOOP             //If not equal jump to width_loop
+
+WIDTH_RESIDUE:
+    mov         w7, w24                     //Loads wd
+
+    mov         x5, x21                     //Loads pu1_avail
+    CMP         x6,x7                       //wd_residue == wd
+
+    LDRb        w20, [x5]                   //pu1_avail[0]
+    csel        w8,w20,w8,EQ
+
+    MOV         x20,#-1
+    csel        x8, x20, x8,NE
+    LDRB        w11,[x5,#1]                 //pu1_avail[1]
+
+    LDRB        w9,[x5,#2]                  //pu1_avail[2]
+    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    CMP         x9,#0
+
+    SUB         x20,x0,x1                   //pu1_src - src_strd
+    csel        x10, x20, x10,EQ
+    mov         v8.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    csel        x10, x3, x10,NE
+
+    ADD         x10,x10,#2                  //pu1_src - src_strd + 2
+    mov         v8.8b[6], w11               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
+
+    mov         w4, w25                     //Loads ht
+    mov         v8.8b[7], w11               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         w7, w24                     //Loads wd
+
+    mov         x8, x26                     //Loads *pu1_src
+    LD1         {v10.16b},[x10]             //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+    //LD1 {v11.8b},[x10]                    //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+    //SUB x10, x10,#8
+    SUB         x7,x7,#2                    //(wd - 2)
+
+    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + (wd - 2)]
+
+AU1_SRC_LEFT_LOOP_RESIDUE:
+    LDRH        w8,[x7]                     //load the value and increment by src_strd
+    ADD         x7,x7,x1
+    STRH        w8,[x5],#2                  //store it in the stack pointer
+    SUBS        x4,x4,#1                    //decrement the loop count
+    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
+
+    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
+    //SUB x0, x0,#8
+
+    movi        v18.16b, #0
+    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    MOV         x7,x12                      //row count, move ht_tmp to x7
+
+PU1_SRC_LOOP_RESIDUE:
+    ADD         x9,x0,x1                    //*pu1_src + src_strd
+
+    SUB         x11,x12,x7                  //ht_tmp - row
+    LD1         {v16.16b},[x9]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //LD1 {v17.8b},[x9]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+    //SUB x9, x9,#8
+    mov         x5, x21                     //Loads pu1_avail
+
+    LDRB        w5,[x5,#3]                  //pu1_avail[3]
+    ADD         x8,x14,x11,LSL #1           //pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+    CMP         x5,#0
+    ADD         x8,x8,#2                    //pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
+
+    BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
+    CMP         x7,#1
+    SUB         x20,x9,#2                   //pu1_src[src_strd - 2]
+    csel        x8, x20, x8,EQ
+
+NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
+    LDRB        w5,[x8]
+
+    LDRB        w8,[x8,#1]
+    mov         v18.16b[14], w5             //vsetq_lane_u8
+    CMP         x7,x12
+
+    mov         v18.16b[15], w8             //vsetq_lane_u8
+    EXT         v18.16b,  v18.16b ,  v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+
+    BLT         SIGN_UP_CHANGE_RESIDUE
+    mov         x5, x21                     //Loads pu1_avail
+    LDRB        w5,[x5,#2]                  //pu1_avail[2]
+    CMP         x5,#0
+    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
+
+SIGN_UP_CHANGE_RESIDUE:
+    LDRB        w8,[x0,#14]                 //pu1_src_cpy[14]
+    SUB         x9,x0,x1
+
+    LDRB        w5,[x9,#16]                 //load the value pu1_src_cpy[16 - src_strd]
+
+    LDRB        w10,[x0,#15]                //pu1_src_cpy[15]
+    SUB         x8,x8,x5                    //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+
+    LDRB        w11,[x9,#17]                //load the value pu1_src_cpy[17 - src_strd]
+    CMP         x8,#0
+
+    movn        x20,#0
+    csel        x8, x20, x8,LT
+    SUB         x10,x10,x11                 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+    MOV         x20,#1
+    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+
+    CMP         x10,#0
+    mov         v14.16b[14], w8             //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+    movn        x20,#0
+    csel        x10, x20, x10,LT
+
+    MOV         x20,#1
+    csel        x10, x20, x10,GT            //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+    mov         v14.16b[15], w10            //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE_RESIDUE:
+    LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    cmhi        v22.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+    cmhi        v24.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+
+    mov         v20.d[1],v20.d[0]
+    NEG         v14.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
+    TBL         v26.16b, {v20.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+
+    //TBL v27.8b, {v20.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+    EXT         v14.16b,  v14.16b ,  v14.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 14)
+
+    Uxtl        v28.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v27.d[0],v26.d[1]
+
+    UZP1        v31.8b, v26.8b, v27.8b
+    UZP2        v27.8b, v26.8b, v27.8b
+    mov         v26.8b,v31.8b
+    TBL         v24.8b, {v6.16b},v26.8b
+    TBL         v25.8b, {v7.16b},v27.8b
+    ZIP1        v31.8b, v24.8b, v25.8b
+    ZIP2        v25.8b, v24.8b, v25.8b
+    mov         v24.8b,v31.8b
+
+    mov         v12.16b, v16.16b            //pu1_cur_row = pu1_next_row
+    SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+    SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
+    xtn         v30.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+    ST1         {v30.8b},[x0],x1            //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+    BNE         PU1_SRC_LOOP_RESIDUE        //If not equal jump to PU1_SRC_LOOP
+
+    mov         w8, w25                     //Loads ht
+    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
+
+    mov         x11, x27                    //Loads *pu1_src_left
+
+SRC_LEFT_LOOP_RESIDUE:
+    LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
+    SUBS        x8,x8,#2
+    STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
+    BNE         SRC_LEFT_LOOP_RESIDUE
+
+
+RE_ASSINING_LOOP:
+    mov         w7, w24                     //Loads wd
+    mov         w8, w25                     //Loads ht
+
+    mov         x0, x26                     //Loads *pu1_src
+    SUB         x10,x7,#2                   //wd - 2
+
+    LDRH        w9,[sp,#6]
+    SUB         x8,x8,#1                    //ht - 1
+
+    STRH        w9,[x0,x10]                 //pu1_src_org[0] = u1_pos_0_0_tmp
+    madd        x6, x8, x1, x0              //pu1_src[(ht - 1) * src_strd]
+
+    mov         x4, x15                     //Loads pu1_src_top_left
+
+    LDRH        w9,[sp,#8]
+    ADD         x12,sp,#10
+
+    STRH        w9,[x6]                     //pu1_src_org[(ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
+
+    LDRH        w10,[sp]                    //load u1_src_top_left_tmp from stack pointer
+    STRH        w10,[x4]                    //*pu1_src_top_left = u1_src_top_left_tmp
+    mov         x3, x22                     //Loads pu1_src_top
+
+SRC_TOP_LOOP:
+    LD1         {v0.8b},[x12],#8            //pu1_src_top[col] = au1_src_top_tmp[col]
+    SUBS        x7,x7,#8                    //Decrement the width
+    ST1         {v0.8b},[x3],#8             //pu1_src_top[col] = au1_src_top_tmp[col]
+    BNE         SRC_TOP_LOOP
+
+END_LOOPS:
+    ADD         sp,sp,#0xE0
+    // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
+    ldp         x27, x28,[sp],#16
+    ldp         x25, x26,[sp],#16
+    ldp         x23, x24,[sp],#16
+    ldp         x21, x22,[sp],#16
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+

diff --git a/common/arm64/ihevc_weighted_pred_bi.s b/common/arm64/ihevc_weighted_pred_bi.s
new file mode 100644
index 0000000..6851cb4
--- /dev/null
+++ b/common/arm64/ihevc_weighted_pred_bi.s

@@ -0,0 +1,316 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* //file
+//*  ihevc_weighted_pred_bi.s
+//*
+//* //brief
+//*  contains function definitions for weighted prediction used in inter
+//* prediction
+//*
+//* //author
+//*  parthiban v
+//*
+//* //par list of functions:
+//*  - ihevc_weighted_pred_bi()
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* //brief
+//*  does bi-weighted prediction on the arrays pointed by  pi2_src1 and
+//* pi2_src2 and stores it at location pointed  by pi2_dst   assumptions : the
+//* function is optimized considering the fact width and  height are multiple
+//* of 2.
+//*
+//* //par description:
+//*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
+//* off1 + 1) << (shift - 1) ) >> shift
+//*
+//* //param[in] pi2_src1
+//*  pointer to source 1
+//*
+//* //param[in] pi2_src2
+//*  pointer to source 2
+//*
+//* //param[out] pu1_dst
+//*  pointer to destination
+//*
+//* //param[in] src_strd1
+//*  source stride 1
+//*
+//* //param[in] src_strd2
+//*  source stride 2
+//*
+//* //param[in] dst_strd
+//*  destination stride
+//*
+//* //param[in] wgt0
+//*  weight to be multiplied to source 1
+//*
+//* //param[in] off0
+//*  offset 0
+//*
+//* //param[in] wgt1
+//*  weight to be multiplied to source 2
+//*
+//* //param[in] off1
+//*  offset 1
+//*
+//* //param[in] shift
+//*  (14 bit depth) + log2_weight_denominator
+//*
+//* //param[in] lvl_shift1
+//*  added before shift and offset
+//*
+//* //param[in] lvl_shift2
+//*  added before shift and offset
+//*
+//* //param[in] ht
+//*  height of the source
+//*
+//* //param[in] wd
+//*  width of the source
+//*
+//* //returns
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_weighted_pred_bi(word16 *pi2_src1,
+//                            word16 *pi2_src2,
+//                            uword8 *pu1_dst,
+//                            word32 src_strd1,
+//                            word32 src_strd2,
+//                            word32 dst_strd,
+//                            word32 wgt0,
+//                            word32 off0,
+//                            word32 wgt1,
+//                            word32 off1,
+//                            word32 shift,
+//                            word32 lvl_shift1,
+//                            word32 lvl_shift2,
+//                            word32 ht,
+//                            word32 wd)
+
+//**************variables vs registers*****************************************
+//    x0 => *pi2_src1
+//    x1 => *pi2_src2
+//    x2 => *pu1_dst
+//    x3 =>  src_strd1
+//    x4 =>  src_strd2
+//    x5 =>  dst_strd
+//    x6 =>  wgt0
+//    x7 =>  off0
+//    x8 =>  wgt1
+//    x9 =>  off1
+//    x10 =>  shift
+//    x11 =>  lvl_shift1
+//    x12 =>    lvl_shift2
+//    x14 =>    ht
+//    x7    =>    wd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_weighted_pred_bi_av8
+
+.type ihevc_weighted_pred_bi_av8, %function
+
+ihevc_weighted_pred_bi_av8:
+
+    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
+
+    ldr         w8,[sp,#0]
+    ldr         w9,[sp,#8]
+    ldr         w10,[sp,#16]
+    ldr         w11,[sp,#24]
+    ldr         w12,[sp,#32]
+    ldr         w13,[sp,#40]
+    ldr         w14,[sp,#48]
+
+    sxtw        x8,w8
+    sxtw        x9,w9
+    sxtw        x10,w10
+    sxtw        x11,w11
+    sxtw        x12,w12
+
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+    stp         x21, x22,[sp,#-16]!
+    stp         x23, x24,[sp,#-16]!
+    stp         x25, x26,[sp,#-16]!
+
+    mov         x15,x4 // src_strd2 40
+    mov         x16,x5 // dst_strd 44
+    mov         x17,x6 // wgt0 48
+    mov         x19,x7 // off0 52
+    mov         x20,x8 // wgt1 56
+    mov         x21,x9 // off1 60
+    mov         x22,x10 // shift 64
+    mov         x23,x11 // lvl_shift1 68
+    mov         x24,x12 // lvl_shift2 72
+    mov         x25,x13 // ht 76
+    mov         x26,x14 // wd 80
+
+    mov         x6,x17                      //load wgt0
+    mov         x11,x23                     //load lvl_shift1
+    mov         x12,x24                     //load lvl_shift2
+    mov         v7.h[0],w6                  //moved for scalar multiplication
+    mul         x4, x11 , x6                //lvl_shift1 * wgt0
+    mov         x8,x20                      //load wgt1
+    mov         x7,x19                      //load off0
+    mov         v7.h[1],w8                  //moved for scalar multiplication
+    madd        x4,x12,x8,x4                //(lvl_shift1 * wgt0) + (lvl_shift2 * wgt1)
+    mov         x9,x21                      //load off1
+    add         x5,x7,x9                    //off0 + off1
+    mov         x10,x22                     //load shift
+    add         x5,x5,#1                    //off0 + off1 + 1
+    sub         x14,x10,#1                  //shift - 1
+    mov         x7,x26                      //load wd
+    lsl         x5,x5,x14                   //((off0 + off1 + 1) << (shift - 1))
+    dup         v28.4s,w10                  //vmovq_n_s32(0-shift)
+    add         x4,x4,x5                    //tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1))
+    dup         v30.4s,w4                   //vmovq_n_s32(tmp_lvl_shift)
+    neg         v28.4s, v28.4s
+    mov         x4,x15                      //load src_strd2
+    lsl         x9,x7,#1
+    mov         x5,x16                      //load dst_strd
+    lsl         x3,x3,#1
+    mov         x14,x25                     //load ht
+    lsl         x4,x4,#1
+
+    cmp         x14,#0                      //check ht == 0
+    beq         end_loops                   //if equal, then end the function
+
+outer_loop:
+    cmp         x7,#0                       //check wd == 0
+    beq         end_loops                   //if equal, then end the function
+
+core_loop:
+    add         x6,x0,x3                    //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         x8,x1,x4                    //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+    ld1         {v0.4h},[x0],#8             //load and increment the pi2_src1
+    add         x10,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
+    ld1         {v1.4h},[x1],#8             //load and increment the pi2_src2
+    smull       v4.4s, v0.4h, v7.4h[0]      //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0)
+    ld1         {v2.4h},[x6],x3             //load and increment the pi2_src_tmp1 ii iteration
+    smull       v8.4s, v1.4h, v7.4h[1]      //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
+    ld1         {v3.4h},[x8],x4             //load and increment the pi2_src_tmp1 ii iteration
+    add         v4.4s,  v4.4s ,  v8.4s      //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)
+
+    ld1         {v0.4h},[x6],x3             //load and increment the pi2_src1 iii iteration
+    smull       v10.4s, v2.4h, v7.4h[0]     //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration
+
+    ld1         {v1.4h},[x8],x4             //load and increment the pi2_src2 iii iteration
+    add         v4.4s,  v4.4s ,  v30.4s     //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+    smull       v14.4s, v0.4h, v7.4h[0]     //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration
+
+    ld1         {v2.4h},[x6],x3             //load and increment the pi2_src_tmp1 iv iteration
+    smull       v12.4s, v3.4h, v7.4h[1]     //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
+    sshl        v4.4s,v4.4s,v28.4s          //vshlq_s32(i4_tmp1_t1, tmp_shift_t)
+
+    ld1         {v3.4h},[x8],x4             //load and increment the pi2_src_tmp1 iv iteration
+    add         v10.4s,  v10.4s ,  v12.4s   //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration
+
+    sqxtun      v4.4h, v4.4s                //vqmovun_s32(sto_res_tmp1)
+    smull       v16.4s, v1.4h, v7.4h[1]     //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration
+
+    add         v10.4s,  v10.4s ,  v30.4s   //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
+    //mov v5, v4                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
+    add         v14.4s,  v14.4s ,  v16.4s   //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration
+
+    sshl        v10.4s,v10.4s,v28.4s
+    //vshl.s32    q5,q5,q14                    //vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration
+    smull       v18.4s, v2.4h, v7.4h[0]     //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration
+    uqxtn       v4.8b,v4.8h
+    //vqmovn.u16    d4,q2                        //vqmovn_u16(sto_res_tmp3)
+    add         v14.4s,  v14.4s ,  v30.4s   //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
+
+    sqxtun      v10.4h, v10.4s              //vqmovun_s32(sto_res_tmp1) ii iteration
+    smull       v20.4s, v3.4h, v7.4h[1]     //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration
+
+    sshl        v14.4s,v14.4s,v28.4s
+    //vshl.s32    q7,q7,q14                    //vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration
+    //mov v11, v10                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
+
+    add         v18.4s,  v18.4s ,  v20.4s   //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
+    sqxtun      v14.4h, v14.4s              //vqmovun_s32(sto_res_tmp1) iii iteration
+
+    add         v18.4s,  v18.4s ,  v30.4s   //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteration
+    st1         {v4.s}[0],[x2],#4           //store pu1_dst i iteration
+
+    uqxtn       v10.8b,v10.8h
+    //vqmovn.u16    d10,q5                        //vqmovn_u16(sto_res_tmp3) ii iteration
+    sshl        v18.4s,v18.4s,v28.4s
+    //vshl.s32    q9,q9,q14                    //vshlq_s32(i4_tmp2_t1, tmp_shift_t) iv iteration
+    st1         {v10.s}[0],[x10],x5         //store pu1_dst ii iteration
+
+
+    //mov v15, v14                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
+    uqxtn       v14.8b,v14.8h
+    //vqmovn.u16    d14,q7                        //vqmovn_u16(sto_res_tmp3) iii iteration
+    sqxtun      v18.4h, v18.4s              //vqmovun_s32(sto_res_tmp1) iv iteration
+    //mov v19, v18                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
+    st1         {v14.s}[0],[x10],x5         //store pu1_dst iii iteration
+    uqxtn       v18.8b,v18.8h
+    //vqmovn.u16    d18,q9                        //vqmovn_u16(sto_res_tmp3) iv iteration
+    subs        x7,x7,#4                    //decrement wd by 4 and check for 0
+    st1         {v18.s}[0],[x10],x5         //store pu1_dst iv iteration
+
+    bgt         core_loop                   //if greater than 0 repeat the core loop again
+
+end_core_loop:
+    sub         x20,x9,x3,lsl #2            //2*src_strd1 - wd
+    neg         x11, x20
+    subs        x14,x14,#4                  //decrement the ht by 4
+    sub         x20,x9,x4,lsl #2            //2*src_strd2 - wd
+    neg         x12, x20
+    add         x0,x0,x11                   //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+    asr         x7,x9,#1
+    add         x1,x1,x12                   //pi2_src2 + 4*src_strd2 - 2*wd
+    sub         x20,x7,x5,lsl #2            //2*dst_strd - wd
+    neg         x10, x20
+    add         x2,x2,x10                   //pu1_dst + dst_std - wd
+    bgt         core_loop                   //if ht is greater than 0 goto outer_loop
+
+end_loops:
+    // ldmfd sp!,{x4-x12,x15}              //reload the registers from sp
+    ldp         x25, x26,[sp],#16
+    ldp         x23, x24,[sp],#16
+    ldp         x21, x22,[sp],#16
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+

diff --git a/common/arm64/ihevc_weighted_pred_bi_default.s b/common/arm64/ihevc_weighted_pred_bi_default.s
new file mode 100644
index 0000000..07fb4ce
--- /dev/null
+++ b/common/arm64/ihevc_weighted_pred_bi_default.s

@@ -0,0 +1,541 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_weighted_pred_bi_default.s
+//*
+//* @brief
+//*  contains function definitions for weighted prediction used in inter
+//* prediction
+//*
+//* @author
+//*  parthiban v
+//*
+//* @par list of functions:
+//*  - ihevc_weighted_pred_bi_default()
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*  does default bi-weighted prediction on the arrays pointed by pi2_src1 and
+//* pi2_src2 and stores it at location  pointed by pi2_dst assumptions : the
+//* function is optimized considering the fact width and  height are multiple
+//* of 2.
+//*
+//* @par description:
+//*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
+//* >> shift  where shift = 15 - bitdepth
+//*
+//* @param[in] pi2_src1
+//*  pointer to source 1
+//*
+//* @param[in] pi2_src2
+//*  pointer to source 2
+//*
+//* @param[out] pu1_dst
+//*  pointer to destination
+//*
+//* @param[in] src_strd1
+//*  source stride 1
+//*
+//* @param[in] src_strd2
+//*  source stride 2
+//*
+//* @param[in] dst_strd
+//*  destination stride
+//*
+//* @param[in] lvl_shift1
+//*  added before shift and offset
+//*
+//* @param[in] lvl_shift2
+//*  added before shift and offset
+//*
+//* @param[in] ht
+//*  height of the source
+//*
+//* @param[in] wd
+//*  width of the source
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_weighted_pred_bi_default(word16 *pi2_src1,
+//                                    word16 *pi2_src2,
+//                                    uword8 *pu1_dst,
+//                                    word32 src_strd1,
+//                                    word32 src_strd2,
+//                                    word32 dst_strd,
+//                                    word32 lvl_shift1,
+//                                    word32 lvl_shift2,
+//                                    word32 ht,
+//                                    word32 wd)
+
+//**************variables vs registers*****************************************
+//    x0 => *pi2_src1
+//    x1 => *pi2_src2
+//    x2 => *pu1_dst
+//    x3 =>  src_strd1
+//    x4 =>  src_strd2
+//    x5 =>  dst_strd
+//    x6 =>  lvl_shift1
+//    x7 =>  lvl_shift2
+//    x8 =>  ht
+//    x9 =>  wd
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_weighted_pred_bi_default_av8
+
+.type ihevc_weighted_pred_bi_default_av8, %function
+
+ihevc_weighted_pred_bi_default_av8:
+
+    ldr         w8,[sp,#0]
+    ldr         w9,[sp,#8]
+
+    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+    stp         x21, x22,[sp,#-16]!
+
+    mov         x15,x4 // src_strd2 40
+    mov         x16,x5 // dst_strd 44
+    mov         x17,x6 // lvl_shift1 48
+    mov         x19,x7 // lvl_shift2 52
+    mov         x20,x8 // ht 56
+    mov         x21,x9 // wd 60
+
+    mov         x4,x15                      //load src_strd2
+    lsl         x3,x3,#1
+    mov         x5,x16                      //load dst_strd
+    mov         x6,x17                      //load lvl_shift1
+    lsl         x4,x4,#1
+    mov         x7,x19                      //load lvl_shift2
+    mov         x8,x20                      //load ht
+    mov         x9,x21                      //load wd
+    dup         v4.8h,w6                    //lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1)
+    dup         v6.8h,w7                    //lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2)
+    movi        v0.8h, #0x40                //tmp_lvl_shift = 1 << (shift - 1)
+    add         v4.8h,  v4.8h,v6.8h
+    add         v0.8h,  v0.8h ,  v4.8h
+//   vmvn.i32    v2.8h,#0x6                         @vmovq_n_s32(tmp_shift)
+    lsl         x6,x9,#1
+    sub         x20,x6,x3,lsl #2            //4*src_strd1 - wd
+    neg         x7, x20
+    sub         x20,x6,x4,lsl #2            //4*src_strd2 - wd
+    neg         x10, x20
+    //asr            x6,#1
+    //rsb            x6,x6,x5,lsl #2             @4*dst_strd - wd
+
+    cmp         x8,#0                       //check ht == 0
+    beq         end_loops                   //if equal, then end the function
+
+chroma_decision:
+    orr         x14,x8,x9
+    cmp         x14,#10
+    beq         outer_loop_chroma_8x2
+
+    cmp         x14,#6
+    beq         outer_loop_chroma_4x2
+
+
+luma_decision:
+    cmp         x9,#24
+    beq         outer_loop_8
+
+    cmp         x9,#16
+    bge         outer_loop_16
+
+    cmp         x9,#12
+    beq         outer_loop_4
+
+    cmp         x9,#8
+    bge         outer_loop_8
+
+
+
+
+
+
+outer_loop_4:
+    cmp         x9,#0                       //check wd == 0
+    beq         end_loops                   //if equal, then end the function
+
+core_loop_4:
+    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+    ld1         {v6.4h},[x0],#8             //load and increment the pi2_src1
+    add         x14,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
+    ld1         {v7.4h},[x1],#8             //load and increment the pi2_src2
+    ld1         {v8.4h},[x11],x3            //load and increment the pi2_src1 ii iteration
+    sqadd       v18.4h,v6.4h,v7.4h
+    sqadd       v18.4h,v18.4h,v0.4h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+    ld1         {v9.4h},[x12],x4            //load and increment the pi2_src2 ii iteration
+    sqadd       v20.4h,v8.4h,v9.4h          //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+    sqadd       v19.4h,v20.4h,v0.4h         //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
+    mov         v18.d[1],v19.d[0]
+    sqshrun     v20.8b, v18.8h,#7
+    ld1         {v22.4h},[x11],x3           //load and increment the pi2_src1 iii iteration
+    ld1         {v23.4h},[x12],x4           //load and increment the pi2_src2 iii iteration
+    sqadd       v30.4h,v22.4h,v23.4h
+    sqadd       v30.4h,v30.4h,v0.4h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
+    ld1         {v24.4h},[x11],x3           //load and increment the pi2_src1 iv iteration
+    ld1         {v25.4h},[x12],x4           //load and increment the pi2_src2 iv iteration
+    sqadd       v18.4h,v24.4h,v25.4h        //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
+    sqadd       v31.4h,v18.4h,v0.4h
+    mov         v30.d[1],v31.d[0]
+    st1         {v20.s}[0],[x2],#4          //store pu1_dst i iteration
+    st1         {v20.s}[1],[x14],x5         //store pu1_dst ii iteration
+    sqshrun     v30.8b, v30.8h,#7
+    st1         {v30.s}[0],[x14],x5         //store pu1_dst iii iteration                                                //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
+    subs        x9,x9,#4                    //decrement wd by 4 and check for 0
+    st1         {v30.s}[1],[x14],x5         //store pu1_dst iv iteration
+    bgt         core_loop_4                 //if greater than 0 repeat the core loop again
+
+end_core_loop_4:
+
+    subs        x8,x8,#4                    //decrement the ht by 4
+
+    add         x0,x0,x7                    //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+    asr         x9,x6,#1
+    add         x1,x1,x10                   //pi2_src2 + 4*src_strd2 - 2*wd
+    sub         x20,x9,x5,lsl #2            //4*dst_strd - wd
+    neg         x14, x20
+    add         x2,x2,x14
+                                            //pu1_dst + dst_std - wd
+    bgt         core_loop_4                 //if ht is greater than 0 goto outer_loop
+
+    b           end_loops
+
+
+// this is only for chroma module with input 2x2
+outer_loop_chroma_4x2:
+    cmp         x9,#0                       //check wd == 0
+    beq         end_loops                   //if equal, then end the function
+    sub         x20,x6,x3,lsl #1            //2*src_strd1 - wd
+    neg         x7, x20
+    sub         x20,x6,x4,lsl #1            //2*src_strd2 - wd
+    neg         x10, x20
+core_loop_chroma_4x2:
+    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+    ld1         {v6.4h},[x0],#8             //load and increment the pi2_src1
+    add         x14,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
+    ld1         {v7.4h},[x1],#8             //load and increment the pi2_src2
+    ld1         {v8.4h},[x11],x3            //load and increment the pi2_src1 ii iteration
+    sqadd       v18.4h,v6.4h,v7.4h
+    sqadd       v18.4h,v18.4h,v0.4h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+    ld1         {v9.4h},[x12],x4            //load and increment the pi2_src2 ii iteration
+    sqadd       v20.4h,v8.4h,v9.4h          //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+    sqadd       v19.4h,v20.4h,v0.4h         //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
+    mov         v18.d[1],v19.d[0]
+    sqshrun     v20.8b, v18.8h,#7
+    st1         {v20.s}[0],[x2],#4          //store pu1_dst i iteration
+    st1         {v20.s}[1],[x14],x5         //store pu1_dst ii iteration
+
+    subs        x9,x9,#4                    //decrement wd by 4 and check for 0
+
+    bgt         core_loop_chroma_4x2        //if greater than 0 repeat the core loop again
+
+end_core_loop_chorma_4x2:
+
+    subs        x8,x8,#2                    //decrement the ht by 4
+
+    add         x0,x0,x7                    //pi2_src1 + 2*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+    asr         x9,x6,#1
+    add         x1,x1,x10                   //pi2_src2 + 2*src_strd2 - 2*wd
+    sub         x20,x9,x5,lsl #1            //2*dst_strd - wd
+    neg         x14, x20
+    add         x2,x2,x14
+                                            //pu1_dst + dst_std - wd
+    bgt         core_loop_chroma_4x2        //if ht is greater than 0 goto outer_loop
+
+    b           end_loops
+
+
+
+outer_loop_8:
+    cmp         x9,#0                       //check wd == 0
+    beq         end_loops                   //if equal, then end the function
+    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+core_loop_8:
+
+    ld1         { v24.8h},[x0],#16          //load and increment the pi2_src1
+    add         x14,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
+    ld1         { v26.8h},[x1],#16          //load and increment the pi2_src2
+    sqadd       v24.8h,v24.8h,v26.8h
+    ld1         { v28.8h},[x11],x3          //load and increment the pi2_src1 ii iteration
+    sqadd       v24.8h,v24.8h,v0.8h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+    ld1         { v30.8h},[x12],x4          //load and increment the pi2_src2 ii iteration
+    ld1         { v16.8h},[x11],x3          //load and increment the pi2_src1 iii iteration
+    sqadd       v22.8h,v28.8h,v30.8h        //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+    ld1         { v18.8h},[x12],x4          //load and increment the pi2_src2 iii iteration
+    sqadd       v22.8h,v22.8h,v0.8h         //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
+    sqshrun     v20.8b, v24.8h,#7
+    ld1         { v12.8h},[x11],x3          //load and increment the pi2_src1 iv iteration
+    sqadd       v30.8h,v16.8h,v18.8h
+    sqshrun     v21.8b, v22.8h,#7
+    ld1         { v14.8h},[x12],x4          //load and increment the pi2_src2 iv iteration
+    sqadd       v30.8h,v30.8h,v0.8h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
+    st1         {v20.2s},[x2],#8            //store pu1_dst i iteration
+    sqadd       v8.8h,v12.8h,v14.8h         //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
+    st1         {v21.2s},[x14],x5           //store pu1_dst ii iteration
+    sqadd       v8.8h,v8.8h,v0.8h
+    sqshrun     v30.8b, v30.8h,#7
+    sqshrun     v31.8b, v8.8h,#7
+    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+    st1         {v30.2s},[x14],x5           //store pu1_dst iii iteration                                                //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
+    subs        x9,x9,#8                    //decrement wd by 4 and check for 0
+    st1         {v31.2s},[x14],x5           //store pu1_dst iv iteration
+    bgt         core_loop_8                 //if greater than 0 repeat the core loop again
+
+end_core_loop_8:
+
+    subs        x8,x8,#4                    //decrement the ht by 4
+
+    add         x0,x0,x7                    //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+    asr         x9,x6,#1
+    add         x1,x1,x10                   //pi2_src2 + 4*src_strd2 - 2*wd
+    sub         x20,x9,x5,lsl #2            //4*dst_strd - wd
+    neg         x14, x20
+    add         x2,x2,x14
+    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  //pu1_dst + dst_std - wd
+
+    bgt         core_loop_8
+    b           end_loops
+
+
+
+// this is only for chroma module with inpput 4x2
+outer_loop_chroma_8x2:
+    cmp         x9,#0                       //check wd == 0
+    beq         end_loops                   //if equal, then end the function
+    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+    sub         x20,x6,x3,lsl #1            //2*src_strd1 - wd
+    neg         x7, x20
+    sub         x20,x6,x4,lsl #1            //2*src_strd2 - wd
+    neg         x10, x20
+core_loop_chroma_8x2:
+
+    ld1         { v24.8h},[x0],#16          //load and increment the pi2_src1
+    add         x14,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
+    ld1         { v26.8h},[x1],#16          //load and increment the pi2_src2
+    sqadd       v24.8h,v24.8h,v26.8h
+    ld1         { v28.8h},[x11],x3          //load and increment the pi2_src1 ii iteration
+    sqadd       v24.8h,v24.8h,v0.8h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+    ld1         { v30.8h},[x12],x4          //load and increment the pi2_src2 ii iteration
+    ld1         { v16.8h},[x11],x3          //load and increment the pi2_src1 iii iteration
+    sqadd       v22.8h,v28.8h,v30.8h        //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+    sqadd       v22.8h,v22.8h,v0.8h         //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
+    sqshrun     v20.8b, v24.8h,#7
+    sqshrun     v21.8b, v22.8h,#7
+    st1         {v20.2s},[x2],#8            //store pu1_dst i iteration
+    st1         {v21.2s},[x14],x5           //store pu1_dst ii iteration
+
+    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+                                            //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
+    subs        x9,x9,#8                    //decrement wd by 4 and check for 0
+
+    bgt         core_loop_chroma_8x2        //if greater than 0 repeat the core loop again
+
+end_core_loop_chroma_8x2:
+
+    subs        x8,x8,#2                    //decrement the ht by 4
+
+    add         x0,x0,x7                    //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+    asr         x9,x6,#1
+    add         x1,x1,x10                   //pi2_src2 + 4*src_strd2 - 2*wd
+    sub         x20,x9,x5,lsl #1            //4*dst_strd - wd
+    neg         x14, x20
+    add         x2,x2,x14
+    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  //pu1_dst + dst_std - wd
+
+    bgt         core_loop_chroma_8x2
+
+    b           end_loops
+
+
+
+
+outer_loop_16:
+    cmp         x9,#0                       //check wd == 0
+    beq         end_loops                   //if equal, then end the function
+    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+    sub         x20,x6,x3,lsl #1            //2*src_strd1 - wd
+    neg         x7, x20
+    mov         x14,#16
+    sub         x10,x14,x5
+    sub         x11,x3,x14
+    sub         x12,x14,x3
+
+    sub         x20,x9,x5,lsl #1            //2*dst_strd - wd
+    neg         x14, x20
+
+
+
+prolog_16:
+
+
+    ld1         { v2.8h},[x0],#16           //load and increment the pi2_src1
+    ld1         { v4.8h},[x1],#16           //load and increment the pi2_src2
+    ld1         { v10.8h},[x0],x11          //load and increment the pi2_src1
+    ld1         { v12.8h},[x1],x11          //load and increment the pi2_src2
+    ld1         { v6.8h},[x0],#16           //load and increment the pi2_src1 ii iteration
+    subs        x9,x9,#16
+    ld1         { v8.8h},[x1],#16           //load and increment the pi2_src2 ii iteration
+    sub         x20,x8,#2
+    csel        x8, x20, x8,eq
+    sqadd       v22.8h,v2.8h,v4.8h
+    ld1         { v14.8h},[x0],x12          //load and increment the pi2_src1 ii iteration
+    sqadd       v28.8h,v10.8h,v12.8h
+    ld1         { v16.8h},[x1],x12          //load and increment the pi2_src2 ii iteration
+    add         x20,x0,x7
+    csel        x0, x20, x0,eq
+    add         x20,x1,x7
+    csel        x1, x20, x1,eq
+    sqadd       v24.8h,v6.8h,v8.8h
+    ld1         { v2.8h},[x0],#16
+    sqadd       v26.8h,v14.8h,v16.8h
+// if the input is chroma with 8x2 block size
+    cmp         x8,#0
+    beq         epilog_16
+
+    ld1         { v4.8h},[x1],#16           //load and increment the pi2_src2
+    sqadd       v22.8h,v22.8h,v0.8h
+    ld1         { v10.8h},[x0],x11          //load and increment the pi2_src1
+    sqadd       v28.8h,v28.8h,v0.8h
+    ld1         { v12.8h},[x1],x11          //load and increment the pi2_src2
+    sqadd       v24.8h,v24.8h,v0.8h
+    ld1         { v6.8h},[x0],#16           //load and increment the pi2_src1 ii iteration
+    sqadd       v30.8h,v26.8h,v0.8h
+    sqshrun     v20.8b, v22.8h,#7
+    ld1         { v8.8h},[x1],#16           //load and increment the pi2_src2 ii iteration
+    sqshrun     v21.8b, v28.8h,#7
+    ld1         { v14.8h},[x0],x12          //load and increment the pi2_src1 ii iteration
+    sqshrun     v26.8b, v24.8h,#7
+    ld1         { v16.8h},[x1],x12          //load and increment the pi2_src2 ii iteration
+    sqshrun     v27.8b, v30.8h,#7
+
+
+
+core_loop_16:
+
+    cmp         x9,#0
+    sqadd       v22.8h,v2.8h,v4.8h
+    asr         x20,x6,#1
+    csel        x9,x20,x9,eq
+    //asreq           x9,x6,#1
+    mov         v20.d[1],v21.d[0]
+    mov         v26.d[1],v27.d[0]
+    st1         { v20.4s},[x2],x5
+    sqadd       v28.8h,v10.8h,v12.8h
+    st1         { v26.4s},[x2],x10
+    add         x20,x2,x14
+    csel        x2, x20, x2,eq
+    sqadd       v24.8h,v6.8h,v8.8h
+    subs        x9,x9,#16
+    add         x20,x0,x7
+    csel        x0, x20, x0,eq
+    sqadd       v26.8h,v14.8h,v16.8h
+
+    add         x20,x1,x7
+    csel        x1, x20, x1,eq
+    sub         x20,x8,#2
+    csel        x8,x20,x8,eq
+    cmp         x8,#0
+    //subeqs           x8,x8,#2                      //decrement the ht by 2
+    beq         epilog_16
+
+
+    sqadd       v22.8h,v22.8h,v0.8h
+    ld1         { v2.8h},[x0],#16           //load and increment the pi2_src1
+    sqadd       v28.8h,v28.8h,v0.8h
+    ld1         { v4.8h},[x1],#16           //load and increment the pi2_src2
+    sqadd       v24.8h,v24.8h,v0.8h
+    ld1         { v10.8h},[x0],x11          //load and increment the pi2_src1
+    sqadd       v30.8h,v26.8h,v0.8h
+    ld1         { v12.8h},[x1],x11          //load and increment the pi2_src2
+    sqshrun     v20.8b, v22.8h,#7
+    ld1         { v6.8h},[x0],#16           //load and increment the pi2_src1 ii iteration
+    sqshrun     v21.8b, v28.8h,#7
+    ld1         { v8.8h},[x1],#16           //load and increment the pi2_src2 ii iteration
+    sqshrun     v26.8b, v24.8h,#7
+    ld1         { v14.8h},[x0],x12          //load and increment the pi2_src1 ii iteration
+    sqshrun     v27.8b, v30.8h,#7
+    ld1         { v16.8h},[x1],x12          //load and increment the pi2_src2 ii iteration
+
+
+    b           core_loop_16
+
+
+epilog_16:
+
+    sqadd       v22.8h,v22.8h,v0.8h
+    sqadd       v28.8h,v28.8h,v0.8h
+    sqadd       v24.8h,v24.8h,v0.8h
+    sqadd       v30.8h,v26.8h,v0.8h
+    sqshrun     v20.8b, v22.8h,#7
+    sqshrun     v21.8b, v28.8h,#7
+    sqshrun     v26.8b, v24.8h,#7
+    sqshrun     v27.8b, v30.8h,#7
+    mov         v20.d[1],v21.d[0]
+    mov         v26.d[1],v27.d[0]
+    st1         { v20.4s},[x2],x5
+    st1         { v26.4s},[x2]
+
+
+
+end_core_loop_16:
+
+
+
+
+
+
+
+
+end_loops:
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x21, x22,[sp],#16
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+

diff --git a/common/arm64/ihevc_weighted_pred_uni.s b/common/arm64/ihevc_weighted_pred_uni.s
new file mode 100644
index 0000000..d805230
--- /dev/null
+++ b/common/arm64/ihevc_weighted_pred_uni.s

@@ -0,0 +1,245 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//*  ihevc_weighted_pred_uni.s
+//*
+//* @brief
+//*  contains function definitions for weighted prediction used in inter
+//* prediction
+//*
+//* @author
+//*  parthiban v
+//*
+//* @par list of functions:
+//*  - ihevc_weighted_pred_uni()
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+///**
+//*******************************************************************************
+//*
+//* @brief
+//*  does uni-weighted prediction on the array pointed by  pi2_src and stores
+//* it at the location pointed by pi2_dst assumptions : the function is
+//* optimized considering the fact width and  height are multiple of 2.
+//*
+//* @par description:
+//*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
+//* offset
+//*
+//* @param[in] pi2_src
+//*  pointer to the source
+//*
+//* @param[out] pu1_dst
+//*  pointer to the destination
+//*
+//* @param[in] src_strd
+//*  source stride
+//*
+//* @param[in] dst_strd
+//*  destination stride
+//*
+//* @param[in] wgt0
+//*  weight to be multiplied to the source
+//*
+//* @param[in] off0
+//*  offset to be added after rounding and
+//*
+//* @param[in] shifting
+//*
+//*
+//* @param[in] shift
+//*  (14 bit depth) + log2_weight_denominator
+//*
+//* @param[in] lvl_shift
+//*  added before shift and offset
+//*
+//* @param[in] ht
+//*  height of the source
+//*
+//* @param[in] wd
+//*  width of the source
+//*
+//* @returns
+//*
+//* @remarks
+//*  none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_weighted_pred_uni(word16 *pi2_src,
+//                             uword8 *pu1_dst,
+//                             word32 src_strd,
+//                             word32 dst_strd,
+//                             word32 wgt0,
+//                             word32 off0,
+//                             word32 shift,
+//                             word32 lvl_shift,
+//                             word32 ht,
+//                             word32 wd)
+
+//**************variables vs registers*****************************************
+//    x0 => *pi2_src
+//    x1 => *pu1_dst
+//    x2 =>  src_strd
+//    x3 =>  dst_strd
+//    x4 =>  wgt0
+//    x5 =>  off0
+//    x6 =>  shift
+//    x7 =>  lvl_shift
+//    x8 =>    ht
+//    x9    =>    wd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_weighted_pred_uni_av8
+
+.type ihevc_weighted_pred_uni_av8, %function
+
+ihevc_weighted_pred_uni_av8:
+
+    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
+
+    ldr         w8,[sp,#0]
+    ldr         w9,[sp,#8]
+
+    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+    stp         x21, x22,[sp,#-16]!
+
+    mov         x15,x4 // src_strd2 40
+    mov         x16,x5 // dst_strd 44
+    mov         x17,x6 // lvl_shift1 48
+    mov         x19,x7 // lvl_shift2 52
+    mov         x20,x8 // ht 56
+    mov         x21,x9 // wd 60
+
+    mov         x4,x15                      //load wgt0
+    mov         x7,x19                      //load lvl_shift
+    mov         x11,#1
+    mov         x5,x16                      //load off0
+    mul         x10, x7, x4                 //lvl_shift * wgt0
+    mov         x6,x17                      //load shift
+    mov         x8,x20                      //load ht
+    lsl         x22,x5,x6
+    add         x10,x10,x22                 //lvl_shift * wgt0 + (off0 << shift)
+    mov         x9,x21                      //load wt
+    sub         x12,x6,#1
+    mov         v0.4h[0], w4                //moved for scalar multiplication
+    lsl         x2,x2,#1
+    dup         v28.4s,w6                   //vmovq_n_s32(tmp_shift)
+    lsl         x22,x11,x12
+    add         x10,x10,x22                 //tmp_lvl_shift += (1 << (shift - 1))
+    dup         v30.4s,w10                  //vmovq_n_s32(tmp_lvl_shift)
+    neg         v28.4s, v28.4s
+    lsl         x4,x9,#1
+
+    cmp         x8,#0                       //check ht == 0
+    beq         end_loops                   //if equal, then end the function
+
+outer_loop:
+    cmp         x9,#0                       //check wd == 0
+    beq         end_loops                   //if equal, then end the function
+
+core_loop:
+    add         x5,x0,x2                    //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+    add         x6,x1,x3                    //pu1_dst_tmp = pu1_dst + dst_strd
+    ld1         {v1.4h},[x0],#8             //load and increment the pi2_src
+    ld1         {v2.4h},[x5],x2             //load and increment the pi2_src_tmp ii iteration
+    smull       v4.4s, v1.4h, v0.4h[0]      //vmull_n_s16(pi2_src_val1, (int16_t) wgt0)
+
+    add         v4.4s,  v4.4s ,  v30.4s     //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t)
+    ld1         {v8.4h},[x5],x2             //load and increment the pi2_src iii iteration
+
+    smull       v6.4s, v2.4h, v0.4h[0]      //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration
+    ld1         {v9.4h},[x5],x2             //load and increment the pi2_src_tmp iv iteration
+
+    sshl        v4.4s,v4.4s,v28.4s
+    //vshl.s32    q2,q2,q14                    //vshlq_s32(i4_tmp1_t, tmp_shift_t)
+    add         v6.4s,  v6.4s ,  v30.4s     //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration
+
+    smull       v10.4s, v8.4h, v0.4h[0]     //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
+    sqxtun      v4.4h, v4.4s                //vqmovun_s32(sto_res_tmp1)
+
+    add         v10.4s,  v10.4s ,  v30.4s   //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration
+    //mov v5, v4                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
+
+    sshl        v6.4s,v6.4s,v28.4s
+    //vshl.s32    q3,q3,q14                    //vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration
+
+    smull       v12.4s, v9.4h, v0.4h[0]     //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
+    uqxtn       v4.8b,  v4.8h               //vqmovn_u16(sto_res_tmp3)
+
+    sshl        v10.4s,v10.4s,v28.4s
+    //vshl.s32    q5,q5,q14                    //vshlq_s32(i4_tmp1_t, tmp_shift_t) iii iteration
+    sqxtun      v6.4h, v6.4s                //vqmovun_s32(sto_res_tmp1) ii iteration
+
+    add         v12.4s,  v12.4s ,  v30.4s   //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration
+    //mov v7, v6                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
+
+    sqxtun      v10.4h, v10.4s              //vqmovun_s32(sto_res_tmp1) iii iteration
+
+    sshl        v12.4s,v12.4s,v28.4s
+    //vshl.s32    q6,q6,q14                    //vshlq_s32(i4_tmp2_t, tmp_shift_t) iv iteration
+    st1         {v4.s}[0],[x1],#4           //store pu1_dst i iteration
+    //mov v11, v10                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
+
+    uqxtn       v6.8b,  v6.8h               //vqmovn_u16(sto_res_tmp3) ii iteration
+    st1         {v6.s}[0],[x6],x3           //store pu1_dst ii iteration
+
+    uqxtn       v10.8b,  v10.8h             //vqmovn_u16(sto_res_tmp3) iii iteration
+    sqxtun      v12.4h, v12.4s              //vqmovun_s32(sto_res_tmp1) iv iteration
+
+    //mov v13, v12                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iv iteration
+    st1         {v10.s}[0],[x6],x3          //store pu1_dst i iteration iii iteration
+    uqxtn       v12.8b,  v12.8h             //vqmovn_u16(sto_res_tmp3) iv iteration
+
+    subs        x9,x9,#4                    //decrement wd by 4 and check for 0
+    st1         {v12.s}[0],[x6],x3          //store pu1_dst iv iteration
+    bgt         core_loop                   //if greater than 0 repeat the core loop again
+
+end_core_loop:
+    sub         x22,x4,x2,lsl #2            //2*src_strd - wd
+    neg         x11, x22
+    subs        x8,x8,#4                    //decrement the ht by 4
+    add         x0,x0,x11                   //pi2_src + 4*src_strd - 2*wd(since pi2_src is 16 bit pointer double the increment with double the wd decrement)
+    asr         x9,x4,#1
+    sub         x22,x9,x3,lsl #2            //2*dst_strd - wd
+    neg         x12, x22
+    add         x1,x1,x12                   //pu1_dst + dst_std - wd
+    bgt         core_loop                   //if ht is greater than 0 goto outer_loop
+
+end_loops:
+    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    ldp         x21, x22,[sp],#16
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+

diff --git a/common/ihevc_buf_mgr.c b/common/ihevc_buf_mgr.c
new file mode 100644
index 0000000..b6e4f2a
--- /dev/null
+++ b/common/ihevc_buf_mgr.c

@@ -0,0 +1,402 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_buf_mgr.c
+*
+* @brief
+*  Contains function definitions for buffer management
+*
+* @author
+*  Srinivas T
+*
+* @par List of Functions:
+*   - ihevc_buf_mgr_init()
+*   - ihevc_buf_mgr_add()
+*   - ihevc_buf_mgr_get_next_free()
+*   - ihevc_buf_mgr_check_free()
+*   - ihevc_buf_mgr_release()
+*   - ihevc_buf_mgr_set_status()
+*   - ihevc_buf_mgr_get_status()
+*   - ihevc_buf_mgr_get_buf()
+*   - ihevc_buf_mgr_get_num_active_buf()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#include <stdlib.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_buf_mgr.h"
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*      Buffer manager initialization function.
+*
+* @par Description:
+*    Initializes the buffer manager structure
+*
+* @param[in] ps_buf_mgr
+*  Pointer to the buffer manager
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_buf_mgr_init(
+                buf_mgr_t *ps_buf_mgr)
+{
+    WORD32 id;
+
+    ps_buf_mgr->u4_max_buf_cnt = BUF_MGR_MAX_CNT;
+    ps_buf_mgr->u4_active_buf_cnt = 0;
+
+    for(id = 0; id < BUF_MGR_MAX_CNT; id++)
+    {
+        ps_buf_mgr->au4_status[id] = 0;
+        ps_buf_mgr->apv_ptr[id] = NULL;
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*       Adds and increments the buffer and buffer count.
+*
+* @par Description:
+*     Adds a buffer to the buffer manager if it is not already  present and
+*   increments the  active buffer count
+*
+* @param[in] ps_buf_mgr
+*  Pointer to the buffer manager
+*
+* @param[in] pv_ptr
+*  Pointer to the buffer to be added
+*
+* @returns  Returns 0 on success, -1 otherwise
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+WORD32 ihevc_buf_mgr_add(
+                buf_mgr_t *ps_buf_mgr,
+                void *pv_ptr,
+                WORD32 buf_id)
+{
+
+    /* Check if buffer ID is within allowed range */
+    if(buf_id >= (WORD32)ps_buf_mgr->u4_max_buf_cnt)
+    {
+        return (-1);
+    }
+
+    /* Check if the current ID is being used to hold some other buffer */
+    if((ps_buf_mgr->apv_ptr[buf_id] != NULL) &&
+       (ps_buf_mgr->apv_ptr[buf_id] != pv_ptr))
+    {
+        return (-1);
+    }
+    ps_buf_mgr->apv_ptr[buf_id] = pv_ptr;
+
+    return 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Gets the next free buffer.
+*
+* @par Description:
+*     Returns the next free buffer available and sets the  corresponding status
+*   to DEC
+*
+* @param[in] ps_buf_mgr
+*  Pointer to the buffer manager
+*
+* @param[in] pi4_buf_id
+*  Pointer to the id of the free buffer
+*
+* @returns  Pointer to the free buffer
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void* ihevc_buf_mgr_get_next_free(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 *pi4_buf_id)
+{
+    WORD32 id;
+    void *pv_ret_ptr;
+
+    pv_ret_ptr = NULL;
+    for(id = 0; id < (WORD32)ps_buf_mgr->u4_max_buf_cnt; id++)
+    {
+        /* Check if the buffer is non-null and status is zero */
+        if((ps_buf_mgr->au4_status[id] == 0) && (ps_buf_mgr->apv_ptr[id]))
+        {
+            *pi4_buf_id = id;
+            /* DEC is set to 1 */
+            ps_buf_mgr->au4_status[id] = 1;
+            pv_ret_ptr = ps_buf_mgr->apv_ptr[id];
+            break;
+        }
+    }
+
+    return pv_ret_ptr;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*      Checks the buffer manager for free buffers available.
+*
+* @par Description:
+*  Checks if there are any free buffers available
+*
+* @param[in] ps_buf_mgr
+*  Pointer to the buffer manager
+*
+* @returns  Returns 0 if available, -1 otherwise
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+WORD32 ihevc_buf_mgr_check_free(
+                buf_mgr_t *ps_buf_mgr)
+{
+    UWORD32 id;
+
+    for(id = 0; id < ps_buf_mgr->u4_max_buf_cnt; id++)
+    {
+        if((ps_buf_mgr->au4_status[id] == 0) &&
+           (ps_buf_mgr->apv_ptr[id]))
+        {
+            return 1;
+        }
+    }
+
+    return 0;
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*       Resets the status bits.
+*
+* @par Description:
+*     resets the status bits that the mask contains (status  corresponding to
+*    the id)
+*
+* @param[in] ps_buf_mgr
+*  Pointer to the buffer manager
+*
+* @param[in] buf_id
+*  ID of the buffer status to be released
+*
+* @param[in] mask
+*  Contains the bits that are to be reset
+*
+* @returns  0 if success, -1 otherwise
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+WORD32 ihevc_buf_mgr_release(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 buf_id,
+                UWORD32 mask)
+{
+    /* If the given id is pointing to an id which is not yet added */
+    if(buf_id >= (WORD32)ps_buf_mgr->u4_max_buf_cnt)
+    {
+        return (-1);
+    }
+
+    ps_buf_mgr->au4_status[buf_id] &= ~mask;
+
+    /* If both the REF and DISP are zero, DEC is set to zero */
+    if(ps_buf_mgr->au4_status[buf_id] == 1)
+    {
+        ps_buf_mgr->au4_status[buf_id] = 0;
+    }
+
+    return 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*      Sets the status bit.
+*
+* @par Description:
+*     sets the status bits that the mask contains (status  corresponding to the
+*    id)
+*
+*
+* @param[in] ps_buf_mgr
+*  Pointer to the buffer manager
+*
+* @param[in] buf_id
+*  ID of the buffer whose status needs to be modified
+*
+*
+* @param[in] mask
+*  Contains the bits that are to be set
+*
+* @returns  0 if success, -1 otherwise
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+WORD32 ihevc_buf_mgr_set_status(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 buf_id,
+                UWORD32 mask)
+{
+    if(buf_id >= (WORD32)ps_buf_mgr->u4_max_buf_cnt)
+    {
+        return (-1);
+    }
+
+
+    if((ps_buf_mgr->au4_status[buf_id] & mask) != 0)
+    {
+        return (-1);
+    }
+
+    ps_buf_mgr->au4_status[buf_id] |= mask;
+    return 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Returns the status of the buffer.
+*
+* @par Description:
+*  Returns the status of the buffer corresponding to the id
+*
+* @param[in] ps_buf_mgr
+*  Pointer to the buffer manager
+*
+* @param[in] buf_id
+*  ID of the buffer status required
+*
+* @returns  Status of the buffer corresponding to the id
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+UWORD32 ihevc_buf_mgr_get_status(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 buf_id)
+{
+    return ps_buf_mgr->au4_status[buf_id];
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*      Gets the buffer from the buffer manager
+*
+* @par Description:
+*        Returns the pointer to the buffer corresponding to the id
+*
+* @param[in] ps_buf_mgr
+*  Pointer to the buffer manager
+*
+* @param[in] buf_id
+*  ID of the buffer required
+*
+* @returns  Pointer to the buffer required
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void* ihevc_buf_mgr_get_buf(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 buf_id)
+{
+    return ps_buf_mgr->apv_ptr[buf_id];
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*        Gets the no.of active buffer
+*
+* @par Description:
+*      Return the number of active buffers in the buffer manager
+*
+* @param[in] ps_buf_mgr
+*  Pointer to the buffer manager
+*
+* @returns  number of active buffers
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+UWORD32 ihevc_buf_mgr_get_num_active_buf(
+                buf_mgr_t *ps_buf_mgr)
+{
+    return ps_buf_mgr->u4_max_buf_cnt;
+}

diff --git a/common/ihevc_buf_mgr.h b/common/ihevc_buf_mgr.h
new file mode 100644
index 0000000..7801a5c
--- /dev/null
+++ b/common/ihevc_buf_mgr.h

@@ -0,0 +1,113 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_buf_mgr.h
+*
+* @brief
+*  Function declarations used for buffer management
+*
+* @author
+*  Srinivas T
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _BUF_MGR_H_
+#define _BUF_MGR_H_
+
+#define BUF_MGR_MAX_CNT 64
+
+#define BUF_MGR_DEC         1
+#define BUF_MGR_REF         (1 << 1)
+#define BUF_MGR_DISP        (1 << 2)
+
+typedef struct
+{
+    /**
+     * max_buf_cnt
+     */
+    UWORD32 u4_max_buf_cnt;
+
+    /**
+     * active_buf_cnt
+     */
+    UWORD32 u4_active_buf_cnt;
+    /**
+     *  au4_status[BUF_MGR_MAX_CNT]
+     */
+    UWORD32 au4_status[BUF_MGR_MAX_CNT];
+    /* The last three bit of status are:    */
+    /* Bit 0 - DEC                          */
+    /* Bit 1 - REF                          */
+    /* Bit 2 - DISP                         */
+
+    void    *apv_ptr[BUF_MGR_MAX_CNT];
+}buf_mgr_t;
+
+// intializes the buffer API structure
+void ihevc_buf_mgr_init(
+                buf_mgr_t *ps_buf_mgr);
+
+// Add buffer to buffer manager. 0: success, -1: fail (u4_active_buf_cnt has reached u4_max_buf_cnt)
+WORD32 ihevc_buf_mgr_add(
+                buf_mgr_t *ps_buf_mgr,
+                void *pv_ptr,
+                WORD32 buf_id);
+
+// this function will set the buffer status to DEC
+void* ihevc_buf_mgr_get_next_free(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 *pi4_id);
+
+// this function will check if there are any free buffers
+WORD32 ihevc_buf_mgr_check_free(
+                buf_mgr_t *ps_buf_mgr);
+
+// mask will have who released it: DISP:REF:DEC
+WORD32 ihevc_buf_mgr_release(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 id,
+                UWORD32 mask);
+
+// sets the status to one or all of DISP:REF:DEC
+WORD32 ihevc_buf_mgr_set_status(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 id,
+                UWORD32 mask);
+
+// Gets status of the buffer
+UWORD32 ihevc_buf_mgr_get_status(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 id);
+
+// pass the ID - buffer will be returned
+void* ihevc_buf_mgr_get_buf(
+                buf_mgr_t *ps_buf_mgr,
+                WORD32 id);
+
+// will return number of active buffers
+UWORD32 ihevc_buf_mgr_get_num_active_buf(
+                buf_mgr_t *ps_buf_mgr);
+
+
+
+#endif  //_BUF_MGR_H_

diff --git a/common/ihevc_cabac_tables.c b/common/ihevc_cabac_tables.c
new file mode 100644
index 0000000..fb10f3e
--- /dev/null
+++ b/common/ihevc_cabac_tables.c

@@ -0,0 +1,3523 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+
+/**
+******************************************************************************
+* @file
+*  ihevc_cabac_tables.c
+*
+* @brief
+*  This file contains HEVC cabac tables for init contexts, rlps and
+*  cabac state trasnitions
+*
+* @author
+*   Ittiam
+*
+* @par List of Tables
+*   - gau1_ihevc_cabac_rlps[]
+*   - gau1_ihevc_next_state[]
+*   - gau1_ihevc_cab_ctxts[]
+*
+******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include "ihevc_typedefs.h"
+#include "ihevc_cabac_tables.h"
+
+
+/**
+ ******************************************************************************
+ * @brief  Table for rangeTabLPS depending on pStateIdx and qCodIRangeIdx
+ * input   : pStateIdx(0-63) and qCodIRangeIdx(0-3) [(Range >> 6) & 0x3]
+ * output  : RLps
+ *
+ * @remarks See Table 9-40 of HEVC spec for rangeTabLPS
+ ******************************************************************************
+ */
+const UWORD8 gau1_ihevc_cabac_rlps[64][4] =
+{
+    { 128,  176,  208,  240 },
+    { 128,  167,  197,  227 },
+    { 128,  158,  187,  216 },
+    { 123,  150,  178,  205 },
+    { 116,  142,  169,  195 },
+    { 111,  135,  160,  185 },
+    { 105,  128,  152,  175 },
+    { 100,  122,  144,  166 },
+    {  95,  116,  137,  158 },
+    {  90,  110,  130,  150 },
+    {  85,  104,  123,  142 },
+    {  81,   99,  117,  135 },
+    {  77,   94,  111,  128 },
+    {  73,   89,  105,  122 },
+    {  69,   85,  100,  116 },
+    {  66,   80,   95,  110 },
+    {  62,   76,   90,  104 },
+    {  59,   72,   86,   99 },
+    {  56,   69,   81,   94 },
+    {  53,   65,   77,   89 },
+    {  51,   62,   73,   85 },
+    {  48,   59,   69,   80 },
+    {  46,   56,   66,   76 },
+    {  43,   53,   63,   72 },
+    {  41,   50,   59,   69 },
+    {  39,   48,   56,   65 },
+    {  37,   45,   54,   62 },
+    {  35,   43,   51,   59 },
+    {  33,   41,   48,   56 },
+    {  32,   39,   46,   53 },
+    {  30,   37,   43,   50 },
+    {  29,   35,   41,   48 },
+    {  27,   33,   39,   45 },
+    {  26,   31,   37,   43 },
+    {  24,   30,   35,   41 },
+    {  23,   28,   33,   39 },
+    {  22,   27,   32,   37 },
+    {  21,   26,   30,   35 },
+    {  20,   24,   29,   33 },
+    {  19,   23,   27,   31 },
+    {  18,   22,   26,   30 },
+    {  17,   21,   25,   28 },
+    {  16,   20,   23,   27 },
+    {  15,   19,   22,   25 },
+    {  14,   18,   21,   24 },
+    {  14,   17,   20,   23 },
+    {  13,   16,   19,   22 },
+    {  12,   15,   18,   21 },
+    {  12,   14,   17,   20 },
+    {  11,   14,   16,   19 },
+    {  11,   13,   15,   18 },
+    {  10,   12,   15,   17 },
+    {  10,   12,   14,   16 },
+    {   9,   11,   13,   15 },
+    {   9,   11,   12,   14 },
+    {   8,   10,   12,   14 },
+    {   8,    9,   11,   13 },
+    {   7,    9,   11,   12 },
+    {   7,    9,   10,   12 },
+    {   7,    8,   10,   11 },
+    {   6,    8,    9,   11 },
+    {   6,    7,    9,   10 },
+    {   6,    7,    8,    9 },
+    {   2,    2,    2,    2 }
+};
+
+
+/**
+ ******************************************************************************
+ * @brief  probaility+MPS state transition tables based on cur State and bin
+ * input  : curpState[bits7-2]  | curMPS[bit1] | decodedBin[bit0]
+ * output : nextpState[bits6-1] | nextMPS[bit0]
+ * @remarks Modified form of Table-9-41 State Transition table in HEVC spec
+ ******************************************************************************
+ */
+const UWORD8 gau1_ihevc_next_state[64 * 2 * 2] =
+{
+/*****************************************************************************/
+/*  m=0,b=0 | m=0,b=1 | m=1,b=0 | m=1,b=1                                    */
+/*****************************************************************************/
+      2,    1,    0,    3, /* mps reversal for m=0,b=1 / m=1,b=0 */
+      4,    0,    1,    5,
+      6,    2,    3,    7,
+      8,    4,    5,    9,
+     10,    4,    5,   11,
+     12,    8,    9,   13,
+     14,    8,    9,   15,
+     16,   10,   11,   17,
+     18,   12,   13,   19,
+     20,   14,   15,   21,
+     22,   16,   17,   23,
+     24,   18,   19,   25,
+     26,   18,   19,   27,
+     28,   22,   23,   29,
+     30,   22,   23,   31,
+     32,   24,   25,   33,
+     34,   26,   27,   35,
+     36,   26,   27,   37,
+     38,   30,   31,   39,
+     40,   30,   31,   41,
+     42,   32,   33,   43,
+     44,   32,   33,   45,
+     46,   36,   37,   47,
+     48,   36,   37,   49,
+     50,   38,   39,   51,
+     52,   38,   39,   53,
+     54,   42,   43,   55,
+     56,   42,   43,   57,
+     58,   44,   45,   59,
+     60,   44,   45,   61,
+     62,   46,   47,   63,
+     64,   48,   49,   65,
+     66,   48,   49,   67,
+     68,   50,   51,   69,
+     70,   52,   53,   71,
+     72,   52,   53,   73,
+     74,   54,   55,   75,
+     76,   54,   55,   77,
+     78,   56,   57,   79,
+     80,   58,   59,   81,
+     82,   58,   59,   83,
+     84,   60,   61,   85,
+     86,   60,   61,   87,
+     88,   60,   61,   89,
+     90,   62,   63,   91,
+     92,   64,   65,   93,
+     94,   64,   65,   95,
+     96,   66,   67,   97,
+     98,   66,   67,   99,
+    100,   66,   67,  101,
+    102,   68,   69,  103,
+    104,   68,   69,  105,
+    106,   70,   71,  107,
+    108,   70,   71,  109,
+    110,   70,   71,  111,
+    112,   72,   73,  113,
+    114,   72,   73,  115,
+    116,   72,   73,  117,
+    118,   74,   75,  119,
+    120,   74,   75,  121,
+    122,   74,   75,  123,
+    124,   76,   77,  125,
+    124,   76,   77,  125,
+    126,  126,  127,  127
+};
+
+
+/*
+******************************************************************************
+* As per HEVC standard the Cabac values are generated using following logic
+* (ref: section 9.3.1.1 of JCTVC-J1003_d7_DIS)
+* From the 8 bit table entry initValue, the two 4 bit variables slopeIdx and
+* intersecIdx are derived according to the following pseudo-code process
+* slopeIdx = initValue >> 4
+* intersecIdx = initValue & 15
+*
+* Slope m and Intersec n are derived from the indices as follows:
+* m = slopeIdx*5 - 45
+* n = ( intersecIdx << 3 ) - 16
+*
+* The two values assigned to pStateIdx and valMPS for the initialization
+* are derived from SliceQPY, which is derived in Equation 7 35.
+*
+* Given the variable m and n, the initialization is specified by the following
+* pseudo-code process
+*
+* preCtxState = Clip3( 1, 126, ( ( m * Clip3( 0, 51, SliceQPY ) ) >> 4 ) + n )
+* valMPS = ( preCtxState  <=  63) ? 0 : 1
+* pStateIdx = valMPS ? (preCtxState - 64) : (63 - preCtxState)
+******************************************************************************
+*/
+
+/**
+ ******************************************************************************
+ * @brief  Init context tables for all combinations of qp and cabac_init_idc
+ * @remarks Packing format MPS in lsb and pState in bits[1-6]
+ ******************************************************************************
+ */
+
+const UWORD8 gau1_ihevc_cab_ctxts[IHEVC_NUM_CAB_IDC][IHEVC_MAX_QP][IHEVC_CAB_CTXT_END] =
+{
+    {
+        {
+            /* Context Tables for init_idc = 0, qp =  0 */
+
+              14,   30,   17,   49,   49,    1,   81,   81,   81,    1,
+               1,   81,   30,   81,   81,   81,   30,   81,   81,   81,
+              81,   81,   81,   81,   81,   81,   81,   81,   81,   81,
+              81,   14,    1,    1,   81,   49,   65,    1,   62,    1,
+              17,   17,   65,   65,   33,   49,   33,   14,   49,   81,
+              33,   49,   81,   81,   81,   81,   81,   33,   17,   81,
+              65,   65,   33,   49,   33,   14,   49,   81,   33,   49,
+              81,   81,   81,   81,   81,   33,   17,   81,   17,   17,
+              62,   49,   81,   81,   49,   65,   65,   65,   33,   33,
+              33,   17,   49,   49,  110,   14,   49,   17,   49,   49,
+             110,   14,   49,   17,   49,   49,  110,   14,   49,   33,
+              17,   62,   62,   30,   30,   30,   30,   14,   30,   17,
+              81,   30,   17,   81,   33,   33,   14,    1,   33,   30,
+               1,   17,   14,    1,   78,   33,   17,   17,    1,   30,
+              33,  110,   62,   62,   33,  110,    1,   78,    1,   14,
+              30,   46,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp =  1 */
+
+              14,   30,   15,   47,   49,    1,   83,   83,   83,    1,
+               1,   83,   30,   83,   83,   83,   30,   77,   83,   83,
+              83,   83,   83,   83,   83,   83,   83,   83,   83,   83,
+              83,   14,    0,    0,   79,   47,   61,    0,   62,    1,
+              15,   15,   63,   63,   31,   47,   31,   14,   47,   79,
+              31,   47,   79,   79,   79,   79,   77,   31,   15,   77,
+              63,   63,   31,   47,   31,   14,   47,   79,   31,   47,
+              79,   79,   79,   79,   77,   31,   15,   77,   13,   17,
+              64,   47,   79,   79,   47,   63,   63,   61,   31,   31,
+              31,   15,   47,   47,  110,   14,   47,   15,   47,   47,
+             110,   14,   47,   15,   47,   47,  110,   14,   47,   31,
+              15,   62,   62,   30,   32,   30,   32,   14,   32,   15,
+              79,   32,   15,   79,   31,   29,   16,    0,   31,   30,
+               0,   15,   14,    2,   78,   29,   15,   15,    0,   30,
+              31,  110,   62,   62,   31,  108,    0,   78,    0,   14,
+              32,   46,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp =  2 */
+
+              14,   28,   15,   47,   49,    1,   87,   87,   87,    1,
+               1,   87,   28,   87,   87,   87,   28,   73,   87,   87,
+              87,   87,   87,   87,   87,   87,   87,   87,   87,   87,
+              87,   14,    0,    0,   77,   47,   59,    0,   60,    1,
+              15,   15,   61,   61,   29,   45,   31,   14,   45,   77,
+              31,   45,   77,   79,   77,   77,   73,   29,   13,   73,
+              61,   61,   29,   45,   31,   14,   45,   77,   31,   45,
+              77,   79,   77,   77,   73,   29,   13,   73,   11,   17,
+              64,   47,   77,   77,   45,   61,   61,   59,   29,   29,
+              29,   13,   45,   47,  108,   14,   45,   13,   45,   47,
+             108,   14,   45,   13,   45,   47,  108,   14,   45,   31,
+              15,   60,   60,   30,   32,   30,   32,   14,   32,   15,
+              77,   32,   15,   77,   31,   27,   16,    0,   31,   30,
+               0,   15,   14,    6,   78,   27,   15,   13,    2,   30,
+              31,  108,   62,   60,   31,  104,    2,   76,    0,   14,
+              32,   46,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp =  3 */
+
+              14,   26,   15,   47,   49,    1,   91,   91,   91,    1,
+               1,   91,   28,   91,   91,   91,   28,   69,   91,   91,
+              91,   91,   91,   91,   91,   91,   91,   91,   91,   91,
+              91,   14,    0,    0,   75,   47,   57,    0,   60,    1,
+              15,   15,   59,   59,   29,   45,   31,   14,   45,   77,
+              31,   43,   75,   79,   77,   75,   71,   27,   13,   69,
+              59,   59,   29,   45,   31,   14,   45,   77,   31,   43,
+              75,   79,   77,   75,   71,   27,   13,   69,    9,   17,
+              64,   47,   75,   75,   45,   59,   59,   57,   29,   27,
+              29,   11,   45,   47,  108,   14,   45,   11,   45,   47,
+             108,   14,   45,   11,   45,   47,  108,   14,   45,   31,
+              15,   60,   60,   30,   32,   30,   32,   14,   32,   15,
+              75,   32,   15,   75,   31,   25,   16,    0,   31,   30,
+               0,   15,   14,    8,   78,   25,   15,   11,    2,   30,
+              31,  108,   62,   60,   31,  102,    2,   74,    0,   14,
+              32,   46,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp =  4 */
+
+              14,   24,   13,   45,   49,    1,   95,   95,   95,    1,
+               1,   95,   26,   95,   95,   95,   26,   65,   95,   95,
+              95,   95,   95,   95,   95,   95,   95,   95,   95,   95,
+              95,   14,    2,    2,   73,   45,   55,    2,   58,    1,
+              13,   13,   57,   57,   27,   43,   29,   14,   43,   75,
+              29,   41,   73,   77,   75,   73,   67,   25,   11,   65,
+              57,   57,   27,   43,   29,   14,   43,   75,   29,   41,
+              73,   77,   75,   73,   67,   25,   11,   65,    7,   19,
+              66,   45,   73,   73,   43,   57,   57,   55,   27,   25,
+              27,    9,   43,   45,  106,   14,   43,    9,   43,   45,
+             106,   14,   43,    9,   43,   45,  106,   14,   43,   29,
+              13,   58,   58,   30,   34,   30,   34,   14,   34,   13,
+              73,   34,   13,   73,   29,   23,   18,    2,   29,   30,
+               2,   13,   14,   12,   78,   23,   13,    9,    4,   30,
+              29,  106,   60,   58,   29,   98,    4,   72,    2,   14,
+              34,   44,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp =  5 */
+
+              14,   22,   13,   45,   49,    1,   99,   99,   99,    1,
+               1,   99,   24,   99,   99,   99,   24,   61,   99,   99,
+              99,   99,   99,   99,   99,   99,   99,   99,   99,   99,
+              99,   14,    2,    2,   71,   45,   51,    2,   56,    1,
+              13,   13,   55,   55,   25,   41,   29,   14,   41,   73,
+              29,   39,   71,   77,   73,   71,   65,   23,    9,   61,
+              55,   55,   25,   41,   29,   14,   41,   73,   29,   39,
+              71,   77,   73,   71,   65,   23,    9,   61,    3,   19,
+              66,   45,   71,   71,   41,   55,   55,   51,   25,   23,
+              25,    7,   41,   45,  104,   14,   41,    7,   41,   45,
+             104,   14,   41,    7,   41,   45,  104,   14,   41,   29,
+              13,   56,   56,   30,   34,   30,   34,   14,   34,   13,
+              71,   34,   13,   71,   29,   19,   18,    2,   29,   30,
+               2,   13,   14,   14,   78,   19,   13,    7,    6,   30,
+              29,  104,   60,   56,   29,   96,    6,   70,    2,   14,
+              34,   44,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp =  6 */
+
+              14,   20,   13,   45,   49,    1,  103,  103,  103,    1,
+               1,  103,   24,  103,  103,  103,   24,   57,  103,  103,
+             103,  103,  103,  103,  103,  103,  103,  103,  103,  103,
+             103,   14,    2,    2,   69,   45,   49,    2,   56,    1,
+              13,   13,   53,   53,   25,   41,   29,   14,   41,   73,
+              29,   37,   69,   77,   73,   69,   61,   21,    9,   57,
+              53,   53,   25,   41,   29,   14,   41,   73,   29,   37,
+              69,   77,   73,   69,   61,   21,    9,   57,    1,   19,
+              66,   45,   69,   69,   41,   53,   53,   49,   25,   21,
+              25,    5,   41,   45,  104,   14,   41,    5,   41,   45,
+             104,   14,   41,    5,   41,   45,  104,   14,   41,   29,
+              13,   56,   56,   30,   34,   30,   34,   14,   34,   13,
+              69,   34,   13,   69,   29,   17,   18,    2,   29,   30,
+               2,   13,   14,   18,   78,   17,   13,    5,    6,   30,
+              29,  104,   60,   56,   29,   92,    6,   68,    2,   14,
+              34,   44,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp =  7 */
+
+              14,   18,   11,   43,   49,    1,  107,  107,  107,    1,
+               1,  107,   22,  107,  107,  107,   22,   53,  107,  107,
+             107,  107,  107,  107,  107,  107,  107,  107,  107,  107,
+             107,   14,    4,    4,   67,   43,   47,    4,   54,    1,
+              11,   11,   51,   51,   23,   39,   27,   14,   39,   71,
+              27,   35,   67,   75,   71,   67,   59,   19,    7,   53,
+              51,   51,   23,   39,   27,   14,   39,   71,   27,   35,
+              67,   75,   71,   67,   59,   19,    7,   53,    0,   21,
+              68,   43,   67,   67,   39,   51,   51,   47,   23,   19,
+              23,    3,   39,   43,  102,   14,   39,    3,   39,   43,
+             102,   14,   39,    3,   39,   43,  102,   14,   39,   27,
+              11,   54,   54,   30,   36,   30,   36,   14,   36,   11,
+              67,   36,   11,   67,   27,   15,   20,    4,   27,   30,
+               4,   11,   14,   20,   78,   15,   11,    3,    8,   30,
+              27,  102,   58,   54,   27,   90,    8,   66,    4,   14,
+              36,   42,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp =  8 */
+
+              14,   16,   11,   43,   49,    1,  111,  111,  111,    1,
+               1,  111,   20,  111,  111,  111,   20,   51,  111,  111,
+             111,  111,  111,  111,  111,  111,  111,  111,  111,  111,
+             111,   14,    4,    4,   65,   43,   45,    4,   52,    1,
+              11,   11,   49,   49,   23,   39,   27,   14,   39,   71,
+              27,   33,   65,   75,   71,   65,   55,   17,    7,   51,
+              49,   49,   23,   39,   27,   14,   39,   71,   27,   33,
+              65,   75,   71,   65,   55,   17,    7,   51,    2,   21,
+              68,   43,   65,   65,   39,   49,   49,   45,   23,   17,
+              23,    1,   39,   43,  100,   14,   39,    1,   39,   43,
+             100,   14,   39,    1,   39,   43,  100,   14,   39,   27,
+              11,   52,   52,   30,   36,   30,   36,   14,   36,   11,
+              65,   36,   11,   65,   27,   13,   20,    4,   27,   30,
+               4,   11,   14,   24,   78,   13,   11,    1,    8,   30,
+              27,  100,   58,   52,   27,   86,    8,   64,    4,   14,
+              36,   42,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp =  9 */
+
+              14,   14,   11,   43,   49,    1,  113,  113,  113,    1,
+               1,  113,   20,  113,  113,  113,   20,   47,  113,  113,
+             113,  113,  113,  113,  113,  113,  113,  113,  113,  113,
+             113,   14,    4,    4,   63,   43,   41,    4,   52,    1,
+              11,   11,   47,   47,   21,   37,   27,   14,   37,   69,
+              27,   31,   63,   75,   69,   63,   51,   15,    5,   47,
+              47,   47,   21,   37,   27,   14,   37,   69,   27,   31,
+              63,   75,   69,   63,   51,   15,    5,   47,    6,   21,
+              68,   43,   63,   63,   37,   47,   47,   41,   21,   15,
+              21,    0,   37,   43,  100,   14,   37,    0,   37,   43,
+             100,   14,   37,    0,   37,   43,  100,   14,   37,   27,
+              11,   52,   52,   30,   36,   30,   36,   14,   36,   11,
+              63,   36,   11,   63,   27,    9,   20,    4,   27,   30,
+               4,   11,   14,   28,   78,    9,   11,    0,   10,   30,
+              27,  100,   58,   52,   27,   82,   10,   62,    4,   14,
+              36,   42,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 10 */
+
+              14,   12,    9,   41,   49,    1,  117,  117,  117,    1,
+               1,  117,   18,  117,  117,  117,   18,   43,  117,  117,
+             117,  117,  117,  117,  117,  117,  117,  117,  117,  117,
+             117,   14,    6,    6,   61,   41,   39,    6,   50,    1,
+               9,    9,   45,   45,   19,   35,   25,   14,   35,   67,
+              25,   29,   61,   73,   67,   61,   49,   13,    3,   43,
+              45,   45,   19,   35,   25,   14,   35,   67,   25,   29,
+              61,   73,   67,   61,   49,   13,    3,   43,    8,   23,
+              70,   41,   61,   61,   35,   45,   45,   39,   19,   13,
+              19,    2,   35,   41,   98,   14,   35,    2,   35,   41,
+              98,   14,   35,    2,   35,   41,   98,   14,   35,   25,
+               9,   50,   50,   30,   38,   30,   38,   14,   38,    9,
+              61,   38,    9,   61,   25,    7,   22,    6,   25,   30,
+               6,    9,   14,   30,   78,    7,    9,    2,   12,   30,
+              25,   98,   56,   50,   25,   80,   12,   60,    6,   14,
+              38,   40,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 11 */
+
+              14,   10,    9,   41,   49,    1,  121,  121,  121,    1,
+               1,  121,   18,  121,  121,  121,   18,   39,  121,  121,
+             121,  121,  121,  121,  121,  121,  121,  121,  121,  121,
+             121,   14,    6,    6,   59,   41,   37,    6,   50,    1,
+               9,    9,   43,   43,   19,   35,   25,   14,   35,   67,
+              25,   27,   59,   73,   67,   59,   45,   11,    3,   39,
+              43,   43,   19,   35,   25,   14,   35,   67,   25,   27,
+              59,   73,   67,   59,   45,   11,    3,   39,   10,   23,
+              70,   41,   59,   59,   35,   43,   43,   37,   19,   11,
+              19,    4,   35,   41,   98,   14,   35,    4,   35,   41,
+              98,   14,   35,    4,   35,   41,   98,   14,   35,   25,
+               9,   50,   50,   30,   38,   30,   38,   14,   38,    9,
+              59,   38,    9,   59,   25,    5,   22,    6,   25,   30,
+               6,    9,   14,   34,   78,    5,    9,    4,   12,   30,
+              25,   98,   56,   50,   25,   76,   12,   58,    6,   14,
+              38,   40,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 12 */
+
+              14,    8,    9,   41,   49,    1,  125,  125,  125,    1,
+               1,  125,   16,  125,  125,  125,   16,   35,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,    6,    6,   57,   41,   35,    6,   48,    1,
+               9,    9,   41,   41,   17,   33,   25,   14,   33,   65,
+              25,   25,   57,   73,   65,   57,   43,    9,    1,   35,
+              41,   41,   17,   33,   25,   14,   33,   65,   25,   25,
+              57,   73,   65,   57,   43,    9,    1,   35,   12,   23,
+              70,   41,   57,   57,   33,   41,   41,   35,   17,    9,
+              17,    6,   33,   41,   96,   14,   33,    6,   33,   41,
+              96,   14,   33,    6,   33,   41,   96,   14,   33,   25,
+               9,   48,   48,   30,   38,   30,   38,   14,   38,    9,
+              57,   38,    9,   57,   25,    3,   22,    6,   25,   30,
+               6,    9,   14,   36,   78,    3,    9,    6,   14,   30,
+              25,   96,   56,   48,   25,   74,   14,   56,    6,   14,
+              38,   40,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 13 */
+
+              14,    6,    7,   39,   49,    1,  125,  125,  125,    1,
+               1,  125,   14,  125,  125,  125,   14,   31,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,    8,    8,   55,   39,   31,    8,   46,    1,
+               7,    7,   39,   39,   15,   31,   23,   14,   31,   63,
+              23,   23,   55,   71,   63,   55,   39,    7,    0,   31,
+              39,   39,   15,   31,   23,   14,   31,   63,   23,   23,
+              55,   71,   63,   55,   39,    7,    0,   31,   16,   25,
+              72,   39,   55,   55,   31,   39,   39,   31,   15,    7,
+              15,    8,   31,   39,   94,   14,   31,    8,   31,   39,
+              94,   14,   31,    8,   31,   39,   94,   14,   31,   23,
+               7,   46,   46,   30,   40,   30,   40,   14,   40,    7,
+              55,   40,    7,   55,   23,    0,   24,    8,   23,   30,
+               8,    7,   14,   40,   78,    0,    7,    8,   16,   30,
+              23,   94,   54,   46,   23,   70,   16,   54,    8,   14,
+              40,   38,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 14 */
+
+              14,    4,    7,   39,   49,    1,  125,  125,  125,    1,
+               1,  125,   14,  125,  125,  125,   14,   27,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,    8,    8,   53,   39,   29,    8,   46,    1,
+               7,    7,   37,   37,   15,   31,   23,   14,   31,   63,
+              23,   21,   53,   71,   63,   53,   37,    5,    0,   27,
+              37,   37,   15,   31,   23,   14,   31,   63,   23,   21,
+              53,   71,   63,   53,   37,    5,    0,   27,   18,   25,
+              72,   39,   53,   53,   31,   37,   37,   29,   15,    5,
+              15,   10,   31,   39,   94,   14,   31,   10,   31,   39,
+              94,   14,   31,   10,   31,   39,   94,   14,   31,   23,
+               7,   46,   46,   30,   40,   30,   40,   14,   40,    7,
+              53,   40,    7,   53,   23,    2,   24,    8,   23,   30,
+               8,    7,   14,   42,   78,    2,    7,   10,   16,   30,
+              23,   94,   54,   46,   23,   68,   16,   52,    8,   14,
+              40,   38,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 15 */
+
+              14,    2,    7,   39,   49,    1,  125,  125,  125,    1,
+               1,  125,   12,  125,  125,  125,   12,   23,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,    8,    8,   51,   39,   27,    8,   44,    1,
+               7,    7,   35,   35,   13,   29,   23,   14,   29,   61,
+              23,   19,   51,   71,   61,   51,   33,    3,    2,   23,
+              35,   35,   13,   29,   23,   14,   29,   61,   23,   19,
+              51,   71,   61,   51,   33,    3,    2,   23,   20,   25,
+              72,   39,   51,   51,   29,   35,   35,   27,   13,    3,
+              13,   12,   29,   39,   92,   14,   29,   12,   29,   39,
+              92,   14,   29,   12,   29,   39,   92,   14,   29,   23,
+               7,   44,   44,   30,   40,   30,   40,   14,   40,    7,
+              51,   40,    7,   51,   23,    4,   24,    8,   23,   30,
+               8,    7,   14,   46,   78,    4,    7,   12,   18,   30,
+              23,   92,   54,   44,   23,   64,   18,   50,    8,   14,
+              40,   38,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 16 */
+
+              14,    0,    7,   39,   49,    1,  125,  125,  125,    1,
+               1,  125,   10,  125,  125,  125,   10,   21,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,    8,    8,   51,   39,   25,    8,   42,    1,
+               7,    7,   35,   35,   13,   29,   23,   14,   29,   61,
+              23,   19,   51,   71,   61,   51,   31,    3,    2,   21,
+              35,   35,   13,   29,   23,   14,   29,   61,   23,   19,
+              51,   71,   61,   51,   31,    3,    2,   21,   22,   27,
+              72,   39,   51,   51,   29,   35,   35,   25,   13,    3,
+              13,   12,   29,   39,   90,   14,   29,   12,   29,   39,
+              90,   14,   29,   12,   29,   39,   90,   14,   29,   23,
+               7,   42,   42,   30,   40,   30,   40,   14,   40,    7,
+              51,   40,    7,   51,   23,    6,   24,    8,   23,   30,
+               8,    7,   14,   48,   78,    6,    7,   12,   18,   30,
+              23,   90,   52,   42,   23,   60,   18,   48,    8,   14,
+              40,   36,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 17 */
+
+              14,    0,    5,   37,   49,    1,  125,  125,  125,    1,
+               1,  125,   10,  125,  125,  125,   10,   17,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   10,   10,   49,   37,   21,   10,   42,    1,
+               5,    5,   33,   33,   11,   27,   21,   14,   27,   59,
+              21,   17,   49,   69,   59,   49,   27,    1,    4,   17,
+              33,   33,   11,   27,   21,   14,   27,   59,   21,   17,
+              49,   69,   59,   49,   27,    1,    4,   17,   26,   27,
+              74,   37,   49,   49,   27,   33,   33,   21,   11,    1,
+              11,   14,   27,   37,   90,   14,   27,   14,   27,   37,
+              90,   14,   27,   14,   27,   37,   90,   14,   27,   21,
+               5,   42,   42,   30,   42,   30,   42,   14,   42,    5,
+              49,   42,    5,   49,   21,   10,   26,   10,   21,   30,
+              10,    5,   14,   52,   78,   10,    5,   14,   20,   30,
+              21,   90,   52,   42,   21,   58,   20,   48,   10,   14,
+              42,   36,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 18 */
+
+              14,    1,    5,   37,   49,    1,  125,  125,  125,    1,
+               1,  125,    8,  125,  125,  125,    8,   13,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   10,   10,   47,   37,   19,   10,   40,    1,
+               5,    5,   31,   31,    9,   25,   21,   14,   25,   57,
+              21,   15,   47,   69,   57,   47,   23,    0,    6,   13,
+              31,   31,    9,   25,   21,   14,   25,   57,   21,   15,
+              47,   69,   57,   47,   23,    0,    6,   13,   28,   27,
+              74,   37,   47,   47,   25,   31,   31,   19,    9,    0,
+               9,   16,   25,   37,   88,   14,   25,   16,   25,   37,
+              88,   14,   25,   16,   25,   37,   88,   14,   25,   21,
+               5,   40,   40,   30,   42,   30,   42,   14,   42,    5,
+              47,   42,    5,   47,   21,   12,   26,   10,   21,   30,
+              10,    5,   14,   56,   78,   12,    5,   16,   22,   30,
+              21,   88,   52,   40,   21,   54,   22,   46,   10,   14,
+              42,   36,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 19 */
+
+              14,    3,    5,   37,   49,    1,  125,  125,  125,    1,
+               1,  125,    8,  125,  125,  125,    8,    9,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   10,   10,   45,   37,   17,   10,   40,    1,
+               5,    5,   29,   29,    9,   25,   21,   14,   25,   57,
+              21,   13,   45,   69,   57,   45,   21,    2,    6,    9,
+              29,   29,    9,   25,   21,   14,   25,   57,   21,   13,
+              45,   69,   57,   45,   21,    2,    6,    9,   30,   27,
+              74,   37,   45,   45,   25,   29,   29,   17,    9,    2,
+               9,   18,   25,   37,   88,   14,   25,   18,   25,   37,
+              88,   14,   25,   18,   25,   37,   88,   14,   25,   21,
+               5,   40,   40,   30,   42,   30,   42,   14,   42,    5,
+              45,   42,    5,   45,   21,   14,   26,   10,   21,   30,
+              10,    5,   14,   58,   78,   14,    5,   18,   22,   30,
+              21,   88,   52,   40,   21,   52,   22,   44,   10,   14,
+              42,   36,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 20 */
+
+              14,    5,    3,   35,   49,    1,  125,  125,  125,    1,
+               1,  125,    6,  125,  125,  125,    6,    5,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   12,   12,   43,   35,   15,   12,   38,    1,
+               3,    3,   27,   27,    7,   23,   19,   14,   23,   55,
+              19,   11,   43,   67,   55,   43,   17,    4,    8,    5,
+              27,   27,    7,   23,   19,   14,   23,   55,   19,   11,
+              43,   67,   55,   43,   17,    4,    8,    5,   32,   29,
+              76,   35,   43,   43,   23,   27,   27,   15,    7,    4,
+               7,   20,   23,   35,   86,   14,   23,   20,   23,   35,
+              86,   14,   23,   20,   23,   35,   86,   14,   23,   19,
+               3,   38,   38,   30,   44,   30,   44,   14,   44,    3,
+              43,   44,    3,   43,   19,   16,   28,   12,   19,   30,
+              12,    3,   14,   62,   78,   16,    3,   20,   24,   30,
+              19,   86,   50,   38,   19,   48,   24,   42,   12,   14,
+              44,   34,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 21 */
+
+              14,    7,    3,   35,   49,    1,  125,  125,  125,    1,
+               1,  125,    4,  125,  125,  125,    4,    1,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   12,   12,   41,   35,   11,   12,   36,    1,
+               3,    3,   25,   25,    5,   21,   19,   14,   21,   53,
+              19,    9,   41,   67,   53,   41,   15,    6,   10,    1,
+              25,   25,    5,   21,   19,   14,   21,   53,   19,    9,
+              41,   67,   53,   41,   15,    6,   10,    1,   36,   29,
+              76,   35,   41,   41,   21,   25,   25,   11,    5,    6,
+               5,   22,   21,   35,   84,   14,   21,   22,   21,   35,
+              84,   14,   21,   22,   21,   35,   84,   14,   21,   19,
+               3,   36,   36,   30,   44,   30,   44,   14,   44,    3,
+              41,   44,    3,   41,   19,   20,   28,   12,   19,   30,
+              12,    3,   14,   64,   78,   20,    3,   22,   26,   30,
+              19,   84,   50,   36,   19,   46,   26,   40,   12,   14,
+              44,   34,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 22 */
+
+              14,    9,    3,   35,   49,    1,  125,  125,  125,    1,
+               1,  125,    4,  125,  125,  125,    4,    2,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   12,   12,   39,   35,    9,   12,   36,    1,
+               3,    3,   23,   23,    5,   21,   19,   14,   21,   53,
+              19,    7,   39,   67,   53,   39,   11,    8,   10,    2,
+              23,   23,    5,   21,   19,   14,   21,   53,   19,    7,
+              39,   67,   53,   39,   11,    8,   10,    2,   38,   29,
+              76,   35,   39,   39,   21,   23,   23,    9,    5,    8,
+               5,   24,   21,   35,   84,   14,   21,   24,   21,   35,
+              84,   14,   21,   24,   21,   35,   84,   14,   21,   19,
+               3,   36,   36,   30,   44,   30,   44,   14,   44,    3,
+              39,   44,    3,   39,   19,   22,   28,   12,   19,   30,
+              12,    3,   14,   68,   78,   22,    3,   24,   26,   30,
+              19,   84,   50,   36,   19,   42,   26,   38,   12,   14,
+              44,   34,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 23 */
+
+              14,   11,    1,   33,   49,    1,  125,  125,  125,    1,
+               1,  125,    2,  125,  125,  125,    2,    6,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   14,   14,   37,   33,    7,   14,   34,    1,
+               1,    1,   21,   21,    3,   19,   17,   14,   19,   51,
+              17,    5,   37,   65,   51,   37,    9,   10,   12,    6,
+              21,   21,    3,   19,   17,   14,   19,   51,   17,    5,
+              37,   65,   51,   37,    9,   10,   12,    6,   40,   31,
+              78,   33,   37,   37,   19,   21,   21,    7,    3,   10,
+               3,   26,   19,   33,   82,   14,   19,   26,   19,   33,
+              82,   14,   19,   26,   19,   33,   82,   14,   19,   17,
+               1,   34,   34,   30,   46,   30,   46,   14,   46,    1,
+              37,   46,    1,   37,   17,   24,   30,   14,   17,   30,
+              14,    1,   14,   70,   78,   24,    1,   26,   28,   30,
+              17,   82,   48,   34,   17,   40,   28,   36,   14,   14,
+              46,   32,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 24 */
+
+              14,   13,    1,   33,   49,    1,  125,  125,  125,    1,
+               1,  125,    0,  125,  125,  125,    0,    8,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   14,   14,   35,   33,    5,   14,   32,    1,
+               1,    1,   19,   19,    3,   19,   17,   14,   19,   51,
+              17,    3,   35,   65,   51,   35,    5,   12,   12,    8,
+              19,   19,    3,   19,   17,   14,   19,   51,   17,    3,
+              35,   65,   51,   35,    5,   12,   12,    8,   42,   31,
+              78,   33,   35,   35,   19,   19,   19,    5,    3,   12,
+               3,   28,   19,   33,   80,   14,   19,   28,   19,   33,
+              80,   14,   19,   28,   19,   33,   80,   14,   19,   17,
+               1,   32,   32,   30,   46,   30,   46,   14,   46,    1,
+              35,   46,    1,   35,   17,   26,   30,   14,   17,   30,
+              14,    1,   14,   74,   78,   26,    1,   28,   28,   30,
+              17,   80,   48,   32,   17,   36,   28,   34,   14,   14,
+              46,   32,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 25 */
+
+              14,   15,    1,   33,   49,    1,  125,  125,  125,    1,
+               1,  125,    0,  125,  125,  125,    0,   12,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   14,   14,   33,   33,    1,   14,   32,    1,
+               1,    1,   17,   17,    1,   17,   17,   14,   17,   49,
+              17,    1,   33,   65,   49,   33,    1,   14,   14,   12,
+              17,   17,    1,   17,   17,   14,   17,   49,   17,    1,
+              33,   65,   49,   33,    1,   14,   14,   12,   46,   31,
+              78,   33,   33,   33,   17,   17,   17,    1,    1,   14,
+               1,   30,   17,   33,   80,   14,   17,   30,   17,   33,
+              80,   14,   17,   30,   17,   33,   80,   14,   17,   17,
+               1,   32,   32,   30,   46,   30,   46,   14,   46,    1,
+              33,   46,    1,   33,   17,   30,   30,   14,   17,   30,
+              14,    1,   14,   78,   78,   30,    1,   30,   30,   30,
+              17,   80,   48,   32,   17,   32,   30,   32,   14,   14,
+              46,   32,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 26 */
+
+              14,   17,    0,   31,   49,    1,  125,  125,  125,    1,
+               1,  125,    1,  125,  125,  125,    1,   16,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   16,   16,   31,   31,    0,   16,   30,    1,
+               0,    0,   15,   15,    0,   15,   15,   14,   15,   47,
+              15,    0,   31,   63,   47,   31,    0,   16,   16,   16,
+              15,   15,    0,   15,   15,   14,   15,   47,   15,    0,
+              31,   63,   47,   31,    0,   16,   16,   16,   48,   33,
+              80,   31,   31,   31,   15,   15,   15,    0,    0,   16,
+               0,   32,   15,   31,   78,   14,   15,   32,   15,   31,
+              78,   14,   15,   32,   15,   31,   78,   14,   15,   15,
+               0,   30,   30,   30,   48,   30,   48,   14,   48,    0,
+              31,   48,    0,   31,   15,   32,   32,   16,   15,   30,
+              16,    0,   14,   80,   78,   32,    0,   32,   32,   30,
+              15,   78,   46,   30,   15,   30,   32,   30,   16,   14,
+              48,   30,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 27 */
+
+              14,   19,    0,   31,   49,    1,  125,  125,  125,    1,
+               1,  125,    1,  125,  125,  125,    1,   20,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   16,   16,   29,   31,    2,   16,   30,    1,
+               0,    0,   13,   13,    0,   15,   15,   14,   15,   47,
+              15,    2,   29,   63,   47,   29,    4,   18,   16,   20,
+              13,   13,    0,   15,   15,   14,   15,   47,   15,    2,
+              29,   63,   47,   29,    4,   18,   16,   20,   50,   33,
+              80,   31,   29,   29,   15,   13,   13,    2,    0,   18,
+               0,   34,   15,   31,   78,   14,   15,   34,   15,   31,
+              78,   14,   15,   34,   15,   31,   78,   14,   15,   15,
+               0,   30,   30,   30,   48,   30,   48,   14,   48,    0,
+              29,   48,    0,   29,   15,   34,   32,   16,   15,   30,
+              16,    0,   14,   84,   78,   34,    0,   34,   32,   30,
+              15,   78,   46,   30,   15,   26,   32,   28,   16,   14,
+              48,   30,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 28 */
+
+              14,   21,    0,   31,   49,    1,  125,  125,  125,    1,
+               1,  125,    3,  125,  125,  125,    3,   24,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   16,   16,   27,   31,    4,   16,   28,    1,
+               0,    0,   11,   11,    2,   13,   15,   14,   13,   45,
+              15,    4,   27,   63,   45,   27,    6,   20,   18,   24,
+              11,   11,    2,   13,   15,   14,   13,   45,   15,    4,
+              27,   63,   45,   27,    6,   20,   18,   24,   52,   33,
+              80,   31,   27,   27,   13,   11,   11,    4,    2,   20,
+               2,   36,   13,   31,   76,   14,   13,   36,   13,   31,
+              76,   14,   13,   36,   13,   31,   76,   14,   13,   15,
+               0,   28,   28,   30,   48,   30,   48,   14,   48,    0,
+              27,   48,    0,   27,   15,   36,   32,   16,   15,   30,
+              16,    0,   14,   86,   78,   36,    0,   36,   34,   30,
+              15,   76,   46,   28,   15,   24,   34,   26,   16,   14,
+              48,   30,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 29 */
+
+              14,   23,    2,   29,   49,    1,  125,  125,  125,    1,
+               1,  125,    5,  125,  125,  125,    5,   28,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   18,   18,   25,   29,    8,   18,   26,    1,
+               2,    2,    9,    9,    4,   11,   13,   14,   11,   43,
+              13,    6,   25,   61,   43,   25,   10,   22,   20,   28,
+               9,    9,    4,   11,   13,   14,   11,   43,   13,    6,
+              25,   61,   43,   25,   10,   22,   20,   28,   56,   35,
+              82,   29,   25,   25,   11,    9,    9,    8,    4,   22,
+               4,   38,   11,   29,   74,   14,   11,   38,   11,   29,
+              74,   14,   11,   38,   11,   29,   74,   14,   11,   13,
+               2,   26,   26,   30,   50,   30,   50,   14,   50,    2,
+              25,   50,    2,   25,   13,   40,   34,   18,   13,   30,
+              18,    2,   14,   90,   78,   40,    2,   38,   36,   30,
+              13,   74,   44,   26,   13,   20,   36,   24,   18,   14,
+              50,   28,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 30 */
+
+              14,   25,    2,   29,   49,    1,  125,  125,  125,    1,
+               1,  125,    5,  125,  125,  125,    5,   32,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   18,   18,   23,   29,   10,   18,   26,    1,
+               2,    2,    7,    7,    4,   11,   13,   14,   11,   43,
+              13,    8,   23,   61,   43,   23,   12,   24,   20,   32,
+               7,    7,    4,   11,   13,   14,   11,   43,   13,    8,
+              23,   61,   43,   23,   12,   24,   20,   32,   58,   35,
+              82,   29,   23,   23,   11,    7,    7,   10,    4,   24,
+               4,   40,   11,   29,   74,   14,   11,   40,   11,   29,
+              74,   14,   11,   40,   11,   29,   74,   14,   11,   13,
+               2,   26,   26,   30,   50,   30,   50,   14,   50,    2,
+              23,   50,    2,   23,   13,   42,   34,   18,   13,   30,
+              18,    2,   14,   92,   78,   42,    2,   40,   36,   30,
+              13,   74,   44,   26,   13,   18,   36,   22,   18,   14,
+              50,   28,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 31 */
+
+              14,   27,    2,   29,   49,    1,  125,  125,  125,    1,
+               1,  125,    7,  125,  125,  125,    7,   36,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   18,   18,   21,   29,   12,   18,   24,    1,
+               2,    2,    5,    5,    6,    9,   13,   14,    9,   41,
+              13,   10,   21,   61,   41,   21,   16,   26,   22,   36,
+               5,    5,    6,    9,   13,   14,    9,   41,   13,   10,
+              21,   61,   41,   21,   16,   26,   22,   36,   60,   35,
+              82,   29,   21,   21,    9,    5,    5,   12,    6,   26,
+               6,   42,    9,   29,   72,   14,    9,   42,    9,   29,
+              72,   14,    9,   42,    9,   29,   72,   14,    9,   13,
+               2,   24,   24,   30,   50,   30,   50,   14,   50,    2,
+              21,   50,    2,   21,   13,   44,   34,   18,   13,   30,
+              18,    2,   14,   96,   78,   44,    2,   42,   38,   30,
+              13,   72,   44,   24,   13,   14,   38,   20,   18,   14,
+              50,   28,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 32 */
+
+              14,   29,    2,   29,   49,    1,  125,  125,  125,    1,
+               1,  125,    9,  125,  125,  125,    9,   38,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   18,   18,   21,   29,   14,   18,   22,    1,
+               2,    2,    5,    5,    6,    9,   13,   14,    9,   41,
+              13,   10,   21,   61,   41,   21,   18,   26,   22,   38,
+               5,    5,    6,    9,   13,   14,    9,   41,   13,   10,
+              21,   61,   41,   21,   18,   26,   22,   38,   62,   37,
+              82,   29,   21,   21,    9,    5,    5,   14,    6,   26,
+               6,   42,    9,   29,   70,   14,    9,   42,    9,   29,
+              70,   14,    9,   42,    9,   29,   70,   14,    9,   13,
+               2,   22,   22,   30,   50,   30,   50,   14,   50,    2,
+              21,   50,    2,   21,   13,   46,   34,   18,   13,   30,
+              18,    2,   14,   98,   78,   46,    2,   42,   38,   30,
+              13,   70,   42,   22,   13,   10,   38,   18,   18,   14,
+              50,   26,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 33 */
+
+              14,   29,    4,   27,   49,    1,  125,  125,  125,    1,
+               1,  125,    9,  125,  125,  125,    9,   42,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   20,   20,   19,   27,   18,   20,   22,    1,
+               4,    4,    3,    3,    8,    7,   11,   14,    7,   39,
+              11,   12,   19,   59,   39,   19,   22,   28,   24,   42,
+               3,    3,    8,    7,   11,   14,    7,   39,   11,   12,
+              19,   59,   39,   19,   22,   28,   24,   42,   66,   37,
+              84,   27,   19,   19,    7,    3,    3,   18,    8,   28,
+               8,   44,    7,   27,   70,   14,    7,   44,    7,   27,
+              70,   14,    7,   44,    7,   27,   70,   14,    7,   11,
+               4,   22,   22,   30,   52,   30,   52,   14,   52,    4,
+              19,   52,    4,   19,   11,   50,   36,   20,   11,   30,
+              20,    4,   14,  102,   78,   50,    4,   44,   40,   30,
+              11,   70,   42,   22,   11,    8,   40,   18,   20,   14,
+              52,   26,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 34 */
+
+              14,   31,    4,   27,   49,    1,  125,  125,  125,    1,
+               1,  125,   11,  125,  125,  125,   11,   46,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   20,   20,   17,   27,   20,   20,   20,    1,
+               4,    4,    1,    1,   10,    5,   11,   14,    5,   37,
+              11,   14,   17,   59,   37,   17,   26,   30,   26,   46,
+               1,    1,   10,    5,   11,   14,    5,   37,   11,   14,
+              17,   59,   37,   17,   26,   30,   26,   46,   68,   37,
+              84,   27,   17,   17,    5,    1,    1,   20,   10,   30,
+              10,   46,    5,   27,   68,   14,    5,   46,    5,   27,
+              68,   14,    5,   46,    5,   27,   68,   14,    5,   11,
+               4,   20,   20,   30,   52,   30,   52,   14,   52,    4,
+              17,   52,    4,   17,   11,   52,   36,   20,   11,   30,
+              20,    4,   14,  106,   78,   52,    4,   46,   42,   30,
+              11,   68,   42,   20,   11,    4,   42,   16,   20,   14,
+              52,   26,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 35 */
+
+              14,   33,    4,   27,   49,    1,  125,  125,  125,    1,
+               1,  125,   11,  125,  125,  125,   11,   50,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   20,   20,   15,   27,   22,   20,   20,    1,
+               4,    4,    0,    0,   10,    5,   11,   14,    5,   37,
+              11,   16,   15,   59,   37,   15,   28,   32,   26,   50,
+               0,    0,   10,    5,   11,   14,    5,   37,   11,   16,
+              15,   59,   37,   15,   28,   32,   26,   50,   70,   37,
+              84,   27,   15,   15,    5,    0,    0,   22,   10,   32,
+              10,   48,    5,   27,   68,   14,    5,   48,    5,   27,
+              68,   14,    5,   48,    5,   27,   68,   14,    5,   11,
+               4,   20,   20,   30,   52,   30,   52,   14,   52,    4,
+              15,   52,    4,   15,   11,   54,   36,   20,   11,   30,
+              20,    4,   14,  108,   78,   54,    4,   48,   42,   30,
+              11,   68,   42,   20,   11,    2,   42,   14,   20,   14,
+              52,   26,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 36 */
+
+              14,   35,    6,   25,   49,    1,  125,  125,  125,    1,
+               1,  125,   13,  125,  125,  125,   13,   54,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   22,   22,   13,   25,   24,   22,   18,    1,
+               6,    6,    2,    2,   12,    3,    9,   14,    3,   35,
+               9,   18,   13,   57,   35,   13,   32,   34,   28,   54,
+               2,    2,   12,    3,    9,   14,    3,   35,    9,   18,
+              13,   57,   35,   13,   32,   34,   28,   54,   72,   39,
+              86,   25,   13,   13,    3,    2,    2,   24,   12,   34,
+              12,   50,    3,   25,   66,   14,    3,   50,    3,   25,
+              66,   14,    3,   50,    3,   25,   66,   14,    3,    9,
+               6,   18,   18,   30,   54,   30,   54,   14,   54,    6,
+              13,   54,    6,   13,    9,   56,   38,   22,    9,   30,
+              22,    6,   14,  112,   78,   56,    6,   50,   44,   30,
+               9,   66,   40,   18,    9,    1,   44,   12,   22,   14,
+              54,   24,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 37 */
+
+              14,   37,    6,   25,   49,    1,  125,  125,  125,    1,
+               1,  125,   15,  125,  125,  125,   15,   58,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   22,   22,   11,   25,   28,   22,   16,    1,
+               6,    6,    4,    4,   14,    1,    9,   14,    1,   33,
+               9,   20,   11,   57,   33,   11,   34,   36,   30,   58,
+               4,    4,   14,    1,    9,   14,    1,   33,    9,   20,
+              11,   57,   33,   11,   34,   36,   30,   58,   76,   39,
+              86,   25,   11,   11,    1,    4,    4,   28,   14,   36,
+              14,   52,    1,   25,   64,   14,    1,   52,    1,   25,
+              64,   14,    1,   52,    1,   25,   64,   14,    1,    9,
+               6,   16,   16,   30,   54,   30,   54,   14,   54,    6,
+              11,   54,    6,   11,    9,   60,   38,   22,    9,   30,
+              22,    6,   14,  114,   78,   60,    6,   52,   46,   30,
+               9,   64,   40,   16,    9,    3,   46,   10,   22,   14,
+              54,   24,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 38 */
+
+              14,   39,    6,   25,   49,    1,  125,  125,  125,    1,
+               1,  125,   15,  125,  125,  125,   15,   62,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   22,   22,    9,   25,   30,   22,   16,    1,
+               6,    6,    6,    6,   14,    1,    9,   14,    1,   33,
+               9,   22,    9,   57,   33,    9,   38,   38,   30,   62,
+               6,    6,   14,    1,    9,   14,    1,   33,    9,   22,
+               9,   57,   33,    9,   38,   38,   30,   62,   78,   39,
+              86,   25,    9,    9,    1,    6,    6,   30,   14,   38,
+              14,   54,    1,   25,   64,   14,    1,   54,    1,   25,
+              64,   14,    1,   54,    1,   25,   64,   14,    1,    9,
+               6,   16,   16,   30,   54,   30,   54,   14,   54,    6,
+               9,   54,    6,    9,    9,   62,   38,   22,    9,   30,
+              22,    6,   14,  118,   78,   62,    6,   54,   46,   30,
+               9,   64,   40,   16,    9,    7,   46,    8,   22,   14,
+              54,   24,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 39 */
+
+              14,   41,    8,   23,   49,    1,  125,  125,  125,    1,
+               1,  125,   17,  125,  125,  125,   17,   66,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   24,   24,    7,   23,   32,   24,   14,    1,
+               8,    8,    8,    8,   16,    0,    7,   14,    0,   31,
+               7,   24,    7,   55,   31,    7,   40,   40,   32,   66,
+               8,    8,   16,    0,    7,   14,    0,   31,    7,   24,
+               7,   55,   31,    7,   40,   40,   32,   66,   80,   41,
+              88,   23,    7,    7,    0,    8,    8,   32,   16,   40,
+              16,   56,    0,   23,   62,   14,    0,   56,    0,   23,
+              62,   14,    0,   56,    0,   23,   62,   14,    0,    7,
+               8,   14,   14,   30,   56,   30,   56,   14,   56,    8,
+               7,   56,    8,    7,    7,   64,   40,   24,    7,   30,
+              24,    8,   14,  120,   78,   64,    8,   56,   48,   30,
+               7,   62,   38,   14,    7,    9,   48,    6,   24,   14,
+              56,   22,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 40 */
+
+              14,   43,    8,   23,   49,    1,  125,  125,  125,    1,
+               1,  125,   19,  125,  125,  125,   19,   68,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   24,   24,    5,   23,   34,   24,   12,    1,
+               8,    8,   10,   10,   16,    0,    7,   14,    0,   31,
+               7,   26,    5,   55,   31,    5,   44,   42,   32,   68,
+              10,   10,   16,    0,    7,   14,    0,   31,    7,   26,
+               5,   55,   31,    5,   44,   42,   32,   68,   82,   41,
+              88,   23,    5,    5,    0,   10,   10,   34,   16,   42,
+              16,   58,    0,   23,   60,   14,    0,   58,    0,   23,
+              60,   14,    0,   58,    0,   23,   60,   14,    0,    7,
+               8,   12,   12,   30,   56,   30,   56,   14,   56,    8,
+               5,   56,    8,    5,    7,   66,   40,   24,    7,   30,
+              24,    8,   14,  124,   78,   66,    8,   58,   48,   30,
+               7,   60,   38,   12,    7,   13,   48,    4,   24,   14,
+              56,   22,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 41 */
+
+              14,   45,    8,   23,   49,    1,  125,  125,  125,    1,
+               1,  125,   19,  125,  125,  125,   19,   72,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   24,   24,    3,   23,   38,   24,   12,    1,
+               8,    8,   12,   12,   18,    2,    7,   14,    2,   29,
+               7,   28,    3,   55,   29,    3,   48,   44,   34,   72,
+              12,   12,   18,    2,    7,   14,    2,   29,    7,   28,
+               3,   55,   29,    3,   48,   44,   34,   72,   86,   41,
+              88,   23,    3,    3,    2,   12,   12,   38,   18,   44,
+              18,   60,    2,   23,   60,   14,    2,   60,    2,   23,
+              60,   14,    2,   60,    2,   23,   60,   14,    2,    7,
+               8,   12,   12,   30,   56,   30,   56,   14,   56,    8,
+               3,   56,    8,    3,    7,   70,   40,   24,    7,   30,
+              24,    8,   14,  124,   78,   70,    8,   60,   50,   30,
+               7,   60,   38,   12,    7,   17,   50,    2,   24,   14,
+              56,   22,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 42 */
+
+              14,   47,   10,   21,   49,    1,  125,  125,  125,    1,
+               1,  125,   21,  125,  125,  125,   21,   76,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   26,   26,    1,   21,   40,   26,   10,    1,
+              10,   10,   14,   14,   20,    4,    5,   14,    4,   27,
+               5,   30,    1,   53,   27,    1,   50,   46,   36,   76,
+              14,   14,   20,    4,    5,   14,    4,   27,    5,   30,
+               1,   53,   27,    1,   50,   46,   36,   76,   88,   43,
+              90,   21,    1,    1,    4,   14,   14,   40,   20,   46,
+              20,   62,    4,   21,   58,   14,    4,   62,    4,   21,
+              58,   14,    4,   62,    4,   21,   58,   14,    4,    5,
+              10,   10,   10,   30,   58,   30,   58,   14,   58,   10,
+               1,   58,   10,    1,    5,   72,   42,   26,    5,   30,
+              26,   10,   14,  124,   78,   72,   10,   62,   52,   30,
+               5,   58,   36,   10,    5,   19,   52,    0,   26,   14,
+              58,   20,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 43 */
+
+              14,   49,   10,   21,   49,    1,  125,  125,  125,    1,
+               1,  125,   21,  125,  125,  125,   21,   80,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   26,   26,    0,   21,   42,   26,   10,    1,
+              10,   10,   16,   16,   20,    4,    5,   14,    4,   27,
+               5,   32,    0,   53,   27,    0,   54,   48,   36,   80,
+              16,   16,   20,    4,    5,   14,    4,   27,    5,   32,
+               0,   53,   27,    0,   54,   48,   36,   80,   90,   43,
+              90,   21,    0,    0,    4,   16,   16,   42,   20,   48,
+              20,   64,    4,   21,   58,   14,    4,   64,    4,   21,
+              58,   14,    4,   64,    4,   21,   58,   14,    4,    5,
+              10,   10,   10,   30,   58,   30,   58,   14,   58,   10,
+               0,   58,   10,    0,    5,   74,   42,   26,    5,   30,
+              26,   10,   14,  124,   78,   74,   10,   64,   52,   30,
+               5,   58,   36,   10,    5,   23,   52,    1,   26,   14,
+              58,   20,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 44 */
+
+              14,   51,   10,   21,   49,    1,  125,  125,  125,    1,
+               1,  125,   23,  125,  125,  125,   23,   84,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   26,   26,    2,   21,   44,   26,    8,    1,
+              10,   10,   18,   18,   22,    6,    5,   14,    6,   25,
+               5,   34,    2,   53,   25,    2,   56,   50,   38,   84,
+              18,   18,   22,    6,    5,   14,    6,   25,    5,   34,
+               2,   53,   25,    2,   56,   50,   38,   84,   92,   43,
+              90,   21,    2,    2,    6,   18,   18,   44,   22,   50,
+              22,   66,    6,   21,   56,   14,    6,   66,    6,   21,
+              56,   14,    6,   66,    6,   21,   56,   14,    6,    5,
+              10,    8,    8,   30,   58,   30,   58,   14,   58,   10,
+               2,   58,   10,    2,    5,   76,   42,   26,    5,   30,
+              26,   10,   14,  124,   78,   76,   10,   66,   54,   30,
+               5,   56,   36,    8,    5,   25,   54,    3,   26,   14,
+              58,   20,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 45 */
+
+              14,   53,   12,   19,   49,    1,  125,  125,  125,    1,
+               1,  125,   25,  125,  125,  125,   25,   88,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   28,   28,    4,   19,   48,   28,    6,    1,
+              12,   12,   20,   20,   24,    8,    3,   14,    8,   23,
+               3,   36,    4,   51,   23,    4,   60,   52,   40,   88,
+              20,   20,   24,    8,    3,   14,    8,   23,    3,   36,
+               4,   51,   23,    4,   60,   52,   40,   88,   96,   45,
+              92,   19,    4,    4,    8,   20,   20,   48,   24,   52,
+              24,   68,    8,   19,   54,   14,    8,   68,    8,   19,
+              54,   14,    8,   68,    8,   19,   54,   14,    8,    3,
+              12,    6,    6,   30,   60,   30,   60,   14,   60,   12,
+               4,   60,   12,    4,    3,   80,   44,   28,    3,   30,
+              28,   12,   14,  124,   78,   80,   12,   68,   56,   30,
+               3,   54,   34,    6,    3,   29,   56,    5,   28,   14,
+              60,   18,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 46 */
+
+              14,   55,   12,   19,   49,    1,  125,  125,  125,    1,
+               1,  125,   25,  125,  125,  125,   25,   92,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   28,   28,    6,   19,   50,   28,    6,    1,
+              12,   12,   22,   22,   24,    8,    3,   14,    8,   23,
+               3,   38,    6,   51,   23,    6,   62,   54,   40,   92,
+              22,   22,   24,    8,    3,   14,    8,   23,    3,   38,
+               6,   51,   23,    6,   62,   54,   40,   92,   98,   45,
+              92,   19,    6,    6,    8,   22,   22,   50,   24,   54,
+              24,   70,    8,   19,   54,   14,    8,   70,    8,   19,
+              54,   14,    8,   70,    8,   19,   54,   14,    8,    3,
+              12,    6,    6,   30,   60,   30,   60,   14,   60,   12,
+               6,   60,   12,    6,    3,   82,   44,   28,    3,   30,
+              28,   12,   14,  124,   78,   82,   12,   70,   56,   30,
+               3,   54,   34,    6,    3,   31,   56,    7,   28,   14,
+              60,   18,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 47 */
+
+              14,   57,   12,   19,   49,    1,  125,  125,  125,    1,
+               1,  125,   27,  125,  125,  125,   27,   96,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   28,   28,    8,   19,   52,   28,    4,    1,
+              12,   12,   24,   24,   26,   10,    3,   14,   10,   21,
+               3,   40,    8,   51,   21,    8,   66,   56,   42,   96,
+              24,   24,   26,   10,    3,   14,   10,   21,    3,   40,
+               8,   51,   21,    8,   66,   56,   42,   96,  100,   45,
+              92,   19,    8,    8,   10,   24,   24,   52,   26,   56,
+              26,   72,   10,   19,   52,   14,   10,   72,   10,   19,
+              52,   14,   10,   72,   10,   19,   52,   14,   10,    3,
+              12,    4,    4,   30,   60,   30,   60,   14,   60,   12,
+               8,   60,   12,    8,    3,   84,   44,   28,    3,   30,
+              28,   12,   14,  124,   78,   84,   12,   72,   58,   30,
+               3,   52,   34,    4,    3,   35,   58,    9,   28,   14,
+              60,   18,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 48 */
+
+              14,   59,   12,   19,   49,    1,  125,  125,  125,    1,
+               1,  125,   29,  125,  125,  125,   29,   98,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   28,   28,    8,   19,   54,   28,    2,    1,
+              12,   12,   24,   24,   26,   10,    3,   14,   10,   21,
+               3,   40,    8,   51,   21,    8,   68,   56,   42,   98,
+              24,   24,   26,   10,    3,   14,   10,   21,    3,   40,
+               8,   51,   21,    8,   68,   56,   42,   98,  102,   47,
+              92,   19,    8,    8,   10,   24,   24,   54,   26,   56,
+              26,   72,   10,   19,   50,   14,   10,   72,   10,   19,
+              50,   14,   10,   72,   10,   19,   50,   14,   10,    3,
+              12,    2,    2,   30,   60,   30,   60,   14,   60,   12,
+               8,   60,   12,    8,    3,   86,   44,   28,    3,   30,
+              28,   12,   14,  124,   78,   86,   12,   72,   58,   30,
+               3,   50,   32,    2,    3,   39,   58,   11,   28,   14,
+              60,   16,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 49 */
+
+              14,   59,   14,   17,   49,    1,  125,  125,  125,    1,
+               1,  125,   29,  125,  125,  125,   29,  102,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   30,   30,   10,   17,   58,   30,    2,    1,
+              14,   14,   26,   26,   28,   12,    1,   14,   12,   19,
+               1,   42,   10,   49,   19,   10,   72,   58,   44,  102,
+              26,   26,   28,   12,    1,   14,   12,   19,    1,   42,
+              10,   49,   19,   10,   72,   58,   44,  102,  106,   47,
+              94,   17,   10,   10,   12,   26,   26,   58,   28,   58,
+              28,   74,   12,   17,   50,   14,   12,   74,   12,   17,
+              50,   14,   12,   74,   12,   17,   50,   14,   12,    1,
+              14,    2,    2,   30,   62,   30,   62,   14,   62,   14,
+              10,   62,   14,   10,    1,   90,   46,   30,    1,   30,
+              30,   14,   14,  124,   78,   90,   14,   74,   60,   30,
+               1,   50,   32,    2,    1,   41,   60,   11,   30,   14,
+              62,   16,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 50 */
+
+              14,   61,   14,   17,   49,    1,  125,  125,  125,    1,
+               1,  125,   31,  125,  125,  125,   31,  106,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   30,   30,   12,   17,   60,   30,    0,    1,
+              14,   14,   28,   28,   30,   14,    1,   14,   14,   17,
+               1,   44,   12,   49,   17,   12,   76,   60,   46,  106,
+              28,   28,   30,   14,    1,   14,   14,   17,    1,   44,
+              12,   49,   17,   12,   76,   60,   46,  106,  108,   47,
+              94,   17,   12,   12,   14,   28,   28,   60,   30,   60,
+              30,   76,   14,   17,   48,   14,   14,   76,   14,   17,
+              48,   14,   14,   76,   14,   17,   48,   14,   14,    1,
+              14,    0,    0,   30,   62,   30,   62,   14,   62,   14,
+              12,   62,   14,   12,    1,   92,   46,   30,    1,   30,
+              30,   14,   14,  124,   78,   92,   14,   76,   62,   30,
+               1,   48,   32,    0,    1,   45,   62,   13,   30,   14,
+              62,   16,   30,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 0, qp = 51 */
+
+              14,   63,   14,   17,   49,    1,  125,  125,  125,    1,
+               1,  125,   31,  125,  125,  125,   31,  110,  125,  125,
+             125,  125,  125,  125,  125,  125,  125,  125,  125,  125,
+             125,   14,   30,   30,   14,   17,   62,   30,    0,    1,
+              14,   14,   30,   30,   30,   14,    1,   14,   14,   17,
+               1,   46,   14,   49,   17,   14,   78,   62,   46,  110,
+              30,   30,   30,   14,    1,   14,   14,   17,    1,   46,
+              14,   49,   17,   14,   78,   62,   46,  110,  110,   47,
+              94,   17,   14,   14,   14,   30,   30,   62,   30,   62,
+              30,   78,   14,   17,   48,   14,   14,   78,   14,   17,
+              48,   14,   14,   78,   14,   17,   48,   14,   14,    1,
+              14,    0,    0,   30,   62,   30,   62,   14,   62,   14,
+              14,   62,   14,   14,    1,   94,   46,   30,    1,   30,
+              30,   14,   14,  124,   78,   94,   14,   78,   62,   30,
+               1,   48,   32,    0,    1,   47,   62,   15,   30,   14,
+              62,   16,   30,   30,
+        },
+
+    },
+
+    {
+        {
+            /* Context Tables for init_idc = 1, qp =  0 */
+
+              14,   14,   17,   17,   65,    1,   78,   14,   14,    1,
+               1,   78,    1,   17,    1,    1,    1,   30,   65,    1,
+              81,   81,   81,   81,   81,   14,   14,   33,   62,   30,
+              81,   33,    1,   65,   14,   81,   78,   17,   46,    1,
+              17,   17,   49,   65,   65,   65,   81,   81,   49,   81,
+              65,   65,   65,   81,   81,   81,   65,   33,   17,   33,
+              49,   65,   65,   65,   81,   81,   49,   81,   65,   65,
+              65,   81,   81,   81,   65,   33,   17,   33,   14,   33,
+              49,    1,   17,    1,   17,   14,   17,   17,   17,   81,
+              14,   62,   46,   33,   30,   14,    1,   62,   46,   33,
+              30,   14,    1,   62,   46,   33,   30,   14,    1,    1,
+              14,   17,   17,   17,   14,   17,   14,   46,   46,   46,
+              33,   46,   46,   33,    1,   94,   94,   46,    1,   30,
+              46,   62,   62,   62,   78,   30,   14,   14,   30,   14,
+              14,  124,   62,   46,    1,   46,   14,   62,   17,   46,
+              17,    1,   17,   46,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp =  1 */
+
+              14,   14,   15,   15,   63,    1,   78,   14,   14,    1,
+               1,   78,    1,   15,    1,    1,    1,   30,   63,    0,
+              77,   77,   77,   75,   75,   14,   14,   31,   62,   30,
+              77,   31,    0,   61,   14,   79,   78,   15,   46,    1,
+              15,   15,   47,   63,   61,   63,   77,   77,   47,   79,
+              63,   61,   63,   79,   79,   77,   61,   31,   15,   31,
+              47,   63,   61,   63,   77,   77,   47,   79,   63,   61,
+              63,   79,   79,   77,   61,   31,   15,   31,   16,   31,
+              45,    1,   17,    1,   15,   14,   15,   15,   15,   77,
+              14,   62,   46,   31,   32,   14,    1,   62,   46,   31,
+              32,   14,    1,   62,   46,   31,   32,   14,    1,    1,
+              14,   15,   15,   15,   16,   15,   16,   46,   46,   46,
+              31,   46,   46,   31,    1,   94,   94,   46,    1,   30,
+              46,   62,   62,   64,   78,   32,   14,   16,   32,   16,
+              14,  124,   62,   46,    1,   46,   16,   62,   15,   46,
+              13,    0,   15,   46,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp =  2 */
+
+              14,   12,   13,   15,   61,    1,   76,   12,   12,    1,
+               1,   78,    1,   15,    1,    1,    1,   30,   61,    2,
+              75,   73,   73,   71,   71,   14,   14,   31,   60,   30,
+              73,   29,    0,   59,   14,   77,   78,   13,   46,    1,
+              15,   15,   45,   61,   59,   61,   75,   73,   45,   77,
+              61,   57,   61,   77,   77,   75,   59,   29,   13,   29,
+              45,   61,   59,   61,   75,   73,   45,   77,   61,   57,
+              61,   77,   77,   75,   59,   29,   13,   29,   18,   31,
+              41,    1,   17,    1,   15,   14,   15,   13,   13,   73,
+              14,   62,   44,   31,   32,   14,    1,   62,   44,   31,
+              32,   14,    1,   62,   44,   31,   32,   14,    1,    1,
+              14,   13,   13,   13,   18,   13,   18,   46,   46,   44,
+              31,   46,   44,   31,    1,   92,   92,   46,    1,   30,
+              46,   60,   60,   64,   78,   32,   14,   18,   32,   16,
+              14,  124,   62,   46,    1,   46,   16,   60,   13,   46,
+              11,    2,   13,   46,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp =  3 */
+
+              14,   12,   11,   15,   61,    1,   74,   12,   10,    1,
+               1,   78,    1,   15,    1,    1,    1,   30,   59,    2,
+              73,   71,   69,   65,   65,   14,   14,   31,   58,   30,
+              71,   29,    0,   57,   14,   75,   78,   11,   46,    1,
+              15,   15,   45,   59,   57,   59,   73,   71,   45,   75,
+              59,   55,   59,   75,   75,   73,   57,   27,   13,   27,
+              45,   59,   57,   59,   73,   71,   45,   75,   59,   55,
+              59,   75,   75,   73,   57,   27,   13,   27,   18,   31,
+              37,    1,   17,    1,   15,   14,   15,   13,   13,   69,
+              14,   62,   44,   31,   32,   14,    1,   62,   44,   31,
+              32,   14,    1,   62,   44,   31,   32,   14,    1,    1,
+              14,   13,   13,   11,   18,   11,   18,   46,   46,   44,
+              31,   46,   44,   31,    1,   90,   90,   46,    1,   30,
+              46,   60,   60,   64,   78,   32,   14,   18,   32,   16,
+              14,  122,   62,   46,    1,   46,   16,   60,   11,   46,
+               9,    2,   11,   46,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp =  4 */
+
+              14,   10,    9,   13,   59,    1,   72,   10,    8,    1,
+               1,   78,    1,   13,    1,    1,    1,   30,   57,    4,
+              71,   67,   65,   61,   61,   14,   14,   29,   56,   28,
+              67,   27,    2,   55,   14,   73,   78,    9,   44,    1,
+              13,   13,   43,   57,   55,   57,   71,   67,   43,   73,
+              57,   51,   57,   73,   73,   71,   55,   25,   11,   25,
+              43,   57,   55,   57,   71,   67,   43,   73,   57,   51,
+              57,   73,   73,   71,   55,   25,   11,   25,   20,   29,
+              33,    1,   17,    1,   13,   14,   13,   11,   11,   65,
+              14,   60,   42,   29,   34,   14,    1,   60,   42,   29,
+              34,   14,    1,   60,   42,   29,   34,   14,    1,    3,
+              14,   11,   11,    9,   20,    9,   20,   44,   46,   42,
+              29,   46,   42,   29,    1,   88,   88,   44,    1,   30,
+              44,   58,   58,   66,   78,   34,   14,   20,   34,   18,
+              12,  120,   60,   44,    1,   44,   18,   58,    9,   44,
+               7,    4,    9,   44,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp =  5 */
+
+              14,    8,    7,   13,   57,    1,   70,    8,    6,    1,
+               1,   78,    1,   13,    1,    1,    1,   30,   55,    6,
+              67,   65,   61,   55,   55,   14,   14,   29,   54,   28,
+              65,   25,    2,   51,   14,   71,   78,    7,   44,    1,
+              13,   13,   41,   55,   51,   55,   67,   65,   41,   71,
+              55,   49,   55,   71,   71,   67,   51,   23,    9,   23,
+              41,   55,   51,   55,   67,   65,   41,   71,   55,   49,
+              55,   71,   71,   67,   51,   23,    9,   23,   22,   29,
+              29,    1,   17,    1,   13,   14,   13,    9,    9,   61,
+              14,   60,   40,   29,   34,   14,    1,   60,   40,   29,
+              34,   14,    1,   60,   40,   29,   34,   14,    1,    3,
+              14,    9,    9,    7,   22,    7,   22,   44,   46,   40,
+              29,   46,   40,   29,    1,   86,   86,   44,    1,   30,
+              44,   56,   56,   66,   78,   34,   14,   22,   34,   18,
+              12,  118,   60,   44,    1,   44,   18,   56,    7,   44,
+               3,    6,    7,   44,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp =  6 */
+
+              14,    8,    5,   13,   57,    1,   68,    8,    4,    1,
+               1,   78,    1,   13,    1,    1,    1,   30,   53,    6,
+              65,   61,   57,   51,   51,   14,   14,   29,   52,   28,
+              61,   25,    2,   49,   14,   69,   78,    5,   44,    1,
+              13,   13,   41,   53,   49,   53,   65,   61,   41,   69,
+              53,   45,   53,   69,   69,   65,   49,   21,    9,   21,
+              41,   53,   49,   53,   65,   61,   41,   69,   53,   45,
+              53,   69,   69,   65,   49,   21,    9,   21,   22,   29,
+              25,    1,   17,    1,   13,   14,   13,    9,    9,   57,
+              14,   60,   40,   29,   34,   14,    1,   60,   40,   29,
+              34,   14,    1,   60,   40,   29,   34,   14,    1,    3,
+              14,    9,    9,    5,   22,    5,   22,   44,   46,   40,
+              29,   46,   40,   29,    1,   84,   84,   44,    1,   30,
+              44,   56,   56,   66,   78,   34,   14,   22,   34,   18,
+              12,  116,   60,   44,    1,   44,   18,   56,    5,   44,
+               1,    6,    5,   44,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp =  7 */
+
+              14,    6,    3,   11,   55,    1,   66,    6,    2,    1,
+               1,   78,    1,   11,    1,    1,    1,   30,   51,    8,
+              63,   59,   53,   45,   45,   14,   14,   27,   50,   26,
+              59,   23,    4,   47,   14,   67,   78,    3,   42,    1,
+              11,   11,   39,   51,   47,   51,   63,   59,   39,   67,
+              51,   43,   51,   67,   67,   63,   47,   19,    7,   19,
+              39,   51,   47,   51,   63,   59,   39,   67,   51,   43,
+              51,   67,   67,   63,   47,   19,    7,   19,   24,   27,
+              21,    1,   17,    1,   11,   14,   11,    7,    7,   53,
+              14,   58,   38,   27,   36,   14,    1,   58,   38,   27,
+              36,   14,    1,   58,   38,   27,   36,   14,    1,    5,
+              14,    7,    7,    3,   24,    3,   24,   42,   46,   38,
+              27,   46,   38,   27,    1,   82,   82,   42,    1,   30,
+              42,   54,   54,   68,   78,   36,   14,   24,   36,   20,
+              10,  114,   58,   42,    1,   42,   20,   54,    3,   42,
+               0,    8,    3,   42,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp =  8 */
+
+              14,    4,    1,   11,   55,    1,   64,    4,    0,    1,
+               1,   78,    1,   11,    1,    1,    1,   30,   49,    8,
+              61,   55,   51,   41,   41,   14,   14,   27,   48,   26,
+              55,   23,    4,   45,   14,   65,   78,    1,   42,    1,
+              11,   11,   39,   49,   45,   49,   61,   55,   39,   65,
+              49,   39,   49,   65,   65,   61,   45,   17,    7,   17,
+              39,   49,   45,   49,   61,   55,   39,   65,   49,   39,
+              49,   65,   65,   61,   45,   17,    7,   17,   24,   27,
+              19,    1,   17,    1,   11,   14,   11,    7,    7,   51,
+              14,   58,   36,   27,   36,   14,    1,   58,   36,   27,
+              36,   14,    1,   58,   36,   27,   36,   14,    1,    5,
+              14,    7,    7,    1,   24,    1,   24,   42,   46,   36,
+              27,   46,   36,   27,    1,   80,   80,   42,    1,   30,
+              42,   52,   52,   68,   78,   36,   14,   24,   36,   20,
+              10,  112,   58,   42,    1,   42,   20,   52,    1,   42,
+               2,    8,    1,   42,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp =  9 */
+
+              14,    4,    0,   11,   53,    1,   62,    4,    1,    1,
+               1,   78,    1,   11,    1,    1,    1,   30,   47,   10,
+              57,   51,   47,   35,   35,   14,   14,   27,   46,   26,
+              51,   21,    4,   41,   14,   63,   78,    0,   42,    1,
+              11,   11,   37,   47,   41,   47,   57,   51,   37,   63,
+              47,   35,   47,   63,   63,   57,   41,   15,    5,   15,
+              37,   47,   41,   47,   57,   51,   37,   63,   47,   35,
+              47,   63,   63,   57,   41,   15,    5,   15,   26,   27,
+              15,    1,   17,    1,   11,   14,   11,    5,    5,   47,
+              14,   58,   36,   27,   36,   14,    1,   58,   36,   27,
+              36,   14,    1,   58,   36,   27,   36,   14,    1,    5,
+              14,    5,    5,    0,   26,    0,   26,   42,   46,   36,
+              27,   46,   36,   27,    1,   78,   78,   42,    1,   30,
+              42,   52,   52,   68,   78,   36,   14,   26,   36,   20,
+              10,  110,   58,   42,    1,   42,   20,   52,    0,   42,
+               6,   10,    0,   42,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 10 */
+
+              14,    2,    2,    9,   51,    1,   60,    2,    3,    1,
+               1,   78,    1,    9,    1,    1,    1,   30,   45,   12,
+              55,   49,   43,   31,   31,   14,   14,   25,   44,   24,
+              49,   19,    6,   39,   14,   61,   78,    2,   40,    1,
+               9,    9,   35,   45,   39,   45,   55,   49,   35,   61,
+              45,   33,   45,   61,   61,   55,   39,   13,    3,   13,
+              35,   45,   39,   45,   55,   49,   35,   61,   45,   33,
+              45,   61,   61,   55,   39,   13,    3,   13,   28,   25,
+              11,    1,   17,    1,    9,   14,    9,    3,    3,   43,
+              14,   56,   34,   25,   38,   14,    1,   56,   34,   25,
+              38,   14,    1,   56,   34,   25,   38,   14,    1,    7,
+              14,    3,    3,    2,   28,    2,   28,   40,   46,   34,
+              25,   46,   34,   25,    1,   76,   76,   40,    1,   30,
+              40,   50,   50,   70,   78,   38,   14,   28,   38,   22,
+               8,  108,   56,   40,    1,   40,   22,   50,    2,   40,
+               8,   12,    2,   40,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 11 */
+
+              14,    2,    4,    9,   51,    1,   58,    2,    5,    1,
+               1,   78,    1,    9,    1,    1,    1,   30,   43,   12,
+              53,   45,   39,   25,   25,   14,   14,   25,   42,   24,
+              45,   19,    6,   37,   14,   59,   78,    4,   40,    1,
+               9,    9,   35,   43,   37,   43,   53,   45,   35,   59,
+              43,   29,   43,   59,   59,   53,   37,   11,    3,   11,
+              35,   43,   37,   43,   53,   45,   35,   59,   43,   29,
+              43,   59,   59,   53,   37,   11,    3,   11,   28,   25,
+               7,    1,   17,    1,    9,   14,    9,    3,    3,   39,
+              14,   56,   34,   25,   38,   14,    1,   56,   34,   25,
+              38,   14,    1,   56,   34,   25,   38,   14,    1,    7,
+              14,    3,    3,    4,   28,    4,   28,   40,   46,   34,
+              25,   46,   34,   25,    1,   74,   74,   40,    1,   30,
+              40,   50,   50,   70,   78,   38,   14,   28,   38,   22,
+               8,  106,   56,   40,    1,   40,   22,   50,    4,   40,
+              10,   12,    4,   40,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 12 */
+
+              14,    0,    6,    9,   49,    1,   56,    0,    7,    1,
+               1,   78,    1,    9,    1,    1,    1,   30,   41,   14,
+              51,   43,   35,   21,   21,   14,   14,   25,   40,   24,
+              43,   17,    6,   35,   14,   57,   78,    6,   40,    1,
+               9,    9,   33,   41,   35,   41,   51,   43,   33,   57,
+              41,   27,   41,   57,   57,   51,   35,    9,    1,    9,
+              33,   41,   35,   41,   51,   43,   33,   57,   41,   27,
+              41,   57,   57,   51,   35,    9,    1,    9,   30,   25,
+               3,    1,   17,    1,    9,   14,    9,    1,    1,   35,
+              14,   56,   32,   25,   38,   14,    1,   56,   32,   25,
+              38,   14,    1,   56,   32,   25,   38,   14,    1,    7,
+              14,    1,    1,    6,   30,    6,   30,   40,   46,   32,
+              25,   46,   32,   25,    1,   72,   72,   40,    1,   30,
+              40,   48,   48,   70,   78,   38,   14,   30,   38,   22,
+               8,  104,   56,   40,    1,   40,   22,   48,    6,   40,
+              12,   14,    6,   40,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 13 */
+
+              14,    1,    8,    7,   47,    1,   54,    1,    9,    1,
+               1,   78,    1,    7,    1,    1,    1,   30,   39,   16,
+              47,   39,   31,   15,   15,   14,   14,   23,   38,   22,
+              39,   15,    8,   31,   14,   55,   78,    8,   38,    1,
+               7,    7,   31,   39,   31,   39,   47,   39,   31,   55,
+              39,   23,   39,   55,   55,   47,   31,    7,    0,    7,
+              31,   39,   31,   39,   47,   39,   31,   55,   39,   23,
+              39,   55,   55,   47,   31,    7,    0,    7,   32,   23,
+               0,    1,   17,    1,    7,   14,    7,    0,    0,   31,
+              14,   54,   30,   23,   40,   14,    1,   54,   30,   23,
+              40,   14,    1,   54,   30,   23,   40,   14,    1,    9,
+              14,    0,    0,    8,   32,    8,   32,   38,   46,   30,
+              23,   46,   30,   23,    1,   70,   70,   38,    1,   30,
+              38,   46,   46,   72,   78,   40,   14,   32,   40,   24,
+               6,  102,   54,   38,    1,   38,   24,   46,    8,   38,
+              16,   16,    8,   38,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 14 */
+
+              14,    1,   10,    7,   47,    1,   52,    1,   11,    1,
+               1,   78,    1,    7,    1,    1,    1,   30,   37,   16,
+              45,   37,   27,   11,   11,   14,   14,   23,   36,   22,
+              37,   15,    8,   29,   14,   53,   78,   10,   38,    1,
+               7,    7,   31,   37,   29,   37,   45,   37,   31,   53,
+              37,   21,   37,   53,   53,   45,   29,    5,    0,    5,
+              31,   37,   29,   37,   45,   37,   31,   53,   37,   21,
+              37,   53,   53,   45,   29,    5,    0,    5,   32,   23,
+               4,    1,   17,    1,    7,   14,    7,    0,    0,   27,
+              14,   54,   30,   23,   40,   14,    1,   54,   30,   23,
+              40,   14,    1,   54,   30,   23,   40,   14,    1,    9,
+              14,    0,    0,   10,   32,   10,   32,   38,   46,   30,
+              23,   46,   30,   23,    1,   68,   68,   38,    1,   30,
+              38,   46,   46,   72,   78,   40,   14,   32,   40,   24,
+               6,  100,   54,   38,    1,   38,   24,   46,   10,   38,
+              18,   16,   10,   38,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 15 */
+
+              14,    3,   12,    7,   45,    1,   50,    3,   13,    1,
+               1,   78,    1,    7,    1,    1,    1,   30,   35,   18,
+              43,   33,   23,    5,    5,   14,   14,   23,   34,   22,
+              33,   13,    8,   27,   14,   51,   78,   12,   38,    1,
+               7,    7,   29,   35,   27,   35,   43,   33,   29,   51,
+              35,   17,   35,   51,   51,   43,   27,    3,    2,    3,
+              29,   35,   27,   35,   43,   33,   29,   51,   35,   17,
+              35,   51,   51,   43,   27,    3,    2,    3,   34,   23,
+               8,    1,   17,    1,    7,   14,    7,    2,    2,   23,
+              14,   54,   28,   23,   40,   14,    1,   54,   28,   23,
+              40,   14,    1,   54,   28,   23,   40,   14,    1,    9,
+              14,    2,    2,   12,   34,   12,   34,   38,   46,   28,
+              23,   46,   28,   23,    1,   66,   66,   38,    1,   30,
+              38,   44,   44,   72,   78,   40,   14,   34,   40,   24,
+               6,   98,   54,   38,    1,   38,   24,   44,   12,   38,
+              20,   18,   12,   38,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 16 */
+
+              14,    5,   12,    7,   45,    1,   48,    5,   15,    1,
+               1,   78,    1,    7,    1,    1,    1,   30,   35,   18,
+              41,   31,   21,    1,    1,   14,   14,   23,   32,   20,
+              31,   13,    8,   25,   14,   51,   78,   12,   36,    1,
+               7,    7,   29,   35,   25,   35,   41,   31,   29,   51,
+              35,   15,   35,   51,   51,   41,   25,    3,    2,    3,
+              29,   35,   25,   35,   41,   31,   29,   51,   35,   15,
+              35,   51,   51,   41,   25,    3,    2,    3,   34,   23,
+              10,    1,   17,    1,    7,   14,    7,    2,    2,   21,
+              14,   52,   26,   23,   40,   14,    1,   52,   26,   23,
+              40,   14,    1,   52,   26,   23,   40,   14,    1,   11,
+              14,    2,    2,   12,   34,   12,   34,   36,   46,   26,
+              23,   46,   26,   23,    1,   64,   64,   36,    1,   30,
+              36,   42,   42,   72,   78,   40,   14,   34,   40,   24,
+               4,   96,   52,   36,    1,   36,   24,   42,   12,   36,
+              22,   18,   12,   36,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 17 */
+
+              14,    5,   14,    5,   43,    1,   48,    5,   15,    1,
+               1,   78,    1,    5,    1,    1,    1,   30,   33,   20,
+              37,   27,   17,    4,    4,   14,   14,   21,   32,   20,
+              27,   11,   10,   21,   14,   49,   78,   14,   36,    1,
+               5,    5,   27,   33,   21,   33,   37,   27,   27,   49,
+              33,   11,   33,   49,   49,   37,   21,    1,    4,    1,
+              27,   33,   21,   33,   37,   27,   27,   49,   33,   11,
+              33,   49,   49,   37,   21,    1,    4,    1,   36,   21,
+              14,    1,   17,    1,    5,   14,    5,    4,    4,   17,
+              14,   52,   26,   21,   42,   14,    1,   52,   26,   21,
+              42,   14,    1,   52,   26,   21,   42,   14,    1,   11,
+              14,    4,    4,   14,   36,   14,   36,   36,   46,   26,
+              21,   46,   26,   21,    1,   64,   64,   36,    1,   30,
+              36,   42,   42,   74,   78,   42,   14,   36,   42,   26,
+               4,   96,   52,   36,    1,   36,   26,   42,   14,   36,
+              26,   20,   14,   36,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 18 */
+
+              14,    7,   16,    5,   41,    1,   46,    7,   17,    1,
+               1,   78,    1,    5,    1,    1,    1,   30,   31,   22,
+              35,   23,   13,    8,    8,   14,   14,   21,   30,   20,
+              23,    9,   10,   19,   14,   47,   78,   16,   36,    1,
+               5,    5,   25,   31,   19,   31,   35,   23,   25,   47,
+              31,    7,   31,   47,   47,   35,   19,    0,    6,    0,
+              25,   31,   19,   31,   35,   23,   25,   47,   31,    7,
+              31,   47,   47,   35,   19,    0,    6,    0,   38,   21,
+              18,    1,   17,    1,    5,   14,    5,    6,    6,   13,
+              14,   52,   24,   21,   42,   14,    1,   52,   24,   21,
+              42,   14,    1,   52,   24,   21,   42,   14,    1,   11,
+              14,    6,    6,   16,   38,   16,   38,   36,   46,   24,
+              21,   46,   24,   21,    1,   62,   62,   36,    1,   30,
+              36,   40,   40,   74,   78,   42,   14,   38,   42,   26,
+               4,   94,   52,   36,    1,   36,   26,   40,   16,   36,
+              28,   22,   16,   36,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 19 */
+
+              14,    7,   18,    5,   41,    1,   44,    7,   19,    1,
+               1,   78,    1,    5,    1,    1,    1,   30,   29,   22,
+              33,   21,    9,   14,   14,   14,   14,   21,   28,   20,
+              21,    9,   10,   17,   14,   45,   78,   18,   36,    1,
+               5,    5,   25,   29,   17,   29,   33,   21,   25,   45,
+              29,    5,   29,   45,   45,   33,   17,    2,    6,    2,
+              25,   29,   17,   29,   33,   21,   25,   45,   29,    5,
+              29,   45,   45,   33,   17,    2,    6,    2,   38,   21,
+              22,    1,   17,    1,    5,   14,    5,    6,    6,    9,
+              14,   52,   24,   21,   42,   14,    1,   52,   24,   21,
+              42,   14,    1,   52,   24,   21,   42,   14,    1,   11,
+              14,    6,    6,   18,   38,   18,   38,   36,   46,   24,
+              21,   46,   24,   21,    1,   60,   60,   36,    1,   30,
+              36,   40,   40,   74,   78,   42,   14,   38,   42,   26,
+               4,   92,   52,   36,    1,   36,   26,   40,   18,   36,
+              30,   22,   18,   36,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 20 */
+
+              14,    9,   20,    3,   39,    1,   42,    9,   21,    1,
+               1,   78,    1,    3,    1,    1,    1,   30,   27,   24,
+              31,   17,    5,   18,   18,   14,   14,   19,   26,   18,
+              17,    7,   12,   15,   14,   43,   78,   20,   34,    1,
+               3,    3,   23,   27,   15,   27,   31,   17,   23,   43,
+              27,    1,   27,   43,   43,   31,   15,    4,    8,    4,
+              23,   27,   15,   27,   31,   17,   23,   43,   27,    1,
+              27,   43,   43,   31,   15,    4,    8,    4,   40,   19,
+              26,    1,   17,    1,    3,   14,    3,    8,    8,    5,
+              14,   50,   22,   19,   44,   14,    1,   50,   22,   19,
+              44,   14,    1,   50,   22,   19,   44,   14,    1,   13,
+              14,    8,    8,   20,   40,   20,   40,   34,   46,   22,
+              19,   46,   22,   19,    1,   58,   58,   34,    1,   30,
+              34,   38,   38,   76,   78,   44,   14,   40,   44,   28,
+               2,   90,   50,   34,    1,   34,   28,   38,   20,   34,
+              32,   24,   20,   34,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 21 */
+
+              14,   11,   22,    3,   37,    1,   40,   11,   23,    1,
+               1,   78,    1,    3,    1,    1,    1,   30,   25,   26,
+              27,   15,    1,   24,   24,   14,   14,   19,   24,   18,
+              15,    5,   12,   11,   14,   41,   78,   22,   34,    1,
+               3,    3,   21,   25,   11,   25,   27,   15,   21,   41,
+              25,    0,   25,   41,   41,   27,   11,    6,   10,    6,
+              21,   25,   11,   25,   27,   15,   21,   41,   25,    0,
+              25,   41,   41,   27,   11,    6,   10,    6,   42,   19,
+              30,    1,   17,    1,    3,   14,    3,   10,   10,    1,
+              14,   50,   20,   19,   44,   14,    1,   50,   20,   19,
+              44,   14,    1,   50,   20,   19,   44,   14,    1,   13,
+              14,   10,   10,   22,   42,   22,   42,   34,   46,   20,
+              19,   46,   20,   19,    1,   56,   56,   34,    1,   30,
+              34,   36,   36,   76,   78,   44,   14,   42,   44,   28,
+               2,   88,   50,   34,    1,   34,   28,   36,   22,   34,
+              36,   26,   22,   34,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 22 */
+
+              14,   11,   24,    3,   37,    1,   38,   11,   25,    1,
+               1,   78,    1,    3,    1,    1,    1,   30,   23,   26,
+              25,   11,    2,   28,   28,   14,   14,   19,   22,   18,
+              11,    5,   12,    9,   14,   39,   78,   24,   34,    1,
+               3,    3,   21,   23,    9,   23,   25,   11,   21,   39,
+              23,    4,   23,   39,   39,   25,    9,    8,   10,    8,
+              21,   23,    9,   23,   25,   11,   21,   39,   23,    4,
+              23,   39,   39,   25,    9,    8,   10,    8,   42,   19,
+              34,    1,   17,    1,    3,   14,    3,   10,   10,    2,
+              14,   50,   20,   19,   44,   14,    1,   50,   20,   19,
+              44,   14,    1,   50,   20,   19,   44,   14,    1,   13,
+              14,   10,   10,   24,   42,   24,   42,   34,   46,   20,
+              19,   46,   20,   19,    1,   54,   54,   34,    1,   30,
+              34,   36,   36,   76,   78,   44,   14,   42,   44,   28,
+               2,   86,   50,   34,    1,   34,   28,   36,   24,   34,
+              38,   26,   24,   34,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 23 */
+
+              14,   13,   26,    1,   35,    1,   36,   13,   27,    1,
+               1,   78,    1,    1,    1,    1,    1,   30,   21,   28,
+              23,    9,    6,   34,   34,   14,   14,   17,   20,   16,
+               9,    3,   14,    7,   14,   37,   78,   26,   32,    1,
+               1,    1,   19,   21,    7,   21,   23,    9,   19,   37,
+              21,    6,   21,   37,   37,   23,    7,   10,   12,   10,
+              19,   21,    7,   21,   23,    9,   19,   37,   21,    6,
+              21,   37,   37,   23,    7,   10,   12,   10,   44,   17,
+              38,    1,   17,    1,    1,   14,    1,   12,   12,    6,
+              14,   48,   18,   17,   46,   14,    1,   48,   18,   17,
+              46,   14,    1,   48,   18,   17,   46,   14,    1,   15,
+              14,   12,   12,   26,   44,   26,   44,   32,   46,   18,
+              17,   46,   18,   17,    1,   52,   52,   32,    1,   30,
+              32,   34,   34,   78,   78,   46,   14,   44,   46,   30,
+               0,   84,   48,   32,    1,   32,   30,   34,   26,   32,
+              40,   28,   26,   32,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 24 */
+
+              14,   15,   28,    1,   35,    1,   34,   15,   29,    1,
+               1,   78,    1,    1,    1,    1,    1,   30,   19,   28,
+              21,    5,    8,   38,   38,   14,   14,   17,   18,   16,
+               5,    3,   14,    5,   14,   35,   78,   28,   32,    1,
+               1,    1,   19,   19,    5,   19,   21,    5,   19,   35,
+              19,   10,   19,   35,   35,   21,    5,   12,   12,   12,
+              19,   19,    5,   19,   21,    5,   19,   35,   19,   10,
+              19,   35,   35,   21,    5,   12,   12,   12,   44,   17,
+              40,    1,   17,    1,    1,   14,    1,   12,   12,    8,
+              14,   48,   16,   17,   46,   14,    1,   48,   16,   17,
+              46,   14,    1,   48,   16,   17,   46,   14,    1,   15,
+              14,   12,   12,   28,   44,   28,   44,   32,   46,   16,
+              17,   46,   16,   17,    1,   50,   50,   32,    1,   30,
+              32,   32,   32,   78,   78,   46,   14,   44,   46,   30,
+               0,   82,   48,   32,    1,   32,   30,   32,   28,   32,
+              42,   28,   28,   32,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 25 */
+
+              14,   15,   30,    1,   33,    1,   32,   15,   31,    1,
+               1,   78,    1,    1,    1,    1,    1,   30,   17,   30,
+              17,    1,   12,   44,   44,   14,   14,   17,   16,   16,
+               1,    1,   14,    1,   14,   33,   78,   30,   32,    1,
+               1,    1,   17,   17,    1,   17,   17,    1,   17,   33,
+              17,   14,   17,   33,   33,   17,    1,   14,   14,   14,
+              17,   17,    1,   17,   17,    1,   17,   33,   17,   14,
+              17,   33,   33,   17,    1,   14,   14,   14,   46,   17,
+              44,    1,   17,    1,    1,   14,    1,   14,   14,   12,
+              14,   48,   16,   17,   46,   14,    1,   48,   16,   17,
+              46,   14,    1,   48,   16,   17,   46,   14,    1,   15,
+              14,   14,   14,   30,   46,   30,   46,   32,   46,   16,
+              17,   46,   16,   17,    1,   48,   48,   32,    1,   30,
+              32,   32,   32,   78,   78,   46,   14,   46,   46,   30,
+               0,   80,   48,   32,    1,   32,   30,   32,   30,   32,
+              46,   30,   30,   32,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 26 */
+
+              14,   17,   32,    0,   31,    1,   30,   17,   33,    1,
+               1,   78,    1,    0,    1,    1,    1,   30,   15,   32,
+              15,    0,   16,   48,   48,   14,   14,   15,   14,   14,
+               0,    0,   16,    0,   14,   31,   78,   32,   30,    1,
+               0,    0,   15,   15,    0,   15,   15,    0,   15,   31,
+              15,   16,   15,   31,   31,   15,    0,   16,   16,   16,
+              15,   15,    0,   15,   15,    0,   15,   31,   15,   16,
+              15,   31,   31,   15,    0,   16,   16,   16,   48,   15,
+              48,    1,   17,    1,    0,   14,    0,   16,   16,   16,
+              14,   46,   14,   15,   48,   14,    1,   46,   14,   15,
+              48,   14,    1,   46,   14,   15,   48,   14,    1,   17,
+              14,   16,   16,   32,   48,   32,   48,   30,   46,   14,
+              15,   46,   14,   15,    1,   46,   46,   30,    1,   30,
+              30,   30,   30,   80,   78,   48,   14,   48,   48,   32,
+               1,   78,   46,   30,    1,   30,   32,   30,   32,   30,
+              48,   32,   32,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 27 */
+
+              14,   17,   34,    0,   31,    1,   28,   17,   35,    1,
+               1,   78,    1,    0,    1,    1,    1,   30,   13,   32,
+              13,    4,   20,   54,   54,   14,   14,   15,   12,   14,
+               4,    0,   16,    2,   14,   29,   78,   34,   30,    1,
+               0,    0,   15,   13,    2,   13,   13,    4,   15,   29,
+              13,   20,   13,   29,   29,   13,    2,   18,   16,   18,
+              15,   13,    2,   13,   13,    4,   15,   29,   13,   20,
+              13,   29,   29,   13,    2,   18,   16,   18,   48,   15,
+              52,    1,   17,    1,    0,   14,    0,   16,   16,   20,
+              14,   46,   14,   15,   48,   14,    1,   46,   14,   15,
+              48,   14,    1,   46,   14,   15,   48,   14,    1,   17,
+              14,   16,   16,   34,   48,   34,   48,   30,   46,   14,
+              15,   46,   14,   15,    1,   44,   44,   30,    1,   30,
+              30,   30,   30,   80,   78,   48,   14,   48,   48,   32,
+               1,   76,   46,   30,    1,   30,   32,   30,   34,   30,
+              50,   32,   34,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 28 */
+
+              14,   19,   36,    0,   29,    1,   26,   19,   37,    1,
+               1,   78,    1,    0,    1,    1,    1,   30,   11,   34,
+              11,    6,   24,   58,   58,   14,   14,   15,   10,   14,
+               6,    2,   16,    4,   14,   27,   78,   36,   30,    1,
+               0,    0,   13,   11,    4,   11,   11,    6,   13,   27,
+              11,   22,   11,   27,   27,   11,    4,   20,   18,   20,
+              13,   11,    4,   11,   11,    6,   13,   27,   11,   22,
+              11,   27,   27,   11,    4,   20,   18,   20,   50,   15,
+              56,    1,   17,    1,    0,   14,    0,   18,   18,   24,
+              14,   46,   12,   15,   48,   14,    1,   46,   12,   15,
+              48,   14,    1,   46,   12,   15,   48,   14,    1,   17,
+              14,   18,   18,   36,   50,   36,   50,   30,   46,   12,
+              15,   46,   12,   15,    1,   42,   42,   30,    1,   30,
+              30,   28,   28,   80,   78,   48,   14,   50,   48,   32,
+               1,   74,   46,   30,    1,   30,   32,   28,   36,   30,
+              52,   34,   36,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 29 */
+
+              14,   21,   38,    2,   27,    1,   24,   21,   39,    1,
+               1,   78,    1,    2,    1,    1,    1,   30,    9,   36,
+               7,   10,   28,   64,   64,   14,   14,   13,    8,   12,
+              10,    4,   18,    8,   14,   25,   78,   38,   28,    1,
+               2,    2,   11,    9,    8,    9,    7,   10,   11,   25,
+               9,   26,    9,   25,   25,    7,    8,   22,   20,   22,
+              11,    9,    8,    9,    7,   10,   11,   25,    9,   26,
+               9,   25,   25,    7,    8,   22,   20,   22,   52,   13,
+              60,    1,   17,    1,    2,   14,    2,   20,   20,   28,
+              14,   44,   10,   13,   50,   14,    1,   44,   10,   13,
+              50,   14,    1,   44,   10,   13,   50,   14,    1,   19,
+              14,   20,   20,   38,   52,   38,   52,   28,   46,   10,
+              13,   46,   10,   13,    1,   40,   40,   28,    1,   30,
+              28,   26,   26,   82,   78,   50,   14,   52,   50,   34,
+               3,   72,   44,   28,    1,   28,   34,   26,   38,   28,
+              56,   36,   38,   28,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 30 */
+
+              14,   21,   40,    2,   27,    1,   22,   21,   41,    1,
+               1,   78,    1,    2,    1,    1,    1,   30,    7,   36,
+               5,   12,   32,   68,   68,   14,   14,   13,    6,   12,
+              12,    4,   18,   10,   14,   23,   78,   40,   28,    1,
+               2,    2,   11,    7,   10,    7,    5,   12,   11,   23,
+               7,   28,    7,   23,   23,    5,   10,   24,   20,   24,
+              11,    7,   10,    7,    5,   12,   11,   23,    7,   28,
+               7,   23,   23,    5,   10,   24,   20,   24,   52,   13,
+              64,    1,   17,    1,    2,   14,    2,   20,   20,   32,
+              14,   44,   10,   13,   50,   14,    1,   44,   10,   13,
+              50,   14,    1,   44,   10,   13,   50,   14,    1,   19,
+              14,   20,   20,   40,   52,   40,   52,   28,   46,   10,
+              13,   46,   10,   13,    1,   38,   38,   28,    1,   30,
+              28,   26,   26,   82,   78,   50,   14,   52,   50,   34,
+               3,   70,   44,   28,    1,   28,   34,   26,   40,   28,
+              58,   36,   40,   28,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 31 */
+
+              14,   23,   42,    2,   25,    1,   20,   23,   43,    1,
+               1,   78,    1,    2,    1,    1,    1,   30,    5,   38,
+               3,   16,   36,   74,   74,   14,   14,   13,    4,   12,
+              16,    6,   18,   12,   14,   21,   78,   42,   28,    1,
+               2,    2,    9,    5,   12,    5,    3,   16,    9,   21,
+               5,   32,    5,   21,   21,    3,   12,   26,   22,   26,
+               9,    5,   12,    5,    3,   16,    9,   21,    5,   32,
+               5,   21,   21,    3,   12,   26,   22,   26,   54,   13,
+              68,    1,   17,    1,    2,   14,    2,   22,   22,   36,
+              14,   44,    8,   13,   50,   14,    1,   44,    8,   13,
+              50,   14,    1,   44,    8,   13,   50,   14,    1,   19,
+              14,   22,   22,   42,   54,   42,   54,   28,   46,    8,
+              13,   46,    8,   13,    1,   36,   36,   28,    1,   30,
+              28,   24,   24,   82,   78,   50,   14,   54,   50,   34,
+               3,   68,   44,   28,    1,   28,   34,   24,   42,   28,
+              60,   38,   42,   28,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 32 */
+
+              14,   25,   42,    2,   25,    1,   18,   25,   45,    1,
+               1,   78,    1,    2,    1,    1,    1,   30,    5,   38,
+               1,   18,   38,   78,   78,   14,   14,   13,    2,   10,
+              18,    6,   18,   14,   14,   21,   78,   42,   26,    1,
+               2,    2,    9,    5,   14,    5,    1,   18,    9,   21,
+               5,   34,    5,   21,   21,    1,   14,   26,   22,   26,
+               9,    5,   14,    5,    1,   18,    9,   21,    5,   34,
+               5,   21,   21,    1,   14,   26,   22,   26,   54,   13,
+              70,    1,   17,    1,    2,   14,    2,   22,   22,   38,
+              14,   42,    6,   13,   50,   14,    1,   42,    6,   13,
+              50,   14,    1,   42,    6,   13,   50,   14,    1,   21,
+              14,   22,   22,   42,   54,   42,   54,   26,   46,    6,
+              13,   46,    6,   13,    1,   34,   34,   26,    1,   30,
+              26,   22,   22,   82,   78,   50,   14,   54,   50,   34,
+               5,   66,   42,   26,    1,   26,   34,   22,   42,   26,
+              62,   38,   42,   26,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 33 */
+
+              14,   25,   44,    4,   23,    1,   18,   25,   45,    1,
+               1,   78,    1,    4,    1,    1,    1,   30,    3,   40,
+               2,   22,   42,   84,   84,   14,   14,   11,    2,   10,
+              22,    8,   20,   18,   14,   19,   78,   44,   26,    1,
+               4,    4,    7,    3,   18,    3,    2,   22,    7,   19,
+               3,   38,    3,   19,   19,    2,   18,   28,   24,   28,
+               7,    3,   18,    3,    2,   22,    7,   19,    3,   38,
+               3,   19,   19,    2,   18,   28,   24,   28,   56,   11,
+              74,    1,   17,    1,    4,   14,    4,   24,   24,   42,
+              14,   42,    6,   11,   52,   14,    1,   42,    6,   11,
+              52,   14,    1,   42,    6,   11,   52,   14,    1,   21,
+              14,   24,   24,   44,   56,   44,   56,   26,   46,    6,
+              11,   46,    6,   11,    1,   34,   34,   26,    1,   30,
+              26,   22,   22,   84,   78,   52,   14,   56,   52,   36,
+               5,   66,   42,   26,    1,   26,   36,   22,   44,   26,
+              66,   40,   44,   26,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 34 */
+
+              14,   27,   46,    4,   21,    1,   16,   27,   47,    1,
+               1,   78,    1,    4,    1,    1,    1,   30,    1,   42,
+               4,   26,   46,   88,   88,   14,   14,   11,    0,   10,
+              26,   10,   20,   20,   14,   17,   78,   46,   26,    1,
+               4,    4,    5,    1,   20,    1,    4,   26,    5,   17,
+               1,   42,    1,   17,   17,    4,   20,   30,   26,   30,
+               5,    1,   20,    1,    4,   26,    5,   17,    1,   42,
+               1,   17,   17,    4,   20,   30,   26,   30,   58,   11,
+              78,    1,   17,    1,    4,   14,    4,   26,   26,   46,
+              14,   42,    4,   11,   52,   14,    1,   42,    4,   11,
+              52,   14,    1,   42,    4,   11,   52,   14,    1,   21,
+              14,   26,   26,   46,   58,   46,   58,   26,   46,    4,
+              11,   46,    4,   11,    1,   32,   32,   26,    1,   30,
+              26,   20,   20,   84,   78,   52,   14,   58,   52,   36,
+               5,   64,   42,   26,    1,   26,   36,   20,   46,   26,
+              68,   42,   46,   26,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 35 */
+
+              14,   27,   48,    4,   21,    1,   14,   27,   49,    1,
+               1,   78,    1,    4,    1,    1,    1,   30,    0,   42,
+               6,   28,   50,   94,   94,   14,   14,   11,    1,   10,
+              28,   10,   20,   22,   14,   15,   78,   48,   26,    1,
+               4,    4,    5,    0,   22,    0,    6,   28,    5,   15,
+               0,   44,    0,   15,   15,    6,   22,   32,   26,   32,
+               5,    0,   22,    0,    6,   28,    5,   15,    0,   44,
+               0,   15,   15,    6,   22,   32,   26,   32,   58,   11,
+              82,    1,   17,    1,    4,   14,    4,   26,   26,   50,
+              14,   42,    4,   11,   52,   14,    1,   42,    4,   11,
+              52,   14,    1,   42,    4,   11,   52,   14,    1,   21,
+              14,   26,   26,   48,   58,   48,   58,   26,   46,    4,
+              11,   46,    4,   11,    1,   30,   30,   26,    1,   30,
+              26,   20,   20,   84,   78,   52,   14,   58,   52,   36,
+               5,   62,   42,   26,    1,   26,   36,   20,   48,   26,
+              70,   42,   48,   26,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 36 */
+
+              14,   29,   50,    6,   19,    1,   12,   29,   51,    1,
+               1,   78,    1,    6,    1,    1,    1,   30,    2,   44,
+               8,   32,   54,   98,   98,   14,   14,    9,    3,    8,
+              32,   12,   22,   24,   14,   13,   78,   50,   24,    1,
+               6,    6,    3,    2,   24,    2,    8,   32,    3,   13,
+               2,   48,    2,   13,   13,    8,   24,   34,   28,   34,
+               3,    2,   24,    2,    8,   32,    3,   13,    2,   48,
+               2,   13,   13,    8,   24,   34,   28,   34,   60,    9,
+              86,    1,   17,    1,    6,   14,    6,   28,   28,   54,
+              14,   40,    2,    9,   54,   14,    1,   40,    2,    9,
+              54,   14,    1,   40,    2,    9,   54,   14,    1,   23,
+              14,   28,   28,   50,   60,   50,   60,   24,   46,    2,
+               9,   46,    2,    9,    1,   28,   28,   24,    1,   30,
+              24,   18,   18,   86,   78,   54,   14,   60,   54,   38,
+               7,   60,   40,   24,    1,   24,   38,   18,   50,   24,
+              72,   44,   50,   24,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 37 */
+
+              14,   31,   52,    6,   17,    1,   10,   31,   53,    1,
+               1,   78,    1,    6,    1,    1,    1,   30,    4,   46,
+              12,   34,   58,  104,  104,   14,   14,    9,    5,    8,
+              34,   14,   22,   28,   14,   11,   78,   52,   24,    1,
+               6,    6,    1,    4,   28,    4,   12,   34,    1,   11,
+               4,   50,    4,   11,   11,   12,   28,   36,   30,   36,
+               1,    4,   28,    4,   12,   34,    1,   11,    4,   50,
+               4,   11,   11,   12,   28,   36,   30,   36,   62,    9,
+              90,    1,   17,    1,    6,   14,    6,   30,   30,   58,
+              14,   40,    0,    9,   54,   14,    1,   40,    0,    9,
+              54,   14,    1,   40,    0,    9,   54,   14,    1,   23,
+              14,   30,   30,   52,   62,   52,   62,   24,   46,    0,
+               9,   46,    0,    9,    1,   26,   26,   24,    1,   30,
+              24,   16,   16,   86,   78,   54,   14,   62,   54,   38,
+               7,   58,   40,   24,    1,   24,   38,   16,   52,   24,
+              76,   46,   52,   24,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 38 */
+
+              14,   31,   54,    6,   17,    1,    8,   31,   55,    1,
+               1,   78,    1,    6,    1,    1,    1,   30,    6,   46,
+              14,   38,   62,  108,  108,   14,   14,    9,    7,    8,
+              38,   14,   22,   30,   14,    9,   78,   54,   24,    1,
+               6,    6,    1,    6,   30,    6,   14,   38,    1,    9,
+               6,   54,    6,    9,    9,   14,   30,   38,   30,   38,
+               1,    6,   30,    6,   14,   38,    1,    9,    6,   54,
+               6,    9,    9,   14,   30,   38,   30,   38,   62,    9,
+              94,    1,   17,    1,    6,   14,    6,   30,   30,   62,
+              14,   40,    0,    9,   54,   14,    1,   40,    0,    9,
+              54,   14,    1,   40,    0,    9,   54,   14,    1,   23,
+              14,   30,   30,   54,   62,   54,   62,   24,   46,    0,
+               9,   46,    0,    9,    1,   24,   24,   24,    1,   30,
+              24,   16,   16,   86,   78,   54,   14,   62,   54,   38,
+               7,   56,   40,   24,    1,   24,   38,   16,   54,   24,
+              78,   46,   54,   24,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 39 */
+
+              14,   33,   56,    8,   15,    1,    6,   33,   57,    1,
+               1,   78,    1,    8,    1,    1,    1,   30,    8,   48,
+              16,   40,   66,  114,  114,   14,   14,    7,    9,    6,
+              40,   16,   24,   32,   14,    7,   78,   56,   22,    1,
+               8,    8,    0,    8,   32,    8,   16,   40,    0,    7,
+               8,   56,    8,    7,    7,   16,   32,   40,   32,   40,
+               0,    8,   32,    8,   16,   40,    0,    7,    8,   56,
+               8,    7,    7,   16,   32,   40,   32,   40,   64,    7,
+              98,    1,   17,    1,    8,   14,    8,   32,   32,   66,
+              14,   38,    1,    7,   56,   14,    1,   38,    1,    7,
+              56,   14,    1,   38,    1,    7,   56,   14,    1,   25,
+              14,   32,   32,   56,   64,   56,   64,   22,   46,    1,
+               7,   46,    1,    7,    1,   22,   22,   22,    1,   30,
+              22,   14,   14,   88,   78,   56,   14,   64,   56,   40,
+               9,   54,   38,   22,    1,   22,   40,   14,   56,   22,
+              80,   48,   56,   22,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 40 */
+
+              14,   35,   58,    8,   15,    1,    4,   35,   59,    1,
+               1,   78,    1,    8,    1,    1,    1,   30,   10,   48,
+              18,   44,   68,  118,  118,   14,   14,    7,   11,    6,
+              44,   16,   24,   34,   14,    5,   78,   58,   22,    1,
+               8,    8,    0,   10,   34,   10,   18,   44,    0,    5,
+              10,   60,   10,    5,    5,   18,   34,   42,   32,   42,
+               0,   10,   34,   10,   18,   44,    0,    5,   10,   60,
+              10,    5,    5,   18,   34,   42,   32,   42,   64,    7,
+             100,    1,   17,    1,    8,   14,    8,   32,   32,   68,
+              14,   38,    3,    7,   56,   14,    1,   38,    3,    7,
+              56,   14,    1,   38,    3,    7,   56,   14,    1,   25,
+              14,   32,   32,   58,   64,   58,   64,   22,   46,    3,
+               7,   46,    3,    7,    1,   20,   20,   22,    1,   30,
+              22,   12,   12,   88,   78,   56,   14,   64,   56,   40,
+               9,   52,   38,   22,    1,   22,   40,   12,   58,   22,
+              82,   48,   58,   22,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 41 */
+
+              14,   35,   60,    8,   13,    1,    2,   35,   61,    1,
+               1,   78,    1,    8,    1,    1,    1,   30,   12,   50,
+              22,   48,   72,  124,  124,   14,   14,    7,   13,    6,
+              48,   18,   24,   38,   14,    3,   78,   60,   22,    1,
+               8,    8,    2,   12,   38,   12,   22,   48,    2,    3,
+              12,   64,   12,    3,    3,   22,   38,   44,   34,   44,
+               2,   12,   38,   12,   22,   48,    2,    3,   12,   64,
+              12,    3,    3,   22,   38,   44,   34,   44,   66,    7,
+             104,    1,   17,    1,    8,   14,    8,   34,   34,   72,
+              14,   38,    3,    7,   56,   14,    1,   38,    3,    7,
+              56,   14,    1,   38,    3,    7,   56,   14,    1,   25,
+              14,   34,   34,   60,   66,   60,   66,   22,   46,    3,
+               7,   46,    3,    7,    1,   18,   18,   22,    1,   30,
+              22,   12,   12,   88,   78,   56,   14,   66,   56,   40,
+               9,   50,   38,   22,    1,   22,   40,   12,   60,   22,
+              86,   50,   60,   22,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 42 */
+
+              14,   37,   62,   10,   11,    1,    0,   37,   63,    1,
+               1,   78,    1,   10,    1,    1,    1,   30,   14,   52,
+              24,   50,   76,  124,  124,   14,   14,    5,   15,    4,
+              50,   20,   26,   40,   14,    1,   78,   62,   20,    1,
+              10,   10,    4,   14,   40,   14,   24,   50,    4,    1,
+              14,   66,   14,    1,    1,   24,   40,   46,   36,   46,
+               4,   14,   40,   14,   24,   50,    4,    1,   14,   66,
+              14,    1,    1,   24,   40,   46,   36,   46,   68,    5,
+             108,    1,   17,    1,   10,   14,   10,   36,   36,   76,
+              14,   36,    5,    5,   58,   14,    1,   36,    5,    5,
+              58,   14,    1,   36,    5,    5,   58,   14,    1,   27,
+              14,   36,   36,   62,   68,   62,   68,   20,   46,    5,
+               5,   46,    5,    5,    1,   16,   16,   20,    1,   30,
+              20,   10,   10,   90,   78,   58,   14,   68,   58,   42,
+              11,   48,   36,   20,    1,   20,   42,   10,   62,   20,
+              88,   52,   62,   20,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 43 */
+
+              14,   37,   64,   10,   11,    1,    1,   37,   65,    1,
+               1,   78,    1,   10,    1,    1,    1,   30,   16,   52,
+              26,   54,   80,  124,  124,   14,   14,    5,   17,    4,
+              54,   20,   26,   42,   14,    0,   78,   64,   20,    1,
+              10,   10,    4,   16,   42,   16,   26,   54,    4,    0,
+              16,   70,   16,    0,    0,   26,   42,   48,   36,   48,
+               4,   16,   42,   16,   26,   54,    4,    0,   16,   70,
+              16,    0,    0,   26,   42,   48,   36,   48,   68,    5,
+             112,    1,   17,    1,   10,   14,   10,   36,   36,   80,
+              14,   36,    5,    5,   58,   14,    1,   36,    5,    5,
+              58,   14,    1,   36,    5,    5,   58,   14,    1,   27,
+              14,   36,   36,   64,   68,   64,   68,   20,   46,    5,
+               5,   46,    5,    5,    1,   14,   14,   20,    1,   30,
+              20,   10,   10,   90,   78,   58,   14,   68,   58,   42,
+              11,   46,   36,   20,    1,   20,   42,   10,   64,   20,
+              90,   52,   64,   20,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 44 */
+
+              14,   39,   66,   10,    9,    1,    3,   39,   67,    1,
+               1,   78,    1,   10,    1,    1,    1,   30,   18,   54,
+              28,   56,   84,  124,  124,   14,   14,    5,   19,    4,
+              56,   22,   26,   44,   14,    2,   78,   66,   20,    1,
+              10,   10,    6,   18,   44,   18,   28,   56,    6,    2,
+              18,   72,   18,    2,    2,   28,   44,   50,   38,   50,
+               6,   18,   44,   18,   28,   56,    6,    2,   18,   72,
+              18,    2,    2,   28,   44,   50,   38,   50,   70,    5,
+             116,    1,   17,    1,   10,   14,   10,   38,   38,   84,
+              14,   36,    7,    5,   58,   14,    1,   36,    7,    5,
+              58,   14,    1,   36,    7,    5,   58,   14,    1,   27,
+              14,   38,   38,   66,   70,   66,   70,   20,   46,    7,
+               5,   46,    7,    5,    1,   12,   12,   20,    1,   30,
+              20,    8,    8,   90,   78,   58,   14,   70,   58,   42,
+              11,   44,   36,   20,    1,   20,   42,    8,   66,   20,
+              92,   54,   66,   20,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 45 */
+
+              14,   41,   68,   12,    7,    1,    5,   41,   69,    1,
+               1,   78,    1,   12,    1,    1,    1,   30,   20,   56,
+              32,   60,   88,  124,  124,   14,   14,    3,   21,    2,
+              60,   24,   28,   48,   14,    4,   78,   68,   18,    1,
+              12,   12,    8,   20,   48,   20,   32,   60,    8,    4,
+              20,   76,   20,    4,    4,   32,   48,   52,   40,   52,
+               8,   20,   48,   20,   32,   60,    8,    4,   20,   76,
+              20,    4,    4,   32,   48,   52,   40,   52,   72,    3,
+             120,    1,   17,    1,   12,   14,   12,   40,   40,   88,
+              14,   34,    9,    3,   60,   14,    1,   34,    9,    3,
+              60,   14,    1,   34,    9,    3,   60,   14,    1,   29,
+              14,   40,   40,   68,   72,   68,   72,   18,   46,    9,
+               3,   46,    9,    3,    1,   10,   10,   18,    1,   30,
+              18,    6,    6,   92,   78,   60,   14,   72,   60,   44,
+              13,   42,   34,   18,    1,   18,   44,    6,   68,   18,
+              96,   56,   68,   18,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 46 */
+
+              14,   41,   70,   12,    7,    1,    7,   41,   71,    1,
+               1,   78,    1,   12,    1,    1,    1,   30,   22,   56,
+              34,   62,   92,  124,  124,   14,   14,    3,   23,    2,
+              62,   24,   28,   50,   14,    6,   78,   70,   18,    1,
+              12,   12,    8,   22,   50,   22,   34,   62,    8,    6,
+              22,   78,   22,    6,    6,   34,   50,   54,   40,   54,
+               8,   22,   50,   22,   34,   62,    8,    6,   22,   78,
+              22,    6,    6,   34,   50,   54,   40,   54,   72,    3,
+             124,    1,   17,    1,   12,   14,   12,   40,   40,   92,
+              14,   34,    9,    3,   60,   14,    1,   34,    9,    3,
+              60,   14,    1,   34,    9,    3,   60,   14,    1,   29,
+              14,   40,   40,   70,   72,   70,   72,   18,   46,    9,
+               3,   46,    9,    3,    1,    8,    8,   18,    1,   30,
+              18,    6,    6,   92,   78,   60,   14,   72,   60,   44,
+              13,   40,   34,   18,    1,   18,   44,    6,   70,   18,
+              98,   56,   70,   18,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 47 */
+
+              14,   43,   72,   12,    5,    1,    9,   43,   73,    1,
+               1,   78,    1,   12,    1,    1,    1,   30,   24,   58,
+              36,   66,   96,  124,  124,   14,   14,    3,   25,    2,
+              66,   26,   28,   52,   14,    8,   78,   72,   18,    1,
+              12,   12,   10,   24,   52,   24,   36,   66,   10,    8,
+              24,   82,   24,    8,    8,   36,   52,   56,   42,   56,
+              10,   24,   52,   24,   36,   66,   10,    8,   24,   82,
+              24,    8,    8,   36,   52,   56,   42,   56,   74,    3,
+             124,    1,   17,    1,   12,   14,   12,   42,   42,   96,
+              14,   34,   11,    3,   60,   14,    1,   34,   11,    3,
+              60,   14,    1,   34,   11,    3,   60,   14,    1,   29,
+              14,   42,   42,   72,   74,   72,   74,   18,   46,   11,
+               3,   46,   11,    3,    1,    6,    6,   18,    1,   30,
+              18,    4,    4,   92,   78,   60,   14,   74,   60,   44,
+              13,   38,   34,   18,    1,   18,   44,    4,   72,   18,
+             100,   58,   72,   18,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 48 */
+
+              14,   45,   72,   12,    5,    1,   11,   45,   75,    1,
+               1,   78,    1,   12,    1,    1,    1,   30,   24,   58,
+              38,   68,   98,  124,  124,   14,   14,    3,   27,    0,
+              68,   26,   28,   54,   14,    8,   78,   72,   16,    1,
+              12,   12,   10,   24,   54,   24,   38,   68,   10,    8,
+              24,   84,   24,    8,    8,   38,   54,   56,   42,   56,
+              10,   24,   54,   24,   38,   68,   10,    8,   24,   84,
+              24,    8,    8,   38,   54,   56,   42,   56,   74,    3,
+             124,    1,   17,    1,   12,   14,   12,   42,   42,   98,
+              14,   32,   13,    3,   60,   14,    1,   32,   13,    3,
+              60,   14,    1,   32,   13,    3,   60,   14,    1,   31,
+              14,   42,   42,   72,   74,   72,   74,   16,   46,   13,
+               3,   46,   13,    3,    1,    4,    4,   16,    1,   30,
+              16,    2,    2,   92,   78,   60,   14,   74,   60,   44,
+              15,   36,   32,   16,    1,   16,   44,    2,   72,   16,
+             102,   58,   72,   16,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 49 */
+
+              14,   45,   74,   14,    3,    1,   11,   45,   75,    1,
+               1,   78,    1,   14,    1,    1,    1,   30,   26,   60,
+              42,   72,  102,  124,  124,   14,   14,    1,   27,    0,
+              72,   28,   30,   58,   14,   10,   78,   74,   16,    1,
+              14,   14,   12,   26,   58,   26,   42,   72,   12,   10,
+              26,   88,   26,   10,   10,   42,   58,   58,   44,   58,
+              12,   26,   58,   26,   42,   72,   12,   10,   26,   88,
+              26,   10,   10,   42,   58,   58,   44,   58,   76,    1,
+             124,    1,   17,    1,   14,   14,   14,   44,   44,  102,
+              14,   32,   13,    1,   62,   14,    1,   32,   13,    1,
+              62,   14,    1,   32,   13,    1,   62,   14,    1,   31,
+              14,   44,   44,   74,   76,   74,   76,   16,   46,   13,
+               1,   46,   13,    1,    1,    4,    4,   16,    1,   30,
+              16,    2,    2,   94,   78,   62,   14,   76,   62,   46,
+              15,   36,   32,   16,    1,   16,   46,    2,   74,   16,
+             106,   60,   74,   16,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 50 */
+
+              14,   47,   76,   14,    1,    1,   13,   47,   77,    1,
+               1,   78,    1,   14,    1,    1,    1,   30,   28,   62,
+              44,   76,  106,  124,  124,   14,   14,    1,   29,    0,
+              76,   30,   30,   60,   14,   12,   78,   76,   16,    1,
+              14,   14,   14,   28,   60,   28,   44,   76,   14,   12,
+              28,   92,   28,   12,   12,   44,   60,   60,   46,   60,
+              14,   28,   60,   28,   44,   76,   14,   12,   28,   92,
+              28,   12,   12,   44,   60,   60,   46,   60,   78,    1,
+             124,    1,   17,    1,   14,   14,   14,   46,   46,  106,
+              14,   32,   15,    1,   62,   14,    1,   32,   15,    1,
+              62,   14,    1,   32,   15,    1,   62,   14,    1,   31,
+              14,   46,   46,   76,   78,   76,   78,   16,   46,   15,
+               1,   46,   15,    1,    1,    2,    2,   16,    1,   30,
+              16,    0,    0,   94,   78,   62,   14,   78,   62,   46,
+              15,   34,   32,   16,    1,   16,   46,    0,   76,   16,
+             108,   62,   76,   16,
+        },
+
+        {
+            /* Context Tables for init_idc = 1, qp = 51 */
+
+              14,   47,   78,   14,    1,    1,   15,   47,   79,    1,
+               1,   78,    1,   14,    1,    1,    1,   30,   30,   62,
+              46,   78,  110,  124,  124,   14,   14,    1,   31,    0,
+              78,   30,   30,   62,   14,   14,   78,   78,   16,    1,
+              14,   14,   14,   30,   62,   30,   46,   78,   14,   14,
+              30,   94,   30,   14,   14,   46,   62,   62,   46,   62,
+              14,   30,   62,   30,   46,   78,   14,   14,   30,   94,
+              30,   14,   14,   46,   62,   62,   46,   62,   78,    1,
+             124,    1,   17,    1,   14,   14,   14,   46,   46,  110,
+              14,   32,   15,    1,   62,   14,    1,   32,   15,    1,
+              62,   14,    1,   32,   15,    1,   62,   14,    1,   31,
+              14,   46,   46,   78,   78,   78,   78,   16,   46,   15,
+               1,   46,   15,    1,    1,    0,    0,   16,    1,   30,
+              16,    0,    0,   94,   78,   62,   14,   78,   62,   46,
+              15,   32,   32,   16,    1,   16,   46,    0,   78,   16,
+             110,   62,   78,   16,
+        },
+
+    },
+
+    {
+        {
+            /* Context Tables for init_idc = 2, qp =  0 */
+
+              14,  124,   17,   17,   65,    1,   78,   14,   14,    1,
+               1,   62,    1,   17,    1,    1,   46,   30,    1,   14,
+              81,   81,   81,   81,   81,   14,   14,   14,   62,   30,
+              81,  124,   46,    1,   14,   81,   78,   33,   46,    1,
+              17,   17,   49,   65,   33,   65,   81,   65,   49,   81,
+              81,   81,   49,   65,   81,   81,   81,   33,   17,   49,
+              49,   65,   33,   65,   81,   65,   49,   81,   81,   81,
+              49,   65,   81,   81,   81,   33,   17,   49,   14,   33,
+              49,    1,    1,    1,   17,   14,   17,   17,   17,   81,
+              33,   62,   46,   33,   30,   14,    1,   62,   46,   33,
+              30,   14,    1,   62,   46,   33,   30,   14,    1,    1,
+              14,    1,    1,    1,   14,    1,   14,   46,   46,   46,
+              33,   46,   46,   33,    1,   94,   46,   46,    1,   30,
+              46,   62,   62,   62,   78,   30,   14,   14,   30,    1,
+              14,  124,   62,   46,    1,   30,   46,   62,   17,   46,
+              17,   17,   17,   46,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp =  1 */
+
+              14,  124,   15,   15,   63,    1,   78,   14,   14,    1,
+               1,   64,    1,   15,    1,    1,   46,   30,    1,   16,
+              77,   77,   77,   75,   75,   14,   14,   14,   62,   30,
+              77,  124,   46,    0,   14,   79,   78,   29,   46,    1,
+              15,   15,   47,   63,   31,   63,   77,   61,   47,   79,
+              79,   77,   47,   63,   79,   79,   77,   31,   15,   45,
+              47,   63,   31,   63,   77,   61,   47,   79,   79,   77,
+              47,   63,   79,   79,   77,   31,   15,   45,   16,   31,
+              45,    1,    1,    1,   15,   14,   15,   15,   15,   77,
+              31,   62,   46,   31,   32,   14,    1,   62,   46,   31,
+              32,   14,    1,   62,   46,   31,   32,   14,    1,    1,
+              14,    0,    0,    0,   16,    0,   16,   46,   46,   46,
+              31,   46,   46,   31,    1,   94,   46,   46,    1,   30,
+              46,   62,   62,   64,   78,   32,   14,   16,   32,    0,
+              14,  124,   62,   46,    1,   30,   46,   62,   15,   46,
+              13,   15,   15,   46,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp =  2 */
+
+              14,  124,   13,   15,   61,    1,   76,   12,   12,    1,
+               1,   64,    1,   15,    1,    1,   44,   30,    1,   16,
+              75,   73,   73,   71,   71,   14,   14,   14,   60,   30,
+              73,  124,   46,    2,   14,   77,   78,   27,   46,    1,
+              15,   15,   45,   61,   29,   61,   75,   59,   45,   77,
+              77,   73,   45,   61,   77,   77,   73,   29,   13,   43,
+              45,   61,   29,   61,   75,   59,   45,   77,   77,   73,
+              45,   61,   77,   77,   73,   29,   13,   43,   18,   31,
+              41,    1,    1,    1,   15,   14,   15,   13,   13,   73,
+              29,   62,   44,   31,   32,   14,    1,   62,   44,   31,
+              32,   14,    1,   62,   44,   31,   32,   14,    1,    1,
+              14,    0,    0,    2,   18,    2,   18,   46,   46,   44,
+              31,   46,   44,   31,    1,   92,   46,   46,    1,   30,
+              46,   60,   60,   64,   78,   32,   14,   18,   32,    2,
+              14,  124,   62,   46,    1,   30,   46,   60,   13,   46,
+              11,   13,   13,   46,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp =  3 */
+
+              14,  124,   11,   15,   61,    1,   74,   12,   10,    1,
+               1,   64,    1,   15,    1,    1,   44,   30,    1,   16,
+              73,   71,   69,   65,   65,   14,   14,   14,   58,   30,
+              71,  124,   46,    2,   14,   75,   78,   25,   46,    1,
+              15,   15,   45,   59,   29,   59,   73,   57,   45,   75,
+              75,   71,   45,   61,   75,   75,   71,   27,   13,   41,
+              45,   59,   29,   59,   73,   57,   45,   75,   75,   71,
+              45,   61,   75,   75,   71,   27,   13,   41,   18,   31,
+              37,    1,    1,    1,   15,   14,   15,   13,   13,   69,
+              29,   62,   44,   31,   32,   14,    1,   62,   44,   31,
+              32,   14,    1,   62,   44,   31,   32,   14,    1,    1,
+              14,    0,    0,    2,   18,    2,   18,   46,   46,   44,
+              31,   46,   44,   31,    1,   90,   46,   46,    1,   30,
+              46,   60,   60,   64,   78,   32,   14,   18,   32,    2,
+              14,  124,   62,   46,    1,   30,   46,   60,   11,   46,
+               9,   11,   11,   46,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp =  4 */
+
+              14,  124,    9,   13,   59,    1,   72,   10,    8,    1,
+               1,   66,    1,   13,    1,    1,   42,   30,    1,   18,
+              71,   67,   65,   61,   61,   14,   14,   12,   56,   28,
+              67,  124,   44,    4,   14,   73,   78,   23,   44,    1,
+              13,   13,   43,   57,   27,   57,   71,   55,   43,   73,
+              73,   67,   43,   59,   73,   73,   67,   25,   11,   39,
+              43,   57,   27,   57,   71,   55,   43,   73,   73,   67,
+              43,   59,   73,   73,   67,   25,   11,   39,   20,   29,
+              33,    1,    3,    1,   13,   14,   13,   11,   11,   65,
+              27,   60,   42,   29,   34,   14,    1,   60,   42,   29,
+              34,   14,    1,   60,   42,   29,   34,   14,    1,    3,
+              14,    2,    2,    4,   20,    4,   20,   44,   46,   42,
+              29,   46,   42,   29,    1,   88,   44,   44,    1,   30,
+              44,   58,   58,   66,   78,   34,   14,   20,   34,    4,
+              12,  124,   60,   44,    1,   30,   44,   58,    9,   44,
+               7,    9,    9,   44,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp =  5 */
+
+              14,  124,    7,   13,   57,    1,   70,    8,    6,    1,
+               1,   66,    1,   13,    1,    1,   40,   30,    1,   18,
+              67,   65,   61,   55,   55,   14,   14,   12,   54,   28,
+              65,  124,   44,    6,   14,   71,   78,   19,   44,    1,
+              13,   13,   41,   55,   25,   55,   67,   51,   41,   71,
+              71,   65,   41,   57,   71,   71,   65,   23,    9,   35,
+              41,   55,   25,   55,   67,   51,   41,   71,   71,   65,
+              41,   57,   71,   71,   65,   23,    9,   35,   22,   29,
+              29,    1,    3,    1,   13,   14,   13,    9,    9,   61,
+              25,   60,   40,   29,   34,   14,    1,   60,   40,   29,
+              34,   14,    1,   60,   40,   29,   34,   14,    1,    3,
+              14,    2,    2,    6,   22,    6,   22,   44,   46,   40,
+              29,   46,   40,   29,    1,   86,   44,   44,    1,   30,
+              44,   56,   56,   66,   78,   34,   14,   22,   34,    6,
+              12,  124,   60,   44,    1,   30,   44,   56,    7,   44,
+               3,    7,    7,   44,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp =  6 */
+
+              14,  124,    5,   13,   57,    1,   68,    8,    4,    1,
+               1,   66,    1,   13,    1,    1,   40,   30,    1,   18,
+              65,   61,   57,   51,   51,   14,   14,   12,   52,   28,
+              61,  124,   44,    6,   14,   69,   78,   17,   44,    1,
+              13,   13,   41,   53,   25,   53,   65,   49,   41,   69,
+              69,   61,   41,   57,   69,   69,   61,   21,    9,   33,
+              41,   53,   25,   53,   65,   49,   41,   69,   69,   61,
+              41,   57,   69,   69,   61,   21,    9,   33,   22,   29,
+              25,    1,    3,    1,   13,   14,   13,    9,    9,   57,
+              25,   60,   40,   29,   34,   14,    1,   60,   40,   29,
+              34,   14,    1,   60,   40,   29,   34,   14,    1,    3,
+              14,    2,    2,    6,   22,    6,   22,   44,   46,   40,
+              29,   46,   40,   29,    1,   84,   44,   44,    1,   30,
+              44,   56,   56,   66,   78,   34,   14,   22,   34,    6,
+              12,  124,   60,   44,    1,   30,   44,   56,    5,   44,
+               1,    5,    5,   44,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp =  7 */
+
+              14,  124,    3,   11,   55,    1,   66,    6,    2,    1,
+               1,   68,    1,   11,    1,    1,   38,   30,    1,   20,
+              63,   59,   53,   45,   45,   14,   14,   10,   50,   26,
+              59,  124,   42,    8,   14,   67,   78,   15,   42,    1,
+              11,   11,   39,   51,   23,   51,   63,   47,   39,   67,
+              67,   59,   39,   55,   67,   67,   59,   19,    7,   31,
+              39,   51,   23,   51,   63,   47,   39,   67,   67,   59,
+              39,   55,   67,   67,   59,   19,    7,   31,   24,   27,
+              21,    1,    5,    1,   11,   14,   11,    7,    7,   53,
+              23,   58,   38,   27,   36,   14,    1,   58,   38,   27,
+              36,   14,    1,   58,   38,   27,   36,   14,    1,    5,
+              14,    4,    4,    8,   24,    8,   24,   42,   46,   38,
+              27,   46,   38,   27,    1,   82,   42,   42,    1,   30,
+              42,   54,   54,   68,   78,   36,   14,   24,   36,    8,
+              10,  124,   58,   42,    1,   30,   42,   54,    3,   42,
+               0,    3,    3,   42,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp =  8 */
+
+              14,  124,    1,   11,   55,    1,   64,    4,    0,    1,
+               1,   68,    1,   11,    1,    1,   36,   30,    1,   20,
+              61,   55,   51,   41,   41,   14,   14,   10,   48,   26,
+              55,  124,   42,    8,   14,   65,   78,   13,   42,    1,
+              11,   11,   39,   49,   23,   49,   61,   45,   39,   65,
+              65,   55,   39,   55,   65,   65,   55,   17,    7,   29,
+              39,   49,   23,   49,   61,   45,   39,   65,   65,   55,
+              39,   55,   65,   65,   55,   17,    7,   29,   24,   27,
+              19,    1,    5,    1,   11,   14,   11,    7,    7,   51,
+              23,   58,   36,   27,   36,   14,    1,   58,   36,   27,
+              36,   14,    1,   58,   36,   27,   36,   14,    1,    5,
+              14,    4,    4,    8,   24,    8,   24,   42,   46,   36,
+              27,   46,   36,   27,    1,   80,   42,   42,    1,   30,
+              42,   52,   52,   68,   78,   36,   14,   24,   36,    8,
+              10,  124,   58,   42,    1,   30,   42,   52,    1,   42,
+               2,    1,    1,   42,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp =  9 */
+
+              14,  124,    0,   11,   53,    1,   62,    4,    1,    1,
+               1,   68,    1,   11,    1,    1,   36,   30,    1,   20,
+              57,   51,   47,   35,   35,   14,   14,   10,   46,   26,
+              51,  124,   42,   10,   14,   63,   78,    9,   42,    1,
+              11,   11,   37,   47,   21,   47,   57,   41,   37,   63,
+              63,   51,   37,   53,   63,   63,   51,   15,    5,   25,
+              37,   47,   21,   47,   57,   41,   37,   63,   63,   51,
+              37,   53,   63,   63,   51,   15,    5,   25,   26,   27,
+              15,    1,    5,    1,   11,   14,   11,    5,    5,   47,
+              21,   58,   36,   27,   36,   14,    1,   58,   36,   27,
+              36,   14,    1,   58,   36,   27,   36,   14,    1,    5,
+              14,    4,    4,   10,   26,   10,   26,   42,   46,   36,
+              27,   46,   36,   27,    1,   78,   42,   42,    1,   30,
+              42,   52,   52,   68,   78,   36,   14,   26,   36,   10,
+              10,  124,   58,   42,    1,   30,   42,   52,    0,   42,
+               6,    0,    0,   42,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 10 */
+
+              14,  124,    2,    9,   51,    1,   60,    2,    3,    1,
+               1,   70,    1,    9,    1,    1,   34,   30,    1,   22,
+              55,   49,   43,   31,   31,   14,   14,    8,   44,   24,
+              49,  124,   40,   12,   14,   61,   78,    7,   40,    1,
+               9,    9,   35,   45,   19,   45,   55,   39,   35,   61,
+              61,   49,   35,   51,   61,   61,   49,   13,    3,   23,
+              35,   45,   19,   45,   55,   39,   35,   61,   61,   49,
+              35,   51,   61,   61,   49,   13,    3,   23,   28,   25,
+              11,    1,    7,    1,    9,   14,    9,    3,    3,   43,
+              19,   56,   34,   25,   38,   14,    1,   56,   34,   25,
+              38,   14,    1,   56,   34,   25,   38,   14,    1,    7,
+              14,    6,    6,   12,   28,   12,   28,   40,   46,   34,
+              25,   46,   34,   25,    1,   76,   40,   40,    1,   30,
+              40,   50,   50,   70,   78,   38,   14,   28,   38,   12,
+               8,  124,   56,   40,    1,   30,   40,   50,    2,   40,
+               8,    2,    2,   40,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 11 */
+
+              14,  124,    4,    9,   51,    1,   58,    2,    5,    1,
+               1,   70,    1,    9,    1,    1,   34,   30,    1,   22,
+              53,   45,   39,   25,   25,   14,   14,    8,   42,   24,
+              45,  124,   40,   12,   14,   59,   78,    5,   40,    1,
+               9,    9,   35,   43,   19,   43,   53,   37,   35,   59,
+              59,   45,   35,   51,   59,   59,   45,   11,    3,   21,
+              35,   43,   19,   43,   53,   37,   35,   59,   59,   45,
+              35,   51,   59,   59,   45,   11,    3,   21,   28,   25,
+               7,    1,    7,    1,    9,   14,    9,    3,    3,   39,
+              19,   56,   34,   25,   38,   14,    1,   56,   34,   25,
+              38,   14,    1,   56,   34,   25,   38,   14,    1,    7,
+              14,    6,    6,   12,   28,   12,   28,   40,   46,   34,
+              25,   46,   34,   25,    1,   74,   40,   40,    1,   30,
+              40,   50,   50,   70,   78,   38,   14,   28,   38,   12,
+               8,  124,   56,   40,    1,   30,   40,   50,    4,   40,
+              10,    4,    4,   40,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 12 */
+
+              14,  124,    6,    9,   49,    1,   56,    0,    7,    1,
+               1,   70,    1,    9,    1,    1,   32,   30,    1,   22,
+              51,   43,   35,   21,   21,   14,   14,    8,   40,   24,
+              43,  122,   40,   14,   14,   57,   78,    3,   40,    1,
+               9,    9,   33,   41,   17,   41,   51,   35,   33,   57,
+              57,   43,   33,   49,   57,   57,   43,    9,    1,   19,
+              33,   41,   17,   41,   51,   35,   33,   57,   57,   43,
+              33,   49,   57,   57,   43,    9,    1,   19,   30,   25,
+               3,    1,    7,    1,    9,   14,    9,    1,    1,   35,
+              17,   56,   32,   25,   38,   14,    1,   56,   32,   25,
+              38,   14,    1,   56,   32,   25,   38,   14,    1,    7,
+              14,    6,    6,   14,   30,   14,   30,   40,   46,   32,
+              25,   46,   32,   25,    1,   72,   40,   40,    1,   30,
+              40,   48,   48,   70,   78,   38,   14,   30,   38,   14,
+               8,  124,   56,   40,    1,   30,   40,   48,    6,   40,
+              12,    6,    6,   40,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 13 */
+
+              14,  124,    8,    7,   47,    1,   54,    1,    9,    1,
+               1,   72,    1,    7,    1,    1,   30,   30,    1,   24,
+              47,   39,   31,   15,   15,   14,   14,    6,   38,   22,
+              39,  118,   38,   16,   14,   55,   78,    0,   38,    1,
+               7,    7,   31,   39,   15,   39,   47,   31,   31,   55,
+              55,   39,   31,   47,   55,   55,   39,    7,    0,   15,
+              31,   39,   15,   39,   47,   31,   31,   55,   55,   39,
+              31,   47,   55,   55,   39,    7,    0,   15,   32,   23,
+               0,    1,    9,    1,    7,   14,    7,    0,    0,   31,
+              15,   54,   30,   23,   40,   14,    1,   54,   30,   23,
+              40,   14,    1,   54,   30,   23,   40,   14,    1,    9,
+              14,    8,    8,   16,   32,   16,   32,   38,   46,   30,
+              23,   46,   30,   23,    1,   70,   38,   38,    1,   30,
+              38,   46,   46,   72,   78,   40,   14,   32,   40,   16,
+               6,  124,   54,   38,    1,   30,   38,   46,    8,   38,
+              16,    8,    8,   38,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 14 */
+
+              14,  124,   10,    7,   47,    1,   52,    1,   11,    1,
+               1,   72,    1,    7,    1,    1,   30,   30,    1,   24,
+              45,   37,   27,   11,   11,   14,   14,    6,   36,   22,
+              37,  116,   38,   16,   14,   53,   78,    2,   38,    1,
+               7,    7,   31,   37,   15,   37,   45,   29,   31,   53,
+              53,   37,   31,   47,   53,   53,   37,    5,    0,   13,
+              31,   37,   15,   37,   45,   29,   31,   53,   53,   37,
+              31,   47,   53,   53,   37,    5,    0,   13,   32,   23,
+               4,    1,    9,    1,    7,   14,    7,    0,    0,   27,
+              15,   54,   30,   23,   40,   14,    1,   54,   30,   23,
+              40,   14,    1,   54,   30,   23,   40,   14,    1,    9,
+              14,    8,    8,   16,   32,   16,   32,   38,   46,   30,
+              23,   46,   30,   23,    1,   68,   38,   38,    1,   30,
+              38,   46,   46,   72,   78,   40,   14,   32,   40,   16,
+               6,  124,   54,   38,    1,   30,   38,   46,   10,   38,
+              18,   10,   10,   38,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 15 */
+
+              14,  124,   12,    7,   45,    1,   50,    3,   13,    1,
+               1,   72,    1,    7,    1,    1,   28,   30,    1,   24,
+              43,   33,   23,    5,    5,   14,   14,    6,   34,   22,
+              33,  112,   38,   18,   14,   51,   78,    4,   38,    1,
+               7,    7,   29,   35,   13,   35,   43,   27,   29,   51,
+              51,   33,   29,   45,   51,   51,   33,    3,    2,   11,
+              29,   35,   13,   35,   43,   27,   29,   51,   51,   33,
+              29,   45,   51,   51,   33,    3,    2,   11,   34,   23,
+               8,    1,    9,    1,    7,   14,    7,    2,    2,   23,
+              13,   54,   28,   23,   40,   14,    1,   54,   28,   23,
+              40,   14,    1,   54,   28,   23,   40,   14,    1,    9,
+              14,    8,    8,   18,   34,   18,   34,   38,   46,   28,
+              23,   46,   28,   23,    1,   66,   38,   38,    1,   30,
+              38,   44,   44,   72,   78,   40,   14,   34,   40,   18,
+               6,  122,   54,   38,    1,   30,   38,   44,   12,   38,
+              20,   12,   12,   38,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 16 */
+
+              14,  124,   12,    7,   45,    1,   48,    5,   15,    1,
+               1,   72,    1,    7,    1,    1,   26,   30,    1,   24,
+              41,   31,   21,    1,    1,   14,   14,    4,   32,   20,
+              31,  108,   36,   18,   14,   51,   78,    6,   36,    1,
+               7,    7,   29,   35,   13,   35,   41,   25,   29,   51,
+              51,   31,   29,   45,   51,   51,   31,    3,    2,    9,
+              29,   35,   13,   35,   41,   25,   29,   51,   51,   31,
+              29,   45,   51,   51,   31,    3,    2,    9,   34,   23,
+              10,    1,   11,    1,    7,   14,    7,    2,    2,   21,
+              13,   52,   26,   23,   40,   14,    1,   52,   26,   23,
+              40,   14,    1,   52,   26,   23,   40,   14,    1,   11,
+              14,    8,    8,   18,   34,   18,   34,   36,   46,   26,
+              23,   46,   26,   23,    1,   64,   36,   36,    1,   30,
+              36,   42,   42,   72,   78,   40,   14,   34,   40,   18,
+               4,  118,   52,   36,    1,   30,   36,   42,   12,   36,
+              22,   12,   12,   36,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 17 */
+
+              14,  124,   14,    5,   43,    1,   48,    5,   15,    1,
+               1,   74,    1,    5,    1,    1,   26,   30,    1,   26,
+              37,   27,   17,    4,    4,   14,   14,    4,   32,   20,
+              27,  106,   36,   20,   14,   49,   78,   10,   36,    1,
+               5,    5,   27,   33,   11,   33,   37,   21,   27,   49,
+              49,   27,   27,   43,   49,   49,   27,    1,    4,    5,
+              27,   33,   11,   33,   37,   21,   27,   49,   49,   27,
+              27,   43,   49,   49,   27,    1,    4,    5,   36,   21,
+              14,    1,   11,    1,    5,   14,    5,    4,    4,   17,
+              11,   52,   26,   21,   42,   14,    1,   52,   26,   21,
+              42,   14,    1,   52,   26,   21,   42,   14,    1,   11,
+              14,   10,   10,   20,   36,   20,   36,   36,   46,   26,
+              21,   46,   26,   21,    1,   64,   36,   36,    1,   30,
+              36,   42,   42,   74,   78,   42,   14,   36,   42,   20,
+               4,  116,   52,   36,    1,   30,   36,   42,   14,   36,
+              26,   14,   14,   36,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 18 */
+
+              14,  124,   16,    5,   41,    1,   46,    7,   17,    1,
+               1,   74,    1,    5,    1,    1,   24,   30,    1,   26,
+              35,   23,   13,    8,    8,   14,   14,    4,   30,   20,
+              23,  102,   36,   22,   14,   47,   78,   12,   36,    1,
+               5,    5,   25,   31,    9,   31,   35,   19,   25,   47,
+              47,   23,   25,   41,   47,   47,   23,    0,    6,    3,
+              25,   31,    9,   31,   35,   19,   25,   47,   47,   23,
+              25,   41,   47,   47,   23,    0,    6,    3,   38,   21,
+              18,    1,   11,    1,    5,   14,    5,    6,    6,   13,
+               9,   52,   24,   21,   42,   14,    1,   52,   24,   21,
+              42,   14,    1,   52,   24,   21,   42,   14,    1,   11,
+              14,   10,   10,   22,   38,   22,   38,   36,   46,   24,
+              21,   46,   24,   21,    1,   62,   36,   36,    1,   30,
+              36,   40,   40,   74,   78,   42,   14,   38,   42,   22,
+               4,  114,   52,   36,    1,   30,   36,   40,   16,   36,
+              28,   16,   16,   36,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 19 */
+
+              14,  124,   18,    5,   41,    1,   44,    7,   19,    1,
+               1,   74,    1,    5,    1,    1,   24,   30,    1,   26,
+              33,   21,    9,   14,   14,   14,   14,    4,   28,   20,
+              21,  100,   36,   22,   14,   45,   78,   14,   36,    1,
+               5,    5,   25,   29,    9,   29,   33,   17,   25,   45,
+              45,   21,   25,   41,   45,   45,   21,    2,    6,    1,
+              25,   29,    9,   29,   33,   17,   25,   45,   45,   21,
+              25,   41,   45,   45,   21,    2,    6,    1,   38,   21,
+              22,    1,   11,    1,    5,   14,    5,    6,    6,    9,
+               9,   52,   24,   21,   42,   14,    1,   52,   24,   21,
+              42,   14,    1,   52,   24,   21,   42,   14,    1,   11,
+              14,   10,   10,   22,   38,   22,   38,   36,   46,   24,
+              21,   46,   24,   21,    1,   60,   36,   36,    1,   30,
+              36,   40,   40,   74,   78,   42,   14,   38,   42,   22,
+               4,  112,   52,   36,    1,   30,   36,   40,   18,   36,
+              30,   18,   18,   36,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 20 */
+
+              14,  124,   20,    3,   39,    1,   42,    9,   21,    1,
+               1,   76,    1,    3,    1,    1,   22,   30,    1,   28,
+              31,   17,    5,   18,   18,   14,   14,    2,   26,   18,
+              17,   96,   34,   24,   14,   43,   78,   16,   34,    1,
+               3,    3,   23,   27,    7,   27,   31,   15,   23,   43,
+              43,   17,   23,   39,   43,   43,   17,    4,    8,    0,
+              23,   27,    7,   27,   31,   15,   23,   43,   43,   17,
+              23,   39,   43,   43,   17,    4,    8,    0,   40,   19,
+              26,    1,   13,    1,    3,   14,    3,    8,    8,    5,
+               7,   50,   22,   19,   44,   14,    1,   50,   22,   19,
+              44,   14,    1,   50,   22,   19,   44,   14,    1,   13,
+              14,   12,   12,   24,   40,   24,   40,   34,   46,   22,
+              19,   46,   22,   19,    1,   58,   34,   34,    1,   30,
+              34,   38,   38,   76,   78,   44,   14,   40,   44,   24,
+               2,  108,   50,   34,    1,   30,   34,   38,   20,   34,
+              32,   20,   20,   34,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 21 */
+
+              14,  124,   22,    3,   37,    1,   40,   11,   23,    1,
+               1,   76,    1,    3,    1,    1,   20,   30,    1,   28,
+              27,   15,    1,   24,   24,   14,   14,    2,   24,   18,
+              15,   94,   34,   26,   14,   41,   78,   20,   34,    1,
+               3,    3,   21,   25,    5,   25,   27,   11,   21,   41,
+              41,   15,   21,   37,   41,   41,   15,    6,   10,    4,
+              21,   25,    5,   25,   27,   11,   21,   41,   41,   15,
+              21,   37,   41,   41,   15,    6,   10,    4,   42,   19,
+              30,    1,   13,    1,    3,   14,    3,   10,   10,    1,
+               5,   50,   20,   19,   44,   14,    1,   50,   20,   19,
+              44,   14,    1,   50,   20,   19,   44,   14,    1,   13,
+              14,   12,   12,   26,   42,   26,   42,   34,   46,   20,
+              19,   46,   20,   19,    1,   56,   34,   34,    1,   30,
+              34,   36,   36,   76,   78,   44,   14,   42,   44,   26,
+               2,  106,   50,   34,    1,   30,   34,   36,   22,   34,
+              36,   22,   22,   34,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 22 */
+
+              14,  124,   24,    3,   37,    1,   38,   11,   25,    1,
+               1,   76,    1,    3,    1,    1,   20,   30,    1,   28,
+              25,   11,    2,   28,   28,   14,   14,    2,   22,   18,
+              11,   90,   34,   26,   14,   39,   78,   22,   34,    1,
+               3,    3,   21,   23,    5,   23,   25,    9,   21,   39,
+              39,   11,   21,   37,   39,   39,   11,    8,   10,    6,
+              21,   23,    5,   23,   25,    9,   21,   39,   39,   11,
+              21,   37,   39,   39,   11,    8,   10,    6,   42,   19,
+              34,    1,   13,    1,    3,   14,    3,   10,   10,    2,
+               5,   50,   20,   19,   44,   14,    1,   50,   20,   19,
+              44,   14,    1,   50,   20,   19,   44,   14,    1,   13,
+              14,   12,   12,   26,   42,   26,   42,   34,   46,   20,
+              19,   46,   20,   19,    1,   54,   34,   34,    1,   30,
+              34,   36,   36,   76,   78,   44,   14,   42,   44,   26,
+               2,  104,   50,   34,    1,   30,   34,   36,   24,   34,
+              38,   24,   24,   34,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 23 */
+
+              14,  124,   26,    1,   35,    1,   36,   13,   27,    1,
+               1,   78,    1,    1,    1,    1,   18,   30,    1,   30,
+              23,    9,    6,   34,   34,   14,   14,    0,   20,   16,
+               9,   88,   32,   28,   14,   37,   78,   24,   32,    1,
+               1,    1,   19,   21,    3,   21,   23,    7,   19,   37,
+              37,    9,   19,   35,   37,   37,    9,   10,   12,    8,
+              19,   21,    3,   21,   23,    7,   19,   37,   37,    9,
+              19,   35,   37,   37,    9,   10,   12,    8,   44,   17,
+              38,    1,   15,    1,    1,   14,    1,   12,   12,    6,
+               3,   48,   18,   17,   46,   14,    1,   48,   18,   17,
+              46,   14,    1,   48,   18,   17,   46,   14,    1,   15,
+              14,   14,   14,   28,   44,   28,   44,   32,   46,   18,
+              17,   46,   18,   17,    1,   52,   32,   32,    1,   30,
+              32,   34,   34,   78,   78,   46,   14,   44,   46,   28,
+               0,  102,   48,   32,    1,   30,   32,   34,   26,   32,
+              40,   26,   26,   32,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 24 */
+
+              14,  124,   28,    1,   35,    1,   34,   15,   29,    1,
+               1,   78,    1,    1,    1,    1,   16,   30,    1,   30,
+              21,    5,    8,   38,   38,   14,   14,    0,   18,   16,
+               5,   84,   32,   28,   14,   35,   78,   26,   32,    1,
+               1,    1,   19,   19,    3,   19,   21,    5,   19,   35,
+              35,    5,   19,   35,   35,   35,    5,   12,   12,   10,
+              19,   19,    3,   19,   21,    5,   19,   35,   35,    5,
+              19,   35,   35,   35,    5,   12,   12,   10,   44,   17,
+              40,    1,   15,    1,    1,   14,    1,   12,   12,    8,
+               3,   48,   16,   17,   46,   14,    1,   48,   16,   17,
+              46,   14,    1,   48,   16,   17,   46,   14,    1,   15,
+              14,   14,   14,   28,   44,   28,   44,   32,   46,   16,
+              17,   46,   16,   17,    1,   50,   32,   32,    1,   30,
+              32,   32,   32,   78,   78,   46,   14,   44,   46,   28,
+               0,   98,   48,   32,    1,   30,   32,   32,   28,   32,
+              42,   28,   28,   32,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 25 */
+
+              14,  124,   30,    1,   33,    1,   32,   15,   31,    1,
+               1,   78,    1,    1,    1,    1,   16,   30,    1,   30,
+              17,    1,   12,   44,   44,   14,   14,    0,   16,   16,
+               1,   80,   32,   30,   14,   33,   78,   30,   32,    1,
+               1,    1,   17,   17,    1,   17,   17,    1,   17,   33,
+              33,    1,   17,   33,   33,   33,    1,   14,   14,   14,
+              17,   17,    1,   17,   17,    1,   17,   33,   33,    1,
+              17,   33,   33,   33,    1,   14,   14,   14,   46,   17,
+              44,    1,   15,    1,    1,   14,    1,   14,   14,   12,
+               1,   48,   16,   17,   46,   14,    1,   48,   16,   17,
+              46,   14,    1,   48,   16,   17,   46,   14,    1,   15,
+              14,   14,   14,   30,   46,   30,   46,   32,   46,   16,
+              17,   46,   16,   17,    1,   48,   32,   32,    1,   30,
+              32,   32,   32,   78,   78,   46,   14,   46,   46,   30,
+               0,   96,   48,   32,    1,   30,   32,   32,   30,   32,
+              46,   30,   30,   32,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 26 */
+
+              14,  124,   32,    0,   31,    1,   30,   17,   33,    1,
+               1,   80,    1,    0,    1,    1,   14,   30,    1,   32,
+              15,    0,   16,   48,   48,   14,   14,    1,   14,   14,
+               0,   78,   30,   32,   14,   31,   78,   32,   30,    1,
+               0,    0,   15,   15,    0,   15,   15,    0,   15,   31,
+              31,    0,   15,   31,   31,   31,    0,   16,   16,   16,
+              15,   15,    0,   15,   15,    0,   15,   31,   31,    0,
+              15,   31,   31,   31,    0,   16,   16,   16,   48,   15,
+              48,    1,   17,    1,    0,   14,    0,   16,   16,   16,
+               0,   46,   14,   15,   48,   14,    1,   46,   14,   15,
+              48,   14,    1,   46,   14,   15,   48,   14,    1,   17,
+              14,   16,   16,   32,   48,   32,   48,   30,   46,   14,
+              15,   46,   14,   15,    1,   46,   30,   30,    1,   30,
+              30,   30,   30,   80,   78,   48,   14,   48,   48,   32,
+               1,   94,   46,   30,    1,   30,   30,   30,   32,   30,
+              48,   32,   32,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 27 */
+
+              14,  124,   34,    0,   31,    1,   28,   17,   35,    1,
+               1,   80,    1,    0,    1,    1,   14,   30,    1,   32,
+              13,    4,   20,   54,   54,   14,   14,    1,   12,   14,
+               4,   74,   30,   32,   14,   29,   78,   34,   30,    1,
+               0,    0,   15,   13,    0,   13,   13,    2,   15,   29,
+              29,    4,   15,   31,   29,   29,    4,   18,   16,   18,
+              15,   13,    0,   13,   13,    2,   15,   29,   29,    4,
+              15,   31,   29,   29,    4,   18,   16,   18,   48,   15,
+              52,    1,   17,    1,    0,   14,    0,   16,   16,   20,
+               0,   46,   14,   15,   48,   14,    1,   46,   14,   15,
+              48,   14,    1,   46,   14,   15,   48,   14,    1,   17,
+              14,   16,   16,   32,   48,   32,   48,   30,   46,   14,
+              15,   46,   14,   15,    1,   44,   30,   30,    1,   30,
+              30,   30,   30,   80,   78,   48,   14,   48,   48,   32,
+               1,   92,   46,   30,    1,   30,   30,   30,   34,   30,
+              50,   34,   34,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 28 */
+
+              14,  124,   36,    0,   29,    1,   26,   19,   37,    1,
+               1,   80,    1,    0,    1,    1,   12,   30,    1,   32,
+              11,    6,   24,   58,   58,   14,   14,    1,   10,   14,
+               6,   72,   30,   34,   14,   27,   78,   36,   30,    1,
+               0,    0,   13,   11,    2,   11,   11,    4,   13,   27,
+              27,    6,   13,   29,   27,   27,    6,   20,   18,   20,
+              13,   11,    2,   11,   11,    4,   13,   27,   27,    6,
+              13,   29,   27,   27,    6,   20,   18,   20,   50,   15,
+              56,    1,   17,    1,    0,   14,    0,   18,   18,   24,
+               2,   46,   12,   15,   48,   14,    1,   46,   12,   15,
+              48,   14,    1,   46,   12,   15,   48,   14,    1,   17,
+              14,   16,   16,   34,   50,   34,   50,   30,   46,   12,
+              15,   46,   12,   15,    1,   42,   30,   30,    1,   30,
+              30,   28,   28,   80,   78,   48,   14,   50,   48,   34,
+               1,   88,   46,   30,    1,   30,   30,   28,   36,   30,
+              52,   36,   36,   30,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 29 */
+
+              14,  124,   38,    2,   27,    1,   24,   21,   39,    1,
+               1,   82,    1,    2,    1,    1,   10,   30,    1,   34,
+               7,   10,   28,   64,   64,   14,   14,    3,    8,   12,
+              10,   68,   28,   36,   14,   25,   78,   40,   28,    1,
+               2,    2,   11,    9,    4,    9,    7,    8,   11,   25,
+              25,   10,   11,   27,   25,   25,   10,   22,   20,   24,
+              11,    9,    4,    9,    7,    8,   11,   25,   25,   10,
+              11,   27,   25,   25,   10,   22,   20,   24,   52,   13,
+              60,    1,   19,    1,    2,   14,    2,   20,   20,   28,
+               4,   44,   10,   13,   50,   14,    1,   44,   10,   13,
+              50,   14,    1,   44,   10,   13,   50,   14,    1,   19,
+              14,   18,   18,   36,   52,   36,   52,   28,   46,   10,
+              13,   46,   10,   13,    1,   40,   28,   28,    1,   30,
+              28,   26,   26,   82,   78,   50,   14,   52,   50,   36,
+               3,   86,   44,   28,    1,   30,   28,   26,   38,   28,
+              56,   38,   38,   28,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 30 */
+
+              14,  124,   40,    2,   27,    1,   22,   21,   41,    1,
+               1,   82,    1,    2,    1,    1,   10,   30,    1,   34,
+               5,   12,   32,   68,   68,   14,   14,    3,    6,   12,
+              12,   66,   28,   36,   14,   23,   78,   42,   28,    1,
+               2,    2,   11,    7,    4,    7,    5,   10,   11,   23,
+              23,   12,   11,   27,   23,   23,   12,   24,   20,   26,
+              11,    7,    4,    7,    5,   10,   11,   23,   23,   12,
+              11,   27,   23,   23,   12,   24,   20,   26,   52,   13,
+              64,    1,   19,    1,    2,   14,    2,   20,   20,   32,
+               4,   44,   10,   13,   50,   14,    1,   44,   10,   13,
+              50,   14,    1,   44,   10,   13,   50,   14,    1,   19,
+              14,   18,   18,   36,   52,   36,   52,   28,   46,   10,
+              13,   46,   10,   13,    1,   38,   28,   28,    1,   30,
+              28,   26,   26,   82,   78,   50,   14,   52,   50,   36,
+               3,   84,   44,   28,    1,   30,   28,   26,   40,   28,
+              58,   40,   40,   28,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 31 */
+
+              14,  124,   42,    2,   25,    1,   20,   23,   43,    1,
+               1,   82,    1,    2,    1,    1,    8,   30,    1,   34,
+               3,   16,   36,   74,   74,   14,   14,    3,    4,   12,
+              16,   62,   28,   38,   14,   21,   78,   44,   28,    1,
+               2,    2,    9,    5,    6,    5,    3,   12,    9,   21,
+              21,   16,    9,   25,   21,   21,   16,   26,   22,   28,
+               9,    5,    6,    5,    3,   12,    9,   21,   21,   16,
+               9,   25,   21,   21,   16,   26,   22,   28,   54,   13,
+              68,    1,   19,    1,    2,   14,    2,   22,   22,   36,
+               6,   44,    8,   13,   50,   14,    1,   44,    8,   13,
+              50,   14,    1,   44,    8,   13,   50,   14,    1,   19,
+              14,   18,   18,   38,   54,   38,   54,   28,   46,    8,
+              13,   46,    8,   13,    1,   36,   28,   28,    1,   30,
+              28,   24,   24,   82,   78,   50,   14,   54,   50,   38,
+               3,   82,   44,   28,    1,   30,   28,   24,   42,   28,
+              60,   42,   42,   28,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 32 */
+
+              14,  124,   42,    2,   25,    1,   18,   25,   45,    1,
+               1,   82,    1,    2,    1,    1,    6,   30,    1,   34,
+               1,   18,   38,   78,   78,   14,   14,    5,    2,   10,
+              18,   58,   26,   38,   14,   21,   78,   46,   26,    1,
+               2,    2,    9,    5,    6,    5,    1,   14,    9,   21,
+              21,   18,    9,   25,   21,   21,   18,   26,   22,   30,
+               9,    5,    6,    5,    1,   14,    9,   21,   21,   18,
+               9,   25,   21,   21,   18,   26,   22,   30,   54,   13,
+              70,    1,   21,    1,    2,   14,    2,   22,   22,   38,
+               6,   42,    6,   13,   50,   14,    1,   42,    6,   13,
+              50,   14,    1,   42,    6,   13,   50,   14,    1,   21,
+              14,   18,   18,   38,   54,   38,   54,   26,   46,    6,
+              13,   46,    6,   13,    1,   34,   26,   26,    1,   30,
+              26,   22,   22,   82,   78,   50,   14,   54,   50,   38,
+               5,   78,   42,   26,    1,   30,   26,   22,   42,   26,
+              62,   42,   42,   26,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 33 */
+
+              14,  124,   44,    4,   23,    1,   18,   25,   45,    1,
+               1,   84,    1,    4,    1,    1,    6,   30,    1,   36,
+               2,   22,   42,   84,   84,   14,   14,    5,    2,   10,
+              22,   56,   26,   40,   14,   19,   78,   50,   26,    1,
+               4,    4,    7,    3,    8,    3,    2,   18,    7,   19,
+              19,   22,    7,   23,   19,   19,   22,   28,   24,   34,
+               7,    3,    8,    3,    2,   18,    7,   19,   19,   22,
+               7,   23,   19,   19,   22,   28,   24,   34,   56,   11,
+              74,    1,   21,    1,    4,   14,    4,   24,   24,   42,
+               8,   42,    6,   11,   52,   14,    1,   42,    6,   11,
+              52,   14,    1,   42,    6,   11,   52,   14,    1,   21,
+              14,   20,   20,   40,   56,   40,   56,   26,   46,    6,
+              11,   46,    6,   11,    1,   34,   26,   26,    1,   30,
+              26,   22,   22,   84,   78,   52,   14,   56,   52,   40,
+               5,   76,   42,   26,    1,   30,   26,   22,   44,   26,
+              66,   44,   44,   26,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 34 */
+
+              14,  124,   46,    4,   21,    1,   16,   27,   47,    1,
+               1,   84,    1,    4,    1,    1,    4,   30,    1,   36,
+               4,   26,   46,   88,   88,   14,   14,    5,    0,   10,
+              26,   52,   26,   42,   14,   17,   78,   52,   26,    1,
+               4,    4,    5,    1,   10,    1,    4,   20,    5,   17,
+              17,   26,    5,   21,   17,   17,   26,   30,   26,   36,
+               5,    1,   10,    1,    4,   20,    5,   17,   17,   26,
+               5,   21,   17,   17,   26,   30,   26,   36,   58,   11,
+              78,    1,   21,    1,    4,   14,    4,   26,   26,   46,
+              10,   42,    4,   11,   52,   14,    1,   42,    4,   11,
+              52,   14,    1,   42,    4,   11,   52,   14,    1,   21,
+              14,   20,   20,   42,   58,   42,   58,   26,   46,    4,
+              11,   46,    4,   11,    1,   32,   26,   26,    1,   30,
+              26,   20,   20,   84,   78,   52,   14,   58,   52,   42,
+               5,   74,   42,   26,    1,   30,   26,   20,   46,   26,
+              68,   46,   46,   26,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 35 */
+
+              14,  124,   48,    4,   21,    1,   14,   27,   49,    1,
+               1,   84,    1,    4,    1,    1,    4,   30,    1,   36,
+               6,   28,   50,   94,   94,   14,   14,    5,    1,   10,
+              28,   50,   26,   42,   14,   15,   78,   54,   26,    1,
+               4,    4,    5,    0,   10,    0,    6,   22,    5,   15,
+              15,   28,    5,   21,   15,   15,   28,   32,   26,   38,
+               5,    0,   10,    0,    6,   22,    5,   15,   15,   28,
+               5,   21,   15,   15,   28,   32,   26,   38,   58,   11,
+              82,    1,   21,    1,    4,   14,    4,   26,   26,   50,
+              10,   42,    4,   11,   52,   14,    1,   42,    4,   11,
+              52,   14,    1,   42,    4,   11,   52,   14,    1,   21,
+              14,   20,   20,   42,   58,   42,   58,   26,   46,    4,
+              11,   46,    4,   11,    1,   30,   26,   26,    1,   30,
+              26,   20,   20,   84,   78,   52,   14,   58,   52,   42,
+               5,   72,   42,   26,    1,   30,   26,   20,   48,   26,
+              70,   48,   48,   26,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 36 */
+
+              14,  124,   50,    6,   19,    1,   12,   29,   51,    1,
+               1,   86,    1,    6,    1,    1,    2,   30,    1,   38,
+               8,   32,   54,   98,   98,   14,   14,    7,    3,    8,
+              32,   46,   24,   44,   14,   13,   78,   56,   24,    1,
+               6,    6,    3,    2,   12,    2,    8,   24,    3,   13,
+              13,   32,    3,   19,   13,   13,   32,   34,   28,   40,
+               3,    2,   12,    2,    8,   24,    3,   13,   13,   32,
+               3,   19,   13,   13,   32,   34,   28,   40,   60,    9,
+              86,    1,   23,    1,    6,   14,    6,   28,   28,   54,
+              12,   40,    2,    9,   54,   14,    1,   40,    2,    9,
+              54,   14,    1,   40,    2,    9,   54,   14,    1,   23,
+              14,   22,   22,   44,   60,   44,   60,   24,   46,    2,
+               9,   46,    2,    9,    1,   28,   24,   24,    1,   30,
+              24,   18,   18,   86,   78,   54,   14,   60,   54,   44,
+               7,   68,   40,   24,    1,   30,   24,   18,   50,   24,
+              72,   50,   50,   24,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 37 */
+
+              14,  124,   52,    6,   17,    1,   10,   31,   53,    1,
+               1,   86,    1,    6,    1,    1,    0,   30,    1,   38,
+              12,   34,   58,  104,  104,   14,   14,    7,    5,    8,
+              34,   44,   24,   46,   14,   11,   78,   60,   24,    1,
+               6,    6,    1,    4,   14,    4,   12,   28,    1,   11,
+              11,   34,    1,   17,   11,   11,   34,   36,   30,   44,
+               1,    4,   14,    4,   12,   28,    1,   11,   11,   34,
+               1,   17,   11,   11,   34,   36,   30,   44,   62,    9,
+              90,    1,   23,    1,    6,   14,    6,   30,   30,   58,
+              14,   40,    0,    9,   54,   14,    1,   40,    0,    9,
+              54,   14,    1,   40,    0,    9,   54,   14,    1,   23,
+              14,   22,   22,   46,   62,   46,   62,   24,   46,    0,
+               9,   46,    0,    9,    1,   26,   24,   24,    1,   30,
+              24,   16,   16,   86,   78,   54,   14,   62,   54,   46,
+               7,   66,   40,   24,    1,   30,   24,   16,   52,   24,
+              76,   52,   52,   24,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 38 */
+
+              14,  124,   54,    6,   17,    1,    8,   31,   55,    1,
+               1,   86,    1,    6,    1,    1,    0,   30,    1,   38,
+              14,   38,   62,  108,  108,   14,   14,    7,    7,    8,
+              38,   40,   24,   46,   14,    9,   78,   62,   24,    1,
+               6,    6,    1,    6,   14,    6,   14,   30,    1,    9,
+               9,   38,    1,   17,    9,    9,   38,   38,   30,   46,
+               1,    6,   14,    6,   14,   30,    1,    9,    9,   38,
+               1,   17,    9,    9,   38,   38,   30,   46,   62,    9,
+              94,    1,   23,    1,    6,   14,    6,   30,   30,   62,
+              14,   40,    0,    9,   54,   14,    1,   40,    0,    9,
+              54,   14,    1,   40,    0,    9,   54,   14,    1,   23,
+              14,   22,   22,   46,   62,   46,   62,   24,   46,    0,
+               9,   46,    0,    9,    1,   24,   24,   24,    1,   30,
+              24,   16,   16,   86,   78,   54,   14,   62,   54,   46,
+               7,   64,   40,   24,    1,   30,   24,   16,   54,   24,
+              78,   54,   54,   24,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 39 */
+
+              14,  124,   56,    8,   15,    1,    6,   33,   57,    1,
+               1,   88,    1,    8,    1,    1,    1,   30,    1,   40,
+              16,   40,   66,  114,  114,   14,   14,    9,    9,    6,
+              40,   38,   22,   48,   14,    7,   78,   64,   22,    1,
+               8,    8,    0,    8,   16,    8,   16,   32,    0,    7,
+               7,   40,    0,   15,    7,    7,   40,   40,   32,   48,
+               0,    8,   16,    8,   16,   32,    0,    7,    7,   40,
+               0,   15,    7,    7,   40,   40,   32,   48,   64,    7,
+              98,    1,   25,    1,    8,   14,    8,   32,   32,   66,
+              16,   38,    1,    7,   56,   14,    1,   38,    1,    7,
+              56,   14,    1,   38,    1,    7,   56,   14,    1,   25,
+              14,   24,   24,   48,   64,   48,   64,   22,   46,    1,
+               7,   46,    1,    7,    1,   22,   22,   22,    1,   30,
+              22,   14,   14,   88,   78,   56,   14,   64,   56,   48,
+               9,   62,   38,   22,    1,   30,   22,   14,   56,   22,
+              80,   56,   56,   22,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 40 */
+
+              14,  124,   58,    8,   15,    1,    4,   35,   59,    1,
+               1,   88,    1,    8,    1,    1,    3,   30,    1,   40,
+              18,   44,   68,  118,  118,   14,   14,    9,   11,    6,
+              44,   34,   22,   48,   14,    5,   78,   66,   22,    1,
+               8,    8,    0,   10,   16,   10,   18,   34,    0,    5,
+               5,   44,    0,   15,    5,    5,   44,   42,   32,   50,
+               0,   10,   16,   10,   18,   34,    0,    5,    5,   44,
+               0,   15,    5,    5,   44,   42,   32,   50,   64,    7,
+             100,    1,   25,    1,    8,   14,    8,   32,   32,   68,
+              16,   38,    3,    7,   56,   14,    1,   38,    3,    7,
+              56,   14,    1,   38,    3,    7,   56,   14,    1,   25,
+              14,   24,   24,   48,   64,   48,   64,   22,   46,    3,
+               7,   46,    3,    7,    1,   20,   22,   22,    1,   30,
+              22,   12,   12,   88,   78,   56,   14,   64,   56,   48,
+               9,   58,   38,   22,    1,   30,   22,   12,   58,   22,
+              82,   58,   58,   22,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 41 */
+
+              14,  124,   60,    8,   13,    1,    2,   35,   61,    1,
+               1,   88,    1,    8,    1,    1,    3,   30,    1,   40,
+              22,   48,   72,  124,  124,   14,   14,    9,   13,    6,
+              48,   30,   22,   50,   14,    3,   78,   70,   22,    1,
+               8,    8,    2,   12,   18,   12,   22,   38,    2,    3,
+               3,   48,    2,   13,    3,    3,   48,   44,   34,   54,
+               2,   12,   18,   12,   22,   38,    2,    3,    3,   48,
+               2,   13,    3,    3,   48,   44,   34,   54,   66,    7,
+             104,    1,   25,    1,    8,   14,    8,   34,   34,   72,
+              18,   38,    3,    7,   56,   14,    1,   38,    3,    7,
+              56,   14,    1,   38,    3,    7,   56,   14,    1,   25,
+              14,   24,   24,   50,   66,   50,   66,   22,   46,    3,
+               7,   46,    3,    7,    1,   18,   22,   22,    1,   30,
+              22,   12,   12,   88,   78,   56,   14,   66,   56,   50,
+               9,   56,   38,   22,    1,   30,   22,   12,   60,   22,
+              86,   60,   60,   22,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 42 */
+
+              14,  124,   62,   10,   11,    1,    0,   37,   63,    1,
+               1,   90,    1,   10,    1,    1,    5,   30,    1,   42,
+              24,   50,   76,  124,  124,   14,   14,   11,   15,    4,
+              50,   28,   20,   52,   14,    1,   78,   72,   20,    1,
+              10,   10,    4,   14,   20,   14,   24,   40,    4,    1,
+               1,   50,    4,   11,    1,    1,   50,   46,   36,   56,
+               4,   14,   20,   14,   24,   40,    4,    1,    1,   50,
+               4,   11,    1,    1,   50,   46,   36,   56,   68,    5,
+             108,    1,   27,    1,   10,   14,   10,   36,   36,   76,
+              20,   36,    5,    5,   58,   14,    1,   36,    5,    5,
+              58,   14,    1,   36,    5,    5,   58,   14,    1,   27,
+              14,   26,   26,   52,   68,   52,   68,   20,   46,    5,
+               5,   46,    5,    5,    1,   16,   20,   20,    1,   30,
+              20,   10,   10,   90,   78,   58,   14,   68,   58,   52,
+              11,   54,   36,   20,    1,   30,   20,   10,   62,   20,
+              88,   62,   62,   20,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 43 */
+
+              14,  124,   64,   10,   11,    1,    1,   37,   65,    1,
+               1,   90,    1,   10,    1,    1,    5,   30,    1,   42,
+              26,   54,   80,  124,  124,   14,   14,   11,   17,    4,
+              54,   24,   20,   52,   14,    0,   78,   74,   20,    1,
+              10,   10,    4,   16,   20,   16,   26,   42,    4,    0,
+               0,   54,    4,   11,    0,    0,   54,   48,   36,   58,
+               4,   16,   20,   16,   26,   42,    4,    0,    0,   54,
+               4,   11,    0,    0,   54,   48,   36,   58,   68,    5,
+             112,    1,   27,    1,   10,   14,   10,   36,   36,   80,
+              20,   36,    5,    5,   58,   14,    1,   36,    5,    5,
+              58,   14,    1,   36,    5,    5,   58,   14,    1,   27,
+              14,   26,   26,   52,   68,   52,   68,   20,   46,    5,
+               5,   46,    5,    5,    1,   14,   20,   20,    1,   30,
+              20,   10,   10,   90,   78,   58,   14,   68,   58,   52,
+              11,   52,   36,   20,    1,   30,   20,   10,   64,   20,
+              90,   64,   64,   20,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 44 */
+
+              14,  124,   66,   10,    9,    1,    3,   39,   67,    1,
+               1,   90,    1,   10,    1,    1,    7,   30,    1,   42,
+              28,   56,   84,  124,  124,   14,   14,   11,   19,    4,
+              56,   22,   20,   54,   14,    2,   78,   76,   20,    1,
+              10,   10,    6,   18,   22,   18,   28,   44,    6,    2,
+               2,   56,    6,    9,    2,    2,   56,   50,   38,   60,
+               6,   18,   22,   18,   28,   44,    6,    2,    2,   56,
+               6,    9,    2,    2,   56,   50,   38,   60,   70,    5,
+             116,    1,   27,    1,   10,   14,   10,   38,   38,   84,
+              22,   36,    7,    5,   58,   14,    1,   36,    7,    5,
+              58,   14,    1,   36,    7,    5,   58,   14,    1,   27,
+              14,   26,   26,   54,   70,   54,   70,   20,   46,    7,
+               5,   46,    7,    5,    1,   12,   20,   20,    1,   30,
+              20,    8,    8,   90,   78,   58,   14,   70,   58,   54,
+              11,   48,   36,   20,    1,   30,   20,    8,   66,   20,
+              92,   66,   66,   20,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 45 */
+
+              14,  124,   68,   12,    7,    1,    5,   41,   69,    1,
+               1,   92,    1,   12,    1,    1,    9,   30,    1,   44,
+              32,   60,   88,  124,  124,   14,   14,   13,   21,    2,
+              60,   18,   18,   56,   14,    4,   78,   80,   18,    1,
+              12,   12,    8,   20,   24,   20,   32,   48,    8,    4,
+               4,   60,    8,    7,    4,    4,   60,   52,   40,   64,
+               8,   20,   24,   20,   32,   48,    8,    4,    4,   60,
+               8,    7,    4,    4,   60,   52,   40,   64,   72,    3,
+             120,    1,   29,    1,   12,   14,   12,   40,   40,   88,
+              24,   34,    9,    3,   60,   14,    1,   34,    9,    3,
+              60,   14,    1,   34,    9,    3,   60,   14,    1,   29,
+              14,   28,   28,   56,   72,   56,   72,   18,   46,    9,
+               3,   46,    9,    3,    1,   10,   18,   18,    1,   30,
+              18,    6,    6,   92,   78,   60,   14,   72,   60,   56,
+              13,   46,   34,   18,    1,   30,   18,    6,   68,   18,
+              96,   68,   68,   18,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 46 */
+
+              14,  124,   70,   12,    7,    1,    7,   41,   71,    1,
+               1,   92,    1,   12,    1,    1,    9,   30,    1,   44,
+              34,   62,   92,  124,  124,   14,   14,   13,   23,    2,
+              62,   16,   18,   56,   14,    6,   78,   82,   18,    1,
+              12,   12,    8,   22,   24,   22,   34,   50,    8,    6,
+               6,   62,    8,    7,    6,    6,   62,   54,   40,   66,
+               8,   22,   24,   22,   34,   50,    8,    6,    6,   62,
+               8,    7,    6,    6,   62,   54,   40,   66,   72,    3,
+             124,    1,   29,    1,   12,   14,   12,   40,   40,   92,
+              24,   34,    9,    3,   60,   14,    1,   34,    9,    3,
+              60,   14,    1,   34,    9,    3,   60,   14,    1,   29,
+              14,   28,   28,   56,   72,   56,   72,   18,   46,    9,
+               3,   46,    9,    3,    1,    8,   18,   18,    1,   30,
+              18,    6,    6,   92,   78,   60,   14,   72,   60,   56,
+              13,   44,   34,   18,    1,   30,   18,    6,   70,   18,
+              98,   70,   70,   18,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 47 */
+
+              14,  124,   72,   12,    5,    1,    9,   43,   73,    1,
+               1,   92,    1,   12,    1,    1,   11,   30,    1,   44,
+              36,   66,   96,  124,  124,   14,   14,   13,   25,    2,
+              66,   12,   18,   58,   14,    8,   78,   84,   18,    1,
+              12,   12,   10,   24,   26,   24,   36,   52,   10,    8,
+               8,   66,   10,    5,    8,    8,   66,   56,   42,   68,
+              10,   24,   26,   24,   36,   52,   10,    8,    8,   66,
+              10,    5,    8,    8,   66,   56,   42,   68,   74,    3,
+             124,    1,   29,    1,   12,   14,   12,   42,   42,   96,
+              26,   34,   11,    3,   60,   14,    1,   34,   11,    3,
+              60,   14,    1,   34,   11,    3,   60,   14,    1,   29,
+              14,   28,   28,   58,   74,   58,   74,   18,   46,   11,
+               3,   46,   11,    3,    1,    6,   18,   18,    1,   30,
+              18,    4,    4,   92,   78,   60,   14,   74,   60,   58,
+              13,   42,   34,   18,    1,   30,   18,    4,   72,   18,
+             100,   72,   72,   18,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 48 */
+
+              14,  124,   72,   12,    5,    1,   11,   45,   75,    1,
+               1,   92,    1,   12,    1,    1,   13,   30,    1,   44,
+              38,   68,   98,  124,  124,   14,   14,   15,   27,    0,
+              68,    8,   16,   58,   14,    8,   78,   86,   16,    1,
+              12,   12,   10,   24,   26,   24,   38,   54,   10,    8,
+               8,   68,   10,    5,    8,    8,   68,   56,   42,   70,
+              10,   24,   26,   24,   38,   54,   10,    8,    8,   68,
+              10,    5,    8,    8,   68,   56,   42,   70,   74,    3,
+             124,    1,   31,    1,   12,   14,   12,   42,   42,   98,
+              26,   32,   13,    3,   60,   14,    1,   32,   13,    3,
+              60,   14,    1,   32,   13,    3,   60,   14,    1,   31,
+              14,   28,   28,   58,   74,   58,   74,   16,   46,   13,
+               3,   46,   13,    3,    1,    4,   16,   16,    1,   30,
+              16,    2,    2,   92,   78,   60,   14,   74,   60,   58,
+              15,   38,   32,   16,    1,   30,   16,    2,   72,   16,
+             102,   72,   72,   16,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 49 */
+
+              14,  124,   74,   14,    3,    1,   11,   45,   75,    1,
+               1,   94,    1,   14,    1,    1,   13,   30,    1,   46,
+              42,   72,  102,  124,  124,   14,   14,   15,   27,    0,
+              72,    6,   16,   60,   14,   10,   78,   90,   16,    1,
+              14,   14,   12,   26,   28,   26,   42,   58,   12,   10,
+              10,   72,   12,    3,   10,   10,   72,   58,   44,   74,
+              12,   26,   28,   26,   42,   58,   12,   10,   10,   72,
+              12,    3,   10,   10,   72,   58,   44,   74,   76,    1,
+             124,    1,   31,    1,   14,   14,   14,   44,   44,  102,
+              28,   32,   13,    1,   62,   14,    1,   32,   13,    1,
+              62,   14,    1,   32,   13,    1,   62,   14,    1,   31,
+              14,   30,   30,   60,   76,   60,   76,   16,   46,   13,
+               1,   46,   13,    1,    1,    4,   16,   16,    1,   30,
+              16,    2,    2,   94,   78,   62,   14,   76,   62,   60,
+              15,   36,   32,   16,    1,   30,   16,    2,   74,   16,
+             106,   74,   74,   16,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 50 */
+
+              14,  124,   76,   14,    1,    1,   13,   47,   77,    1,
+               1,   94,    1,   14,    1,    1,   15,   30,    1,   46,
+              44,   76,  106,  124,  124,   14,   14,   15,   29,    0,
+              76,    2,   16,   62,   14,   12,   78,   92,   16,    1,
+              14,   14,   14,   28,   30,   28,   44,   60,   14,   12,
+              12,   76,   14,    1,   12,   12,   76,   60,   46,   76,
+              14,   28,   30,   28,   44,   60,   14,   12,   12,   76,
+              14,    1,   12,   12,   76,   60,   46,   76,   78,    1,
+             124,    1,   31,    1,   14,   14,   14,   46,   46,  106,
+              30,   32,   15,    1,   62,   14,    1,   32,   15,    1,
+              62,   14,    1,   32,   15,    1,   62,   14,    1,   31,
+              14,   30,   30,   62,   78,   62,   78,   16,   46,   15,
+               1,   46,   15,    1,    1,    2,   16,   16,    1,   30,
+              16,    0,    0,   94,   78,   62,   14,   78,   62,   62,
+              15,   34,   32,   16,    1,   30,   16,    0,   76,   16,
+             108,   76,   76,   16,
+        },
+
+        {
+            /* Context Tables for init_idc = 2, qp = 51 */
+
+              14,  124,   78,   14,    1,    1,   15,   47,   79,    1,
+               1,   94,    1,   14,    1,    1,   15,   30,    1,   46,
+              46,   78,  110,  124,  124,   14,   14,   15,   31,    0,
+              78,    0,   16,   62,   14,   14,   78,   94,   16,    1,
+              14,   14,   14,   30,   30,   30,   46,   62,   14,   14,
+              14,   78,   14,    1,   14,   14,   78,   62,   46,   78,
+              14,   30,   30,   30,   46,   62,   14,   14,   14,   78,
+              14,    1,   14,   14,   78,   62,   46,   78,   78,    1,
+             124,    1,   31,    1,   14,   14,   14,   46,   46,  110,
+              30,   32,   15,    1,   62,   14,    1,   32,   15,    1,
+              62,   14,    1,   32,   15,    1,   62,   14,    1,   31,
+              14,   30,   30,   62,   78,   62,   78,   16,   46,   15,
+               1,   46,   15,    1,    1,    0,   16,   16,    1,   30,
+              16,    0,    0,   94,   78,   62,   14,   78,   62,   62,
+              15,   32,   32,   16,    1,   30,   16,    0,   78,   16,
+             110,   78,   78,   16,
+        },
+    },
+};

diff --git a/common/ihevc_cabac_tables.h b/common/ihevc_cabac_tables.h
new file mode 100644
index 0000000..9ed1a2c
--- /dev/null
+++ b/common/ihevc_cabac_tables.h

@@ -0,0 +1,137 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+******************************************************************************
+* @file ihevc_cabac_tables.h
+*
+* @brief
+*  This file contains enumerations, macros and extern declarations of HEVC
+*  cabac tables
+*
+* @author
+*  Ittiam
+******************************************************************************
+*/
+
+#ifndef _IHEVC_CABAC_TABLES_H_
+#define _IHEVC_CABAC_TABLES_H_
+
+/*****************************************************************************/
+/* Constant Macros                                                           */
+/*****************************************************************************/
+
+/**
+******************************************************************************
+ *  @brief  maximum range of cabac_init_idc (0-2)
+******************************************************************************
+ */
+#define IHEVC_NUM_CAB_IDC   3
+
+/**
+******************************************************************************
+ *  @brief  max range of qps in HEVC (0-51)
+******************************************************************************
+ */
+#define IHEVC_MAX_QP      52
+
+
+/*****************************************************************************/
+/* Enums                                                                     */
+/*****************************************************************************/
+
+/**
+ *****************************************************************************
+ *  @brief   start offsets of cabac contexts for various syntax elements
+ *
+ *  @remarks Init ContextModel generation and these offsets are tightly coupled
+             See the cabac table generation utility for these offsets
+ *****************************************************************************
+ */
+typedef enum
+{
+    IHEVC_CAB_SAO_MERGE             = 0,
+    IHEVC_CAB_SAO_TYPE              = IHEVC_CAB_SAO_MERGE               + 1,
+    IHEVC_CAB_SPLIT_CU_FLAG         = IHEVC_CAB_SAO_TYPE                + 1,
+    IHEVC_CAB_CU_TQ_BYPASS_FLAG     = IHEVC_CAB_SPLIT_CU_FLAG           + 3,
+    IHEVC_CAB_SKIP_FLAG             = IHEVC_CAB_CU_TQ_BYPASS_FLAG       + 1,
+    IHEVC_CAB_QP_DELTA_ABS          = IHEVC_CAB_SKIP_FLAG               + 3,
+    IHEVC_CAB_PRED_MODE             = IHEVC_CAB_QP_DELTA_ABS            + 2,
+    IHEVC_CAB_PART_MODE             = IHEVC_CAB_PRED_MODE               + 1,
+    IHEVC_CAB_INTRA_LUMA_PRED_FLAG  = IHEVC_CAB_PART_MODE               + 4,
+    IHEVC_CAB_CHROMA_PRED_MODE      = IHEVC_CAB_INTRA_LUMA_PRED_FLAG    + 1,
+    IHEVC_CAB_MERGE_FLAG_EXT        = IHEVC_CAB_CHROMA_PRED_MODE        + 1,
+    IHEVC_CAB_MERGE_IDX_EXT         = IHEVC_CAB_MERGE_FLAG_EXT          + 1,
+    IHEVC_CAB_INTER_PRED_IDC        = IHEVC_CAB_MERGE_IDX_EXT           + 1,
+    IHEVC_CAB_INTER_REF_IDX         = IHEVC_CAB_INTER_PRED_IDC          + 5,
+    IHEVC_CAB_MVD_GRT0              = IHEVC_CAB_INTER_REF_IDX           + 2,
+    IHEVC_CAB_MVD_GRT1              = IHEVC_CAB_MVD_GRT0                + 1,
+    IHEVC_CAB_MVP_L0L1              = IHEVC_CAB_MVD_GRT1                + 1,
+    IHEVC_CAB_NORES_IDX             = IHEVC_CAB_MVP_L0L1                + 1,
+    IHEVC_CAB_SPLIT_TFM             = IHEVC_CAB_NORES_IDX               + 1,
+    IHEVC_CAB_CBF_LUMA_IDX          = IHEVC_CAB_SPLIT_TFM               + 3,
+    IHEVC_CAB_CBCR_IDX              = IHEVC_CAB_CBF_LUMA_IDX            + 2,
+    IHEVC_CAB_TFM_SKIP0             = IHEVC_CAB_CBCR_IDX                + 4,
+    IHEVC_CAB_TFM_SKIP12            = IHEVC_CAB_TFM_SKIP0               + 1,
+    IHEVC_CAB_COEFFX_PREFIX         = IHEVC_CAB_TFM_SKIP12              + 1,
+    IHEVC_CAB_COEFFY_PREFIX         = IHEVC_CAB_COEFFX_PREFIX           + 18,
+    IHEVC_CAB_CODED_SUBLK_IDX       = IHEVC_CAB_COEFFY_PREFIX           + 18,
+    IHEVC_CAB_COEFF_FLAG            = IHEVC_CAB_CODED_SUBLK_IDX         + 4,
+    IHEVC_CAB_COEFABS_GRTR1_FLAG    = IHEVC_CAB_COEFF_FLAG              + 42,
+    IHEVC_CAB_COEFABS_GRTR2_FLAG    = IHEVC_CAB_COEFABS_GRTR1_FLAG      + 24,
+    IHEVC_CAB_CTXT_END              = IHEVC_CAB_COEFABS_GRTR2_FLAG      + 6
+}IHEVC_CABAC_CTXT_OFFSETS;
+
+
+/*****************************************************************************/
+/* Extern global declarations                                                */
+/*****************************************************************************/
+
+/**
+ ******************************************************************************
+ * @brief  Table for rangeTabLPS depending on pStateIdx and qCodIRangeIdx
+ * input   : pStateIdx(0-63) and qCodIRangeIdx(0-3) [(Range >> 6) & 0x3]
+ * output  : RLps
+ *
+ * @remarks See Table 9-40 of HEVC spec for rangeTabLPS
+ *******************************************************************************
+ */
+extern const UWORD8 gau1_ihevc_cabac_rlps[64][4];
+
+
+/**
+ ******************************************************************************
+ * @brief  probaility+MPS state transition tables based on cur State and bin
+ * input  : curpState[bits7-2]  | curMPS[bit1] | decodedBin[bit0]
+ * output : nextpState[bits6-1] | nextMPS[bit0]
+ * @remarks Modified form of Table-9-41 State Transition table in HEVC spec
+ ******************************************************************************
+ */
+extern const UWORD8 gau1_ihevc_next_state[128*2];
+
+/**
+ ******************************************************************************
+ * @brief  Init context tables for all combinations of qp and cabac_init_idc
+ * @remarks Packing format MPS in lsb and pState in bits[1-6]
+ ******************************************************************************
+ */
+extern const UWORD8 gau1_ihevc_cab_ctxts[IHEVC_NUM_CAB_IDC][IHEVC_MAX_QP][IHEVC_CAB_CTXT_END];
+
+
+
+#endif /* _IHEVC_CABAC_TABLES_H_ */

diff --git a/common/ihevc_chroma_intra_pred.h b/common/ihevc_chroma_intra_pred.h
new file mode 100644
index 0000000..c4ca13b
--- /dev/null
+++ b/common/ihevc_chroma_intra_pred.h

@@ -0,0 +1,358 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_intra_pred.h
+*
+* @brief
+*  Declarations for the fucntions defined in  ihevc_intra_pred_filters
+*
+* @author
+*  Mamatha
+*
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef IHEVC_CHROMA_INTRA_PRED_H_
+#define IHEVC_CHROMA_INTRA_PRED_H_
+
+/*****************************************************************************/
+/* Function Declarations                                                     */
+/*****************************************************************************/
+typedef void ihevc_intra_pred_chroma_planar_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_dc_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_horz_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_ver_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_mode2_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_mode_18_34_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_mode_3_to_9_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_mode_11_to_17_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_mode_19_to_25_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_mode_27_to_33_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_ref_substitution_ft(UWORD8 *pu1_top_left,
+                                                         UWORD8 *pu1_top,
+                                                         UWORD8 *pu1_left,
+                                                         WORD32 src_strd,
+                                                         WORD32 nt,
+                                                         WORD32 nbr_flags,
+                                                         UWORD8 *pu1_dst,
+                                                         WORD32 dst_strd);
+
+typedef void ihevc_hbd_intra_pred_chroma_planar_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+
+typedef void ihevc_hbd_intra_pred_chroma_dc_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+
+typedef void ihevc_hbd_intra_pred_chroma_horz_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+typedef void ihevc_hbd_intra_pred_chroma_ver_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+
+typedef void ihevc_hbd_intra_pred_chroma_mode2_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+
+
+typedef void ihevc_hbd_intra_pred_chroma_mode_18_34_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+typedef void ihevc_hbd_intra_pred_chroma_mode_3_to_9_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+
+typedef void ihevc_hbd_intra_pred_chroma_mode_11_to_17_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+
+
+typedef void ihevc_hbd_intra_pred_chroma_mode_19_to_25_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+
+typedef void ihevc_hbd_intra_pred_chroma_mode_27_to_33_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+
+typedef void ihevc_hbd_intra_pred_chroma_ref_substitution_ft(UWORD16 *pu2_top_left,
+                                                             UWORD16 *pu2_top,
+                                                             UWORD16 *pu2_left,
+                                                             WORD32 src_strd,
+                                                             WORD32 nt,
+                                                             WORD32 nbr_flags,
+                                                             UWORD16 *pu2_dst,
+                                                             WORD32 dst_strd,
+                                                             UWORD8 bit_depth);
+
+/* C function declarations */
+ihevc_intra_pred_chroma_planar_ft ihevc_intra_pred_chroma_planar;
+ihevc_intra_pred_chroma_dc_ft ihevc_intra_pred_chroma_dc;
+ihevc_intra_pred_chroma_horz_ft ihevc_intra_pred_chroma_horz;
+ihevc_intra_pred_chroma_ver_ft ihevc_intra_pred_chroma_ver;
+ihevc_intra_pred_chroma_mode2_ft ihevc_intra_pred_chroma_mode2;
+ihevc_intra_pred_chroma_mode_18_34_ft ihevc_intra_pred_chroma_mode_18_34;
+ihevc_intra_pred_chroma_mode_3_to_9_ft ihevc_intra_pred_chroma_mode_3_to_9;
+ihevc_intra_pred_chroma_mode_11_to_17_ft ihevc_intra_pred_chroma_mode_11_to_17;
+ihevc_intra_pred_chroma_mode_19_to_25_ft ihevc_intra_pred_chroma_mode_19_to_25;
+ihevc_intra_pred_chroma_mode_27_to_33_ft ihevc_intra_pred_chroma_mode_27_to_33;
+ihevc_intra_pred_chroma_ref_substitution_ft ihevc_intra_pred_chroma_ref_substitution;
+
+ihevc_hbd_intra_pred_chroma_planar_ft ihevc_hbd_intra_pred_chroma_planar;
+ihevc_hbd_intra_pred_chroma_dc_ft ihevc_hbd_intra_pred_chroma_dc;
+ihevc_hbd_intra_pred_chroma_horz_ft ihevc_hbd_intra_pred_chroma_horz;
+ihevc_hbd_intra_pred_chroma_ver_ft ihevc_hbd_intra_pred_chroma_ver;
+ihevc_hbd_intra_pred_chroma_mode2_ft ihevc_hbd_intra_pred_chroma_mode2;
+ihevc_hbd_intra_pred_chroma_mode_18_34_ft ihevc_hbd_intra_pred_chroma_mode_18_34;
+ihevc_hbd_intra_pred_chroma_mode_3_to_9_ft ihevc_hbd_intra_pred_chroma_mode_3_to_9;
+ihevc_hbd_intra_pred_chroma_mode_11_to_17_ft ihevc_hbd_intra_pred_chroma_mode_11_to_17;
+ihevc_hbd_intra_pred_chroma_mode_19_to_25_ft ihevc_hbd_intra_pred_chroma_mode_19_to_25;
+ihevc_hbd_intra_pred_chroma_mode_27_to_33_ft ihevc_hbd_intra_pred_chroma_mode_27_to_33;
+ihevc_hbd_intra_pred_chroma_ref_substitution_ft ihevc_hbd_intra_pred_chroma_ref_substitution;
+
+/* C function declarations */
+ihevc_intra_pred_chroma_planar_ft ihevc_intra_pred_chroma_planar;
+ihevc_intra_pred_chroma_dc_ft ihevc_intra_pred_chroma_dc;
+ihevc_intra_pred_chroma_horz_ft ihevc_intra_pred_chroma_horz;
+ihevc_intra_pred_chroma_ver_ft ihevc_intra_pred_chroma_ver;
+ihevc_intra_pred_chroma_mode2_ft ihevc_intra_pred_chroma_mode2;
+ihevc_intra_pred_chroma_mode_18_34_ft ihevc_intra_pred_chroma_mode_18_34;
+ihevc_intra_pred_chroma_mode_3_to_9_ft ihevc_intra_pred_chroma_mode_3_to_9;
+ihevc_intra_pred_chroma_mode_11_to_17_ft ihevc_intra_pred_chroma_mode_11_to_17;
+ihevc_intra_pred_chroma_mode_19_to_25_ft ihevc_intra_pred_chroma_mode_19_to_25;
+ihevc_intra_pred_chroma_mode_27_to_33_ft ihevc_intra_pred_chroma_mode_27_to_33;
+ihevc_intra_pred_chroma_ref_substitution_ft ihevc_intra_pred_chroma_ref_substitution;
+
+/* A9Q function declarations */
+ihevc_intra_pred_chroma_planar_ft ihevc_intra_pred_chroma_planar_a9q;
+ihevc_intra_pred_chroma_dc_ft ihevc_intra_pred_chroma_dc_a9q;
+ihevc_intra_pred_chroma_horz_ft ihevc_intra_pred_chroma_horz_a9q;
+ihevc_intra_pred_chroma_ver_ft ihevc_intra_pred_chroma_ver_a9q;
+ihevc_intra_pred_chroma_mode2_ft ihevc_intra_pred_chroma_mode2_a9q;
+ihevc_intra_pred_chroma_mode_18_34_ft ihevc_intra_pred_chroma_mode_18_34_a9q;
+ihevc_intra_pred_chroma_mode_3_to_9_ft ihevc_intra_pred_chroma_mode_3_to_9_a9q;
+ihevc_intra_pred_chroma_mode_11_to_17_ft ihevc_intra_pred_chroma_mode_11_to_17_a9q;
+ihevc_intra_pred_chroma_mode_19_to_25_ft ihevc_intra_pred_chroma_mode_19_to_25_a9q;
+ihevc_intra_pred_chroma_mode_27_to_33_ft ihevc_intra_pred_chroma_mode_27_to_33_a9q;
+ihevc_intra_pred_chroma_ref_substitution_ft ihevc_intra_pred_chroma_ref_substitution_a9q;
+
+/* SSE4.2 function declarations */
+ihevc_intra_pred_chroma_planar_ft ihevc_intra_pred_chroma_planar_neonintr;
+ihevc_intra_pred_chroma_dc_ft ihevc_intra_pred_chroma_dc_neonintr;
+ihevc_intra_pred_chroma_horz_ft ihevc_intra_pred_chroma_horz_neonintr;
+ihevc_intra_pred_chroma_ver_ft ihevc_intra_pred_chroma_ver_neonintr;
+ihevc_intra_pred_chroma_mode2_ft ihevc_intra_pred_chroma_mode2_neonintr;
+ihevc_intra_pred_chroma_mode_18_34_ft ihevc_intra_pred_chroma_mode_18_34_neonintr;
+ihevc_intra_pred_chroma_mode_3_to_9_ft ihevc_intra_pred_chroma_mode_3_to_9_neonintr;
+ihevc_intra_pred_chroma_mode_11_to_17_ft ihevc_intra_pred_chroma_mode_11_to_17_neonintr;
+ihevc_intra_pred_chroma_mode_19_to_25_ft ihevc_intra_pred_chroma_mode_19_to_25_neonintr;
+ihevc_intra_pred_chroma_mode_27_to_33_ft ihevc_intra_pred_chroma_mode_27_to_33_neonintr;
+ihevc_intra_pred_chroma_ref_substitution_ft ihevc_intra_pred_chroma_ref_substitution_neonintr;
+
+/* SSSE3 function declarations */
+ihevc_intra_pred_chroma_planar_ft ihevc_intra_pred_chroma_planar_ssse3;
+ihevc_intra_pred_chroma_dc_ft ihevc_intra_pred_chroma_dc_ssse3;
+ihevc_intra_pred_chroma_horz_ft ihevc_intra_pred_chroma_horz_ssse3;
+ihevc_intra_pred_chroma_ver_ft ihevc_intra_pred_chroma_ver_ssse3;
+ihevc_intra_pred_chroma_mode2_ft ihevc_intra_pred_chroma_mode2_ssse3;
+ihevc_intra_pred_chroma_mode_18_34_ft ihevc_intra_pred_chroma_mode_18_34_ssse3;
+ihevc_intra_pred_chroma_mode_3_to_9_ft ihevc_intra_pred_chroma_mode_3_to_9_ssse3;
+ihevc_intra_pred_chroma_mode_11_to_17_ft ihevc_intra_pred_chroma_mode_11_to_17_ssse3;
+ihevc_intra_pred_chroma_mode_19_to_25_ft ihevc_intra_pred_chroma_mode_19_to_25_ssse3;
+ihevc_intra_pred_chroma_mode_27_to_33_ft ihevc_intra_pred_chroma_mode_27_to_33_ssse3;
+ihevc_intra_pred_chroma_ref_substitution_ft ihevc_intra_pred_chroma_ref_substitution_ssse3;
+
+/* SSE4.2 function declarations */
+ihevc_intra_pred_chroma_planar_ft ihevc_intra_pred_chroma_planar_sse42;
+ihevc_intra_pred_chroma_dc_ft ihevc_intra_pred_chroma_dc_sse42;
+ihevc_intra_pred_chroma_ref_substitution_ft ihevc_intra_pred_chroma_ref_substitution_sse42;
+
+ihevc_hbd_intra_pred_chroma_planar_ft ihevc_hbd_intra_pred_chroma_planar_sse42;
+ihevc_hbd_intra_pred_chroma_dc_ft ihevc_hbd_intra_pred_chroma_dc_sse42;
+ihevc_hbd_intra_pred_chroma_horz_ft ihevc_hbd_intra_pred_chroma_horz_sse42;
+ihevc_hbd_intra_pred_chroma_ver_ft ihevc_hbd_intra_pred_chroma_ver_sse42;
+ihevc_hbd_intra_pred_chroma_mode2_ft ihevc_hbd_intra_pred_chroma_mode2_sse42;
+ihevc_hbd_intra_pred_chroma_mode_18_34_ft ihevc_hbd_intra_pred_chroma_mode_18_34_sse42;
+ihevc_hbd_intra_pred_chroma_mode_3_to_9_ft ihevc_hbd_intra_pred_chroma_mode_3_to_9_sse42;
+ihevc_hbd_intra_pred_chroma_mode_11_to_17_ft ihevc_hbd_intra_pred_chroma_mode_11_to_17_sse42;
+ihevc_hbd_intra_pred_chroma_mode_19_to_25_ft ihevc_hbd_intra_pred_chroma_mode_19_to_25_sse42;
+ihevc_hbd_intra_pred_chroma_mode_27_to_33_ft ihevc_hbd_intra_pred_chroma_mode_27_to_33_sse42;
+ihevc_hbd_intra_pred_chroma_ref_substitution_ft ihevc_hbd_intra_pred_chroma_ref_substitution_sse42;
+
+ihevc_intra_pred_chroma_planar_ft ihevc_intra_pred_chroma_planar_a9a;
+ihevc_intra_pred_chroma_dc_ft ihevc_intra_pred_chroma_dc_a9a;
+ihevc_intra_pred_chroma_horz_ft ihevc_intra_pred_chroma_horz_a9a;
+ihevc_intra_pred_chroma_ver_ft ihevc_intra_pred_chroma_ver_a9a;
+ihevc_intra_pred_chroma_mode2_ft ihevc_intra_pred_chroma_mode2_a9a;
+ihevc_intra_pred_chroma_mode_18_34_ft ihevc_intra_pred_chroma_mode_18_34_a9a;
+ihevc_intra_pred_chroma_mode_3_to_9_ft ihevc_intra_pred_chroma_mode_3_to_9_a9a;
+ihevc_intra_pred_chroma_mode_11_to_17_ft ihevc_intra_pred_chroma_mode_11_to_17_a9a;
+ihevc_intra_pred_chroma_mode_19_to_25_ft ihevc_intra_pred_chroma_mode_19_to_25_a9a;
+ihevc_intra_pred_chroma_mode_27_to_33_ft ihevc_intra_pred_chroma_mode_27_to_33_a9a;
+ihevc_intra_pred_chroma_ref_substitution_ft ihevc_intra_pred_chroma_ref_substitution_a9a;
+
+/* AVX function declaration*/
+ihevc_intra_pred_chroma_dc_ft ihevc_intra_pred_chroma_dc_avx;
+ihevc_intra_pred_chroma_mode_18_34_ft ihevc_intra_pred_chroma_mode_18_34_avx;
+ihevc_intra_pred_chroma_ver_ft ihevc_intra_pred_chroma_ver_avx;
+
+ihevc_hbd_intra_pred_chroma_dc_ft ihevc_hbd_intra_pred_chroma_dc_avx;
+ihevc_hbd_intra_pred_chroma_mode_18_34_ft ihevc_hbd_intra_pred_chroma_mode_18_34_avx;
+ihevc_hbd_intra_pred_chroma_ver_ft ihevc_hbd_intra_pred_chroma_ver_avx;
+
+/* armv8 function declarations */
+ihevc_intra_pred_chroma_planar_ft ihevc_intra_pred_chroma_planar_av8;
+ihevc_intra_pred_chroma_dc_ft ihevc_intra_pred_chroma_dc_av8;
+ihevc_intra_pred_chroma_horz_ft ihevc_intra_pred_chroma_horz_av8;
+ihevc_intra_pred_chroma_ver_ft ihevc_intra_pred_chroma_ver_av8;
+ihevc_intra_pred_chroma_mode2_ft ihevc_intra_pred_chroma_mode2_av8;
+ihevc_intra_pred_chroma_mode_18_34_ft ihevc_intra_pred_chroma_mode_18_34_av8;
+ihevc_intra_pred_chroma_mode_3_to_9_ft ihevc_intra_pred_chroma_mode_3_to_9_av8;
+ihevc_intra_pred_chroma_mode_11_to_17_ft ihevc_intra_pred_chroma_mode_11_to_17_av8;
+ihevc_intra_pred_chroma_mode_19_to_25_ft ihevc_intra_pred_chroma_mode_19_to_25_av8;
+ihevc_intra_pred_chroma_mode_27_to_33_ft ihevc_intra_pred_chroma_mode_27_to_33_av8;
+ihevc_intra_pred_chroma_ref_substitution_ft ihevc_intra_pred_chroma_ref_substitution_av8;
+#endif /* IHEVC_CHROMA_INTRA_PRED_H_ */

diff --git a/common/ihevc_chroma_intra_pred_filters.c b/common/ihevc_chroma_intra_pred_filters.c
new file mode 100644
index 0000000..8b3c992
--- /dev/null
+++ b/common/ihevc_chroma_intra_pred_filters.c

@@ -0,0 +1,1277 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_chroma_intra_pred_filters.c
+*
+* @brief
+*  Contains function Definition for intra prediction  interpolation filters
+*
+*
+* @author
+*  Ittiam
+*
+* @par List of Functions:
+*  ihevc_intra_pred_chroma_planar()
+*
+*  ihevc_intra_pred_chroma_dc()
+*
+*  ihevc_intra_pred_chroma_horz()
+*
+*  ihevc_intra_pred_chroma_ver()
+*
+*  ihevc_intra_pred_chroma_mode2()
+*
+*  ihevc_intra_pred_chroma_mode_18_34()
+*
+*  ihevc_intra_pred_chroma_mode_3_to_9()
+*
+*  ihevc_intra_pred_chroma_mode_11_to_17()
+*
+*  ihevc_intra_pred_chroma_mode_19_to_25()
+*
+*  ihevc_intra_pred_chroma_mode_27_to_33()
+*
+*  ihevc_intra_pred_chroma_ref_substitution()
+*
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_intra_pred.h"
+#include "ihevc_mem_fns.h"
+#include "ihevc_chroma_intra_pred.h"
+#include "ihevc_common_tables.h"
+
+
+/****************************************************************************/
+/* Constant Macros                                                          */
+/****************************************************************************/
+#define MAX_CU_SIZE 64
+#define BIT_DEPTH 8
+#define T32_4NT 128
+#define T16_4NT 64
+#define T16C_4NT 64
+#define T8C_4NT 32
+/****************************************************************************/
+/* Function Macros                                                          */
+/****************************************************************************/
+
+#define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x)
+
+
+/*****************************************************************************/
+/* Function Definition                                                      */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Reference substitution process for samples unavailable  for prediction
+* Refer to section 8.4.4.2.2
+*
+* @par Description:
+*
+*
+* @param[in] pu1_top_left
+*  UWORD8 pointer to the top-left
+*
+* @param[in] pu1_top
+*  UWORD8 pointer to the top
+*
+* @param[in] pu1_left
+*  UWORD8 pointer to the left
+*
+* @param[in] src_strd
+*  WORD32 Source stride
+*
+* @param[in] nbr_flags
+*  WORD32 neighbor availability flags
+*
+* @param[in] nt
+*  WORD32 transform Block size
+*
+* @param[in] dst_strd
+*  WORD32 Destination stride
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_ref_substitution(UWORD8 *pu1_top_left,
+                                              UWORD8 *pu1_top,
+                                              UWORD8 *pu1_left,
+                                              WORD32 src_strd,
+                                              WORD32 nt,
+                                              WORD32 nbr_flags,
+                                              UWORD8 *pu1_dst,
+                                              WORD32 dst_strd)
+{
+    UWORD8 pu1_ref_u, pu1_ref_v;
+    WORD32 dc_val, i, j;
+    WORD32 total_samples = (4 * nt) + 1;
+    WORD32 get_bits;
+    WORD32 next;
+    WORD32 bot_left, left, top, tp_right, tp_left;
+    WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
+    WORD32 a_nbr_flag[5];
+    UNUSED(dst_strd);
+    /* Neighbor Flag Structure*/
+    /* WORD32 nbr_flags MSB-->LSB   TOP LEFT | TOP-RIGHT |  TOP   | LEFT    | BOTTOM LEFT*/
+    /*                              (1 bit)     (4 bits)  (4 bits) (4 bits)  (4 bits)  */
+
+    if(nbr_flags == 0)
+    {
+/* If no neighbor flags are present, fill the neighbor samples with DC value */
+        /*dc_val = 1 << (BIT_DEPTH - 1);*/
+        dc_val = 1 << (8 - 1);
+        for(i = 0; i < (2 * total_samples); i++)
+        {
+            pu1_dst[i] = dc_val;
+        }
+    }
+    else
+    {
+        /* Else fill the corresponding samples */
+
+        /* Check for the neighbors availibility */
+        tp_left     = (nbr_flags & 0x10000);
+        tp_right    = (nbr_flags & 0x0f000);
+        top         = (nbr_flags & 0x00f00);
+        left        = (nbr_flags & 0x000f0);
+        bot_left    = (nbr_flags & 0x0000f);
+
+        /* Fill nbrs depending on avalibility */
+        /* Top -Left nbrs  */
+        if(0 != tp_left)
+        {
+            pu1_dst[(4 * nt)] = *pu1_top_left; // U top-left sample
+            pu1_dst[(4 * nt) + 1] = *(pu1_top_left + 1); // V top-left sample
+        }
+        /* Left nbrs  */
+        if(0 != left)
+        {
+            for(i = 0, j = 0; i < (2 * nt); i += 2)
+            {
+                pu1_dst[(4 * nt) - 2 - i] = pu1_left[j * src_strd]; // U left samples
+                pu1_dst[(4 * nt) - 1 - i] = pu1_left[(j * src_strd) + 1]; // V left samples
+                j++;
+            }
+        }
+        /* Bottom - Left nbrs  */
+        if(0 != bot_left)
+        {
+            for(i = (2 * nt), j = nt; i < (4 * nt); i += 2)
+            {
+                pu1_dst[(4 * nt) - 2 - i] = pu1_left[j * src_strd]; // U left samples
+                pu1_dst[(4 * nt) - 1 - i] = pu1_left[(j * src_strd) + 1]; // V left samples
+                j++;
+            }
+        }
+        /* Top nbrs  */
+        if(0 != top)
+        {
+            ihevc_memcpy_mul_8(&pu1_dst[(4 * nt) + 2], pu1_top, 2 * nt);
+            // U-V interleaved Top-top right samples
+        }
+
+        /* Top - Right nbrs  */
+        if(0 != tp_right)
+        {
+            ihevc_memcpy_mul_8(&pu1_dst[(4 * nt) + 2 + 2 * nt], pu1_top + 2 * nt, 2 * nt);
+            // U-V interleaved Top-top right samples
+        }
+
+        if(nt == 4)
+        {
+            /* 1 bit extraction for all the neighboring blocks */
+            tp_left = (nbr_flags & 0x10000) >> 16;
+            bot_left = (nbr_flags & 0x8) >> 3;
+            left = (nbr_flags & 0x80) >> 7;
+            top = (nbr_flags & 0x100) >> 8;
+            tp_right = (nbr_flags & 0x1000) >> 12;
+
+            next = 1;
+            a_nbr_flag[0] = bot_left;
+            a_nbr_flag[1] = left;
+            a_nbr_flag[2] = tp_left;
+            a_nbr_flag[3] = top;
+            a_nbr_flag[4] = tp_right;
+
+            /* If bottom -left is not available, reverse substitution process*/
+            if(bot_left == 0)
+            {
+                /* Check for the 1st available sample from bottom-left*/
+                while(!a_nbr_flag[next])
+                    next++;
+
+                /* If Left, top-left are available*/
+                if(next <= 2)
+                {
+                    UWORD16 *pu2_dst;
+                    idx = (nt * next);
+                    pu2_dst = (UWORD16 *)&pu1_dst[2 * idx];
+                    ihevc_memset_16bit((UWORD16 *)pu1_dst, pu2_dst[0], idx);
+                }
+                else /* If top, top-right are available */
+                {
+                    UWORD16 *pu2_dst;
+                    /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
+                    idx = (nt * (next - 1)) + 1;
+                    pu2_dst = (UWORD16 *)&pu1_dst[2 * idx];
+                    ihevc_memset_16bit((UWORD16 *)pu1_dst, pu2_dst[0], idx);
+                }
+            }
+
+            if(left == 0)
+            {
+                UWORD16 *pu2_dst = (UWORD16 *)&pu1_dst[(2 * nt) - 2];
+                ihevc_memset_16bit((UWORD16 *)&pu1_dst[(2 * nt)], pu2_dst[0], nt);
+
+
+            }
+            if(tp_left == 0)
+            {
+                pu1_dst[4 * nt] = pu1_dst[(4 * nt) - 2];
+                pu1_dst[(4 * nt) + 1] = pu1_dst[(4 * nt) - 1];
+            }
+            if(top == 0)
+            {
+                UWORD16 *pu2_dst = (UWORD16 *)&pu1_dst[(4 * nt)];
+                ihevc_memset_16bit((UWORD16 *)&pu1_dst[(4 * nt) + 2], pu2_dst[0], nt);
+
+
+            }
+            if(tp_right == 0)
+            {
+                UWORD16 *pu2_dst = (UWORD16 *)&pu1_dst[(6 * nt)];
+                ihevc_memset_16bit((UWORD16 *)&pu1_dst[(6 * nt) + 2], pu2_dst[0], nt);
+
+
+            }
+        }
+        else if(nt == 8)
+        {
+            WORD32 nbr_flags_temp = 0;
+            nbr_flags_temp = ((nbr_flags & 0xC) >> 2) + ((nbr_flags & 0xC0) >> 4)
+                            + ((nbr_flags & 0x300) >> 4)
+                            + ((nbr_flags & 0x3000) >> 6)
+                            + ((nbr_flags & 0x10000) >> 8);
+
+            /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
+            /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+            {
+                nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 4; /* for bottom left and left */
+                if(nbr_id_from_bl == 32)
+                    nbr_id_from_bl = 16;
+                if(nbr_id_from_bl == 16)
+                {
+                    /* for top left : 1 pel per nbr bit */
+                    if(!((nbr_flags_temp >> 8) & 0x1))
+                    {
+                        nbr_id_from_bl++;
+                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 4; /* top and top right;  8 pels per nbr bit */
+
+                    }
+                }
+                /* Reverse Substitution Process*/
+                if(nbr_id_from_bl)
+                {
+                    /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+                    pu1_ref_u = pu1_dst[2 * nbr_id_from_bl];
+                    pu1_ref_v = pu1_dst[(2 * nbr_id_from_bl) + 1];
+                    for(i = 2 * (nbr_id_from_bl - 1); i >= 0; i -= 2)
+                    {
+                        pu1_dst[i] = pu1_ref_u;
+                        pu1_dst[i + 1] = pu1_ref_v;
+                    }
+                }
+            }
+
+            /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+            while(nbr_id_from_bl < ((T8C_4NT)+1))
+            {
+                /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
+                /* Divide by 8 to obtain the original index */
+                frwd_nbr_flag = (nbr_id_from_bl >> 2); /*+ (nbr_id_from_bl & 0x1);*/
+
+                /* The Top-left flag is at the last bit location of nbr_flags*/
+                if(nbr_id_from_bl == (T8C_4NT / 2))
+                {
+                    get_bits = GET_BIT(nbr_flags_temp, 8);
+
+                    /* only pel substitution for TL */
+                    if(!get_bits)
+                    {
+                        pu1_dst[2 * nbr_id_from_bl] = pu1_dst[(2 * nbr_id_from_bl) - 2];
+                        pu1_dst[(2 * nbr_id_from_bl) + 1] = pu1_dst[(2 * nbr_id_from_bl) - 1];
+                    }
+                }
+                else
+                {
+                    get_bits = GET_BIT(nbr_flags_temp, frwd_nbr_flag);
+                    if(!get_bits)
+                    {
+                        UWORD16 *pu2_dst;
+                        /* 8 pel substitution (other than TL) */
+                        pu2_dst = (UWORD16 *)&pu1_dst[(2 * nbr_id_from_bl) - 2];
+                        ihevc_memset_16bit((UWORD16 *)(pu1_dst + (2 * nbr_id_from_bl)), pu2_dst[0], 4);
+                    }
+
+                }
+                nbr_id_from_bl += (nbr_id_from_bl == (T8C_4NT / 2)) ? 1 : 4;
+            }
+
+        }
+        else if(nt == 16)
+        {
+            /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
+            /* as each bit in nbr flags corresponds to 4 pels for bot_left, left, top and topright but 1 pel for topleft */
+            {
+                nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 4; /* for bottom left and left */
+
+                if(nbr_id_from_bl == 32)
+                {
+                    /* for top left : 1 pel per nbr bit */
+                    if(!((nbr_flags >> 16) & 0x1))
+                    {
+                        /* top left not available */
+                        nbr_id_from_bl++;
+                        /* top and top right;  4 pels per nbr bit */
+                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 4;
+                    }
+                }
+                /* Reverse Substitution Process*/
+                if(nbr_id_from_bl)
+                {
+                    /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+                    pu1_ref_u = pu1_dst[2 * nbr_id_from_bl];
+                    pu1_ref_v = pu1_dst[2 * nbr_id_from_bl + 1];
+                    for(i = (2 * (nbr_id_from_bl - 1)); i >= 0; i -= 2)
+                    {
+                        pu1_dst[i] = pu1_ref_u;
+                        pu1_dst[i + 1] = pu1_ref_v;
+                    }
+                }
+            }
+
+            /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+            while(nbr_id_from_bl < ((T16C_4NT)+1))
+            {
+                /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
+                /* Devide by 4 to obtain the original index */
+                frwd_nbr_flag = (nbr_id_from_bl >> 2); /*+ (nbr_id_from_bl & 0x1);*/
+
+                /* The Top-left flag is at the last bit location of nbr_flags*/
+                if(nbr_id_from_bl == (T16C_4NT / 2))
+                {
+                    get_bits = GET_BIT(nbr_flags, 16);
+                    /* only pel substitution for TL */
+                    if(!get_bits)
+                    {
+                        pu1_dst[2 * nbr_id_from_bl] = pu1_dst[(2 * nbr_id_from_bl) - 2];
+                        pu1_dst[(2 * nbr_id_from_bl) + 1] = pu1_dst[(2 * nbr_id_from_bl) - 1];
+                    }
+                }
+                else
+                {
+                    get_bits = GET_BIT(nbr_flags, frwd_nbr_flag);
+                    if(!get_bits)
+                    {
+                        UWORD16 *pu2_dst;
+                        /* 4 pel substitution (other than TL) */
+                        pu2_dst = (UWORD16 *)&pu1_dst[(2 * nbr_id_from_bl) - 2];
+                        ihevc_memset_16bit((UWORD16 *)(pu1_dst + (2 * nbr_id_from_bl)), pu2_dst[0], 4);
+                    }
+
+                }
+                nbr_id_from_bl += (nbr_id_from_bl == (T16C_4NT / 2)) ? 1 : 4;
+            }
+        }
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Planar Intraprediction with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'  Refer
+* to section 8.4.4.2.4 in the standard
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_planar(UWORD8 *pu1_ref,
+                                    WORD32 src_strd,
+                                    UWORD8 *pu1_dst,
+                                    WORD32 dst_strd,
+                                    WORD32 nt,
+                                    WORD32 mode)
+{
+
+    WORD32 row, col;
+    WORD32 log2nt = 5;
+    WORD32 two_nt, three_nt;
+    UNUSED(src_strd);
+    UNUSED(mode);
+    switch(nt)
+    {
+        case 16:
+            log2nt = 4;
+            break;
+        case 8:
+            log2nt = 3;
+            break;
+        case 4:
+            log2nt = 2;
+            break;
+        default:
+            break;
+    }
+    two_nt = 2 * nt;
+    three_nt = 3 * nt;
+    /* Planar filtering */
+    for(row = 0; row < nt; row++)
+    {
+        for(col = 0; col < (2 * nt); col += 2)
+        {
+            pu1_dst[row * dst_strd + col] = ((nt - 1 - col / 2)
+                            * pu1_ref[2 * (two_nt - 1 - row)]
+                            + (col / 2 + 1) * pu1_ref[2 * (three_nt + 1)]
+                            + (nt - 1 - row) * pu1_ref[2 * (two_nt + 1) + col]
+                            + (row + 1) * pu1_ref[2 * (nt - 1)] + nt) >> (log2nt + 1);
+
+            pu1_dst[row * dst_strd + col + 1] = ((nt - 1 - col / 2)
+                            * pu1_ref[2 * (two_nt - 1 - row) + 1]
+                            + (col / 2 + 1) * pu1_ref[2 * (three_nt + 1) + 1]
+                            + (nt - 1 - row) * pu1_ref[2 * (two_nt + 1) + col + 1]
+                            + (row + 1) * pu1_ref[2 * (nt - 1) + 1] + nt) >> (log2nt + 1);
+        }
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Intraprediction for DC mode with reference neighboring  samples location
+* pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
+* to section 8.4.4.2.5 in the standard
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size (Chroma)
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_dc(UWORD8 *pu1_ref,
+                                WORD32 src_strd,
+                                UWORD8 *pu1_dst,
+                                WORD32 dst_strd,
+                                WORD32 nt,
+                                WORD32 mode)
+{
+
+    WORD32 acc_dc_u, acc_dc_v;
+    WORD32 dc_val_u, dc_val_v;
+    WORD32 i;
+    WORD32 row, col;
+    WORD32 log2nt = 5;
+    UNUSED(mode);
+    UNUSED(src_strd);
+    switch(nt)
+    {
+        case 32:
+            log2nt = 5;
+            break;
+        case 16:
+            log2nt = 4;
+            break;
+        case 8:
+            log2nt = 3;
+            break;
+        case 4:
+            log2nt = 2;
+            break;
+        default:
+            break;
+    }
+
+
+    acc_dc_u = 0;
+    acc_dc_v = 0;
+    /* Calculate DC value for the transform block */
+    for(i = (2 * nt); i < (4 * nt); i += 2)
+    {
+        acc_dc_u += pu1_ref[i];
+        acc_dc_v += pu1_ref[i + 1];
+    }
+    for(i = ((4 * nt) + 2); i < ((6 * nt) + 2); i += 2)
+    {
+        acc_dc_u += pu1_ref[i];
+        acc_dc_v += pu1_ref[i + 1];
+    }
+
+
+    dc_val_u = (acc_dc_u + nt) >> (log2nt + 1);
+    dc_val_v = (acc_dc_v + nt) >> (log2nt + 1);
+
+
+    /* Fill the remaining rows with DC value*/
+    for(row = 0; row < nt; row++)
+    {
+        for(col = 0; col < (2 * nt); col += 2)
+        {
+            pu1_dst[(row * dst_strd) + col] = dc_val_u;
+            pu1_dst[(row * dst_strd) + col + 1] = dc_val_v;
+        }
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Horizontal intraprediction(mode 10) with reference  samples location
+* pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
+* to section 8.4.4.2.6 in the standard (Special case)
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_horz(UWORD8 *pu1_ref,
+                                  WORD32 src_strd,
+                                  UWORD8 *pu1_dst,
+                                  WORD32 dst_strd,
+                                  WORD32 nt,
+                                  WORD32 mode)
+{
+
+    WORD32 row, col;
+    UNUSED(mode);
+    UNUSED(src_strd);
+    /* Replication to next rows*/
+    for(row = 0; row < nt; row++)
+    {
+        for(col = 0; col < (2 * nt); col += 2)
+        {
+            pu1_dst[(row * dst_strd) + col] = pu1_ref[(4 * nt) - 2 - 2 * row];
+            pu1_dst[(row * dst_strd) + col + 1] = pu1_ref[(4 * nt) - 1 - 2 * row];
+        }
+    }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Horizontal intraprediction with reference neighboring  samples location
+* pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
+* to section 8.4.4.2.6 in the standard (Special case)
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_ver(UWORD8 *pu1_ref,
+                                 WORD32 src_strd,
+                                 UWORD8 *pu1_dst,
+                                 WORD32 dst_strd,
+                                 WORD32 nt,
+                                 WORD32 mode)
+{
+    WORD32 row, col;
+    UNUSED(mode);
+    UNUSED(src_strd);
+    /* Replication to next columns*/
+    for(row = 0; row < nt; row++)
+    {
+        for(col = 0; col < (2 * nt); col += 2)
+        {
+            pu1_dst[(row * dst_strd) + col] = pu1_ref[(4 * nt) + 2 + col];
+            pu1_dst[(row * dst_strd) + col + 1] = pu1_ref[(4 * nt) + 3 + col];
+        }
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Intraprediction for mode 2 (sw angle) with reference  neighboring samples
+* location pointed by 'pu1_ref' to the  TU block location pointed by
+* 'pu1_dst'  Refer to section 8.4.4.2.6 in the standard
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_mode2(UWORD8 *pu1_ref,
+                                   WORD32 src_strd,
+                                   UWORD8 *pu1_dst,
+                                   WORD32 dst_strd,
+                                   WORD32 nt,
+                                   WORD32 mode)
+{
+    WORD32 row, col;
+
+    WORD32 intra_pred_ang = 32;
+    WORD32 idx_u, idx_v;
+    UNUSED(src_strd);
+    UNUSED(mode);
+    /* For the angle 45, replication is done from the corresponding angle */
+    /* intra_pred_ang = tan(angle) in q5 format */
+    for(col = 0; col < (2 * nt); col += 2)
+    {
+        idx_u = ((col + 1) * intra_pred_ang) >> 5; /* Use idx++ */
+        idx_v = (((col + 1) + 1) * intra_pred_ang) >> 5; /* Use idx++ */
+        for(row = 0; row < nt; row++)
+        {
+            pu1_dst[col + (row * dst_strd)] = pu1_ref[(4 * nt) - 2 * row - idx_u - 3];
+            pu1_dst[(col + 1) + (row * dst_strd)] = pu1_ref[(4 * nt) - 2 * row - idx_v - 1];
+        }
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Intraprediction for mode 34 (ne angle) and  mode 18 (nw angle) with
+* reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+* block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_mode_18_34(UWORD8 *pu1_ref,
+                                        WORD32 src_strd,
+                                        UWORD8 *pu1_dst,
+                                        WORD32 dst_strd,
+                                        WORD32 nt,
+                                        WORD32 mode)
+{
+    WORD32 row, col;
+    WORD32 intra_pred_ang;
+    WORD32 idx = 0;
+    UNUSED(src_strd);
+    intra_pred_ang = 32; /*Default value*/
+    /* For mode 18, angle is -45degree */
+    if(mode == 18)
+        intra_pred_ang = -32;
+    /* For mode 34, angle is 45degree */
+    else if(mode == 34)
+        intra_pred_ang = 32;
+    /* For the angle 45 and -45, replication is done from the corresponding angle */
+    /* No interpolation is done for 45 degree*/
+    for(row = 0; row < nt; row++)
+    {
+        idx = ((row + 1) * intra_pred_ang) >> 5;
+
+        for(col = 0; col < (2 * nt); col += 2)
+        {
+            pu1_dst[col + (row * dst_strd)] = pu1_ref[(4 * nt) + col + 2 * idx + 2];
+            pu1_dst[(col + 1) + (row * dst_strd)] = pu1_ref[(4 * nt) + (col + 1) + 2 * idx + 2];
+        }
+
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Intraprediction for mode 3 to 9  (positive angle, horizontal mode ) with
+* reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+* block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_mode_3_to_9(UWORD8 *pu1_ref,
+                                         WORD32 src_strd,
+                                         UWORD8 *pu1_dst,
+                                         WORD32 dst_strd,
+                                         WORD32 nt,
+                                         WORD32 mode)
+{
+    WORD32 row, col;
+
+    WORD32 intra_pred_ang;
+    WORD32 idx_u, ref_main_idx_u;
+    WORD32 idx_v, ref_main_idx_v;
+    WORD32 pos_u, fract_u;
+    WORD32 pos_v, fract_v;
+    UNUSED(src_strd);
+    /* Intra Pred Angle according to the mode */
+    intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+    /* samples dependent on distance to obtain destination sample */
+
+    for(col = 0; col < (2 * nt); col += 2)
+    {
+        pos_u = ((col / 2 + 1) * intra_pred_ang);
+        pos_v = ((col / 2 + 1) * intra_pred_ang);
+
+        idx_u = pos_u >> 5;
+        fract_u = pos_u & (31);
+
+        idx_v = pos_v >> 5;
+        fract_v = pos_v & (31);
+        // Do linear filtering
+        for(row = 0; row < nt; row++)
+        {
+            ref_main_idx_u = (4 * nt) - 2 * row - 2 * idx_u - 2;
+            ref_main_idx_v = (4 * nt) - 2 * row - 2 * idx_v - 1;
+
+            pu1_dst[col + (row * dst_strd)] = (((32 - fract_u)
+                            * pu1_ref[ref_main_idx_u]
+                            + fract_u * pu1_ref[ref_main_idx_u - 2] + 16) >> 5);
+
+            pu1_dst[(col + 1) + (row * dst_strd)] = (((32 - fract_v)
+                            * pu1_ref[ref_main_idx_v]
+                            + fract_v * pu1_ref[ref_main_idx_v - 2] + 16) >> 5);
+        }
+
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Intraprediction for mode 11 to 17  (negative angle, horizontal mode )
+* with reference  neighboring samples location pointed by 'pu1_ref' to the
+* TU block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_mode_11_to_17(UWORD8 *pu1_ref,
+                                           WORD32 src_strd,
+                                           UWORD8 *pu1_dst,
+                                           WORD32 dst_strd,
+                                           WORD32 nt,
+                                           WORD32 mode)
+{
+    /* This function and ihevc_intra_pred_CHROMA_mode_19_to_25 are same except*/
+    /* for ref main & side samples assignment,can be combined for */
+    /* optimzation*/
+
+    WORD32 row, col, k;
+    WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
+    WORD32 idx_u, idx_v, ref_main_idx_u, ref_main_idx_v, ref_idx;
+    WORD32 pos_u, pos_v, fract_u, fract_v;
+
+    UWORD8 ref_temp[2 * MAX_CU_SIZE + 2];
+    UWORD8 *ref_main;
+    UNUSED(src_strd);
+    inv_ang_sum = 128;
+
+    intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+    inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
+    /* Intermediate reference samples for negative angle modes */
+    /* This have to be removed during optimization*/
+
+    /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
+
+
+    ref_main = ref_temp + 2 * nt;
+    for(k = 0; k < (2 * (nt + 1)); k += 2)
+    {
+        ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k];
+        ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k + 1];
+    }
+
+    ref_main = ref_temp + (2 * (nt - 1));
+    ref_idx = (nt * intra_pred_ang) >> 5;
+
+    /* SIMD Optimization can be done using look-up table for the loop */
+    /* For negative angled derive the main reference samples from side */
+    /*  reference samples refer to section 8.4.4.2.6 */
+    for(k = -2; k > (2 * ref_idx); k -= 2)
+    {
+        inv_ang_sum += inv_ang;
+        ref_main[k] = pu1_ref[(4 * nt) + ((inv_ang_sum >> 8) << 1)];
+        ref_main[k + 1] = pu1_ref[((4 * nt) + 1) + ((inv_ang_sum >> 8) << 1)];
+    }
+
+    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+    /* samples dependent on distance to obtain destination sample */
+    for(col = 0; col < (2 * nt); col += 2)
+    {
+        pos_u = ((col / 2 + 1) * intra_pred_ang);
+        pos_v = ((col / 2 + 1) * intra_pred_ang);
+        idx_u = pos_u >> 5;
+        idx_v = pos_v >> 5;
+        fract_u = pos_u & (31);
+        fract_v = pos_v & (31);
+
+        // Do linear filtering
+        for(row = 0; row < nt; row++)
+        {
+            ref_main_idx_u = 2 * (row + idx_u + 1);
+            ref_main_idx_v = 2 * (row + idx_v + 1) + 1;
+
+            pu1_dst[col + (dst_strd * row)] = (UWORD8)(((32 - fract_u)
+                            * ref_main[ref_main_idx_u]
+                            + fract_u * ref_main[ref_main_idx_u + 2] + 16) >> 5);
+            pu1_dst[(col + 1) + (dst_strd * row)] = (UWORD8)(((32 - fract_v)
+                            * ref_main[ref_main_idx_v]
+                            + fract_v * ref_main[ref_main_idx_v + 2] + 16) >> 5);
+
+        }
+
+    }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Intraprediction for mode 19 to 25  (negative angle, vertical mode ) with
+* reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+* block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_mode_19_to_25(UWORD8 *pu1_ref,
+                                           WORD32 src_strd,
+                                           UWORD8 *pu1_dst,
+                                           WORD32 dst_strd,
+                                           WORD32 nt,
+                                           WORD32 mode)
+{
+
+    WORD32 row, col, k;
+    WORD32 intra_pred_ang, idx;
+    WORD32 inv_ang, inv_ang_sum, pos, fract;
+    WORD32 ref_main_idx_u, ref_main_idx_v, ref_idx;
+    UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 2];
+    UWORD8 *ref_main;
+    UNUSED(src_strd);
+
+
+    intra_pred_ang = gai4_ihevc_ang_table_chroma[mode];
+    inv_ang = gai4_ihevc_inv_ang_table_chroma[mode - 12];
+
+    /* Intermediate reference samples for negative angle modes */
+    /* This have to be removed during optimization*/
+    /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+    ref_main = ref_temp + 2 * nt;
+    for(k = 0; k < (2 * (nt + 1)); k += 2)
+    {
+        ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k];
+        ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k + 1];
+    }
+
+
+    ref_idx = (nt * intra_pred_ang) >> 5;
+    inv_ang_sum = 128;
+    ref_main = ref_temp + (2 * (nt - 1));
+    /* SIMD Optimization can be done using look-up table for the loop */
+    /* For negative angled derive the main reference samples from side */
+    /*  reference samples refer to section 8.4.4.2.6 */
+    for(k = -2; k > (2 * ref_idx); k -= 2)
+    {
+        inv_ang_sum += inv_ang;
+        ref_main[k] = pu1_ref[(4 * nt) - (inv_ang_sum >> 8) * 2];
+        ref_main[k + 1] = pu1_ref[((4 * nt) + 1) - (inv_ang_sum >> 8) * 2];
+    }
+
+    for(row = 0; row < nt; row++)
+    {
+        pos = ((row + 1) * intra_pred_ang);
+        idx = pos >> 5;
+        fract = pos & (31);
+
+        // Do linear filtering
+        for(col = 0; col < (2 * nt); col += 2)
+        {
+            ref_main_idx_u = col + 2 * idx + 2;
+            ref_main_idx_v = (col + 1) + 2 * idx + 2;
+            pu1_dst[(row * dst_strd) + col] = (UWORD8)(((32 - fract)
+                            * ref_main[ref_main_idx_u]
+                            + fract * ref_main[ref_main_idx_u + 2] + 16) >> 5);
+            pu1_dst[(row * dst_strd) + (col + 1)] = (UWORD8)(((32 - fract)
+                            * ref_main[ref_main_idx_v]
+                            + fract * ref_main[ref_main_idx_v + 2] + 16) >> 5);
+
+        }
+
+    }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
+* reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+* block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_mode_27_to_33(UWORD8 *pu1_ref,
+                                           WORD32 src_strd,
+                                           UWORD8 *pu1_dst,
+                                           WORD32 dst_strd,
+                                           WORD32 nt,
+                                           WORD32 mode)
+{
+    WORD32 row, col;
+    WORD32 pos, fract;
+    WORD32 intra_pred_ang;
+    WORD32 idx, ref_main_idx_u, ref_main_idx_v;
+    UNUSED(src_strd);
+
+
+    intra_pred_ang = gai4_ihevc_ang_table_chroma[mode];
+
+    for(row = 0; row < nt; row++)
+    {
+        pos = ((row + 1) * intra_pred_ang);
+        idx = pos >> 5;
+        fract = pos & (31);
+
+
+        // Do linear filtering
+        for(col = 0; col < (2 * nt); col += 2)
+        {
+            ref_main_idx_u = (4 * nt) + col + 2 * idx + 2;
+            ref_main_idx_v = (4 * nt) + (col + 1) + 2 * idx + 2;
+            pu1_dst[col + (row * dst_strd)] = (((32 - fract)
+                            * pu1_ref[ref_main_idx_u]
+                            + fract * pu1_ref[ref_main_idx_u + 2] + 16) >> 5);
+            pu1_dst[(col + 1) + (row * dst_strd)] = (((32 - fract)
+                            * pu1_ref[ref_main_idx_v]
+                            + fract * pu1_ref[ref_main_idx_v + 2] + 16) >> 5);
+
+        }
+    }
+
+}
+

diff --git a/common/ihevc_chroma_iquant_itrans_recon.c b/common/ihevc_chroma_iquant_itrans_recon.c
new file mode 100644
index 0000000..479aebd
--- /dev/null
+++ b/common/ihevc_chroma_iquant_itrans_recon.c

@@ -0,0 +1,256 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_chroma_iquant_itrans_recon.c
+ *
+ * @brief
+ *  Contains function definitions for inverse  quantization, inverse
+ * transform and reconstruction  of chroma interleaved data.
+ *
+ * @author
+ *  100470
+ *
+ * @par List of Functions:
+ *   - ihevc_chroma_iquant_itrans_recon_4x4()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_chroma_iquant_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions work one component(U or V) of interleaved data depending upon pointers passed to it */
+/* Data visualization */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* If the pointer points to first byte of above stream (U) , functions will operate on U component */
+/* If the pointer points to second byte of above stream (V) , functions will operate on V component */
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization, inverse  transform and
+ * reconstruction for 4x4 input block
+ *
+ * @par Description:
+ *  Performs inverse quantization , inverse transform  and adds the
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 4x4 buffer for storing inverse transform
+ *  1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 4x4 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @param[in] zero_rows
+ *  Zero Rows in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_iquant_itrans_recon_4x4(WORD16 *pi2_src,
+                                          WORD16 *pi2_tmp,
+                                          UWORD8 *pu1_pred,
+                                          WORD16 *pi2_dequant_coeff,
+                                          UWORD8 *pu1_dst,
+                                          WORD32 qp_div, /* qpscaled / 6 */
+                                          WORD32 qp_rem, /* qpscaled % 6 */
+                                          WORD32 src_strd,
+                                          WORD32 pred_strd,
+                                          WORD32 dst_strd,
+                                          WORD32 zero_cols,
+                                          WORD32 zero_rows)
+{
+    UNUSED(zero_rows);
+
+    /* Inverse Transform */
+    {
+        WORD32 j;
+        WORD32 e[2], o[2];
+        WORD32 add;
+        WORD32 shift;
+        WORD16 *pi2_tmp_orig;
+        WORD32 shift_iq;
+        WORD32 trans_size;
+        /* Inverse Quantization constants */
+        {
+            WORD32 log2_trans_size, bit_depth;
+
+            log2_trans_size = 2;
+            bit_depth = 8 + 0;
+            shift_iq = bit_depth + log2_trans_size - 5;
+        }
+
+        trans_size = TRANS_SIZE_4;
+        pi2_tmp_orig = pi2_tmp;
+
+        /* Inverse Transform 1st stage */
+        shift = IT_SHIFT_STAGE_1;
+        add = 1 << (shift - 1);
+
+        for(j = 0; j < trans_size; j++)
+        {
+            /* Checking for Zero Cols */
+            if((zero_cols & 1) == 1)
+            {
+                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+            }
+            else
+            {
+                WORD32 iq_tmp_1, iq_tmp_2;
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                IQUANT_4x4(iq_tmp_1,
+                           pi2_src[1 * src_strd],
+                           pi2_dequant_coeff[1 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+                IQUANT_4x4(iq_tmp_2,
+                           pi2_src[3 * src_strd],
+                           pi2_dequant_coeff[3 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+
+                o[0] = g_ai2_ihevc_trans_4[1][0] * iq_tmp_1
+                                + g_ai2_ihevc_trans_4[3][0] * iq_tmp_2;
+                o[1] = g_ai2_ihevc_trans_4[1][1] * iq_tmp_1
+                                + g_ai2_ihevc_trans_4[3][1] * iq_tmp_2;
+
+                IQUANT_4x4(iq_tmp_1,
+                           pi2_src[0 * src_strd],
+                           pi2_dequant_coeff[0 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+                IQUANT_4x4(iq_tmp_2,
+                           pi2_src[2 * src_strd],
+                           pi2_dequant_coeff[2 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+
+                e[0] = g_ai2_ihevc_trans_4[0][0] * iq_tmp_1
+                                + g_ai2_ihevc_trans_4[2][0] * iq_tmp_2;
+                e[1] = g_ai2_ihevc_trans_4[0][1] * iq_tmp_1
+                                + g_ai2_ihevc_trans_4[2][1] * iq_tmp_2;
+
+                pi2_tmp[0] =
+                                CLIP_S16(((e[0] + o[0] + add) >> shift));
+                pi2_tmp[1] =
+                                CLIP_S16(((e[1] + o[1] + add) >> shift));
+                pi2_tmp[2] =
+                                CLIP_S16(((e[1] - o[1] + add) >> shift));
+                pi2_tmp[3] =
+                                CLIP_S16(((e[0] - o[0] + add) >> shift));
+            }
+            pi2_src++;
+            pi2_dequant_coeff++;
+            pi2_tmp += trans_size;
+            zero_cols = zero_cols >> 1;
+        }
+
+        pi2_tmp = pi2_tmp_orig;
+
+        /* Inverse Transform 2nd stage */
+        shift = IT_SHIFT_STAGE_2;
+        add = 1 << (shift - 1);
+
+        for(j = 0; j < trans_size; j++)
+        {
+            WORD32 itrans_out;
+
+            /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+            o[0] = g_ai2_ihevc_trans_4[1][0] * pi2_tmp[trans_size]
+                            + g_ai2_ihevc_trans_4[3][0]
+                                            * pi2_tmp[3 * trans_size];
+            o[1] = g_ai2_ihevc_trans_4[1][1] * pi2_tmp[trans_size]
+                            + g_ai2_ihevc_trans_4[3][1]
+                                            * pi2_tmp[3 * trans_size];
+            e[0] = g_ai2_ihevc_trans_4[0][0] * pi2_tmp[0]
+                            + g_ai2_ihevc_trans_4[2][0]
+                                            * pi2_tmp[2 * trans_size];
+            e[1] = g_ai2_ihevc_trans_4[0][1] * pi2_tmp[0]
+                            + g_ai2_ihevc_trans_4[2][1]
+                                            * pi2_tmp[2 * trans_size];
+
+            itrans_out =
+                            CLIP_S16(((e[0] + o[0] + add) >> shift));
+            pu1_dst[0 * 2] = CLIP_U8((itrans_out + pu1_pred[0 * 2]));
+
+            itrans_out =
+                            CLIP_S16(((e[1] + o[1] + add) >> shift));
+            pu1_dst[1 * 2] = CLIP_U8((itrans_out + pu1_pred[1 * 2]));
+
+            itrans_out =
+                            CLIP_S16(((e[1] - o[1] + add) >> shift));
+            pu1_dst[2 * 2] = CLIP_U8((itrans_out + pu1_pred[2 * 2]));
+
+            itrans_out =
+                            CLIP_S16(((e[0] - o[0] + add) >> shift));
+            pu1_dst[3 * 2] = CLIP_U8((itrans_out + pu1_pred[3 * 2]));
+
+            pi2_tmp++;
+            pu1_pred += pred_strd;
+            pu1_dst += dst_strd;
+
+        }
+    }
+}

diff --git a/common/ihevc_chroma_iquant_itrans_recon.h b/common/ihevc_chroma_iquant_itrans_recon.h
new file mode 100644
index 0000000..1cacfc5
--- /dev/null
+++ b/common/ihevc_chroma_iquant_itrans_recon.h

@@ -0,0 +1,135 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_chroma_iquant_itrans_recon.h
+*
+* @brief
+*  Functions declarations for inverse quantization,  inverse transform and
+* reconstruction  of chroma interleaved data.
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+#ifndef _IHEVC_CHROMA_IQUANT_ITRANS_RECON_H_
+#define _IHEVC_CHROMA_IQUANT_ITRANS_RECON_H_
+
+typedef void ihevc_chroma_iquant_itrans_recon_4x4_ft(WORD16 *pi2_src,
+                                                     WORD16 *pi2_tmp,
+                                                     UWORD8 *pu1_pred,
+                                                     WORD16 *pi2_dequant_coeff,
+                                                     UWORD8 *pu1_dst,
+                                                     WORD32 qp_div, /* qpscaled / 6 */
+                                                     WORD32 qp_rem, /* qpscaled % 6 */
+                                                     WORD32 src_strd,
+                                                     WORD32 pred_strd,
+                                                     WORD32 dst_strd,
+                                                     WORD32 zero_cols,
+                                                     WORD32 zero_rows);
+
+typedef void ihevc_hbd_chroma_iquant_itrans_recon_4x4_ft(WORD16 *pi2_src,
+                                                         WORD16 *pi2_tmp,
+                                                         UWORD16 *pu2_pred,
+                                                         WORD16 *pi2_dequant_coeff,
+                                                         UWORD16 *pu2_dst,
+                                                         WORD32 qp_div, /* qpscaled / 6 */
+                                                         WORD32 qp_rem, /* qpscaled % 6 */
+                                                         WORD32 src_strd,
+                                                         WORD32 pred_strd,
+                                                         WORD32 dst_strd,
+                                                         WORD32 zero_cols,
+                                                         WORD32 zero_rows,
+                                                         UWORD8 bit_depth);
+
+typedef void ihevc_chroma_iquant_itrans_recon_8x8_ft(WORD16 *pi2_src,
+                                                     WORD16 *pi2_tmp,
+                                                     UWORD8 *pu1_pred,
+                                                     WORD16 *pi2_dequant_coeff,
+                                                     UWORD8 *pu1_dst,
+                                                     WORD32 qp_div, /* qpscaled / 6 */
+                                                     WORD32 qp_rem, /* qpscaled % 6 */
+                                                     WORD32 src_strd,
+                                                     WORD32 pred_strd,
+                                                     WORD32 dst_strd,
+                                                     WORD32 zero_cols,
+                                                     WORD32 zero_rows);
+
+typedef void ihevc_hbd_chroma_iquant_itrans_recon_8x8_ft(WORD16 *pi2_src,
+                                                         WORD16 *pi2_tmp,
+                                                         UWORD16 *pu2_pred,
+                                                         WORD16 *pi2_dequant_coeff,
+                                                         UWORD16 *pu2_dst,
+                                                         WORD32 qp_div, /* qpscaled / 6 */
+                                                         WORD32 qp_rem, /* qpscaled % 6 */
+                                                         WORD32 src_strd,
+                                                         WORD32 pred_strd,
+                                                         WORD32 dst_strd,
+                                                         WORD32 zero_cols,
+                                                         WORD32 zero_rows,
+                                                         UWORD8 bit_depth);
+
+typedef void ihevc_chroma_iquant_itrans_recon_16x16_ft(WORD16 *pi2_src,
+                                                       WORD16 *pi2_tmp,
+                                                       UWORD8 *pu1_pred,
+                                                       WORD16 *pi2_dequant_coeff,
+                                                       UWORD8 *pu1_dst,
+                                                       WORD32 qp_div, /* qpscaled / 6 */
+                                                       WORD32 qp_rem, /* qpscaled % 6 */
+                                                       WORD32 src_strd,
+                                                       WORD32 pred_strd,
+                                                       WORD32 dst_strd,
+                                                       WORD32 zero_cols,
+                                                       WORD32 zero_rows);
+
+typedef void ihevc_hbd_chroma_iquant_itrans_recon_16x16_ft(WORD16 *pi2_src,
+                                                           WORD16 *pi2_tmp,
+                                                           UWORD16 *pu2_pred,
+                                                           WORD16 *pi2_dequant_coeff,
+                                                           UWORD16 *pu2_dst,
+                                                           WORD32 qp_div, /* qpscaled / 6 */
+                                                           WORD32 qp_rem, /* qpscaled % 6 */
+                                                           WORD32 src_strd,
+                                                           WORD32 pred_strd,
+                                                           WORD32 dst_strd,
+                                                           WORD32 zero_cols,
+                                                           WORD32 zero_rows,
+                                                           UWORD8 bit_depth);
+
+ihevc_chroma_iquant_itrans_recon_4x4_ft ihevc_chroma_iquant_itrans_recon_4x4;
+ihevc_hbd_chroma_iquant_itrans_recon_4x4_ft ihevc_hbd_chroma_iquant_itrans_recon_4x4;
+ihevc_chroma_iquant_itrans_recon_8x8_ft ihevc_chroma_iquant_itrans_recon_8x8;
+ihevc_hbd_chroma_iquant_itrans_recon_8x8_ft ihevc_hbd_chroma_iquant_itrans_recon_8x8;
+ihevc_chroma_iquant_itrans_recon_16x16_ft ihevc_chroma_iquant_itrans_recon_16x16;
+ihevc_hbd_chroma_iquant_itrans_recon_16x16_ft ihevc_hbd_chroma_iquant_itrans_recon_16x16;
+
+ihevc_chroma_iquant_itrans_recon_4x4_ft ihevc_chroma_iquant_itrans_recon_4x4_sse42;
+ihevc_hbd_chroma_iquant_itrans_recon_4x4_ft ihevc_hbd_chroma_iquant_itrans_recon_4x4_sse42;
+ihevc_chroma_iquant_itrans_recon_8x8_ft ihevc_chroma_iquant_itrans_recon_8x8_sse42;
+ihevc_hbd_chroma_iquant_itrans_recon_8x8_ft ihevc_hbd_chroma_iquant_itrans_recon_8x8_sse42;
+ihevc_chroma_iquant_itrans_recon_16x16_ft ihevc_chroma_iquant_itrans_recon_16x16_sse42;
+ihevc_hbd_chroma_iquant_itrans_recon_16x16_ft ihevc_hbd_chroma_iquant_itrans_recon_16x16_sse42;
+
+#endif /*_IHEVC_CHROMA_IQUANT_ITRANS_RECON_H_*/

diff --git a/common/ihevc_chroma_iquant_recon.c b/common/ihevc_chroma_iquant_recon.c
new file mode 100644
index 0000000..cba9eb1
--- /dev/null
+++ b/common/ihevc_chroma_iquant_recon.c

@@ -0,0 +1,398 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_chroma_iquant_recon.c
+ *
+ * @brief
+ *  Contains function definitions for inverse  quantization and
+ * reconstruction  of chroma interleaved data.
+ *
+ * @author
+ *  100470
+ *
+ * @par List of Functions:
+ *   - ihevc_chroma_iquant_recon_4x4()
+ *   - ihevc_chroma_iquant_recon_8x8()
+ *   - ihevc_chroma_iquant_recon_16x16()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_chroma_iquant_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions work one component(U or V) of interleaved data depending upon pointers passed to it */
+/* Data visualization */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* If the pointer points to first byte of above stream (U) , functions will operate on U component */
+/* If the pointer points to second byte of above stream (V) , functions will operate on V component */
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization and  reconstruction for 4x4
+ * input block
+ *
+ * @par Description:
+ *  This function performs inverse quantization and  reconstruction for 4x4
+ * input block
+ *
+ * @param[in] pi2_src
+ *  Input 4x4 coefficients
+ *
+ * @param[in] pu1_pred
+ *  Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 4x4 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_iquant_recon_4x4(WORD16 *pi2_src,
+                                   UWORD8 *pu1_pred,
+                                   WORD16 *pi2_dequant_coeff,
+                                   UWORD8 *pu1_dst,
+                                   WORD32 qp_div, /* qpscaled / 6 */
+                                   WORD32 qp_rem, /* qpscaled % 6 */
+                                   WORD32 src_strd,
+                                   WORD32 pred_strd,
+                                   WORD32 dst_strd,
+                                   WORD32 zero_cols)
+{
+
+    {
+        /* Inverse Quant and recon */
+        {
+            WORD32 i, j;
+            WORD32 shift_iq;
+            WORD32 trans_size;
+            /* Inverse Quantization constants */
+            {
+                WORD32 log2_trans_size, bit_depth;
+
+                log2_trans_size = 2;
+                bit_depth = 8 + 0;
+                shift_iq = bit_depth + log2_trans_size - 5;
+            }
+
+            trans_size = TRANS_SIZE_4;
+
+            for(i = 0; i < trans_size; i++)
+            {
+                /* Checking for Zero Cols */
+                if((zero_cols & 1) == 1)
+                {
+                    for(j = 0; j < trans_size; j++)
+                        pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+                }
+                else
+                {
+                    for(j = 0; j < trans_size; j++)
+                    {
+                        WORD32 iquant_out;
+                        IQUANT_4x4(iquant_out,
+                                   pi2_src[j * src_strd],
+                                   pi2_dequant_coeff[j * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                                   shift_iq, qp_div);
+                        iquant_out = (iquant_out + 16) >> 5;
+                        pu1_dst[j * dst_strd] =
+                                        CLIP_U8(iquant_out + pu1_pred[j * pred_strd]);
+                    }
+                }
+                pi2_src++;
+                pi2_dequant_coeff++;
+                pu1_pred += 2;
+                pu1_dst += 2;
+
+                zero_cols = zero_cols >> 1;
+            }
+        }
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization and  reconstruction for 8x8
+ * input block
+ *
+ * @par Description:
+ *  This function performs inverse quantization and  reconstruction for 8x8
+ * input block
+ *
+ * @param[in] pi2_src
+ *  Input 8x8 coefficients
+ *
+ * @param[in] pu1_pred
+ *  Prediction 8x8 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 8x8 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_iquant_recon_8x8(WORD16 *pi2_src,
+                                   UWORD8 *pu1_pred,
+                                   WORD16 *pi2_dequant_coeff,
+                                   UWORD8 *pu1_dst,
+                                   WORD32 qp_div, /* qpscaled / 6 */
+                                   WORD32 qp_rem, /* qpscaled % 6 */
+                                   WORD32 src_strd,
+                                   WORD32 pred_strd,
+                                   WORD32 dst_strd,
+                                   WORD32 zero_cols)
+{
+
+    {
+        /* Inverse Quant and recon */
+        {
+            WORD32 i, j;
+            WORD32 shift_iq;
+            WORD32 trans_size;
+            /* Inverse Quantization constants */
+            {
+                WORD32 log2_trans_size, bit_depth;
+
+                log2_trans_size = 3;
+                bit_depth = 8 + 0;
+                shift_iq = bit_depth + log2_trans_size - 5;
+            }
+
+            trans_size = TRANS_SIZE_8;
+
+            for(i = 0; i < trans_size; i++)
+            {
+                /* Checking for Zero Cols */
+                if((zero_cols & 1) == 1)
+                {
+                    for(j = 0; j < trans_size; j++)
+                        pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+                }
+                else
+                {
+                    for(j = 0; j < trans_size; j++)
+                    {
+                        WORD32 iquant_out;
+                        IQUANT(iquant_out,
+                               pi2_src[j * src_strd],
+                               pi2_dequant_coeff[j * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                               shift_iq, qp_div);
+                        iquant_out = (iquant_out + 16) >> 5;
+                        pu1_dst[j * dst_strd] =
+                                        CLIP_U8(iquant_out + pu1_pred[j * pred_strd]);
+                    }
+                }
+                pi2_src++;
+                pi2_dequant_coeff++;
+                pu1_pred += 2;
+                pu1_dst += 2;
+
+                zero_cols = zero_cols >> 1;
+            }
+        }
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization and  reconstruction for 16x16
+ * input block
+ *
+ * @par Description:
+ *  This function performs inverse quantization and  reconstruction for 16x16
+ * input block
+ *
+ * @param[in] pi2_src
+ *  Input 16x16 coefficients
+ *
+ * @param[in] pu1_pred
+ *  Prediction 16x16 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 16x16 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_iquant_recon_16x16(WORD16 *pi2_src,
+                                     UWORD8 *pu1_pred,
+                                     WORD16 *pi2_dequant_coeff,
+                                     UWORD8 *pu1_dst,
+                                     WORD32 qp_div, /* qpscaled / 6 */
+                                     WORD32 qp_rem, /* qpscaled % 6 */
+                                     WORD32 src_strd,
+                                     WORD32 pred_strd,
+                                     WORD32 dst_strd,
+                                     WORD32 zero_cols)
+
+{
+
+    {
+        /* Inverse Quant and recon */
+        {
+            WORD32 i, j;
+            WORD32 shift_iq;
+            WORD32 trans_size;
+            /* Inverse Quantization constants */
+            {
+                WORD32 log2_trans_size, bit_depth;
+
+                log2_trans_size = 4;
+                bit_depth = 8 + 0;
+                shift_iq = bit_depth + log2_trans_size - 5;
+            }
+
+            trans_size = TRANS_SIZE_16;
+
+            for(i = 0; i < trans_size; i++)
+            {
+                /* Checking for Zero Cols */
+                if((zero_cols & 1) == 1)
+                {
+                    for(j = 0; j < trans_size; j++)
+                        pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+                }
+                else
+                {
+                    for(j = 0; j < trans_size; j++)
+                    {
+                        WORD32 iquant_out;
+                        IQUANT(iquant_out,
+                               pi2_src[j * src_strd],
+                               pi2_dequant_coeff[j * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                               shift_iq, qp_div);
+                        iquant_out = (iquant_out + 16) >> 5;
+                        pu1_dst[j * dst_strd] =
+                                        CLIP_U8(iquant_out + pu1_pred[j * pred_strd]);
+                    }
+                }
+                pi2_src++;
+                pi2_dequant_coeff++;
+                pu1_pred += 2;
+                pu1_dst += 2;
+
+                zero_cols = zero_cols >> 1;
+            }
+        }
+    }
+}
+
+

diff --git a/common/ihevc_chroma_iquant_recon.h b/common/ihevc_chroma_iquant_recon.h
new file mode 100644
index 0000000..8f6a043
--- /dev/null
+++ b/common/ihevc_chroma_iquant_recon.h

@@ -0,0 +1,111 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_chroma_iquant_recon.h
+*
+* @brief
+*  Functions declarations for inverse quantization and  reconstruction  of
+* chroma interleaved data.
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+#ifndef _IHEVC_CHROMA_IQUANT_RECON_H_
+#define _IHEVC_CHROMA_IQUANT_RECON_H_
+
+typedef void ihevc_chroma_iquant_recon_4x4_ft(WORD16 *pi2_src,
+                                              UWORD8 *pu1_pred,
+                                              WORD16 *pi2_dequant_coeff,
+                                              UWORD8 *pu1_dst,
+                                              WORD32 qp_div, /* qpscaled / 6 */
+                                              WORD32 qp_rem, /* qpscaled % 6 */
+                                              WORD32 src_strd,
+                                              WORD32 pred_strd,
+                                              WORD32 dst_strd,
+                                              WORD32 zero_cols);
+typedef void ihevc_hbd_chroma_iquant_recon_4x4_ft(WORD16 *pi2_src,
+                                                  UWORD16 *pu2_pred,
+                                                  WORD16 *pi2_dequant_coeff,
+                                                  UWORD16 *pu2_dst,
+                                                  WORD32 qp_div, /* qpscaled / 6 */
+                                                  WORD32 qp_rem, /* qpscaled % 6 */
+                                                  WORD32 src_strd,
+                                                  WORD32 pred_strd,
+                                                  WORD32 dst_strd,
+                                                  WORD32 zero_cols,
+                                                  UWORD8 bit_depth);
+typedef void ihevc_chroma_iquant_recon_8x8_ft(WORD16 *pi2_src,
+                                              UWORD8 *pu1_pred,
+                                              WORD16 *pi2_dequant_coeff,
+                                              UWORD8 *pu1_dst,
+                                              WORD32 qp_div, /* qpscaled / 6 */
+                                              WORD32 qp_rem, /* qpscaled % 6 */
+                                              WORD32 src_strd,
+                                              WORD32 pred_strd,
+                                              WORD32 dst_strd,
+                                              WORD32 zero_cols);
+typedef void ihevc_hbd_chroma_iquant_recon_8x8_ft(WORD16 *pi2_src,
+                                                  UWORD16 *pu2_pred,
+                                                  WORD16 *pi2_dequant_coeff,
+                                                  UWORD16 *pu2_dst,
+                                                  WORD32 qp_div, /* qpscaled / 6 */
+                                                  WORD32 qp_rem, /* qpscaled % 6 */
+                                                  WORD32 src_strd,
+                                                  WORD32 pred_strd,
+                                                  WORD32 dst_strd,
+                                                  WORD32 zero_cols,
+                                                  UWORD8 bit_depth);
+typedef void ihevc_chroma_iquant_recon_16x16_ft(WORD16 *pi2_src,
+                                                UWORD8 *pu1_pred,
+                                                WORD16 *pi2_dequant_coeff,
+                                                UWORD8 *pu1_dst,
+                                                WORD32 qp_div, /* qpscaled / 6 */
+                                                WORD32 qp_rem, /* qpscaled % 6 */
+                                                WORD32 src_strd,
+                                                WORD32 pred_strd,
+                                                WORD32 dst_strd,
+                                                WORD32 zero_cols);
+typedef void ihevc_hbd_chroma_iquant_recon_16x16_ft(WORD16 *pi2_src,
+                                                    UWORD16 *pu2_pred,
+                                                    WORD16 *pi2_dequant_coeff,
+                                                    UWORD16 *pu2_dst,
+                                                    WORD32 qp_div, /* qpscaled / 6 */
+                                                    WORD32 qp_rem, /* qpscaled % 6 */
+                                                    WORD32 src_strd,
+                                                    WORD32 pred_strd,
+                                                    WORD32 dst_strd,
+                                                    WORD32 zero_cols,
+                                                    UWORD8 bit_depth);
+
+ihevc_chroma_iquant_recon_4x4_ft ihevc_chroma_iquant_recon_4x4;
+ihevc_hbd_chroma_iquant_recon_4x4_ft ihevc_hbd_chroma_iquant_recon_4x4;
+ihevc_chroma_iquant_recon_8x8_ft ihevc_chroma_iquant_recon_8x8;
+ihevc_hbd_chroma_iquant_recon_8x8_ft ihevc_hbd_chroma_iquant_recon_8x8;
+ihevc_chroma_iquant_recon_16x16_ft ihevc_chroma_iquant_recon_16x16;
+ihevc_hbd_chroma_iquant_recon_16x16_ft ihevc_hbd_chroma_iquant_recon_16x16;
+
+#endif /*_IHEVC_CHROMA_IQUANT_RECON_H_*/

diff --git a/common/ihevc_chroma_itrans_recon.c b/common/ihevc_chroma_itrans_recon.c
new file mode 100644
index 0000000..bbbc476
--- /dev/null
+++ b/common/ihevc_chroma_itrans_recon.c

@@ -0,0 +1,205 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_chroma_itrans_recon.c
+ *
+ * @brief
+ *  Contains function definitions for inverse transform  and reconstruction
+ * of chroma interleaved data.
+ *
+ * @author
+ *  100470
+ *
+ * @par List of Functions:
+ *  - ihevc_chroma_itrans_recon_4x4()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_chroma_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions work one component(U or V) of interleaved data depending upon pointers passed to it */
+/* Data visualization */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* If the pointer points to first byte of above stream (U) , functions will operate on U component */
+/* If the pointer points to second byte of above stream (V) , functions will operate on V component */
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs Inverse transform  and reconstruction for 4x4
+ * input block
+ *
+ * @par Description:
+ *  Performs inverse transform and adds the prediction  data and clips output
+ * to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 4x4 buffer for storing inverse transform
+ *  1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 4x4 block
+ *
+ * @param[out] pu1_dst
+ *  Output 4x4 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_itrans_recon_4x4(WORD16 *pi2_src,
+                                   WORD16 *pi2_tmp,
+                                   UWORD8 *pu1_pred,
+                                   UWORD8 *pu1_dst,
+                                   WORD32 src_strd,
+                                   WORD32 pred_strd,
+                                   WORD32 dst_strd,
+                                   WORD32 zero_cols,
+                                   WORD32 zero_rows)
+{
+    WORD32 j;
+    WORD32 e[2], o[2];
+    WORD32 add;
+    WORD32 shift;
+    WORD16 *pi2_tmp_orig;
+    WORD32 trans_size;
+    UNUSED(zero_rows);
+    trans_size = TRANS_SIZE_4;
+
+    pi2_tmp_orig = pi2_tmp;
+
+    /* Inverse Transform 1st stage */
+    shift = IT_SHIFT_STAGE_1;
+    add = 1 << (shift - 1);
+
+    for(j = 0; j < trans_size; j++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+        }
+        else
+        {
+
+            /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+            o[0] = g_ai2_ihevc_trans_4[1][0] * pi2_src[src_strd]
+                            + g_ai2_ihevc_trans_4[3][0] * pi2_src[3 * src_strd];
+            o[1] = g_ai2_ihevc_trans_4[1][1] * pi2_src[src_strd]
+                            + g_ai2_ihevc_trans_4[3][1] * pi2_src[3 * src_strd];
+            e[0] = g_ai2_ihevc_trans_4[0][0] * pi2_src[0]
+                            + g_ai2_ihevc_trans_4[2][0] * pi2_src[2 * src_strd];
+            e[1] = g_ai2_ihevc_trans_4[0][1] * pi2_src[0]
+                            + g_ai2_ihevc_trans_4[2][1] * pi2_src[2 * src_strd];
+
+            pi2_tmp[0] =
+                            CLIP_S16(((e[0] + o[0] + add) >> shift));
+            pi2_tmp[1] =
+                            CLIP_S16(((e[1] + o[1] + add) >> shift));
+            pi2_tmp[2] =
+                            CLIP_S16(((e[1] - o[1] + add) >> shift));
+            pi2_tmp[3] =
+                            CLIP_S16(((e[0] - o[0] + add) >> shift));
+
+        }
+        pi2_src++;
+        pi2_tmp += trans_size;
+        zero_cols = zero_cols >> 1;
+    }
+
+    pi2_tmp = pi2_tmp_orig;
+
+    /* Inverse Transform 2nd stage */
+    shift = IT_SHIFT_STAGE_2;
+    add = 1 << (shift - 1);
+
+    for(j = 0; j < trans_size; j++)
+    {
+        WORD32 itrans_out;
+        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+        o[0] = g_ai2_ihevc_trans_4[1][0] * pi2_tmp[trans_size]
+                        + g_ai2_ihevc_trans_4[3][0] * pi2_tmp[3 * trans_size];
+        o[1] = g_ai2_ihevc_trans_4[1][1] * pi2_tmp[trans_size]
+                        + g_ai2_ihevc_trans_4[3][1] * pi2_tmp[3 * trans_size];
+        e[0] = g_ai2_ihevc_trans_4[0][0] * pi2_tmp[0]
+                        + g_ai2_ihevc_trans_4[2][0] * pi2_tmp[2 * trans_size];
+        e[1] = g_ai2_ihevc_trans_4[0][1] * pi2_tmp[0]
+                        + g_ai2_ihevc_trans_4[2][1] * pi2_tmp[2 * trans_size];
+
+        itrans_out =
+                        CLIP_S16(((e[0] + o[0] + add) >> shift));
+        pu1_dst[0 * 2] = CLIP_U8((itrans_out + pu1_pred[0 * 2]));
+        itrans_out =
+                        CLIP_S16(((e[1] + o[1] + add) >> shift));
+        pu1_dst[1 * 2] = CLIP_U8((itrans_out + pu1_pred[1 * 2]));
+        itrans_out =
+                        CLIP_S16(((e[1] - o[1] + add) >> shift));
+        pu1_dst[2 * 2] = CLIP_U8((itrans_out + pu1_pred[2 * 2]));
+        itrans_out =
+                        CLIP_S16(((e[0] - o[0] + add) >> shift));
+        pu1_dst[3 * 2] = CLIP_U8((itrans_out + pu1_pred[3 * 2]));
+
+        pi2_tmp++;
+        pu1_pred += pred_strd;
+        pu1_dst += dst_strd;
+
+    }
+}
+

diff --git a/common/ihevc_chroma_itrans_recon.h b/common/ihevc_chroma_itrans_recon.h
new file mode 100644
index 0000000..c20cebf
--- /dev/null
+++ b/common/ihevc_chroma_itrans_recon.h

@@ -0,0 +1,109 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_chroma_itrans_recon.h
+*
+* @brief
+*  Functions declarations for inverse transform and  reconstruction  of
+* chroma interleaved data.
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+#ifndef _IHEVC_CHROMA_ITRANS_RECON_H_
+#define _IHEVC_CHROMA_ITRANS_RECON_H_
+
+typedef void ihevc_chroma_itrans_recon_4x4_ft(WORD16 *pi2_src,
+                                              WORD16 *pi2_tmp,
+                                              UWORD8 *pu1_pred,
+                                              UWORD8 *pu1_dst,
+                                              WORD32 src_strd,
+                                              WORD32 pred_strd,
+                                              WORD32 dst_strd,
+                                              WORD32 zero_cols,
+                                              WORD32 zero_rows);
+typedef void ihevc_hbd_chroma_itrans_recon_4x4_ft(WORD16 *pi2_src,
+                                                  WORD16 *pi2_tmp,
+                                                  UWORD16 *pu2_pred,
+                                                  UWORD16 *pu2_dst,
+                                                  WORD32 src_strd,
+                                                  WORD32 pred_strd,
+                                                  WORD32 dst_strd,
+                                                  WORD32 zero_cols,
+                                                  WORD32 zero_rows,
+                                                  UWORD8 bit_depth);
+typedef void ihevc_chroma_itrans_recon_8x8_ft(WORD16 *pi2_src,
+                                              WORD16 *pi2_tmp,
+                                              UWORD8 *pu1_pred,
+                                              UWORD8 *pu1_dst,
+                                              WORD32 src_strd,
+                                              WORD32 pred_strd,
+                                              WORD32 dst_strd,
+                                              WORD32 zero_cols,
+                                              WORD32 zero_rows);
+typedef void ihevc_hbd_chroma_itrans_recon_8x8_ft(WORD16 *pi2_src,
+                                                  WORD16 *pi2_tmp,
+                                                  UWORD16 *pu2_pred,
+                                                  UWORD16 *pu2_dst,
+                                                  WORD32 src_strd,
+                                                  WORD32 pred_strd,
+                                                  WORD32 dst_strd,
+                                                  WORD32 zero_cols,
+                                                  WORD32 zero_rows,
+                                                  UWORD8 bit_depth);
+typedef void ihevc_chroma_itrans_recon_16x16_ft(WORD16 *pi2_src,
+                                                WORD16 *pi2_tmp,
+                                                UWORD8 *pu1_pred,
+                                                UWORD8 *pu1_dst,
+                                                WORD32 src_strd,
+                                                WORD32 pred_strd,
+                                                WORD32 dst_strd,
+                                                WORD32 zero_cols,
+                                                WORD32 zero_rows);
+typedef void ihevc_hbd_chroma_itrans_recon_16x16_ft(WORD16 *pi2_src,
+                                                    WORD16 *pi2_tmp,
+                                                    UWORD16 *pu2_pred,
+                                                    UWORD16 *pu2_dst,
+                                                    WORD32 src_strd,
+                                                    WORD32 pred_strd,
+                                                    WORD32 dst_strd,
+                                                    WORD32 zero_cols,
+                                                    WORD32 zero_rows,
+                                                    UWORD8 bit_depth);
+
+ihevc_chroma_itrans_recon_4x4_ft ihevc_chroma_itrans_recon_4x4;
+ihevc_hbd_chroma_itrans_recon_4x4_ft ihevc_hbd_chroma_itrans_recon_4x4;
+ihevc_chroma_itrans_recon_8x8_ft ihevc_chroma_itrans_recon_8x8;
+ihevc_hbd_chroma_itrans_recon_8x8_ft ihevc_hbd_chroma_itrans_recon_8x8;
+ihevc_chroma_itrans_recon_16x16_ft ihevc_chroma_itrans_recon_16x16;
+ihevc_hbd_chroma_itrans_recon_16x16_ft ihevc_hbd_chroma_itrans_recon_16x16;
+
+ihevc_hbd_chroma_itrans_recon_4x4_ft ihevc_hbd_chroma_itrans_recon_4x4_sse42;
+ihevc_hbd_chroma_itrans_recon_8x8_ft ihevc_hbd_chroma_itrans_recon_8x8_sse42;
+ihevc_hbd_chroma_itrans_recon_16x16_ft ihevc_hbd_chroma_itrans_recon_16x16_sse42;
+
+#endif /*_IHEVC_CHROMA_ITRANS_RECON_H_*/

diff --git a/common/ihevc_chroma_itrans_recon_16x16.c b/common/ihevc_chroma_itrans_recon_16x16.c
new file mode 100644
index 0000000..35874fe
--- /dev/null
+++ b/common/ihevc_chroma_itrans_recon_16x16.c

@@ -0,0 +1,895 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_chroma_itrans_recon_16x16.c
+ *
+ * @brief
+ *  Contains function definitions for 16x16 inverse transform  and reconstruction
+ * of chroma interleaved data.
+ *
+ * @author
+ *  100470
+ *
+ * @par List of Functions:
+ *  - ihevc_chroma_itrans_recon_16x16()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_chroma_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions work one component(U or V) of interleaved data depending upon pointers passed to it */
+/* Data visualization */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* If the pointer points to first byte of above stream (U) , functions will operate on U component */
+/* If the pointer points to second byte of above stream (V) , functions will operate on V component */
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs Inverse transform  and reconstruction for 16x16
+ * input block
+ *
+ * @par Description:
+ *  Performs inverse transform and adds the prediction  data and clips output
+ * to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 16x16 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 16x16 buffer for storing inverse transform
+ *  1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 16x16 block
+ *
+ * @param[out] pu1_dst
+ *  Output 16x16 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_itrans_recon_16x16(WORD16 *pi2_src,
+                                     WORD16 *pi2_tmp,
+                                     UWORD8 *pu1_pred,
+                                     UWORD8 *pu1_dst,
+                                     WORD32 src_strd,
+                                     WORD32 pred_strd,
+                                     WORD32 dst_strd,
+                                     WORD32 zero_cols,
+                                     WORD32 zero_rows)
+{
+    WORD32 j, k;
+    WORD32 e[8], o[8];
+    WORD32 ee[4], eo[4];
+    WORD32 eee[2], eeo[2];
+    WORD32 add;
+    WORD32 shift;
+    WORD16 *pi2_tmp_orig;
+    WORD32 trans_size;
+    WORD32 row_limit_2nd_stage, zero_rows_2nd_stage = zero_cols;
+
+    trans_size = TRANS_SIZE_16;
+    pi2_tmp_orig = pi2_tmp;
+
+    if((zero_cols & 0xFFF0) == 0xFFF0)
+        row_limit_2nd_stage = 4;
+    else if((zero_cols & 0xFF00) == 0xFF00)
+        row_limit_2nd_stage = 8;
+    else
+        row_limit_2nd_stage = TRANS_SIZE_16;
+
+    if((zero_rows & 0xFFF0) == 0xFFF0) /* First 4 rows of input are non-zero */
+    {
+        /************************************************************************************************/
+        /**********************************START - IT_RECON_16x16****************************************/
+        /************************************************************************************************/
+
+        /* Inverse Transform 1st stage */
+        shift = IT_SHIFT_STAGE_1;
+        add = 1 << (shift - 1);
+
+        for(j = 0; j < row_limit_2nd_stage; j++)
+        {
+            /* Checking for Zero Cols */
+            if((zero_cols & 1) == 1)
+            {
+                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+            }
+            else
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_src[3 * src_strd];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd];
+                }
+                eeo[0] = 0;
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
+                eeo[1] = 0;
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    pi2_tmp[k] =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pi2_tmp[k + 8] =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                }
+            }
+            pi2_src++;
+            pi2_tmp += trans_size;
+            zero_cols = zero_cols >> 1;
+        }
+
+        pi2_tmp = pi2_tmp_orig;
+
+        /* Inverse Transform 2nd stage */
+        shift = IT_SHIFT_STAGE_2;
+        add = 1 << (shift - 1);
+        if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_tmp[3 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
+                }
+                eeo[0] = 0;
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+                eeo[1] = 0;
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+                    itrans_out =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 8 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_16[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_16[7][k]
+                                                    * pi2_tmp[7 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+                                    + g_ai2_ihevc_trans_16[6][k]
+                                                    * pi2_tmp[6 * trans_size];
+                }
+                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+                    itrans_out =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else /* All rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_16[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_16[7][k]
+                                                    * pi2_tmp[7 * trans_size]
+                                    + g_ai2_ihevc_trans_16[9][k]
+                                                    * pi2_tmp[9 * trans_size]
+                                    + g_ai2_ihevc_trans_16[11][k]
+                                                    * pi2_tmp[11 * trans_size]
+                                    + g_ai2_ihevc_trans_16[13][k]
+                                                    * pi2_tmp[13 * trans_size]
+                                    + g_ai2_ihevc_trans_16[15][k]
+                                                    * pi2_tmp[15 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+                                    + g_ai2_ihevc_trans_16[6][k]
+                                                    * pi2_tmp[6 * trans_size]
+                                    + g_ai2_ihevc_trans_16[10][k]
+                                                    * pi2_tmp[10 * trans_size]
+                                    + g_ai2_ihevc_trans_16[14][k]
+                                                    * pi2_tmp[14 * trans_size];
+                }
+                eeo[0] =
+                                g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
+                                                + g_ai2_ihevc_trans_16[12][0]
+                                                                * pi2_tmp[12
+                                                                                * trans_size];
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
+                                + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
+                eeo[1] =
+                                g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
+                                                + g_ai2_ihevc_trans_16[12][1]
+                                                                * pi2_tmp[12
+                                                                                * trans_size];
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
+                                + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+                    itrans_out =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        /************************************************************************************************/
+        /************************************END - IT_RECON_16x16****************************************/
+        /************************************************************************************************/
+    }
+    else if((zero_rows & 0xFF00) == 0xFF00) /* First 8 rows of input are non-zero */
+    {
+        /************************************************************************************************/
+        /**********************************START - IT_RECON_16x16****************************************/
+        /************************************************************************************************/
+
+        /* Inverse Transform 1st stage */
+        shift = IT_SHIFT_STAGE_1;
+        add = 1 << (shift - 1);
+
+        for(j = 0; j < row_limit_2nd_stage; j++)
+        {
+            /* Checking for Zero Cols */
+            if((zero_cols & 1) == 1)
+            {
+                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+            }
+            else
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_src[3 * src_strd]
+                                    + g_ai2_ihevc_trans_16[5][k]
+                                                    * pi2_src[5 * src_strd]
+                                    + g_ai2_ihevc_trans_16[7][k]
+                                                    * pi2_src[7 * src_strd];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
+                                    + g_ai2_ihevc_trans_16[6][k]
+                                                    * pi2_src[6 * src_strd];
+                }
+                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd];
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
+                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd];
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    pi2_tmp[k] =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pi2_tmp[k + 8] =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                }
+            }
+            pi2_src++;
+            pi2_tmp += trans_size;
+            zero_cols = zero_cols >> 1;
+        }
+
+        pi2_tmp = pi2_tmp_orig;
+
+        /* Inverse Transform 2nd stage */
+        shift = IT_SHIFT_STAGE_2;
+        add = 1 << (shift - 1);
+        if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_tmp[3 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
+                }
+                eeo[0] = 0;
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+                eeo[1] = 0;
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+                    itrans_out =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 8 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_16[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_16[7][k]
+                                                    * pi2_tmp[7 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+                                    + g_ai2_ihevc_trans_16[6][k]
+                                                    * pi2_tmp[6 * trans_size];
+                }
+                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+                    itrans_out =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else /* All rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_16[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_16[7][k]
+                                                    * pi2_tmp[7 * trans_size]
+                                    + g_ai2_ihevc_trans_16[9][k]
+                                                    * pi2_tmp[9 * trans_size]
+                                    + g_ai2_ihevc_trans_16[11][k]
+                                                    * pi2_tmp[11 * trans_size]
+                                    + g_ai2_ihevc_trans_16[13][k]
+                                                    * pi2_tmp[13 * trans_size]
+                                    + g_ai2_ihevc_trans_16[15][k]
+                                                    * pi2_tmp[15 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+                                    + g_ai2_ihevc_trans_16[6][k]
+                                                    * pi2_tmp[6 * trans_size]
+                                    + g_ai2_ihevc_trans_16[10][k]
+                                                    * pi2_tmp[10 * trans_size]
+                                    + g_ai2_ihevc_trans_16[14][k]
+                                                    * pi2_tmp[14 * trans_size];
+                }
+                eeo[0] =
+                                g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
+                                                + g_ai2_ihevc_trans_16[12][0]
+                                                                * pi2_tmp[12
+                                                                                * trans_size];
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
+                                + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
+                eeo[1] =
+                                g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
+                                                + g_ai2_ihevc_trans_16[12][1]
+                                                                * pi2_tmp[12
+                                                                                * trans_size];
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
+                                + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+                    itrans_out =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        /************************************************************************************************/
+        /************************************END - IT_RECON_16x16****************************************/
+        /************************************************************************************************/
+    }
+    else /* All rows of input are non-zero */
+    {
+        /************************************************************************************************/
+        /**********************************START - IT_RECON_16x16****************************************/
+        /************************************************************************************************/
+
+        /* Inverse Transform 1st stage */
+        shift = IT_SHIFT_STAGE_1;
+        add = 1 << (shift - 1);
+
+        for(j = 0; j < row_limit_2nd_stage; j++)
+        {
+            /* Checking for Zero Cols */
+            if((zero_cols & 1) == 1)
+            {
+                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+            }
+            else
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_src[3 * src_strd]
+                                    + g_ai2_ihevc_trans_16[5][k]
+                                                    * pi2_src[5 * src_strd]
+                                    + g_ai2_ihevc_trans_16[7][k]
+                                                    * pi2_src[7 * src_strd]
+                                    + g_ai2_ihevc_trans_16[9][k]
+                                                    * pi2_src[9 * src_strd]
+                                    + g_ai2_ihevc_trans_16[11][k]
+                                                    * pi2_src[11 * src_strd]
+                                    + g_ai2_ihevc_trans_16[13][k]
+                                                    * pi2_src[13 * src_strd]
+                                    + g_ai2_ihevc_trans_16[15][k]
+                                                    * pi2_src[15 * src_strd];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
+                                    + g_ai2_ihevc_trans_16[6][k]
+                                                    * pi2_src[6 * src_strd]
+                                    + g_ai2_ihevc_trans_16[10][k]
+                                                    * pi2_src[10 * src_strd]
+                                    + g_ai2_ihevc_trans_16[14][k]
+                                                    * pi2_src[14 * src_strd];
+                }
+                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd]
+                                + g_ai2_ihevc_trans_16[12][0]
+                                                * pi2_src[12 * src_strd];
+                eee[0] =
+                                g_ai2_ihevc_trans_16[0][0] * pi2_src[0]
+                                                + g_ai2_ihevc_trans_16[8][0]
+                                                                * pi2_src[8
+                                                                                * src_strd];
+                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd]
+                                + g_ai2_ihevc_trans_16[12][1]
+                                                * pi2_src[12 * src_strd];
+                eee[1] =
+                                g_ai2_ihevc_trans_16[0][1] * pi2_src[0]
+                                                + g_ai2_ihevc_trans_16[8][1]
+                                                                * pi2_src[8
+                                                                                * src_strd];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    pi2_tmp[k] =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pi2_tmp[k + 8] =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                }
+            }
+            pi2_src++;
+            pi2_tmp += trans_size;
+            zero_cols = zero_cols >> 1;
+        }
+
+        pi2_tmp = pi2_tmp_orig;
+
+        /* Inverse Transform 2nd stage */
+        shift = IT_SHIFT_STAGE_2;
+        add = 1 << (shift - 1);
+        if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_tmp[3 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
+                }
+                eeo[0] = 0;
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+                eeo[1] = 0;
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+                    itrans_out =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 8 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_16[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_16[7][k]
+                                                    * pi2_tmp[7 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+                                    + g_ai2_ihevc_trans_16[6][k]
+                                                    * pi2_tmp[6 * trans_size];
+                }
+                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+                    itrans_out =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else /* All rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_16[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_16[7][k]
+                                                    * pi2_tmp[7 * trans_size]
+                                    + g_ai2_ihevc_trans_16[9][k]
+                                                    * pi2_tmp[9 * trans_size]
+                                    + g_ai2_ihevc_trans_16[11][k]
+                                                    * pi2_tmp[11 * trans_size]
+                                    + g_ai2_ihevc_trans_16[13][k]
+                                                    * pi2_tmp[13 * trans_size]
+                                    + g_ai2_ihevc_trans_16[15][k]
+                                                    * pi2_tmp[15 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+                                    + g_ai2_ihevc_trans_16[6][k]
+                                                    * pi2_tmp[6 * trans_size]
+                                    + g_ai2_ihevc_trans_16[10][k]
+                                                    * pi2_tmp[10 * trans_size]
+                                    + g_ai2_ihevc_trans_16[14][k]
+                                                    * pi2_tmp[14 * trans_size];
+                }
+                eeo[0] =
+                                g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
+                                                + g_ai2_ihevc_trans_16[12][0]
+                                                                * pi2_tmp[12
+                                                                                * trans_size];
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
+                                + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
+                eeo[1] =
+                                g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
+                                                + g_ai2_ihevc_trans_16[12][1]
+                                                                * pi2_tmp[12
+                                                                                * trans_size];
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
+                                + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+                    itrans_out =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        /************************************************************************************************/
+        /************************************END - IT_RECON_16x16****************************************/
+        /************************************************************************************************/
+    }
+}
+

diff --git a/common/ihevc_chroma_itrans_recon_8x8.c b/common/ihevc_chroma_itrans_recon_8x8.c
new file mode 100644
index 0000000..f086387
--- /dev/null
+++ b/common/ihevc_chroma_itrans_recon_8x8.c

@@ -0,0 +1,285 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_chroma_itrans_recon_8x8.c
+ *
+ * @brief
+ *  Contains function definitions for 8x8 inverse transform  and reconstruction
+ * of chroma interleaved data.
+ *
+ * @author
+ *  100470
+ *
+ * @par List of Functions:
+ *  - ihevc_chroma_itrans_recon_8x8()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_chroma_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions work one component(U or V) of interleaved data depending upon pointers passed to it */
+/* Data visualization */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* If the pointer points to first byte of above stream (U) , functions will operate on U component */
+/* If the pointer points to second byte of above stream (V) , functions will operate on V component */
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs Inverse transform  and reconstruction for 8x8
+ * input block
+ *
+ * @par Description:
+ *  Performs inverse transform and adds the prediction  data and clips output
+ * to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 8x8 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 8x8 buffer for storing inverse transform
+ *  1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 8x8 block
+ *
+ * @param[out] pu1_dst
+ *  Output 8x8 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_itrans_recon_8x8(WORD16 *pi2_src,
+                                   WORD16 *pi2_tmp,
+                                   UWORD8 *pu1_pred,
+                                   UWORD8 *pu1_dst,
+                                   WORD32 src_strd,
+                                   WORD32 pred_strd,
+                                   WORD32 dst_strd,
+                                   WORD32 zero_cols,
+                                   WORD32 zero_rows)
+{
+    WORD32 j, k;
+    WORD32 e[4], o[4];
+    WORD32 ee[2], eo[2];
+    WORD32 add;
+    WORD32 shift;
+    WORD16 *pi2_tmp_orig;
+    WORD32 trans_size;
+    WORD32 zero_rows_2nd_stage = zero_cols;
+    WORD32 row_limit_2nd_stage;
+    UNUSED(zero_rows);
+    trans_size = TRANS_SIZE_8;
+
+    pi2_tmp_orig = pi2_tmp;
+
+    if((zero_cols & 0xF0) == 0xF0)
+        row_limit_2nd_stage = 4;
+    else
+        row_limit_2nd_stage = TRANS_SIZE_8;
+
+    /* Inverse Transform 1st stage */
+    shift = IT_SHIFT_STAGE_1;
+    add = 1 << (shift - 1);
+    {
+        /************************************************************************************************/
+        /**********************************START - IT_RECON_8x8******************************************/
+        /************************************************************************************************/
+
+        for(j = 0; j < row_limit_2nd_stage; j++)
+        {
+            /* Checking for Zero Cols */
+            if((zero_cols & 1) == 1)
+            {
+                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+            }
+            else
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 4; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_src[src_strd]
+                                    + g_ai2_ihevc_trans_8[3][k]
+                                                    * pi2_src[3 * src_strd]
+                                    + g_ai2_ihevc_trans_8[5][k]
+                                                    * pi2_src[5 * src_strd]
+                                    + g_ai2_ihevc_trans_8[7][k]
+                                                    * pi2_src[7 * src_strd];
+                }
+
+                eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_src[2 * src_strd]
+                                + g_ai2_ihevc_trans_8[6][0] * pi2_src[6 * src_strd];
+                eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_src[2 * src_strd]
+                                + g_ai2_ihevc_trans_8[6][1] * pi2_src[6 * src_strd];
+                ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_src[0]
+                                + g_ai2_ihevc_trans_8[4][0] * pi2_src[4 * src_strd];
+                ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_src[0]
+                                + g_ai2_ihevc_trans_8[4][1] * pi2_src[4 * src_strd];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                e[0] = ee[0] + eo[0];
+                e[3] = ee[0] - eo[0];
+                e[1] = ee[1] + eo[1];
+                e[2] = ee[1] - eo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    pi2_tmp[k] =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pi2_tmp[k + 4] =
+                                    CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
+                }
+            }
+            pi2_src++;
+            pi2_tmp += trans_size;
+            zero_cols = zero_cols >> 1;
+        }
+
+        pi2_tmp = pi2_tmp_orig;
+
+        /* Inverse Transform 2nd stage */
+        shift = IT_SHIFT_STAGE_2;
+        add = 1 << (shift - 1);
+
+        if((zero_rows_2nd_stage & 0xF0) == 0xF0) /* First 4 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 4; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_8[3][k]
+                                                    * pi2_tmp[3 * trans_size];
+                }
+                eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size];
+                eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size];
+                ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0];
+                ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                e[0] = ee[0] + eo[0];
+                e[3] = ee[0] - eo[0];
+                e[1] = ee[1] + eo[1];
+                e[2] = ee[1] - eo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+                    itrans_out =
+                                    CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
+                    pu1_dst[(k + 4) * 2] =
+                                    CLIP_U8((itrans_out + pu1_pred[(k + 4) * 2]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else /* All rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 4; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_8[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_8[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_8[7][k]
+                                                    * pi2_tmp[7 * trans_size];
+                }
+
+                eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size]
+                                + g_ai2_ihevc_trans_8[6][0] * pi2_tmp[6 * trans_size];
+                eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size]
+                                + g_ai2_ihevc_trans_8[6][1] * pi2_tmp[6 * trans_size];
+                ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0]
+                                + g_ai2_ihevc_trans_8[4][0] * pi2_tmp[4 * trans_size];
+                ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0]
+                                + g_ai2_ihevc_trans_8[4][1] * pi2_tmp[4 * trans_size];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                e[0] = ee[0] + eo[0];
+                e[3] = ee[0] - eo[0];
+                e[1] = ee[1] + eo[1];
+                e[2] = ee[1] - eo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+                    itrans_out =
+                                    CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
+                    pu1_dst[(k + 4) * 2] =
+                                    CLIP_U8((itrans_out + pu1_pred[(k + 4) * 2]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        /************************************************************************************************/
+        /************************************END - IT_RECON_8x8******************************************/
+        /************************************************************************************************/
+    }
+}

diff --git a/common/ihevc_chroma_recon.c b/common/ihevc_chroma_recon.c
new file mode 100644
index 0000000..4a1e9ee
--- /dev/null
+++ b/common/ihevc_chroma_recon.c

@@ -0,0 +1,308 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_chroma_recon.c
+ *
+ * @brief
+ *  Functions definitions reconstruction  of chroma interleaved data.
+ *
+ * @author
+ *  100470
+ *
+ * @par List of Functions:
+ *  - ihevc_chroma_recon_4x4()
+ *  - ihevc_chroma_recon_8x8()
+ *  - ihevc_chroma_recon_16x16()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_chroma_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions work one component(U or V) of interleaved data depending upon pointers passed to it */
+/* Data visualization */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* If the pointer points to first byte of above stream (U) , functions will operate on U component */
+/* If the pointer points to second byte of above stream (V) , functions will operate on V component */
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs reconstruction for  4x4 input block
+ *
+ * @par Description:
+ *  Performs reconstruction of 4x4 input block by adding  adding prediction
+ * data to input and clipping it to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 4x4 coefficients
+ *
+ * @param[in] pu1_pred
+ *  Prediction 4x4 block
+ *
+ * @param[out] pu1_dst
+ *  Output 4x4 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_tmp
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_recon_4x4(WORD16 *pi2_src,
+                            UWORD8 *pu1_pred,
+                            UWORD8 *pu1_dst,
+                            WORD32 src_strd,
+                            WORD32 pred_strd,
+                            WORD32 dst_strd,
+                            WORD32 zero_cols)
+{
+    WORD32 i, j;
+    WORD32 trans_size;
+
+    trans_size = TRANS_SIZE_4;
+
+    /* Reconstruction */
+
+    for(i = 0; i < trans_size; i++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+            }
+        }
+        else
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                pu1_dst[j * dst_strd] =
+                                CLIP_U8(pi2_src[j * src_strd] + pu1_pred[j * pred_strd]);
+            }
+        }
+        pi2_src++;
+        pu1_dst += 2;
+        pu1_pred += 2;
+        zero_cols = zero_cols >> 1;
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs reconstruction for 8x8 input block
+ *
+ * @par Description:
+ *  Performs reconstruction of 8x8 input block by adding  adding prediction
+ * data to input and clipping it to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 8x8 coefficients
+ *
+ * @param[in] pu1_pred
+ *  Prediction 8x8 block
+ *
+ * @param[out] pu1_dst
+ *  Output 8x8 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_tmp
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_recon_8x8(WORD16 *pi2_src,
+                            UWORD8 *pu1_pred,
+                            UWORD8 *pu1_dst,
+                            WORD32 src_strd,
+                            WORD32 pred_strd,
+                            WORD32 dst_strd,
+                            WORD32 zero_cols)
+{
+    WORD32 i, j;
+    WORD32 trans_size;
+
+    trans_size = TRANS_SIZE_8;
+
+    /* Reconstruction */
+
+    for(i = 0; i < trans_size; i++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+            }
+        }
+        else
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                pu1_dst[j * dst_strd] =
+                                CLIP_U8(pi2_src[j * src_strd] + pu1_pred[j * pred_strd]);
+            }
+        }
+        pi2_src++;
+        pu1_dst += 2;
+        pu1_pred += 2;
+        zero_cols = zero_cols >> 1;
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs reconstruction for  16x16 input block
+ *
+ * @par Description:
+ *  Performs reconstruction of 16x16 input block by adding  adding prediction
+ * data to input and clipping it to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 16x16 coefficients
+ *
+ * @param[in] pu1_pred
+ *  Prediction 16x16 block
+ *
+ * @param[out] pu1_dst
+ *  Output 16x16 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_tmp
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_recon_16x16(WORD16 *pi2_src,
+                              UWORD8 *pu1_pred,
+                              UWORD8 *pu1_dst,
+                              WORD32 src_strd,
+                              WORD32 pred_strd,
+                              WORD32 dst_strd,
+                              WORD32 zero_cols)
+{
+    WORD32 i, j;
+    WORD32 trans_size;
+
+    trans_size = TRANS_SIZE_16;
+
+    /* Reconstruction */
+
+    for(i = 0; i < trans_size; i++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+            }
+        }
+        else
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                pu1_dst[j * dst_strd] =
+                                CLIP_U8(pi2_src[j * src_strd] + pu1_pred[j * pred_strd]);
+            }
+        }
+        pi2_src++;
+        pu1_dst += 2;
+        pu1_pred += 2;
+        zero_cols = zero_cols >> 1;
+    }
+}
+

diff --git a/common/ihevc_chroma_recon.h b/common/ihevc_chroma_recon.h
new file mode 100644
index 0000000..b4ece06
--- /dev/null
+++ b/common/ihevc_chroma_recon.h

@@ -0,0 +1,99 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_chroma_recon.h
+*
+* @brief
+*  Functions declarations reconstruction  of chroma interleaved data.
+*
+* @author
+*  Ittiam
+*
+* @par List of Functions:
+*  - ihevc_chroma_recon_4x4_ttype1()
+*  - ihevc_chroma_recon_4x4()
+*  - ihevc_chroma_recon_8x8()
+*  - ihevc_chroma_recon_16x16()
+*  - ihevc_chroma_recon_32x32()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+#ifndef _IHEVC_CHROMA_RECON_H_
+#define _IHEVC_CHROMA_RECON_H_
+
+typedef void ihevc_chroma_recon_4x4_ft(WORD16 *pi2_src,
+                                       UWORD8 *pu1_pred,
+                                       UWORD8 *pu1_dst,
+                                       WORD32 src_strd,
+                                       WORD32 pred_strd,
+                                       WORD32 dst_strd,
+                                       WORD32 zero_cols);
+typedef void ihevc_hbd_chroma_recon_4x4_ft(WORD16 *pi2_src,
+                                           UWORD16 *pu2_pred,
+                                           UWORD16 *pu2_dst,
+                                           WORD32 src_strd,
+                                           WORD32 pred_strd,
+                                           WORD32 dst_strd,
+                                           WORD32 zero_cols,
+                                           UWORD8 bit_depth);
+typedef void ihevc_chroma_recon_8x8_ft(WORD16 *pi2_src,
+                                       UWORD8 *pu1_pred,
+                                       UWORD8 *pu1_dst,
+                                       WORD32 src_strd,
+                                       WORD32 pred_strd,
+                                       WORD32 dst_strd,
+                                       WORD32 zero_cols);
+typedef void ihevc_hbd_chroma_recon_8x8_ft(WORD16 *pi2_src,
+                                           UWORD16 *pu2_pred,
+                                           UWORD16 *pu2_dst,
+                                           WORD32 src_strd,
+                                           WORD32 pred_strd,
+                                           WORD32 dst_strd,
+                                           WORD32 zero_cols,
+                                           UWORD8 bit_depth);
+typedef void ihevc_chroma_recon_16x16_ft(WORD16 *pi2_src,
+                                         UWORD8 *pu1_pred,
+                                         UWORD8 *pu1_dst,
+                                         WORD32 src_strd,
+                                         WORD32 pred_strd,
+                                         WORD32 dst_strd,
+                                         WORD32 zero_cols);
+typedef void ihevc_hbd_chroma_recon_16x16_ft(WORD16 *pi2_src,
+                                             UWORD16 *pu2_pred,
+                                             UWORD16 *pu2_dst,
+                                             WORD32 src_strd,
+                                             WORD32 pred_strd,
+                                             WORD32 dst_strd,
+                                             WORD32 zero_cols,
+                                             UWORD8 bit_depth);
+
+ihevc_chroma_recon_4x4_ft ihevc_chroma_recon_4x4;
+ihevc_hbd_chroma_recon_4x4_ft ihevc_hbd_chroma_recon_4x4;
+ihevc_chroma_recon_8x8_ft ihevc_chroma_recon_8x8;
+ihevc_hbd_chroma_recon_8x8_ft ihevc_hbd_chroma_recon_8x8;
+ihevc_chroma_recon_16x16_ft ihevc_chroma_recon_16x16;
+ihevc_hbd_chroma_recon_16x16_ft ihevc_hbd_chroma_recon_16x16;
+
+#endif /*_IHEVC_CHROMA_RECON_H_*/

diff --git a/common/ihevc_common_tables.c b/common/ihevc_common_tables.c
new file mode 100644
index 0000000..7927497
--- /dev/null
+++ b/common/ihevc_common_tables.c

@@ -0,0 +1,549 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_common_tables.c
+*
+* @brief
+*  Contains common global tables
+*
+* @author
+*  Harish M
+*
+* @par List of Tables:
+* gai4_ihevc_max_luma_pic_size
+* gai4_ihevc_max_wd_ht
+* gai4_ihevc_min_wd_ht
+* gai4_ihevc_ang_table
+* col_for_intra_luma
+* col_for_intra_chroma
+* idx_neg_vals_3_9
+* idx_neg_idx_3_9
+* idx_neg_idx_chroma_3_9
+* idx_neg_idx_11_17
+* idx_neg_idx_chroma_11_17
+* gai4_ihevc_inv_ang_table
+* gau1_ihevc_invscan8x8
+* gau1_ihevc_invscan4x4
+* gau1_ihevc_invscan2x2
+* gau1_ihevc_scan8x8
+* gau1_ihevc_scan4x4
+* gau1_ihevc_scan2x2
+* *gapv_ihevc_scan
+* *gapv_ihevc_invscan
+* gau1_ihevc_chroma_qp_scale
+* gai1_ihevc_chroma_qp_scale
+* gau1_ihevc_planar_factor
+* gau1_ihevc_planar_factor_1
+* gai4_ihevc_ang_table_chroma
+* gai4_ihevc_inv_ang_table_chroma
+* gau1_ihevc_planar_factor_chroma
+* gau1_intra_pred_ref_filter
+* gi1_table_edge_idx
+* gu1_table_band_idx
+* gu2_table_band_idx
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_common_tables.h"
+
+/*****************************************************************************/
+/* Level specific tables                                                     */
+/*****************************************************************************/
+
+/**
+ * Array giving size of max luma samples in a picture for a given level
+ */
+const WORD32 gai4_ihevc_max_luma_pic_size[] =
+{
+    /* Level 1 */
+    36864,
+    /* Level 2 */
+    122880,
+    /* Level 2.1 */
+    245760,
+    /* Level 3 */
+    552960,
+    /* Level 3.1 */
+    983040,
+    /* Level 4 */
+    2228224,
+    /* Level 4.1 */
+    2228224,
+    /* Level 5 */
+    8912896,
+    /* Level 5.1 */
+    8912896,
+    /* Level 5.2 */
+    8912896,
+    /* Level 6 */
+    33423360,
+    /* Level 6.1 */
+    33423360,
+    /* Level 6.2 */
+    33423360
+};
+/** Max width and height allowed for a given level */
+/** This is derived as SQRT(8 * gai4_ihevc_max_luma_pic_size[]) */
+const WORD32 gai4_ihevc_max_wd_ht[] =
+{
+    /* Level 1 */
+    543,
+    /* Level 2 */
+    991,
+    /* Level 2.1 */
+    1402,
+    /* Level 3 */
+    2103,
+    /* Level 3.1 */
+    2804,
+    /* Level 4 */
+    4222,
+    /* Level 4.1 */
+    4222,
+    /* Level 5 */
+    8444,
+    /* Level 5.1 */
+    8444,
+    /* Level 5.2 */
+    8444,
+    /* Level 6 */
+    16888,
+    /* Level 6.1 */
+    16888,
+    /* Level 6.2 */
+    16888
+};
+
+/** Min width and height allowed for a given level */
+/** This is derived as gai4_ihevc_max_luma_pic_size[]/gai4_ihevc_max_wd_ht[] */
+const WORD32 gai4_ihevc_min_wd_ht[] =
+{
+    /* Level 1 */
+    67,
+    /* Level 2 */
+    123,
+    /* Level 2.1 */
+    175,
+    /* Level 3 */
+    262,
+    /* Level 3.1 */
+    350,
+    /* Level 4 */
+    527,
+    /* Level 4.1 */
+    527,
+    /* Level 5 */
+    1055,
+    /* Level 5.1 */
+    1055,
+    /* Level 5.2 */
+    1055,
+    /* Level 6 */
+    2111,
+    /* Level 6.1 */
+    2111,
+    /* Level 6.2 */
+    2111
+};
+/*****************************************************************************/
+/* Intra prediction tables                                                   */
+/*****************************************************************************/
+/**
+ * Intra pred angles
+ */
+/* g_ang_table = tan(actual angle) in Q5 format for all 33 modes */
+const WORD32 gai4_ihevc_ang_table[35] =
+    { 0, 0, 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
+                    -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+
+const WORD8 col_for_intra_luma[32] =
+    { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+                    29, 30, 31, 32 };
+
+const WORD8 col_for_intra_chroma[32] =
+    { 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16 };
+
+const WORD8 idx_neg_vals_3_9[7] =
+    { 26, 21, 17, 13, 9, 5, 2 };
+
+const WORD32 idx_neg_idx_3_9[28] =
+    { 6, 13, 19, 26, 5, 10, 15, 21, 4, 8, 12, 17, 3, 6, 9, 13, 2, 4, 6, 9,
+                    1, 2, 3, 5, 0, 0, 1, 2 };
+
+
+const WORD32 idx_neg_idx_chroma_3_9[28] =
+  { 3, 6, 9, 13,
+    2, 5, 7, 10,
+    2, 4, 6, 8,
+    1, 3, 4, 6,
+    1, 2, 3, 4,
+    0, 1, 1, 2,
+    0, 0, 0, 1 };
+const WORD32 idx_neg_idx_11_17[28] =
+    { -1, -1, -2, -2, -2, -3, -4, -5, -3, -5, -7, -9, -4, -7, -10, -13, -5, -9, -13, -17, -6, -11,
+                    -16, -21, -7, -13, -20, -26 };
+
+const WORD32 idx_neg_idx_chroma_11_17[28] =
+  { -1, -1, -1, -1,
+    -1, -2, -2, -3,
+    -2, -3, -4, -5,
+    -2, -4, -5, -7,
+    -3, -5, -7, -9,
+    -3, -6, -8, -11,
+    -4, -7, -10, -13 };
+
+/**
+ * Intra pred inverse angles
+ */
+/* g_invAngTable = Inverse angle in Q5 format, required for negative angles */
+const WORD32 gai4_ihevc_inv_ang_table[14] =
+    { 4096, 1638, 910, 630, 482, 390, 315, 315, 390, 482, 630, 910, 1638, 4096 };
+
+/*****************************************************************************/
+/* Scan matrices                                                             */
+/*****************************************************************************/
+/**
+ * Inverse Scan matrix for 8x8 Section 6.5.3
+ */
+const UWORD8  gau1_ihevc_invscan8x8[][64] =
+{
+    /* Upright diagonal */
+    {
+        0,  8,  1,  16, 9,  2,  24, 17,
+        10, 3,  32, 25, 18, 11, 4,  40,
+        33, 26, 19, 12, 5,  48, 41, 34,
+        27, 20, 13, 6,  56, 49, 42, 35,
+        28, 21, 14, 7,  57, 50, 43, 36,
+        29, 22, 15, 58, 51, 44, 37, 30,
+        23, 59, 52, 45, 38, 31, 60, 53,
+        46, 39, 61, 54, 47, 62, 55, 63
+    },
+    /* Horizontal */
+    {
+        0,  1,  2,  3,  4,  5,  6,  7,
+        8,  9,  10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    },
+    /* Vertical */
+    {
+        0,  8,  16, 24, 32, 40, 48, 56,
+        1,  9,  17, 25, 33, 41, 49, 57,
+        2,  10, 18, 26, 34, 42, 50, 58,
+        3,  11, 19, 27, 35, 43, 51, 59,
+        4,  12, 20, 28, 36, 44, 52, 60,
+        5,  13, 21, 29, 37, 45, 53, 61,
+        6,  14, 22, 30, 38, 46, 54, 62,
+        7,  15, 23, 31, 39, 47, 55, 63
+    }
+};
+
+/**
+ * Inverse Scan matrix for 4x4 Section 6.5.3
+ */
+const UWORD8  gau1_ihevc_invscan4x4[][16] =
+{
+    /* Upright diagonal */
+    {
+        0, 4,  1,  8,
+        5, 2,  12, 9,
+        6, 3,  13, 10,
+        7, 14, 11, 15
+    },
+    /* Horizontal */
+    {
+        0,  1,  2,  3,
+        4,  5,  6,  7,
+        8,  9,  10, 11,
+        12, 13, 14, 15
+    },
+    /* Vertical */
+    {
+        0,  4,  8,  12,
+        1,  5,  9,  13,
+        2,  6,  10, 14,
+        3,  7,  11, 15
+    }
+};
+
+/**
+ * Inverse Scan matrix for 4x4 Section 6.5.3
+ */
+const UWORD8  gau1_ihevc_invscan2x2[][4] =
+{
+    /* Upright diagonal */
+    {
+        0,  2,
+        1,  3
+    },
+    /* Horizontal */
+    {
+        0,  1,
+        2,  3
+    },
+    /* Vertical */
+    {
+        0,  2,
+        1,  3,
+    }
+};
+
+/**
+ * Scan matrix for 8x8 Section 6.5.3
+ */
+
+const UWORD8  gau1_ihevc_scan8x8[][64] =
+{
+    /* Upright diagonal */
+    {
+        0,  2,  5,  9,  14, 20, 27, 35,
+        1,  4,  8,  13, 19, 26, 34, 42,
+        3,  7,  12, 18, 25, 33, 41, 48,
+        6,  11, 17, 24, 32, 40, 47, 53,
+        10, 16, 23, 31, 39, 46, 52, 57,
+        15, 22, 30, 38, 45, 51, 56, 60,
+        21, 29, 37, 44, 50, 55, 59, 62,
+        28, 36, 43, 49, 54, 58, 61, 63
+    },
+    /* Horizontal */
+    {
+        0,  1,  2,  3,  4,  5,  6,  7,
+        8,  9,  10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
+        32, 33, 34, 35, 36, 37, 38, 39,
+        40, 41, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 54, 55,
+        56, 57, 58, 59, 60, 61, 62, 63
+    },
+    /* Vertical */
+    {
+        0,  8,  16, 24, 32, 40, 48, 56,
+        1,  9,  17, 25, 33, 41, 49, 57,
+        2,  10, 18, 26, 34, 42, 50, 58,
+        3,  11, 19, 27, 35, 43, 51, 59,
+        4,  12, 20, 28, 36, 44, 52, 60,
+        5,  13, 21, 29, 37, 45, 53, 61,
+        6,  14, 22, 30, 38, 46, 54, 62,
+        7,  15, 23, 31, 39, 47, 55, 63
+    }
+};
+
+/**
+ * Scan matrix for 4x4 Section 6.5.3
+ */
+const UWORD8  gau1_ihevc_scan4x4[][16] =
+{
+    /* Upright diagonal */
+    {
+        0,  2,  5,  9,
+        1,  4,  8,  12,
+        3,  7,  11, 14,
+        6,  10, 13, 15
+    },
+    /* Horizontal */
+    {
+        0,  1,  2,  3,
+        4,  5,  6,  7,
+        8,  9,  10, 11,
+        12, 13, 14, 15
+    },
+    /* Vertical */
+    {
+        0,  4,  8,  12,
+        1,  5,  9,  13,
+        2,  6,  10, 14,
+        3,  7,  11, 15
+    }
+};
+
+/**
+ * Scan matrix for 4x4 Section 6.5.3
+ */
+const UWORD8  gau1_ihevc_scan2x2[][4] =
+{
+    /* Upright diagonal */
+    {
+        0,  2,
+        1,  3
+    },
+    /* Horizontal */
+    {
+        0,  1,
+        2,  3
+    },
+    /* Vertical */
+    {
+        0,  2,
+        1,  3,
+    }
+};
+
+/**
+ * Table containing all the scan matrices
+ */
+const void *gapv_ihevc_scan[] =
+{
+    gau1_ihevc_scan2x2[0],
+    gau1_ihevc_scan4x4[0],
+    gau1_ihevc_scan8x8[0],
+
+    gau1_ihevc_scan2x2[1],
+    gau1_ihevc_scan4x4[1],
+    gau1_ihevc_scan8x8[1],
+
+    gau1_ihevc_scan2x2[2],
+    gau1_ihevc_scan4x4[2],
+    gau1_ihevc_scan8x8[2],
+
+};
+
+const void *gapv_ihevc_invscan[] =
+{
+    gau1_ihevc_invscan2x2[0],
+    gau1_ihevc_invscan4x4[0],
+    gau1_ihevc_invscan8x8[0],
+
+    gau1_ihevc_invscan2x2[1],
+    gau1_ihevc_invscan4x4[1],
+    gau1_ihevc_invscan8x8[1],
+
+    gau1_ihevc_invscan2x2[2],
+    gau1_ihevc_invscan4x4[2],
+    gau1_ihevc_invscan8x8[2],
+};
+/**
+ * Table for luma to chroma qp conversion
+ */
+
+// FOR MAIN branch encoder ( 8 bit)
+const UWORD8 gau1_ihevc_chroma_qp_scale[58] =
+{
+
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
+    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32,
+    33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44,
+    45, 46, 47, 48, 49, 50, 51
+};
+
+// FOR HBD branch encoder ( 8 and 10 bit)
+const WORD8 gai1_ihevc_chroma_qp_scale[70] =  //EXTENDED for 10 bit
+{
+
+    -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1,
+    0,   1,   2,   3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+    17,  18,  19,  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32,
+    33,  33,  34,  34, 35, 35, 36, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44,
+    45,  46,  47,  48, 49, 50, 51
+};
+
+
+/** constant planar factor values table */
+const UWORD8 gau1_ihevc_planar_factor[65] = {    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+    11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+    21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+    31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+    41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
+    51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+    61, 62, 63, 64 };
+//AX CHANGES
+const UWORD8 gau1_ihevc_planar_factor_1[32] = {    1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9,
+    17, 17, 17, 17, 17, 17, 17, 17, 25, 25, 25, 25, 25, 25,
+    25, 25 };
+//AX CHANGES
+
+/** g_ang_table = tan(actual angle) in Q5 format for all 33 modes */
+const WORD32 gai4_ihevc_ang_table_chroma[35] = { 0, 0, 32, 26, 21, 17, 13, 9, 5, 2, 0, -2,  -5, -9,
+    -13, -17, -21, -26, -32, -26, -21, -17, -13, -9, -5,
+    -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+/** g_invAngTable = Inverse angle in Q5 format, required for negative angles */
+const WORD32 gai4_ihevc_inv_ang_table_chroma[14] = { 4096, 1638, 910, 630, 482, 390, 315,
+    315, 390, 482, 630, 910, 1638, 4096 };
+
+
+/** constant planar factor values table */
+const UWORD8 gau1_ihevc_planar_factor_chroma[33] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+    11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+    21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+    31, 32  };
+
+
+
+/** Filter flag values for intra pred referenece filtering - intra pred mode is the index
+*   flag for nt = 4 is the Bit 0, nt = 8 is Bit 1, nt = 16 is Bit 2, nt = 32 is Bit 3
+*/
+const UWORD8 gau1_intra_pred_ref_filter[] =
+{
+    14,  0, 14, 12, 12, 12, 12,
+    12, 12,  8,  0,  8, 12, 12,
+    12, 12, 12, 12, 14, 12, 12,
+    12, 12, 12, 12,  8,  0,  8,
+    12, 12, 12, 12, 12, 12, 14
+};
+
+
+const WORD8 gi1_table_edge_idx[8] = { 1, 2, 0, 3, 4, 0, 0, 0 }; /* First 5 values are valid. Last 3 dummy values are added to help SIMD load*/
+
+const UWORD8 gu1_table_band_idx[32] = {  0,  1,  2,  3,  4,  5,  6,  7,
+                                         8,  9, 10, 11, 12, 13, 14, 15,
+                                        16, 17, 18, 19, 20, 21, 22, 23,
+                                        24, 25, 26, 27, 28, 29, 30, 31
+};
+
+const UWORD16 gu2_table_band_idx[32] = {  0,  1,  2,  3,  4,  5,  6,  7,
+                                         8,  9, 10, 11, 12, 13, 14, 15,
+                                        16, 17, 18, 19, 20, 21, 22, 23,
+                                        24, 25, 26, 27, 28, 29, 30, 31
+};
+
+#ifdef ENABLE_SSE4_1_INTR
+/*Used as a lookup table to have popcnt instruction working for SSE4.1 platform.
+Each unit indicates number of 1s the index at which it is persent
+*/
+const WORD8   gi1_popcnt_byte_table[] =
+{
+    0,  1,  1,  2,  1,  2,  2,  3,  1,  2,  2,  3,  2,  3,  3,  4,
+    1,  2,  2,  3,  2,  3,  3,  4,  2,  3,  3,  4,  3,  4,  4,  5,
+    1,  2,  2,  3,  2,  3,  3,  4,  2,  3,  3,  4,  3,  4,  4,  5,
+    2,  3,  3,  4,  3,  4,  4,  5,  3,  4,  4,  5,  4,  5,  5,  6,
+    1,  2,  2,  3,  2,  3,  3,  4,  2,  3,  3,  4,  3,  4,  4,  5,
+    2,  3,  3,  4,  3,  4,  4,  5,  3,  4,  4,  5,  4,  5,  5,  6,
+    2,  3,  3,  4,  3,  4,  4,  5,  3,  4,  4,  5,  4,  5,  5,  6,
+    3,  4,  4,  5,  4,  5,  5,  6,  4,  5,  5,  6,  5,  6,  6,  7,
+    1,  2,  2,  3,  2,  3,  3,  4,  2,  3,  3,  4,  3,  4,  4,  5,
+    2,  3,  3,  4,  3,  4,  4,  5,  3,  4,  4,  5,  4,  5,  5,  6,
+    2,  3,  3,  4,  3,  4,  4,  5,  3,  4,  4,  5,  4,  5,  5,  6,
+    3,  4,  4,  5,  4,  5,  5,  6,  4,  5,  5,  6,  5,  6,  6,  7,
+    2,  3,  3,  4,  3,  4,  4,  5,  3,  4,  4,  5,  4,  5,  5,  6,
+    3,  4,  4,  5,  4,  5,  5,  6,  4,  5,  5,  6,  5,  6,  6,  7,
+    3,  4,  4,  5,  4,  5,  5,  6,  4,  5,  5,  6,  5,  6,  6,  7,
+    4,  5,  5,  6,  5,  6,  6,  7,  5,  6,  6,  7,  6,  7,  7,  8
+};
+#endif

diff --git a/common/ihevc_common_tables.h b/common/ihevc_common_tables.h
new file mode 100644
index 0000000..ff7e438
--- /dev/null
+++ b/common/ihevc_common_tables.h

@@ -0,0 +1,75 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_common_tables.h
+*
+* @brief
+*  Common tables
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVC_COMMON_TABLES_H_
+#define _IHEVC_COMMON_TABLES_H_
+
+extern MEM_ALIGN16 const WORD32 gai4_ihevc_max_luma_pic_size[];
+extern MEM_ALIGN16 const WORD32 gai4_ihevc_max_wd_ht[];
+extern MEM_ALIGN16 const WORD32 gai4_ihevc_min_wd_ht[];
+
+
+extern MEM_ALIGN16 const WORD32 gai4_ihevc_ang_table[35];
+extern MEM_ALIGN16 const WORD32 gai4_ihevc_inv_ang_table[14];
+
+extern MEM_ALIGN16 const UWORD8  gau1_ihevc_scan8x8[][64];
+extern MEM_ALIGN16 const UWORD8  gau1_ihevc_scan4x4[][16];
+extern MEM_ALIGN16 const UWORD8  gau1_ihevc_scan2x2[][4];
+
+extern MEM_ALIGN16 const UWORD8  gau1_ihevc_invscan8x8[][64];
+extern MEM_ALIGN16 const UWORD8  gau1_ihevc_invscan4x4[][16];
+extern MEM_ALIGN16 const UWORD8  gau1_ihevc_invscan2x2[][4];
+
+extern MEM_ALIGN16 const void   *gapv_ihevc_scan[];
+extern MEM_ALIGN16 const void   *gapv_ihevc_invscan[];
+extern MEM_ALIGN16 const UWORD8 gau1_ihevc_chroma_qp_scale[];
+extern MEM_ALIGN16 const WORD8 gai1_ihevc_chroma_qp_scale[];
+
+extern MEM_ALIGN16 const WORD32 gai4_ihevc_ang_table_chroma[35];
+extern MEM_ALIGN16 const WORD32 gai4_ihevc_inv_ang_table_chroma[14];
+extern MEM_ALIGN16 const UWORD8 gau1_ihevc_planar_factor_chroma[33];
+
+extern MEM_ALIGN16 const UWORD8 gau1_ihevc_planar_factor[65];
+
+extern MEM_ALIGN16 const UWORD8 gau1_intra_pred_ref_filter[];
+
+extern MEM_ALIGN16 const WORD8 gi1_table_edge_idx[8];
+
+extern MEM_ALIGN16 const UWORD8 gu1_table_band_idx[32];
+
+extern MEM_ALIGN16 const UWORD16 gu2_table_band_idx[32];
+
+#endif /*_IHEVC_COMMON_TABLES_H_*/

diff --git a/common/ihevc_deblk.h b/common/ihevc_deblk.h
new file mode 100644
index 0000000..cd4c8c8
--- /dev/null
+++ b/common/ihevc_deblk.h

@@ -0,0 +1,173 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_deblk.h
+*
+* @brief
+*  Declarations for the fucntions defined in  ihevc_deblk.c
+*
+* @author
+*  Srinivas T
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_DEBLK_H_
+#define _IHEVC_DEBLK_H_
+
+/*****************************************************************************/
+/* Function Declarations                                                     */
+/*****************************************************************************/
+
+typedef void ihevc_deblk_luma_vert_ft(
+                UWORD8 *pu1_src,
+                WORD32 src_strd,
+                WORD32 bs,
+                WORD32 quant_param_p,
+                WORD32 quant_param_q,
+                WORD32 beta_offset_div2,
+                WORD32 tc_offset_div2,
+                WORD32 filter_flag_p,
+                WORD32 filter_flag_q);
+
+typedef void ihevc_deblk_luma_horz_ft(
+                UWORD8 *pu1_src,
+                WORD32 src_strd,
+                WORD32 bs,
+                WORD32 quant_param_p,
+                WORD32 quant_param_q,
+                WORD32 beta_offset_div2,
+                WORD32 tc_offset_div2,
+                WORD32 filter_flag_p,
+                WORD32 filter_flag_q);
+
+typedef void ihevc_deblk_chroma_vert_ft(
+                UWORD8 *pu1_src,
+                WORD32 src_strd,
+                WORD32 quant_param_p,
+                WORD32 quant_param_q,
+                WORD32 qp_offset_u,
+                WORD32 qp_offset_v,
+                WORD32 tc_offset_div2,
+                WORD32 filter_flag_p,
+                WORD32 filter_flag_q);
+
+typedef void ihevc_deblk_chroma_horz_ft(
+                UWORD8 *pu1_src,
+                WORD32 src_strd,
+                WORD32 quant_param_p,
+                WORD32 quant_param_q,
+                WORD32 qp_offset_u,
+                WORD32 qp_offset_v,
+                WORD32 tc_offset_div2,
+                WORD32 filter_flag_p,
+                WORD32 filter_flag_q);
+
+typedef void ihevc_hbd_deblk_luma_vert_ft(
+                UWORD16 *pu2_src,
+                WORD32 src_strd,
+                WORD32 bs,
+                WORD32 quant_param_p,
+                WORD32 quant_param_q,
+                WORD32 beta_offset_div2,
+                WORD32 tc_offset_div2,
+                WORD32 filter_flag_p,
+                WORD32 filter_flag_q,
+                UWORD8 bit_depth);
+
+typedef void ihevc_hbd_deblk_luma_horz_ft(
+                UWORD16 *pu2_src,
+                WORD32 src_strd,
+                WORD32 bs,
+                WORD32 quant_param_p,
+                WORD32 quant_param_q,
+                WORD32 beta_offset_div2,
+                WORD32 tc_offset_div2,
+                WORD32 filter_flag_p,
+                WORD32 filter_flag_q,
+                UWORD8 bit_depth);
+
+typedef void ihevc_hbd_deblk_chroma_vert_ft(
+                UWORD16 *pu2_src,
+                WORD32 src_strd,
+                WORD32 quant_param_p,
+                WORD32 quant_param_q,
+                WORD32 qp_offset_u,
+                WORD32 qp_offset_v,
+                WORD32 tc_offset_div2,
+                WORD32 filter_flag_p,
+                WORD32 filter_flag_q,
+                UWORD8 bit_depth);
+
+typedef void ihevc_hbd_deblk_chroma_horz_ft(
+                UWORD16 *pu2_src,
+                WORD32 src_strd,
+                WORD32 quant_param_p,
+                WORD32 quant_param_q,
+                WORD32 qp_offset_u,
+                WORD32 qp_offset_v,
+                WORD32 tc_offset_div2,
+                WORD32 filter_flag_p,
+                WORD32 filter_flag_q,
+                UWORD8 bit_depth);
+
+ihevc_deblk_luma_vert_ft ihevc_deblk_luma_vert;
+ihevc_deblk_luma_horz_ft ihevc_deblk_luma_horz;
+ihevc_deblk_chroma_vert_ft ihevc_deblk_chroma_vert;
+ihevc_deblk_chroma_horz_ft ihevc_deblk_chroma_horz;
+
+ihevc_deblk_luma_vert_ft ihevc_deblk_luma_vert_a9q;
+ihevc_deblk_luma_horz_ft ihevc_deblk_luma_horz_a9q;
+ihevc_deblk_chroma_vert_ft ihevc_deblk_chroma_vert_a9q;
+ihevc_deblk_chroma_horz_ft ihevc_deblk_chroma_horz_a9q;
+
+ihevc_deblk_luma_vert_ft ihevc_deblk_luma_vert_a9a;
+ihevc_deblk_luma_horz_ft ihevc_deblk_luma_horz_a9a;
+ihevc_deblk_chroma_vert_ft ihevc_deblk_chroma_vert_a9a;
+ihevc_deblk_chroma_horz_ft ihevc_deblk_chroma_horz_a9a;
+
+ihevc_deblk_luma_vert_ft ihevc_deblk_luma_vert_neonintr;
+ihevc_deblk_luma_horz_ft ihevc_deblk_luma_horz_neonintr;
+ihevc_deblk_chroma_vert_ft ihevc_deblk_chroma_vert_neonintr;
+ihevc_deblk_chroma_horz_ft ihevc_deblk_chroma_horz_neonintr;
+
+ihevc_deblk_luma_vert_ft ihevc_deblk_luma_vert_ssse3;
+ihevc_deblk_luma_horz_ft ihevc_deblk_luma_horz_ssse3;
+ihevc_deblk_chroma_vert_ft ihevc_deblk_chroma_vert_ssse3;
+ihevc_deblk_chroma_horz_ft ihevc_deblk_chroma_horz_ssse3;
+
+ihevc_hbd_deblk_luma_vert_ft ihevc_hbd_deblk_luma_vert;
+ihevc_hbd_deblk_luma_horz_ft ihevc_hbd_deblk_luma_horz;
+ihevc_hbd_deblk_chroma_vert_ft ihevc_hbd_deblk_chroma_vert;
+ihevc_hbd_deblk_chroma_horz_ft ihevc_hbd_deblk_chroma_horz;
+
+ihevc_hbd_deblk_luma_vert_ft ihevc_hbd_deblk_luma_vert_sse42;
+ihevc_hbd_deblk_luma_horz_ft ihevc_hbd_deblk_luma_horz_sse42;
+ihevc_hbd_deblk_chroma_vert_ft ihevc_hbd_deblk_chroma_vert_sse42;
+ihevc_hbd_deblk_chroma_horz_ft ihevc_hbd_deblk_chroma_horz_sse42;
+
+ihevc_deblk_luma_vert_ft ihevc_deblk_luma_vert_av8;
+ihevc_deblk_luma_horz_ft ihevc_deblk_luma_horz_av8;
+ihevc_deblk_chroma_vert_ft ihevc_deblk_chroma_vert_av8;
+ihevc_deblk_chroma_horz_ft ihevc_deblk_chroma_horz_av8;
+
+#endif /*_IHEVC_DEBLK_H_*/

diff --git a/common/ihevc_deblk_edge_filter.c b/common/ihevc_deblk_edge_filter.c
new file mode 100644
index 0000000..8b6e6ea
--- /dev/null
+++ b/common/ihevc_deblk_edge_filter.c

@@ -0,0 +1,1510 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_deblk_edge_filter.c
+*
+* @brief
+*  Contains function definitions for deblocking filters
+*
+* @author
+*  Srinivas T
+*
+* @par List of Functions:
+*   - ihevc_deblk_luma_vert()
+*   - ihevc_deblk_luma_horz()
+*   - ihevc_deblk_chroma_vert()
+*   - ihevc_deblk_chroma_horz()
+*   - ihevc_hbd_deblk_luma_vert()
+*   - ihevc_hbd_deblk_luma_horz()
+*   - ihevc_hbd_deblk_chroma_vert()
+*   - ihevc_hbd_deblk_chroma_horz()
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_deblk.h"
+#include "ihevc_deblk_tables.h"
+#include "ihevc_debug.h"
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*       Decision process and filtering for the luma block vertical edge.
+*
+* @par Description:
+*     The decision process for the luma block vertical edge is  carried out and
+*     an appropriate filter is applied. The  boundary filter strength, bs should
+*     be greater than 0.  The pcm flags and the transquant bypass flags should
+*     be  taken care of by the calling function.
+*
+* @param[in] pu1_src
+*  Pointer to the src sample q(0,0)
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] bs
+*  Boundary filter strength of q(0,0)
+*
+* @param[in] quant_param_p
+*  quantization parameter of p block
+*
+* @param[in] quant_param_q
+*  quantization parameter of p block
+*
+* @param[in] beta_offset_div2
+*
+*
+* @param[in] tc_offset_div2
+*
+*
+* @param[in] filter_flag_p
+*  flag whether to filter the p block
+*
+* @param[in] filter_flag_q
+*  flag whether to filter the q block
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_deblk_luma_vert(UWORD8 *pu1_src,
+                           WORD32 src_strd,
+                           WORD32 bs,
+                           WORD32 quant_param_p,
+                           WORD32 quant_param_q,
+                           WORD32 beta_offset_div2,
+                           WORD32 tc_offset_div2,
+                           WORD32 filter_flag_p,
+                           WORD32 filter_flag_q)
+{
+    WORD32 qp_luma, beta_indx, tc_indx;
+    WORD32 beta, tc;
+    WORD32 dp0, dp3, dq0, dq3, d0, d3, dp, dq, d;
+    WORD32 d_sam0, d_sam3;
+    WORD32 de, dep, deq;
+    WORD32 row;
+    WORD32 tmp_p0, tmp_p1, tmp_p2, tmp_q0, tmp_q1, tmp_q2;
+    WORD32 delta, delta_p, delta_q;
+
+    ASSERT((bs > 0) && (bs <= 3));
+    ASSERT(filter_flag_p || filter_flag_q);
+
+    qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
+    beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
+
+    /* BS based on implementation can take value 3 if it is intra/inter egde          */
+    /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
+    /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2)          */
+    /* the above desired functionallity is achieved by doing (2*(bs>>1))              */
+
+    tc_indx = CLIP3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53);
+
+    beta = gai4_ihevc_beta_table[beta_indx];
+    tc = gai4_ihevc_tc_table[tc_indx];
+    if(0 == tc)
+    {
+        return;
+    }
+
+    dq0 = ABS(pu1_src[2] - 2 * pu1_src[1] + pu1_src[0]);
+    dq3 = ABS(pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]
+                    + pu1_src[3 * src_strd + 0]);
+    dp0 = ABS(pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1]);
+    dp3 = ABS(pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]
+                    + pu1_src[3 * src_strd - 1]);
+
+    d0 = dp0 + dq0;
+    d3 = dp3 + dq3;
+
+    dp = dp0 + dp3;
+    dq = dq0 + dq3;
+
+    d = d0 + d3;
+
+    de = 0;
+    dep = 0;
+    deq = 0;
+
+    if(d < beta)
+    {
+        d_sam0 = 0;
+        if((2 * d0 < (beta >> 2))
+                        && (ABS(pu1_src[3] - pu1_src[0]) + ABS(pu1_src[-1] - pu1_src[-4])
+                                        < (beta >> 3))
+                        && ABS(pu1_src[0] - pu1_src[-1]) < ((5 * tc + 1) >> 1))
+        {
+            d_sam0 = 1;
+        }
+
+        pu1_src += 3 * src_strd;
+        d_sam3 = 0;
+        if((2 * d3 < (beta >> 2))
+                        && (ABS(pu1_src[3] - pu1_src[0]) + ABS(pu1_src[-1] - pu1_src[-4])
+                                        < (beta >> 3))
+                        && ABS(pu1_src[0] - pu1_src[-1]) < ((5 * tc + 1) >> 1))
+        {
+            d_sam3 = 1;
+        }
+        pu1_src -= 3 * src_strd;
+
+        de = (d_sam0 == 1 && d_sam3 == 1) ? 2 : 1;
+        dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
+        deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
+        if(tc <= 1)
+        {
+            dep = 0;
+            deq = 0;
+        }
+    }
+
+    if(de != 0)
+    {
+        for(row = 0; row < 4; row++)
+        {
+            tmp_p0 = pu1_src[-1];
+            tmp_p1 = pu1_src[-2];
+            tmp_p2 = pu1_src[-3];
+
+            tmp_q0 = pu1_src[0];
+            tmp_q1 = pu1_src[1];
+            tmp_q2 = pu1_src[2];
+
+            if(de == 2)
+            {
+                tmp_q0 = CLIP3((pu1_src[2] + 2 * pu1_src[1] +
+                                2 * pu1_src[0] + 2 * pu1_src[-1] +
+                                pu1_src[-2] + 4) >> 3,
+                                pu1_src[0] - 2 * tc,
+                                pu1_src[0] + 2 * tc);
+
+                tmp_q1 = CLIP3((pu1_src[2] + pu1_src[1] + pu1_src[0] +
+                                pu1_src[-1] + 2) >> 2,
+                                pu1_src[1] - 2 * tc,
+                                pu1_src[1] + 2 * tc);
+
+                tmp_q2 = CLIP3((2 * pu1_src[3] + 3 * pu1_src[2] +
+                                pu1_src[1] + pu1_src[0] +
+                                pu1_src[-1] + 4) >> 3,
+                                pu1_src[2] - 2 * tc,
+                                pu1_src[2] + 2 * tc);
+
+                tmp_p0 = CLIP3((pu1_src[1] + 2 * pu1_src[0] +
+                                2 * pu1_src[-1] + 2 * pu1_src[-2] +
+                                pu1_src[-3] + 4) >> 3,
+                                pu1_src[-1] - 2 * tc,
+                                pu1_src[-1] + 2 * tc);
+
+                tmp_p1 = CLIP3((pu1_src[0] + pu1_src[-1] +
+                                pu1_src[-2] + pu1_src[-3] + 2) >> 2,
+                                pu1_src[-2] - 2 * tc,
+                                pu1_src[-2] + 2 * tc);
+
+                tmp_p2 = CLIP3((pu1_src[0] + pu1_src[-1] +
+                                pu1_src[-2] + 3 * pu1_src[-3] +
+                                2 * pu1_src[-4] + 4) >> 3,
+                                pu1_src[-3] - 2 * tc,
+                                pu1_src[-3] + 2 * tc);
+            }
+            else
+            {
+                delta = (9 * (pu1_src[0] - pu1_src[-1]) -
+                                3 * (pu1_src[1] - pu1_src[-2]) + 8) >> 4;
+                if(ABS(delta) < 10 * tc)
+                {
+                    delta = CLIP3(delta, -tc, tc);
+
+                    tmp_p0 = CLIP_U8(pu1_src[-1] + delta);
+                    tmp_q0 = CLIP_U8(pu1_src[0] - delta);
+
+                    if(dep == 1)
+                    {
+                        delta_p = CLIP3((((pu1_src[-3] + pu1_src[-1] + 1) >> 1)
+                                        - pu1_src[-2] + delta) >> 1,
+                                        -(tc >> 1),
+                                        (tc >> 1));
+                        tmp_p1 = CLIP_U8(pu1_src[-2] + delta_p);
+                    }
+
+                    if(deq == 1)
+                    {
+                        delta_q = CLIP3((((pu1_src[2] + pu1_src[0] + 1) >> 1)
+                                        - pu1_src[1] - delta) >> 1,
+                                        -(tc >> 1),
+                                        (tc >> 1));
+                        tmp_q1 = CLIP_U8(pu1_src[1] + delta_q);
+                    }
+                }
+            }
+
+            if(filter_flag_p != 0)
+            {
+                pu1_src[-3] = tmp_p2;
+                pu1_src[-2] = tmp_p1;
+                pu1_src[-1] = tmp_p0;
+            }
+
+            if(filter_flag_q != 0)
+            {
+                pu1_src[0] = tmp_q0;
+                pu1_src[1] = tmp_q1;
+                pu1_src[2] = tmp_q2;
+            }
+
+            pu1_src += src_strd;
+        }
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*       Decision process and filtering for the luma block vertical edge for high bit depth.
+*
+* @par Description:
+*     The decision process for the luma block vertical edge is  carried out and
+*     an appropriate filter is applied. The  boundary filter strength, bs should
+*     be greater than 0.  The pcm flags and the transquant bypass flags should
+*     be  taken care of by the calling function.
+*
+* @param[in] pu2_src
+*  Pointer to the src sample q(0,0)
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] bs
+*  Boundary filter strength of q(0,0)
+*
+* @param[in] quant_param_p
+*  quantization parameter of p block
+*
+* @param[in] quant_param_q
+*  quantization parameter of p block
+*
+* @param[in] beta_offset_div2
+*
+*
+* @param[in] tc_offset_div2
+*
+*
+* @param[in] filter_flag_p
+*  flag whether to filter the p block
+*
+* @param[in] filter_flag_q
+*  flag whether to filter the q block
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_hbd_deblk_luma_vert(UWORD16 *pu2_src,
+                               WORD32 src_strd,
+                               WORD32 bs,
+                               WORD32 quant_param_p,
+                               WORD32 quant_param_q,
+                               WORD32 beta_offset_div2,
+                               WORD32 tc_offset_div2,
+                               WORD32 filter_flag_p,
+                               WORD32 filter_flag_q,
+                               UWORD8 bit_depth)
+{
+    WORD32 qp_luma, beta_indx, tc_indx;
+    WORD32 beta, tc;
+    WORD32 dp0, dp3, dq0, dq3, d0, d3, dp, dq, d;
+    WORD32 d_sam0, d_sam3;
+    WORD32 de, dep, deq;
+    WORD32 row;
+    WORD32 tmp_p0, tmp_p1, tmp_p2, tmp_q0, tmp_q1, tmp_q2;
+    WORD32 delta, delta_p, delta_q;
+
+    ASSERT((bs > 0) && (bs <= 3));
+    ASSERT(filter_flag_p || filter_flag_q);
+
+    qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
+    beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
+
+    /* BS based on implementation can take value 3 if it is intra/inter egde          */
+    /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
+    /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2)          */
+    /* the above desired functionallity is achieved by doing (2*(bs>>1))              */
+
+    tc_indx = CLIP3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53);
+
+    beta = gai4_ihevc_beta_table[beta_indx] * (1 << (bit_depth - 8));
+    tc = gai4_ihevc_tc_table[tc_indx] * (1 << (bit_depth - 8));
+    if(0 == tc)
+    {
+        return;
+    }
+
+    dq0 = ABS(pu2_src[2] - 2 * pu2_src[1] + pu2_src[0]);
+    dq3 = ABS(pu2_src[3 * src_strd + 2] - 2 * pu2_src[3 * src_strd + 1]
+                    + pu2_src[3 * src_strd + 0]);
+    dp0 = ABS(pu2_src[-3] - 2 * pu2_src[-2] + pu2_src[-1]);
+    dp3 = ABS(pu2_src[3 * src_strd - 3] - 2 * pu2_src[3 * src_strd - 2]
+                    + pu2_src[3 * src_strd - 1]);
+
+    d0 = dp0 + dq0;
+    d3 = dp3 + dq3;
+
+    dp = dp0 + dp3;
+    dq = dq0 + dq3;
+
+    d = d0 + d3;
+
+    de = 0;
+    dep = 0;
+    deq = 0;
+
+    if(d < beta)
+    {
+        d_sam0 = 0;
+        if((2 * d0 < (beta >> 2))
+                        && (ABS(pu2_src[3] - pu2_src[0]) + ABS(pu2_src[-1] - pu2_src[-4])
+                                        < (beta >> 3))
+                        && ABS(pu2_src[0] - pu2_src[-1]) < ((5 * tc + 1) >> 1))
+        {
+            d_sam0 = 1;
+        }
+
+        pu2_src += 3 * src_strd;
+        d_sam3 = 0;
+        if((2 * d3 < (beta >> 2))
+                        && (ABS(pu2_src[3] - pu2_src[0]) + ABS(pu2_src[-1] - pu2_src[-4])
+                                        < (beta >> 3))
+                        && ABS(pu2_src[0] - pu2_src[-1]) < ((5 * tc + 1) >> 1))
+        {
+            d_sam3 = 1;
+        }
+        pu2_src -= 3 * src_strd;
+
+        de = (d_sam0 == 1 && d_sam3 == 1) ? 2 : 1;
+        dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
+        deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
+        if(tc <= 1)
+        {
+            dep = 0;
+            deq = 0;
+        }
+    }
+
+    if(de != 0)
+    {
+        for(row = 0; row < 4; row++)
+        {
+            tmp_p0 = pu2_src[-1];
+            tmp_p1 = pu2_src[-2];
+            tmp_p2 = pu2_src[-3];
+
+            tmp_q0 = pu2_src[0];
+            tmp_q1 = pu2_src[1];
+            tmp_q2 = pu2_src[2];
+
+            if(de == 2)
+            {
+                tmp_q0 = CLIP3((pu2_src[2] + 2 * pu2_src[1] +
+                                2 * pu2_src[0] + 2 * pu2_src[-1] +
+                                pu2_src[-2] + 4) >> 3,
+                                pu2_src[0] - 2 * tc,
+                                pu2_src[0] + 2 * tc);
+
+                tmp_q1 = CLIP3((pu2_src[2] + pu2_src[1] + pu2_src[0] +
+                                pu2_src[-1] + 2) >> 2,
+                                pu2_src[1] - 2 * tc,
+                                pu2_src[1] + 2 * tc);
+
+                tmp_q2 = CLIP3((2 * pu2_src[3] + 3 * pu2_src[2] +
+                                pu2_src[1] + pu2_src[0] +
+                                pu2_src[-1] + 4) >> 3,
+                                pu2_src[2] - 2 * tc,
+                                pu2_src[2] + 2 * tc);
+
+                tmp_p0 = CLIP3((pu2_src[1] + 2 * pu2_src[0] +
+                                2 * pu2_src[-1] + 2 * pu2_src[-2] +
+                                pu2_src[-3] + 4) >> 3,
+                                pu2_src[-1] - 2 * tc,
+                                pu2_src[-1] + 2 * tc);
+
+                tmp_p1 = CLIP3((pu2_src[0] + pu2_src[-1] +
+                                pu2_src[-2] + pu2_src[-3] + 2) >> 2,
+                                pu2_src[-2] - 2 * tc,
+                                pu2_src[-2] + 2 * tc);
+
+                tmp_p2 = CLIP3((pu2_src[0] + pu2_src[-1] +
+                                pu2_src[-2] + 3 * pu2_src[-3] +
+                                2 * pu2_src[-4] + 4) >> 3,
+                                pu2_src[-3] - 2 * tc,
+                                pu2_src[-3] + 2 * tc);
+            }
+            else
+            {
+                delta = (9 * (pu2_src[0] - pu2_src[-1]) -
+                                3 * (pu2_src[1] - pu2_src[-2]) + 8) >> 4;
+                if(ABS(delta) < 10 * tc)
+                {
+                    delta = CLIP3(delta, -tc, tc);
+
+                    tmp_p0 = CLIP3(pu2_src[-1] + delta, 0, ((1 << bit_depth) - 1));
+                    tmp_q0 = CLIP3(pu2_src[0] - delta, 0, ((1 << bit_depth) - 1));
+                    if(dep == 1)
+                    {
+                        delta_p = CLIP3((((pu2_src[-3] + pu2_src[-1] + 1) >> 1)
+                                        - pu2_src[-2] + delta) >> 1,
+                                        -(tc >> 1),
+                                        (tc >> 1));
+                        tmp_p1 = CLIP3(pu2_src[-2] + delta_p, 0, ((1 << bit_depth) - 1));
+                    }
+
+                    if(deq == 1)
+                    {
+                        delta_q = CLIP3((((pu2_src[2] + pu2_src[0] + 1) >> 1)
+                                        - pu2_src[1] - delta) >> 1,
+                                        -(tc >> 1),
+                                        (tc >> 1));
+                        tmp_q1 = CLIP3(pu2_src[1] + delta_q, 0, ((1 << bit_depth) - 1));
+                    }
+                }
+            }
+
+            if(filter_flag_p != 0)
+            {
+                pu2_src[-3] = tmp_p2;
+                pu2_src[-2] = tmp_p1;
+                pu2_src[-1] = tmp_p0;
+            }
+
+            if(filter_flag_q != 0)
+            {
+                pu2_src[0] = tmp_q0;
+                pu2_src[1] = tmp_q1;
+                pu2_src[2] = tmp_q2;
+            }
+
+            pu2_src += src_strd;
+        }
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*
+*     Decision process and filtering for the luma block horizontal edge
+*
+* @par Description:
+*     The decision process for the luma block horizontal edge  is carried out
+*    and an appropriate filter is applied. The  boundary filter strength, bs
+*    should be greater than 0.  The pcm flags and the transquant bypass flags
+*    should be  taken care of by the calling function.
+*
+* @param[in] pu1_src
+*  Pointer to the src sample q(0,0)
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] bs
+*  Boundary filter strength of q(0,0)
+*
+* @param[in] quant_param_p
+*  quantization parameter of p block
+*
+* @param[in] quant_param_q
+*  quantization parameter of p block
+*
+* @param[in] beta_offset_div2
+*
+*
+* @param[in] tc_offset_div2
+*
+*
+* @param[in] filter_flag_p
+*  flag whether to filter the p block
+*
+* @param[in] filter_flag_q
+*  flag whether to filter the q block
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_deblk_luma_horz(UWORD8 *pu1_src,
+                           WORD32 src_strd,
+                           WORD32 bs,
+                           WORD32 quant_param_p,
+                           WORD32 quant_param_q,
+                           WORD32 beta_offset_div2,
+                           WORD32 tc_offset_div2,
+                           WORD32 filter_flag_p,
+                           WORD32 filter_flag_q)
+{
+    WORD32 qp_luma, beta_indx, tc_indx;
+    WORD32 beta, tc;
+    WORD32 dp0, dp3, dq0, dq3, d0, d3, dp, dq, d;
+    WORD32 d_sam0, d_sam3;
+    WORD32 de, dep, deq;
+    WORD32 col;
+    WORD32 tmp_p0, tmp_p1, tmp_p2, tmp_q0, tmp_q1, tmp_q2;
+    WORD32 delta, delta_p, delta_q;
+
+    ASSERT((bs > 0));
+    ASSERT(filter_flag_p || filter_flag_q);
+
+    qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
+    beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
+
+    /* BS based on implementation can take value 3 if it is intra/inter egde          */
+    /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
+    /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2)          */
+    /* the above desired functionallity is achieved by doing (2*(bs>>1))              */
+
+    tc_indx = CLIP3(qp_luma + 2 * (bs >> 1) + (tc_offset_div2 << 1), 0, 53);
+
+    beta = gai4_ihevc_beta_table[beta_indx];
+    tc = gai4_ihevc_tc_table[tc_indx];
+    if(0 == tc)
+    {
+        return;
+    }
+
+    dq0 = ABS(pu1_src[2 * src_strd] - 2 * pu1_src[1 * src_strd] +
+                    pu1_src[0 * src_strd]);
+
+    dq3 = ABS(pu1_src[3 + 2 * src_strd] - 2 * pu1_src[3 + 1 * src_strd] +
+                    pu1_src[3 + 0 * src_strd]);
+
+    dp0 = ABS(pu1_src[-3 * src_strd] - 2 * pu1_src[-2 * src_strd] +
+                    pu1_src[-1 * src_strd]);
+
+    dp3 = ABS(pu1_src[3 - 3 * src_strd] - 2 * pu1_src[3 - 2 * src_strd] +
+                    pu1_src[3 - 1 * src_strd]);
+
+    d0 = dp0 + dq0;
+    d3 = dp3 + dq3;
+
+    dp = dp0 + dp3;
+    dq = dq0 + dq3;
+
+    d = d0 + d3;
+
+    de = 0;
+    dep = 0;
+    deq = 0;
+
+    if(d < beta)
+    {
+        d_sam0 = 0;
+        if((2 * d0 < (beta >> 2))
+                        && (ABS(pu1_src[3 * src_strd] - pu1_src[0 * src_strd]) +
+                                        ABS(pu1_src[-1 * src_strd] - pu1_src[-4 * src_strd])
+                                        < (beta >> 3))
+                        && ABS(pu1_src[0 * src_strd] - pu1_src[-1 * src_strd])
+                        < ((5 * tc + 1) >> 1))
+        {
+            d_sam0 = 1;
+        }
+
+        pu1_src += 3;
+        d_sam3 = 0;
+        if((2 * d3 < (beta >> 2))
+                        && (ABS(pu1_src[3 * src_strd] - pu1_src[0 * src_strd]) +
+                                        ABS(pu1_src[-1 * src_strd] - pu1_src[-4 * src_strd])
+                                        < (beta >> 3))
+                        && ABS(pu1_src[0 * src_strd] - pu1_src[-1 * src_strd])
+                        < ((5 * tc + 1) >> 1))
+        {
+            d_sam3 = 1;
+        }
+        pu1_src -= 3;
+
+        de = (d_sam0 == 1 && d_sam3 == 1) ? 2 : 1;
+        dep = (dp < ((beta + (beta >> 1)) >> 3)) ? 1 : 0;
+        deq = (dq < ((beta + (beta >> 1)) >> 3)) ? 1 : 0;
+        if(tc <= 1)
+        {
+            dep = 0;
+            deq = 0;
+        }
+    }
+
+    if(de != 0)
+    {
+        for(col = 0; col < 4; col++)
+        {
+            tmp_p0 = pu1_src[-1 * src_strd];
+            tmp_p1 = pu1_src[-2 * src_strd];
+            tmp_p2 = pu1_src[-3 * src_strd];
+
+            tmp_q0 = pu1_src[0 * src_strd];
+            tmp_q1 = pu1_src[1 * src_strd];
+            tmp_q2 = pu1_src[2 * src_strd];
+            if(de == 2)
+            {
+                tmp_q0 = CLIP3((pu1_src[2 * src_strd] +
+                                2 * pu1_src[1 * src_strd] +
+                                2 * pu1_src[0 * src_strd] +
+                                2 * pu1_src[-1 * src_strd] +
+                                pu1_src[-2 * src_strd] + 4) >> 3,
+                                pu1_src[0 * src_strd] - 2 * tc,
+                                pu1_src[0 * src_strd] + 2 * tc);
+
+                tmp_q1 = CLIP3((pu1_src[2 * src_strd] +
+                                pu1_src[1 * src_strd] +
+                                pu1_src[0 * src_strd] +
+                                pu1_src[-1 * src_strd] + 2) >> 2,
+                                pu1_src[1 * src_strd] - 2 * tc,
+                                pu1_src[1 * src_strd] + 2 * tc);
+
+                tmp_q2 = CLIP3((2 * pu1_src[3 * src_strd] +
+                                3 * pu1_src[2 * src_strd] +
+                                pu1_src[1 * src_strd] +
+                                pu1_src[0 * src_strd] +
+                                pu1_src[-1 * src_strd] + 4) >> 3,
+                                pu1_src[2 * src_strd] - 2 * tc,
+                                pu1_src[2 * src_strd] + 2 * tc);
+
+                tmp_p0 = CLIP3((pu1_src[1 * src_strd] +
+                                2 * pu1_src[0 * src_strd] +
+                                2 * pu1_src[-1 * src_strd] +
+                                2 * pu1_src[-2 * src_strd] +
+                                pu1_src[-3 * src_strd] + 4) >> 3,
+                                pu1_src[-1 * src_strd] - 2 * tc,
+                                pu1_src[-1 * src_strd] + 2 * tc);
+
+                tmp_p1 = CLIP3((pu1_src[0 * src_strd] +
+                                pu1_src[-1 * src_strd] +
+                                pu1_src[-2 * src_strd] +
+                                pu1_src[-3 * src_strd] + 2) >> 2,
+                                pu1_src[-2 * src_strd] - 2 * tc,
+                                pu1_src[-2 * src_strd] + 2 * tc);
+
+                tmp_p2 = CLIP3((pu1_src[0 * src_strd] +
+                                pu1_src[-1 * src_strd] +
+                                pu1_src[-2 * src_strd] +
+                                3 * pu1_src[-3 * src_strd] +
+                                2 * pu1_src[-4 * src_strd] + 4) >> 3,
+                                pu1_src[-3 * src_strd] - 2 * tc,
+                                pu1_src[-3 * src_strd] + 2 * tc);
+            }
+            else
+            {
+                delta = (9 * (pu1_src[0 * src_strd] - pu1_src[-1 * src_strd]) -
+                                3 * (pu1_src[1 * src_strd] - pu1_src[-2 * src_strd]) +
+                                8) >> 4;
+                if(ABS(delta) < 10 * tc)
+                {
+                    delta = CLIP3(delta, -tc, tc);
+
+                    tmp_p0 = CLIP_U8(pu1_src[-1 * src_strd] + delta);
+                    tmp_q0 = CLIP_U8(pu1_src[0 * src_strd] - delta);
+
+                    if(dep == 1)
+                    {
+                        delta_p = CLIP3((((pu1_src[-3 * src_strd] +
+                                        pu1_src[-1 * src_strd] + 1) >> 1) -
+                                        pu1_src[-2 * src_strd] + delta) >> 1,
+                                        -(tc >> 1),
+                                        (tc >> 1));
+                        tmp_p1 = CLIP_U8(pu1_src[-2 * src_strd] + delta_p);
+                    }
+
+                    if(deq == 1)
+                    {
+                        delta_q = CLIP3((((pu1_src[2 * src_strd] +
+                                        pu1_src[0 * src_strd] + 1) >> 1) -
+                                        pu1_src[1 * src_strd] - delta) >> 1,
+                                        -(tc >> 1),
+                                        (tc >> 1));
+                        tmp_q1 = CLIP_U8(pu1_src[1 * src_strd] + delta_q);
+                    }
+                }
+            }
+
+            if(filter_flag_p != 0)
+            {
+                pu1_src[-3 * src_strd] = tmp_p2;
+                pu1_src[-2 * src_strd] = tmp_p1;
+                pu1_src[-1 * src_strd] = tmp_p0;
+            }
+
+            if(filter_flag_q != 0)
+            {
+                pu1_src[0 * src_strd] = tmp_q0;
+                pu1_src[1 * src_strd] = tmp_q1;
+                pu1_src[2 * src_strd] = tmp_q2;
+            }
+
+            pu1_src += 1;
+        }
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*
+*     Decision process and filtering for the luma block horizontal edge for high bit depth
+*
+* @par Description:
+*     The decision process for the luma block horizontal edge  is carried out
+*    and an appropriate filter is applied. The  boundary filter strength, bs
+*    should be greater than 0.  The pcm flags and the transquant bypass flags
+*    should be  taken care of by the calling function.
+*
+* @param[in] pu1_src
+*  Pointer to the src sample q(0,0)
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] bs
+*  Boundary filter strength of q(0,0)
+*
+* @param[in] quant_param_p
+*  quantization parameter of p block
+*
+* @param[in] quant_param_q
+*  quantization parameter of p block
+*
+* @param[in] beta_offset_div2
+*
+*
+* @param[in] tc_offset_div2
+*
+*
+* @param[in] filter_flag_p
+*  flag whether to filter the p block
+*
+* @param[in] filter_flag_q
+*  flag whether to filter the q block
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_hbd_deblk_luma_horz(UWORD16 *pu2_src,
+                               WORD32 src_strd,
+                               WORD32 bs,
+                               WORD32 quant_param_p,
+                               WORD32 quant_param_q,
+                               WORD32 beta_offset_div2,
+                               WORD32 tc_offset_div2,
+                               WORD32 filter_flag_p,
+                               WORD32 filter_flag_q,
+                               UWORD8 bit_depth)
+{
+    WORD32 qp_luma, beta_indx, tc_indx;
+    WORD32 beta, tc;
+    WORD32 dp0, dp3, dq0, dq3, d0, d3, dp, dq, d;
+    WORD32 d_sam0, d_sam3;
+    WORD32 de, dep, deq;
+    WORD32 col;
+    WORD32 tmp_p0, tmp_p1, tmp_p2, tmp_q0, tmp_q1, tmp_q2;
+    WORD32 delta, delta_p, delta_q;
+
+    ASSERT((bs > 0));
+    ASSERT(filter_flag_p || filter_flag_q);
+
+    qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
+    beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
+
+    /* BS based on implementation can take value 3 if it is intra/inter egde          */
+    /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
+    /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2)          */
+    /* the above desired functionallity is achieved by doing (2*(bs>>1))              */
+
+    tc_indx = CLIP3(qp_luma + 2 * (bs >> 1) + (tc_offset_div2 << 1), 0, 53);
+
+    beta = gai4_ihevc_beta_table[beta_indx] * (1 << (bit_depth - 8));
+    tc = gai4_ihevc_tc_table[tc_indx] * (1 << (bit_depth - 8));
+    if(0 == tc)
+    {
+        return;
+    }
+
+    dq0 = ABS(pu2_src[2 * src_strd] - 2 * pu2_src[1 * src_strd] +
+                    pu2_src[0 * src_strd]);
+
+    dq3 = ABS(pu2_src[3 + 2 * src_strd] - 2 * pu2_src[3 + 1 * src_strd] +
+                    pu2_src[3 + 0 * src_strd]);
+
+    dp0 = ABS(pu2_src[-3 * src_strd] - 2 * pu2_src[-2 * src_strd] +
+                    pu2_src[-1 * src_strd]);
+
+    dp3 = ABS(pu2_src[3 - 3 * src_strd] - 2 * pu2_src[3 - 2 * src_strd] +
+                    pu2_src[3 - 1 * src_strd]);
+
+    d0 = dp0 + dq0;
+    d3 = dp3 + dq3;
+
+    dp = dp0 + dp3;
+    dq = dq0 + dq3;
+
+    d = d0 + d3;
+
+    de = 0;
+    dep = 0;
+    deq = 0;
+
+    if(d < beta)
+    {
+        d_sam0 = 0;
+        if((2 * d0 < (beta >> 2))
+                        && (ABS(pu2_src[3 * src_strd] - pu2_src[0 * src_strd]) +
+                                        ABS(pu2_src[-1 * src_strd] - pu2_src[-4 * src_strd])
+                                        < (beta >> 3))
+                        && ABS(pu2_src[0 * src_strd] - pu2_src[-1 * src_strd])
+                        < ((5 * tc + 1) >> 1))
+        {
+            d_sam0 = 1;
+        }
+
+        pu2_src += 3;
+        d_sam3 = 0;
+        if((2 * d3 < (beta >> 2))
+                        && (ABS(pu2_src[3 * src_strd] - pu2_src[0 * src_strd]) +
+                                        ABS(pu2_src[-1 * src_strd] - pu2_src[-4 * src_strd])
+                                        < (beta >> 3))
+                        && ABS(pu2_src[0 * src_strd] - pu2_src[-1 * src_strd])
+                        < ((5 * tc + 1) >> 1))
+        {
+            d_sam3 = 1;
+        }
+        pu2_src -= 3;
+
+        de = (d_sam0 == 1 && d_sam3 == 1) ? 2 : 1;
+        dep = (dp < ((beta + (beta >> 1)) >> 3)) ? 1 : 0;
+        deq = (dq < ((beta + (beta >> 1)) >> 3)) ? 1 : 0;
+        if(tc <= 1)
+        {
+            dep = 0;
+            deq = 0;
+        }
+    }
+
+    if(de != 0)
+    {
+        for(col = 0; col < 4; col++)
+        {
+            tmp_p0 = pu2_src[-1 * src_strd];
+            tmp_p1 = pu2_src[-2 * src_strd];
+            tmp_p2 = pu2_src[-3 * src_strd];
+
+            tmp_q0 = pu2_src[0 * src_strd];
+            tmp_q1 = pu2_src[1 * src_strd];
+            tmp_q2 = pu2_src[2 * src_strd];
+            if(de == 2)
+            {
+                tmp_q0 = CLIP3((pu2_src[2 * src_strd] +
+                                2 * pu2_src[1 * src_strd] +
+                                2 * pu2_src[0 * src_strd] +
+                                2 * pu2_src[-1 * src_strd] +
+                                pu2_src[-2 * src_strd] + 4) >> 3,
+                                pu2_src[0 * src_strd] - 2 * tc,
+                                pu2_src[0 * src_strd] + 2 * tc);
+
+                tmp_q1 = CLIP3((pu2_src[2 * src_strd] +
+                                pu2_src[1 * src_strd] +
+                                pu2_src[0 * src_strd] +
+                                pu2_src[-1 * src_strd] + 2) >> 2,
+                                pu2_src[1 * src_strd] - 2 * tc,
+                                pu2_src[1 * src_strd] + 2 * tc);
+
+                tmp_q2 = CLIP3((2 * pu2_src[3 * src_strd] +
+                                3 * pu2_src[2 * src_strd] +
+                                pu2_src[1 * src_strd] +
+                                pu2_src[0 * src_strd] +
+                                pu2_src[-1 * src_strd] + 4) >> 3,
+                                pu2_src[2 * src_strd] - 2 * tc,
+                                pu2_src[2 * src_strd] + 2 * tc);
+
+                tmp_p0 = CLIP3((pu2_src[1 * src_strd] +
+                                2 * pu2_src[0 * src_strd] +
+                                2 * pu2_src[-1 * src_strd] +
+                                2 * pu2_src[-2 * src_strd] +
+                                pu2_src[-3 * src_strd] + 4) >> 3,
+                                pu2_src[-1 * src_strd] - 2 * tc,
+                                pu2_src[-1 * src_strd] + 2 * tc);
+
+                tmp_p1 = CLIP3((pu2_src[0 * src_strd] +
+                                pu2_src[-1 * src_strd] +
+                                pu2_src[-2 * src_strd] +
+                                pu2_src[-3 * src_strd] + 2) >> 2,
+                                pu2_src[-2 * src_strd] - 2 * tc,
+                                pu2_src[-2 * src_strd] + 2 * tc);
+
+                tmp_p2 = CLIP3((pu2_src[0 * src_strd] +
+                                pu2_src[-1 * src_strd] +
+                                pu2_src[-2 * src_strd] +
+                                3 * pu2_src[-3 * src_strd] +
+                                2 * pu2_src[-4 * src_strd] + 4) >> 3,
+                                pu2_src[-3 * src_strd] - 2 * tc,
+                                pu2_src[-3 * src_strd] + 2 * tc);
+            }
+            else
+            {
+                delta = (9 * (pu2_src[0 * src_strd] - pu2_src[-1 * src_strd]) -
+                                3 * (pu2_src[1 * src_strd] - pu2_src[-2 * src_strd]) +
+                                8) >> 4;
+                if(ABS(delta) < 10 * tc)
+                {
+                    delta = CLIP3(delta, -tc, tc);
+                    tmp_p0 = CLIP3(pu2_src[-1 * src_strd] + delta, 0, ((1 << bit_depth) - 1));
+                    tmp_q0 = CLIP3(pu2_src[0 * src_strd] - delta, 0, ((1 << bit_depth) - 1));
+                    if(dep == 1)
+                    {
+                        delta_p = CLIP3((((pu2_src[-3 * src_strd] +
+                                        pu2_src[-1 * src_strd] + 1) >> 1) -
+                                        pu2_src[-2 * src_strd] + delta) >> 1,
+                                        -(tc >> 1),
+                                        (tc >> 1));
+                        tmp_p1 = CLIP3(pu2_src[-2 * src_strd] + delta_p, 0, ((1 << bit_depth) - 1));
+                    }
+
+                    if(deq == 1)
+                    {
+                        delta_q = CLIP3((((pu2_src[2 * src_strd] +
+                                        pu2_src[0 * src_strd] + 1) >> 1) -
+                                        pu2_src[1 * src_strd] - delta) >> 1,
+                                        -(tc >> 1),
+                                        (tc >> 1));
+                        tmp_q1 = CLIP3(pu2_src[1 * src_strd] + delta_q, 0, ((1 << bit_depth) - 1));
+                    }
+                }
+            }
+
+            if(filter_flag_p != 0)
+            {
+                pu2_src[-3 * src_strd] = tmp_p2;
+                pu2_src[-2 * src_strd] = tmp_p1;
+                pu2_src[-1 * src_strd] = tmp_p0;
+            }
+
+            if(filter_flag_q != 0)
+            {
+                pu2_src[0 * src_strd] = tmp_q0;
+                pu2_src[1 * src_strd] = tmp_q1;
+                pu2_src[2 * src_strd] = tmp_q2;
+            }
+
+            pu2_src += 1;
+        }
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Filtering for the chroma block vertical edge.
+*
+* @par Description:
+*     Filter for chroma vertical edge. The  boundary filter strength, bs
+*    should be greater than 1.  The pcm flags and the transquant bypass flags
+*    should be  taken care of by the calling function.
+*
+* @param[in] pu1_src
+*  Pointer to the src sample q(0,0)
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] bs
+*  Boundary filter strength of q(0,0)
+*
+* @param[in] quant_param_p
+*  quantization parameter of p block
+*
+* @param[in] quant_param_q
+*  quantization parameter of p block
+*
+* @param[in] beta_offset_div2
+*
+*
+* @param[in] tc_offset_div2
+*
+*
+* @param[in] filter_flag_p
+*  flag whether to filter the p block
+*
+* @param[in] filter_flag_q
+*  flag whether to filter the q block
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_deblk_chroma_vert(UWORD8 *pu1_src,
+                             WORD32 src_strd,
+                             WORD32 quant_param_p,
+                             WORD32 quant_param_q,
+                             WORD32 qp_offset_u,
+                             WORD32 qp_offset_v,
+                             WORD32 tc_offset_div2,
+                             WORD32 filter_flag_p,
+                             WORD32 filter_flag_q)
+{
+    WORD32 qp_indx_u, qp_chroma_u;
+    WORD32 qp_indx_v, qp_chroma_v;
+    WORD32 tc_indx_u, tc_u;
+    WORD32 tc_indx_v, tc_v;
+    WORD32 delta_u, tmp_p0_u, tmp_q0_u;
+    WORD32 delta_v, tmp_p0_v, tmp_q0_v;
+    WORD32 row;
+
+    ASSERT(filter_flag_p || filter_flag_q);
+
+    /* chroma processing is done only if BS is 2             */
+    /* this function is assumed to be called only if BS is 2 */
+    qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
+    qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
+
+    qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
+    qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
+
+    tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
+    tc_u = gai4_ihevc_tc_table[tc_indx_u];
+
+    tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
+    tc_v = gai4_ihevc_tc_table[tc_indx_v];
+
+    if(0 == tc_u && 0 == tc_v)
+    {
+        return;
+    }
+
+    for(row = 0; row < 4; row++)
+    {
+        delta_u = CLIP3((((pu1_src[0] - pu1_src[-2]) << 2) +
+                        pu1_src[-4] - pu1_src[2] + 4) >> 3,
+                        -tc_u, tc_u);
+
+        tmp_p0_u = CLIP_U8(pu1_src[-2] + delta_u);
+        tmp_q0_u = CLIP_U8(pu1_src[0] - delta_u);
+
+        delta_v = CLIP3((((pu1_src[1] - pu1_src[-1]) << 2) +
+                        pu1_src[-3] - pu1_src[3] + 4) >> 3,
+                        -tc_v, tc_v);
+
+        tmp_p0_v = CLIP_U8(pu1_src[-1] + delta_v);
+        tmp_q0_v = CLIP_U8(pu1_src[1] - delta_v);
+
+        if(filter_flag_p != 0)
+        {
+            pu1_src[-2] = tmp_p0_u;
+            pu1_src[-1] = tmp_p0_v;
+        }
+
+        if(filter_flag_q != 0)
+        {
+            pu1_src[0] = tmp_q0_u;
+            pu1_src[1] = tmp_q0_v;
+        }
+
+        pu1_src += src_strd;
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Filtering for the chroma block vertical edge.
+*
+* @par Description:
+*     Filter for chroma vertical edge. The  boundary filter strength, bs
+*    should be greater than 1.  The pcm flags and the transquant bypass flags
+*    should be  taken care of by the calling function.
+*
+* @param[in] pu2_src
+*  Pointer to the src sample q(0,0)
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] bs
+*  Boundary filter strength of q(0,0)
+*
+* @param[in] quant_param_p
+*  quantization parameter of p block
+*
+* @param[in] quant_param_q
+*  quantization parameter of p block
+*
+* @param[in] beta_offset_div2
+*
+*
+* @param[in] tc_offset_div2
+*
+*
+* @param[in] filter_flag_p
+*  flag whether to filter the p block
+*
+* @param[in] filter_flag_q
+*  flag whether to filter the q block
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_hbd_deblk_chroma_vert(UWORD16 *pu2_src,
+                                 WORD32 src_strd,
+                                 WORD32 quant_param_p,
+                                 WORD32 quant_param_q,
+                                 WORD32 qp_offset_u,
+                                 WORD32 qp_offset_v,
+                                 WORD32 tc_offset_div2,
+                                 WORD32 filter_flag_p,
+                                 WORD32 filter_flag_q,
+                                 UWORD8 bit_depth)
+{
+    WORD32 qp_indx_u, qp_chroma_u;
+    WORD32 qp_indx_v, qp_chroma_v;
+    WORD32 tc_indx_u, tc_u;
+    WORD32 tc_indx_v, tc_v;
+    WORD32 delta_u, tmp_p0_u, tmp_q0_u;
+    WORD32 delta_v, tmp_p0_v, tmp_q0_v;
+    WORD32 row;
+
+    ASSERT(filter_flag_p || filter_flag_q);
+
+    /* chroma processing is done only if BS is 2             */
+    /* this function is assumed to be called only if BS is 2 */
+    qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
+    qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
+
+    qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
+    qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
+
+    tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
+    tc_u = gai4_ihevc_tc_table[tc_indx_u] * (1 << (bit_depth - 8));
+
+    tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
+    tc_v = gai4_ihevc_tc_table[tc_indx_v] * (1 << (bit_depth - 8));
+
+    if(0 == tc_u && 0 == tc_v)
+    {
+        return;
+    }
+
+    for(row = 0; row < 4; row++)
+    {
+        delta_u = CLIP3((((pu2_src[0] - pu2_src[-2]) << 2) +
+                        pu2_src[-4] - pu2_src[2] + 4) >> 3,
+                        -tc_u, tc_u);
+        tmp_p0_u = CLIP3(pu2_src[-2] + delta_u, 0, ((1 << bit_depth) - 1));
+        tmp_q0_u = CLIP3(pu2_src[0] - delta_u, 0, ((1 << bit_depth) - 1));
+
+        delta_v = CLIP3((((pu2_src[1] - pu2_src[-1]) << 2) +
+                        pu2_src[-3] - pu2_src[3] + 4) >> 3,
+                        -tc_v, tc_v);
+        tmp_p0_v = CLIP3(pu2_src[-1] + delta_v, 0, ((1 << bit_depth) - 1));
+        tmp_q0_v = CLIP3(pu2_src[1] - delta_v, 0, ((1 << bit_depth) - 1));
+        if(filter_flag_p != 0)
+        {
+            pu2_src[-2] = tmp_p0_u;
+            pu2_src[-1] = tmp_p0_v;
+        }
+
+        if(filter_flag_q != 0)
+        {
+            pu2_src[0] = tmp_q0_u;
+            pu2_src[1] = tmp_q0_v;
+        }
+
+        pu2_src += src_strd;
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Filtering for the chroma block horizontal edge.
+*
+* @par Description:
+*     Filter for chroma horizontal edge. The  boundary filter strength, bs
+*    should be greater than 1.  The pcm flags and the transquant bypass flags
+*    should be  taken care of by the calling function.
+*
+* @param[in] pu1_src
+*  Pointer to the src sample q(0,0)
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] bs
+*  Boundary filter strength of q(0,0)
+*
+* @param[in] quant_param_p
+*  quantization parameter of p block
+*
+* @param[in] quant_param_q
+*  quantization parameter of p block
+*
+* @param[in] beta_offset_div2
+*
+*
+* @param[in] tc_offset_div2
+*
+*
+* @param[in] filter_flag_p
+*  flag whether to filter the p block
+*
+* @param[in] filter_flag_q
+*  flag whether to filter the q block
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_deblk_chroma_horz(UWORD8 *pu1_src,
+                             WORD32 src_strd,
+                             WORD32 quant_param_p,
+                             WORD32 quant_param_q,
+                             WORD32 qp_offset_u,
+                             WORD32 qp_offset_v,
+                             WORD32 tc_offset_div2,
+                             WORD32 filter_flag_p,
+                             WORD32 filter_flag_q)
+{
+    WORD32 qp_indx_u, qp_chroma_u;
+    WORD32 qp_indx_v, qp_chroma_v;
+    WORD32 tc_indx_u, tc_u;
+    WORD32 tc_indx_v, tc_v;
+    WORD32 tc;
+
+    WORD32 delta, tmp_p0, tmp_q0;
+    WORD32 col;
+
+    ASSERT(filter_flag_p || filter_flag_q);
+
+    /* chroma processing is done only if BS is 2             */
+    /* this function is assumed to be called only if BS is 2 */
+    qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
+    qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
+
+    qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
+    qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
+
+    tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
+    tc_u = gai4_ihevc_tc_table[tc_indx_u];
+
+    tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
+    tc_v = gai4_ihevc_tc_table[tc_indx_v];
+
+    if(0 == tc_u && 0 == tc_v)
+    {
+        return;
+    }
+
+    for(col = 0; col < 8; col++)
+    {
+        tc = (col & 1) ? tc_v : tc_u;
+        delta = CLIP3((((pu1_src[0 * src_strd] -
+                      pu1_src[-1 * src_strd]) << 2) +
+                      pu1_src[-2 * src_strd] -
+                      pu1_src[1 * src_strd] + 4) >> 3,
+                      -tc, tc);
+
+        tmp_p0 = CLIP_U8(pu1_src[-1 * src_strd] + delta);
+        tmp_q0 = CLIP_U8(pu1_src[0 * src_strd] - delta);
+
+        if(filter_flag_p != 0)
+        {
+            pu1_src[-1 * src_strd] = tmp_p0;
+        }
+
+        if(filter_flag_q != 0)
+        {
+            pu1_src[0 * src_strd] = tmp_q0;
+        }
+
+        pu1_src += 1;
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Filtering for the chroma block horizontal edge.
+*
+* @par Description:
+*     Filter for chroma horizontal edge. The  boundary filter strength, bs
+*    should be greater than 1.  The pcm flags and the transquant bypass flags
+*    should be  taken care of by the calling function.
+*
+* @param[in] pu2_src
+*  Pointer to the src sample q(0,0)
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] bs
+*  Boundary filter strength of q(0,0)
+*
+* @param[in] quant_param_p
+*  quantization parameter of p block
+*
+* @param[in] quant_param_q
+*  quantization parameter of p block
+*
+* @param[in] beta_offset_div2
+*
+*
+* @param[in] tc_offset_div2
+*
+*
+* @param[in] filter_flag_p
+*  flag whether to filter the p block
+*
+* @param[in] filter_flag_q
+*  flag whether to filter the q block
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_hbd_deblk_chroma_horz(UWORD16 *pu2_src,
+                                 WORD32 src_strd,
+                                 WORD32 quant_param_p,
+                                 WORD32 quant_param_q,
+                                 WORD32 qp_offset_u,
+                                 WORD32 qp_offset_v,
+                                 WORD32 tc_offset_div2,
+                                 WORD32 filter_flag_p,
+                                 WORD32 filter_flag_q,
+                                 UWORD8 bit_depth)
+{
+    WORD32 qp_indx_u, qp_chroma_u;
+    WORD32 qp_indx_v, qp_chroma_v;
+    WORD32 tc_indx_u, tc_u;
+    WORD32 tc_indx_v, tc_v;
+    WORD32 tc;
+
+    WORD32 delta, tmp_p0, tmp_q0;
+    WORD32 col;
+
+    ASSERT(filter_flag_p || filter_flag_q);
+
+    /* chroma processing is done only if BS is 2             */
+    /* this function is assumed to be called only if BS is 2 */
+    qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
+    qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
+
+    qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
+    qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
+
+    tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
+    tc_u = gai4_ihevc_tc_table[tc_indx_u] * (1 << (bit_depth - 8));
+
+    tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
+    tc_v = gai4_ihevc_tc_table[tc_indx_v] * (1 << (bit_depth - 8));
+
+    if(0 == tc_u && 0 == tc_v)
+    {
+        return;
+    }
+
+    for(col = 0; col < 8; col++)
+    {
+        tc = (col & 1) ? tc_v : tc_u;
+        delta = CLIP3((((pu2_src[0 * src_strd] -
+                      pu2_src[-1 * src_strd]) << 2) +
+                      pu2_src[-2 * src_strd] -
+                      pu2_src[1 * src_strd] + 4) >> 3,
+                      -tc, tc);
+        tmp_p0 = CLIP3(pu2_src[-1 * src_strd] + delta, 0, ((1 << bit_depth) - 1));
+        tmp_q0 = CLIP3(pu2_src[0 * src_strd] - delta, 0, ((1 << bit_depth) - 1));
+
+        if(filter_flag_p != 0)
+        {
+            pu2_src[-1 * src_strd] = tmp_p0;
+        }
+
+        if(filter_flag_q != 0)
+        {
+            pu2_src[0 * src_strd] = tmp_q0;
+        }
+
+        pu2_src += 1;
+    }
+
+}

diff --git a/common/ihevc_deblk_tables.c b/common/ihevc_deblk_tables.c
new file mode 100644
index 0000000..6fd9e58
--- /dev/null
+++ b/common/ihevc_deblk_tables.c

@@ -0,0 +1,78 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_deblk_tables.c
+*
+* @brief
+*  Contains tables used for deblock filters
+*
+* @author
+*  Srinivas T
+*
+* @par List of Tables:
+* gai4_ihevc_beta_table
+* gai4_ihevc_tc_table
+* gai4_ihevc_qp_table
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#include "ihevc_typedefs.h"
+#include "ihevc_deblk_tables.h"
+
+/**
+ * Beta table for deblocking
+ * Table 8-10 - Derivation of threshold variables beta and tc from input Q
+ */
+const WORD32 gai4_ihevc_beta_table[52] =
+{    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+    16, 17, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38,
+    40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64
+};
+
+
+/**
+ * Tc table for deblocking
+ * Table 8-10 - Derivation of threshold variables beta and tc from input Q
+ */
+const WORD32 gai4_ihevc_tc_table[54] =
+{
+     0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,
+     1,  1,  1,  1,  1,  1,  1,  1,  1,
+     2,  2,  2,  2,  3,  3,  3,  3,  4,
+     4,  4,  5,  5,  6,  6,  7,  8,  9,
+    10, 11, 13, 14, 16, 18, 20, 22, 24
+};
+
+/**
+ * QP table for deblocking
+ * Table 8-9  Specification of QPC as a function of qPi
+ */
+const WORD32 gai4_ihevc_qp_table[58] =
+{
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
+    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32,
+    33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44,
+    45, 46, 47, 48, 49, 50, 51
+};

diff --git a/common/ihevc_deblk_tables.h b/common/ihevc_deblk_tables.h
new file mode 100644
index 0000000..6387881
--- /dev/null
+++ b/common/ihevc_deblk_tables.h

@@ -0,0 +1,45 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_deblk_tables.h
+*
+* @brief
+*  Tables for forward and inverse transform
+*
+* @author
+*  Srinivas T
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_DEBLK_TABLES_H_
+#define _IHEVC_DEBLK_TABLES_H_
+
+extern const WORD32 gai4_ihevc_beta_table[52];
+
+extern const WORD32 gai4_ihevc_tc_table[54];
+
+extern const WORD32 gai4_ihevc_qp_table[58];
+
+#endif /*_IHEVC_DEBLK_TABLES_H_*/

diff --git a/common/ihevc_debug.h b/common/ihevc_debug.h
new file mode 100644
index 0000000..4b87e47
--- /dev/null
+++ b/common/ihevc_debug.h

@@ -0,0 +1,70 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_debug.h
+*
+* @brief
+*  Definitions for codec debugging
+*
+* @author
+*  Ittiam
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_DEBUG_H_
+#define _IHEVC_DEBUG_H_
+
+#ifdef DEBUG_PRINT
+
+#define DEBUG(...)                                                 \
+{                                                                           \
+    printf("\n[HEVC DBG] %s/%d:: ", __FUNCTION__, __LINE__);                \
+    printf(__VA_ARGS__);                                                    \
+}
+
+#else //DEBUG_CODEC
+
+#define DEBUG(...) {}
+
+#endif //DEBUG_CODEC
+
+#if 1
+
+#define ASSERT(x) assert((x))
+//#define ASSERT(x) ihevcd_debug_assert((x))
+
+#else
+#define ASSERT(x)                        \
+{                                        \
+    if (!(x))                            \
+    {                                    \
+        printf("ASSERT %s %d\n", __FILE__, __LINE__);              \
+        exit(-1);                        \
+    }                                    \
+}
+#endif
+
+#endif /* _IHEVC_DEBUG_H_ */
+

diff --git a/common/ihevc_defs.h b/common/ihevc_defs.h
new file mode 100644
index 0000000..bd92d7d
--- /dev/null
+++ b/common/ihevc_defs.h

@@ -0,0 +1,457 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_defs.h
+*
+* @brief
+*  Definitions used in the codec
+*
+* @author
+*  Ittiam
+*
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_DEFS_H_
+#define _IHEVC_DEFS_H_
+
+/*****************************************************************************/
+/* Profile and Levels                                                        */
+/*****************************************************************************/
+enum
+{
+    IHEVC_PROFILE_MAIN = 0,
+};
+
+enum
+{
+    IHEVC_TIER_MAIN,
+    IHEVC_TIER_HIGH,
+};
+
+
+/* Slice type enums - Do not change the values */
+
+enum
+{
+    BSLICE = 0,
+    PSLICE = 1,
+    ISLICE = 2,
+};
+
+/** Enum for Level : Multiplication by 30 as per spec is implemented as multiplied by 10 and then by 3
+ * User will give level multiplied by 10 in the API
+ */
+enum
+{
+    IHEVC_LEVEL_10 = 10 * 3,
+    IHEVC_LEVEL_20 = 20 * 3,
+    IHEVC_LEVEL_21 = 21 * 3,
+    IHEVC_LEVEL_30 = 30 * 3,
+    IHEVC_LEVEL_31 = 31 * 3,
+    IHEVC_LEVEL_40 = 40 * 3,
+    IHEVC_LEVEL_41 = 41 * 3,
+    IHEVC_LEVEL_50 = 50 * 3,
+    IHEVC_LEVEL_51 = 51 * 3,
+    IHEVC_LEVEL_52 = 52 * 3,
+    IHEVC_LEVEL_60 = 60 * 3,
+    IHEVC_LEVEL_61 = 61 * 3,
+    IHEVC_LEVEL_62 = 62 * 3,
+};
+
+
+enum
+{
+    NAL_TRAIL_N     = 0,
+    NAL_TRAIL_R,
+    NAL_TSA_N,
+    NAL_TSA_R,
+    NAL_STSA_N,
+    NAL_STSA_R,
+    NAL_RADL_N,
+    NAL_RADL_R,
+    NAL_RASL_N,
+    NAL_RASL_R,
+    NAL_RSV_VCL_N10 = 10,
+    NAL_RSV_VCL_N12 = 12,
+    NAL_RSV_VCL_N14 = 14,
+    NAL_RSV_VCL_R11 = 11,
+    NAL_RSV_VCL_R13 = 13,
+    NAL_RSV_VCL_R15 = 15,
+
+    NAL_BLA_W_LP    = 16,
+    NAL_BLA_W_DLP,
+    NAL_BLA_N_LP,
+    NAL_IDR_W_LP,
+    NAL_IDR_N_LP,
+    NAL_CRA,
+    NAL_RSV_RAP_VCL22 = 22,
+    NAL_RSV_RAP_VCL23 = 23,
+    NAL_RSV_VCL24 = 24,
+    NAL_RSV_VCL31 = 31,
+    NAL_VPS       = 32,
+    NAL_SPS,
+    NAL_PPS,
+    NAL_AUD,
+    NAL_EOS,
+    NAL_EOB,
+    NAL_FD,
+    NAL_PREFIX_SEI = 39,
+    NAL_SUFFIX_SEI = 40,
+    NAL_RSV_NVCL41 = 41,
+    NAL_RSV_NVCL47 = 47 ,
+    NAL_UNSPEC48   = 48 ,
+    NAL_UNSPEC63   = 49,
+};
+
+enum
+{
+    CHROMA_FMT_IDC_MONOCHROME   = 0,
+    CHROMA_FMT_IDC_YUV420       = 1,
+    CHROMA_FMT_IDC_YUV422       = 2,
+    CHROMA_FMT_IDC_YUV444       = 3,
+    CHROMA_FMT_IDC_YUV444_PLANES = 4,
+};
+
+/* Pred Modes */
+/* Do not change enum values */
+enum
+{
+    PRED_MODE_INTER = 0,
+    PRED_MODE_INTRA = 1,
+    PRED_MODE_SKIP  = 2
+};
+
+/* Partition Modes */
+/* Do not change enum values */
+enum
+{
+    PART_2Nx2N  = 0,
+    PART_2NxN   = 1,
+    PART_Nx2N   = 2,
+    PART_NxN    = 3,
+    PART_2NxnU  = 4,
+    PART_2NxnD  = 5,
+    PART_nLx2N  = 6,
+    PART_nRx2N  = 7
+};
+
+/* Prediction list */
+/* Do not change enum values */
+enum
+{
+    PRED_L0 = 0,
+    PRED_L1 = 1,
+    PRED_BI = 2
+};
+
+/**
+ * Scan types
+ */
+enum
+{
+    SCAN_DIAG_UPRIGHT,
+    SCAN_HORZ,
+    SCAN_VERT
+};
+
+/**
+ * VUI aspect ratio indicator
+ */
+enum
+{
+    SAR_UNUSED = 0,
+    SAR_1_1 = 1,
+    SAR_12_11,
+    SAR_10_11,
+    SAR_16_11,
+    SAR_40_33,
+    SAR_24_11,
+    SAR_20_11,
+    SAR_32_11,
+    SAR_80_33,
+    SAR_18_11,
+    SAR_15_11,
+    SAR_64_33,
+    SAR_160_99,
+    SAR_4_3,
+    SAR_3_2,
+    SAR_2_1,
+    EXTENDED_SAR = 255
+};
+
+enum
+{
+    VID_FMT_COMPONENT = 0,
+    VID_FMT_PAL,
+    VID_FMT_NTSC,
+    VID_FMT_SECAM,
+    VID_FMT_MAC,
+    VID_FMT_UNSPECIFIED
+};
+
+#define BIT_DEPTH           8
+#define BIT_DEPTH_LUMA      BIT_DEPTH
+#define BIT_DEPTH_CHROMA    BIT_DEPTH
+/*****************************************************************************/
+/* Profile tier level defs                                                   */
+/*****************************************************************************/
+#define MAX_PROFILE_COMPATBLTY 32
+
+/*****************************************************************************/
+/* Reference frame defs                                                      */
+/*****************************************************************************/
+/* Maximum DPB size */
+#define MAX_DPB_SIZE 16
+
+
+/*****************************************************************************/
+/* VPS restrictions                                                          */
+/*****************************************************************************/
+
+/* Number of VPS allowed  in Main Profile */
+#define MAX_VPS_CNT         16
+
+/* Max sub layers in VPS */
+#define VPS_MAX_SUB_LAYERS  7
+
+/* Max number of HRD parameters */
+#define VPS_MAX_HRD_PARAMS  2
+
+/* Maximum number of operation point layers */
+#define VPS_MAX_OP_LAYERS 2
+
+
+/*****************************************************************************/
+/* Tile restrictions                                                         */
+/*****************************************************************************/
+/* Minimum tile width in Main Profile */
+#define MIN_TILE_WD  MIN_CTB_SIZE
+
+/* Minimum tile height in Main Profile */
+#define MIN_TILE_HT  MIN_CTB_SIZE
+
+/*****************************************************************************/
+/* SPS restrictions                                                          */
+/*****************************************************************************/
+
+/* Number of SPS allowed in Main Profile*/
+/* An extra buffer is allocated to write the parsed data
+ * It is copied to the appropriate location later */
+#define MAX_SPS_CNT         (16 + 1)
+
+/* Max sub layers in PPS */
+#define SPS_MAX_SUB_LAYERS  7
+
+/* Maximum long term reference pics */
+#define MAX_LTREF_PICS_SPS 16
+
+#define MAX_STREF_PICS_SPS 64
+
+/*****************************************************************************/
+/* PPS restrictions                                                          */
+/*****************************************************************************/
+
+/* Number of PPS allowed in Main Profile */
+/* An extra buffer is allocated to write the parsed data
+ * It is copied to the appropriate location later */
+#define MAX_PPS_CNT         (64 + 1)
+
+/*****************************************************************************/
+/* Macro definitions for sizes of CTB, PU, TU, CU                            */
+/*****************************************************************************/
+
+/* CTB Size Range */
+#define MAX_CTB_SIZE        64
+#define MIN_CTB_SIZE        16
+
+/* TU Size Range */
+#define MAX_TU_SIZE         32
+#define MIN_TU_SIZE         4
+
+/* Max Transform Size */
+#define MAX_TRANS_SIZE      (MAX_TU_SIZE*MAX_TU_SIZE)
+
+/* PU Size Range */
+#define MAX_PU_SIZE         64
+#define MIN_PU_SIZE         4
+
+/* CU Size Range */
+#define MAX_CU_SIZE         64
+#define MIN_CU_SIZE         8
+
+
+/* Number of max TU in a CTB row */
+#define MAX_TU_IN_CTB_ROW   ((MAX_CTB_SIZE / MIN_TU_SIZE))
+
+/* Number of max TU in a CTB row */
+#define MAX_CU_IN_CTB_ROW   ((MAX_CTB_SIZE / MIN_CU_SIZE))
+
+/* Number of max PU in a CTb row */
+#define MAX_PU_IN_CTB_ROW   ((MAX_CTB_SIZE / MIN_PU_SIZE))
+
+/* Number of max CU in a CTB */
+#define MAX_CU_IN_CTB       ((MAX_CTB_SIZE / MIN_CU_SIZE) * \
+                             (MAX_CTB_SIZE / MIN_CU_SIZE))
+
+/* Number of max PU in a CTB */
+/*****************************************************************************/
+/* Note though for 64 x 64 CTB, Max PU in CTB is 128, in order to store      */
+/*  intra pred info, 256 entries are needed                                  */
+/*****************************************************************************/
+#define MAX_PU_IN_CTB       ((MAX_CTB_SIZE / MIN_PU_SIZE) * \
+                             (MAX_CTB_SIZE / MIN_PU_SIZE))
+
+/* Number of max TU in a CTB */
+#define MAX_TU_IN_CTB       ((MAX_CTB_SIZE / MIN_TU_SIZE) * \
+                             (MAX_CTB_SIZE / MIN_TU_SIZE))
+
+
+
+/**
+ * Maximum transform depths
+ */
+#define MAX_TRAFO_DEPTH 5
+
+
+
+
+/* Max number of deblocking edges */
+#define MAX_VERT_DEBLK_EDGES ((MAX_CTB_SIZE/8) * (MAX_CTB_SIZE/4))
+#define MAX_HORZ_DEBLK_EDGES ((MAX_CTB_SIZE/4) * (MAX_CTB_SIZE/8))
+
+/* Qp can not change below 8x8 level */
+#define MAX_DEBLK_QP_CNT     ((MAX_CTB_SIZE/8) * (MAX_CTB_SIZE/8))
+
+/*****************************************************************************/
+/* Parsing related macros                                                    */
+/*****************************************************************************/
+#define SUBBLK_COEFF_CNT    16
+
+/* Quant and Trans defs */
+
+/*****************************************************************************/
+/* Sizes for Transform functions                                             */
+/*****************************************************************************/
+#define TRANS_SIZE_4   4
+#define TRANS_SIZE_8   8
+#define TRANS_SIZE_16 16
+#define TRANS_SIZE_32 32
+
+
+#define IT_SHIFT_STAGE_1 7
+#define IT_SHIFT_STAGE_2 12
+
+/**
+ * @brief  Maximum transform dynamic range (excluding sign bit)
+ */
+#define MAX_TR_DYNAMIC_RANGE  15
+
+/**
+ * @brief  Q(QP%6) * IQ(QP%6) = 2^20
+ */
+#define QUANT_IQUANT_SHIFT    20
+
+/**
+ * @brief Q factor for Qp%6 multiplication
+ */
+#define QUANT_SHIFT           14
+
+/**
+ * @brief Q shift factor for flat rescale matrix weights
+ */
+#define FLAT_RESCALE_MAT_Q_SHIFT    11
+
+/**
+ * @brief  Scaling matrix is represented in Q15 format
+ */
+#define SCALING_Q_SHIFT       15
+
+/**
+ * @brief  rounding factor for quantization represented in Q9 format
+ */
+#define QUANT_ROUND_FACTOR_Q   9
+
+/**
+ * @brief  Minimum qp supported in HEVC spec
+ */
+#define MIN_HEVC_QP 0
+
+/**
+ * @brief  Maximum qp supported in HEVC spec
+ */
+#define MAX_HEVC_QP 51  //FOR MAIN Branch Encoder
+
+#define MAX_HEVC_QP_10bit 63  //FOR HBD Branch Encoder
+
+
+/**
+ * @brief  Total number of transform sizes
+ * used for sizeID while getting scale matrix
+ */
+#define NUM_UNIQUE_TRANS_SIZE 4
+
+/*****************************************************************************/
+/* Number of scaling matrices for each transform size                        */
+/*****************************************************************************/
+#define SCALE_MAT_CNT_TRANS_SIZE_4    6
+#define SCALE_MAT_CNT_TRANS_SIZE_8    6
+#define SCALE_MAT_CNT_TRANS_SIZE_16   6
+#define SCALE_MAT_CNT_TRANS_SIZE_32   2
+
+/* Maximum number of scale matrices for a given transform size */
+#define SCALE_MAT_CNT_MAX_PER_TRANS_SIZE 6
+
+/* Total number of scale matrices */
+#define TOTAL_SCALE_MAT_COUNT   (SCALE_MAT_CNT_TRANS_SIZE_4     + \
+                                 SCALE_MAT_CNT_TRANS_SIZE_8     + \
+                                 SCALE_MAT_CNT_TRANS_SIZE_16    + \
+                                 SCALE_MAT_CNT_TRANS_SIZE_32)
+
+
+/*****************************************************************************/
+/* Intra pred Macros                                                         */
+/*****************************************************************************/
+/** Planar Intra prediction mode */
+#define INTRA_PLANAR             0
+
+/** DC Intra prediction mode */
+#define INTRA_DC                 1
+
+/** Gives angular mode for intra prediction */
+#define INTRA_ANGULAR(x) (x)
+
+/** Following is used to signal no intra prediction in case of pcm blocks
+ */
+#define INTRA_PRED_NONE  63
+
+
+/** Following is used to signal no intra prediction is needed for first three
+ * 4x4 luma blocks in case of 4x4 TU sizes
+ * Also used in pcm cases
+ */
+#define INTRA_PRED_CHROMA_IDX_NONE  7
+
+
+
+#endif /*__IHEVC_DEFS_H_*/

diff --git a/common/ihevc_disp_mgr.c b/common/ihevc_disp_mgr.c
new file mode 100644
index 0000000..e52b2fc
--- /dev/null
+++ b/common/ihevc_disp_mgr.c

@@ -0,0 +1,188 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_disp_mgr.c
+*
+* @brief
+*  Contains function definitions for display management
+*
+* @author
+*  Srinivas T
+*
+* @par List of Functions:
+*   - ihevc_disp_mgr_init()
+*   - ihevc_disp_mgr_add()
+*   - ihevc_disp_mgr_get()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#include <stdlib.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_disp_mgr.h"
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Initialization function for display buffer manager
+*
+* @par Description:
+*    Initializes the display buffer management structure
+*
+* @param[in] ps_disp_mgr
+*  Pointer to the display buffer management structure
+*
+* @returns none
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void ihevc_disp_mgr_init(
+                disp_mgr_t *ps_disp_mgr)
+{
+    WORD32 id;
+
+    ps_disp_mgr->u4_last_abs_poc = DEFAULT_POC;
+
+    for(id = 0; id < DISP_MGR_MAX_CNT; id++)
+    {
+        ps_disp_mgr->ai4_abs_poc[id] = DEFAULT_POC;
+        ps_disp_mgr->apv_ptr[id] = NULL;
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Adds a buffer to the display manager
+*
+* @par Description:
+*      Adds a buffer to the display buffer manager
+*
+* @param[in] ps_disp_mgr
+*  Pointer to the diaplay buffer management structure
+*
+* @param[in] buf_id
+*  ID of the display buffer
+*
+* @param[in] abs_poc
+*  Absolute POC of the display buffer
+*
+* @param[in] pv_ptr
+*  Pointer to the display buffer
+*
+* @returns  0 if success, -1 otherwise
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+WORD32 ihevc_disp_mgr_add(disp_mgr_t *ps_disp_mgr,
+                          WORD32 buf_id,
+                          WORD32 abs_poc,
+                          void *pv_ptr)
+{
+    if(buf_id >= DISP_MGR_MAX_CNT)
+    {
+        return (-1);
+    }
+
+    if(ps_disp_mgr->apv_ptr[buf_id] != NULL)
+    {
+        return (-1);
+    }
+
+    ps_disp_mgr->apv_ptr[buf_id] = pv_ptr;
+    ps_disp_mgr->ai4_abs_poc[buf_id] = abs_poc;
+    return 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Gets the next buffer
+*
+* @par Description:
+*  Gets the next display buffer
+*
+* @param[in] ps_disp_mgr
+*  Pointer to the display buffer structure
+*
+* @param[out]  pi4_buf_id
+*  Pointer to hold buffer id of the display buffer being returned
+*
+* @returns  Pointer to the next display buffer
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void* ihevc_disp_mgr_get(
+                disp_mgr_t *ps_disp_mgr,
+                WORD32 *pi4_buf_id)
+{
+    WORD32 id;
+    void *pv_ret_ptr;
+    WORD32 i4_min_poc;
+    WORD32 min_poc_id;
+
+
+    pv_ret_ptr = NULL;
+    i4_min_poc = 0x7FFFFFFF;
+    min_poc_id = -1;
+
+    /* Find minimum POC */
+    for(id = 0; id < DISP_MGR_MAX_CNT; id++)
+    {
+        if((DEFAULT_POC != ps_disp_mgr->ai4_abs_poc[id]) &&
+                        (ps_disp_mgr->ai4_abs_poc[id] <= i4_min_poc))
+        {
+            i4_min_poc = ps_disp_mgr->ai4_abs_poc[id];
+            min_poc_id = id;
+        }
+    }
+    *pi4_buf_id = min_poc_id;
+    /* If all pocs are still default_poc then return NULL */
+    if(-1 == min_poc_id)
+    {
+        return NULL;
+    }
+
+    pv_ret_ptr = ps_disp_mgr->apv_ptr[min_poc_id];
+
+    /* Set abs poc to default and apv_ptr to null so that the buffer is not returned again */
+    ps_disp_mgr->apv_ptr[min_poc_id] = NULL;
+    ps_disp_mgr->ai4_abs_poc[min_poc_id] = DEFAULT_POC;
+    return pv_ret_ptr;
+}

diff --git a/common/ihevc_disp_mgr.h b/common/ihevc_disp_mgr.h
new file mode 100644
index 0000000..aa5bd29
--- /dev/null
+++ b/common/ihevc_disp_mgr.h

@@ -0,0 +1,71 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_disp_mgr.h
+*
+* @brief
+*  Function declarations used for display management
+*
+* @author
+*  Srinivas T
+*
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _DISP_MGR_H_
+#define _DISP_MGR_H_
+
+#define DISP_MGR_MAX_CNT 64
+#define DEFAULT_POC 0x7FFFFFFF
+
+typedef struct
+{
+    /**
+     * last_abs_poc
+     */
+    UWORD32 u4_last_abs_poc;
+
+    /**
+     * au4_abs_poc[DISP_MGR_MAX_CNT]
+     */
+    WORD32 ai4_abs_poc[DISP_MGR_MAX_CNT];
+
+    /**
+     * apv_ptr[DISP_MGR_MAX_CNT]
+     */
+    void    *apv_ptr[DISP_MGR_MAX_CNT];
+}disp_mgr_t;
+
+void ihevc_disp_mgr_init(
+                disp_mgr_t *ps_disp_mgr);
+
+WORD32 ihevc_disp_mgr_add(
+                disp_mgr_t *ps_disp_mgr,
+                WORD32 id,
+                WORD32 abs_poc,
+                void *pv_ptr);
+
+void* ihevc_disp_mgr_get(disp_mgr_t *ps_disp_mgr,
+                         WORD32 *pi4_buf_id);
+
+#endif  //_DISP_MGR_H_

diff --git a/common/ihevc_dpb_mgr.c b/common/ihevc_dpb_mgr.c
new file mode 100644
index 0000000..7a2e032
--- /dev/null
+++ b/common/ihevc_dpb_mgr.c

@@ -0,0 +1,506 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_dpb_mgr.c
+ *
+ * @brief
+ *  Function definitions used for decoded picture buffer management
+ *
+ * @author
+ *  Srinivas T
+ *
+ * @par List of Functions:
+ *   - ihevc_dpb_mgr_init()
+ *   - ihevc_dpb_mgr_del_lt()
+ *   - ihevc_dpb_mgr_insert_lt()
+ *   - ihevc_dpb_mgr_del_st_or_make_lt()
+ *   - ihevc_dpb_mgr_insert_st()
+ *   - ihevc_dpb_mgr_reset()
+ *   - ihevc_dpb_mgr_release_pics()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_structs.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  DPB manager initializer
+ *
+ * @par Description:
+ *  Initialises the DPB manager structure
+ *
+ * @param[in] ps_dpb_mgr
+ *  Pointer to the DPB manager structure
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+void ihevc_dpb_mgr_init(dpb_mgr_t *ps_dpb_mgr)
+{
+    UWORD32 i;
+    dpb_info_t *ps_dpb_info = ps_dpb_mgr->as_dpb_info;
+    for(i = 0; i < MAX_DPB_BUFS; i++)
+    {
+        ps_dpb_info[i].ps_prev_dpb = NULL;
+        ps_dpb_info[i].ps_pic_buf = NULL;
+
+    }
+
+    ps_dpb_mgr->u1_num_ref_bufs = 0;
+    ps_dpb_mgr->ps_dpb_head = NULL;
+
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Adds a reference picture into the linked  list
+ *
+ * @par Description:
+ *  Adds the reference buffer with the given buffer id into the DPB manager
+ *
+ *
+ * @param[in] ps_dpb_mgr
+ *  Pointer to the DPB manager structure
+ *
+ * @param[in] ps_picBuf
+ *  Pointer to the picture buffer
+ *
+ * @param[in] buf_id
+ *  buffer id of the picture buffer
+ *
+ * @returns  0 if successful, -1 otherwise
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+WORD32 ihevc_dpb_mgr_insert_ref(dpb_mgr_t *ps_dpb_mgr,
+                                pic_buf_t *ps_pic_buf,
+                                WORD32 buf_id)
+{
+    int i;
+    dpb_info_t *ps_dpb_info;
+
+    ps_dpb_info = ps_dpb_mgr->as_dpb_info;
+
+    /* Return error if buffer is already present in the DPB */
+    for(i = 0; i < MAX_DPB_BUFS; i++)
+    {
+        if((ps_dpb_info[i].ps_pic_buf == ps_pic_buf)
+                        && (ps_dpb_info[i].ps_pic_buf->u1_used_as_ref))
+        {
+            return (-1);
+        }
+
+
+    }
+
+    /* Find an unused DPB location */
+    for(i = 0; i < MAX_DPB_BUFS; i++)
+    {
+        if(NULL == ps_dpb_info[i].ps_pic_buf)
+        {
+            break;
+        }
+    }
+    if(i == MAX_DPB_BUFS)
+    {
+        return (-1);
+    }
+
+    /* Create DPB info */
+    ps_dpb_info[i].ps_pic_buf = ps_pic_buf;
+    ps_dpb_info[i].ps_prev_dpb = ps_dpb_mgr->ps_dpb_head;
+    ps_dpb_info[i].ps_pic_buf->u1_buf_id = buf_id;
+    ps_dpb_info[i].ps_pic_buf->u1_used_as_ref = SHORT_TERM_REF;
+
+    /* update the head node of linked list to point to the current picture */
+    ps_dpb_mgr->ps_dpb_head = ps_dpb_info + i;
+
+    /* Increment Short term buffer count */
+    ps_dpb_mgr->u1_num_ref_bufs++;
+
+    return 0;
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Deletes a reference buffer from the dpb manager
+ *
+ * @par Description:
+ *  Delete short term reference with a given POC from the linked
+ *  list
+ *
+ * @param[in] ps_dpb_mgr
+ *  Pointer to DPB Manager structure
+ *
+ * @param[in] ps_buf_mgr
+ *  Pointer to buffer manager structure
+ *
+ * @param[in] u4_abs_poc
+ *  Node's absolute poc
+ *
+ *
+ * @returns  0 if successful, -1 otherwise
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+void ihevc_dpb_mgr_del_ref(dpb_mgr_t *ps_dpb_mgr,
+                           buf_mgr_t *ps_buf_mgr,
+                           WORD32 i4_abs_poc)
+{
+    int i;
+    dpb_info_t *ps_next_dpb;
+
+    dpb_info_t *ps_unmark_node;
+    UWORD8 u1_del_node;
+    UNUSED(u1_del_node);
+    u1_del_node = 0;
+
+    /* Find the node with matching absolute POC */
+    ps_next_dpb = ps_dpb_mgr->ps_dpb_head;
+    if(ps_next_dpb->ps_pic_buf->i4_abs_poc == i4_abs_poc)
+    {
+        ps_unmark_node = ps_next_dpb;
+    }
+    else
+    {
+        for(i = 1; i < ps_dpb_mgr->u1_num_ref_bufs; i++)
+        {
+            if(ps_next_dpb->ps_prev_dpb->ps_pic_buf->i4_abs_poc == i4_abs_poc)
+                break;
+            ps_next_dpb = ps_next_dpb->ps_prev_dpb;
+        }
+
+        if(i == ps_dpb_mgr->u1_num_ref_bufs)
+        {
+            return;
+        }
+        else
+            ps_unmark_node = ps_next_dpb->ps_prev_dpb;
+    }
+
+    if(ps_unmark_node == ps_dpb_mgr->ps_dpb_head)
+    {
+        ps_dpb_mgr->ps_dpb_head = ps_unmark_node->ps_prev_dpb;
+    }
+    else
+    {
+        ps_next_dpb->ps_prev_dpb = ps_unmark_node->ps_prev_dpb; //update link
+        ps_unmark_node->ps_prev_dpb = NULL;
+    }
+    ps_dpb_mgr->u1_num_ref_bufs--; //decrement buffer count
+
+    /* Release the physical buffer */
+    ihevc_buf_mgr_release((buf_mgr_t *)ps_buf_mgr, ps_unmark_node->ps_pic_buf->u1_buf_id,
+                          BUF_MGR_REF);
+    ps_unmark_node->ps_prev_dpb = NULL;
+    ps_unmark_node->ps_pic_buf = NULL;
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Gets a buffer with abs_poc closest to the current poc
+ *
+ * @par Description:
+ *  Returns the pointer to the picture buffer whose poc is equal to abs_poc
+ *
+ * @param[in] ps_dpb_mgr
+ *  Pointer to DPB Manager structure
+ *
+ * @param[out] ps_pic_buf
+ *  Pointer to picture buffer
+
+ * @param[in] abs_poc
+ *  poc of the buffer to be returned
+ *
+ * @returns
+ *  0 if successful, pic_buf otherwise
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+pic_buf_t* ihevc_dpb_mgr_get_ref_by_nearest_poc(dpb_mgr_t *ps_dpb_mgr, WORD32 cur_abs_poc)
+{
+    WORD32 i;
+    WORD32 min_diff = 0x7FFFFFFF;
+    pic_buf_t *ps_pic_buf = NULL;
+
+    for(i = 0; i < MAX_DPB_BUFS; i++)
+    {
+        if((ps_dpb_mgr->as_dpb_info[i].ps_pic_buf) &&
+                        (ps_dpb_mgr->as_dpb_info[i].ps_pic_buf->u1_used_as_ref != UNUSED_FOR_REF))
+        {
+            WORD32 poc_diff = cur_abs_poc - ps_dpb_mgr->as_dpb_info[i].ps_pic_buf->i4_abs_poc;
+            if((poc_diff > 0) && (poc_diff < min_diff))
+            {
+                min_diff = poc_diff;
+                ps_pic_buf = ps_dpb_mgr->as_dpb_info[i].ps_pic_buf;
+            }
+        }
+    }
+
+    if(NULL == ps_pic_buf)
+    {
+        min_diff = 0x7FFFFFFF;
+        for(i = 0; i < MAX_DPB_BUFS; i++)
+        {
+            if((ps_dpb_mgr->as_dpb_info[i].ps_pic_buf) &&
+                            (ps_dpb_mgr->as_dpb_info[i].ps_pic_buf->u1_used_as_ref != UNUSED_FOR_REF))
+            {
+                WORD32 poc_diff = cur_abs_poc - ps_dpb_mgr->as_dpb_info[i].ps_pic_buf->i4_abs_poc;
+                if(ABS(poc_diff) < min_diff)
+                {
+                    min_diff = ABS(poc_diff);
+                    ps_pic_buf = ps_dpb_mgr->as_dpb_info[i].ps_pic_buf;
+                }
+            }
+        }
+    }
+
+    return ps_pic_buf;
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Gets a buffer with abs_poc
+ *
+ * @par Description:
+ *  Returns the pointer to the picture buffer whose poc is equal to abs_poc
+ *
+ * @param[in] ps_dpb_mgr
+ *  Pointer to DPB Manager structure
+ *
+ * @param[out] ps_pic_buf
+ *  Pointer to picture buffer
+
+ * @param[in] abs_poc
+ *  poc of the buffer to be returned
+ *
+ * @returns
+ *  0 if successful, pic_buf otherwise
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+pic_buf_t* ihevc_dpb_mgr_get_ref_by_poc(dpb_mgr_t *ps_dpb_mgr, WORD32 abs_poc)
+{
+    UWORD32 i;
+    dpb_info_t *ps_next_ref;
+    pic_buf_t *ps_pic_buf = NULL;
+
+
+    ps_next_ref = ps_dpb_mgr->ps_dpb_head;
+    for(i = 0; i < ps_dpb_mgr->u1_num_ref_bufs; i++)
+    {
+        if(ps_next_ref->ps_pic_buf->i4_abs_poc == abs_poc)
+        {
+            ps_pic_buf = ps_next_ref->ps_pic_buf;
+            break;
+        }
+
+        ps_next_ref = ps_next_ref->ps_prev_dpb;
+    }
+
+    if(i == ps_dpb_mgr->u1_num_ref_bufs)
+    {
+        ps_pic_buf = NULL;
+    }
+
+    return ps_pic_buf;
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Gets a buffer with poc_lsb
+ *
+ * @par Description:
+ *  Returns the pointer to the picture buffer whose poc is equal to poc_lsb
+ *
+ * @param[in] ps_dpb_mgr
+ *  Pointer to DPB Manager structure
+ *
+ * @param[out] ps_pic_buf
+ *  Pointer to picture buffer
+
+ * @param[in] poc_lsb
+ *  poc_lsb of the buffer to be returned
+ *
+ * @returns
+ *  0 if successful, pic_buf otherwise
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+pic_buf_t* ihevc_dpb_mgr_get_ref_by_poc_lsb(dpb_mgr_t *ps_dpb_mgr, WORD32 poc_lsb)
+{
+    pic_buf_t *ps_pic_buf = NULL;
+    UWORD32 i;
+    dpb_info_t *ps_next_ref;
+
+    ps_next_ref = ps_dpb_mgr->ps_dpb_head;
+    for(i = 0; i < ps_dpb_mgr->u1_num_ref_bufs; i++)
+    {
+        if(ps_next_ref->ps_pic_buf->i4_poc_lsb == poc_lsb)
+        {
+            ps_pic_buf = ps_next_ref->ps_pic_buf;
+            break;
+        }
+
+        ps_next_ref = ps_next_ref->ps_prev_dpb;
+    }
+
+    if(i == ps_dpb_mgr->u1_num_ref_bufs)
+    {
+        ps_pic_buf = NULL;
+    }
+
+    return ps_pic_buf;
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Resets the DPB manager
+ *
+ * @par Description:
+ *  Re-initialises the DPB manager structure
+ *
+ * @param[in] ps_dpb_mgr
+ *  Pointer to DPB Manager structure
+ *
+ * @param[in] ps_buf_mgr
+ *  Pointer to buffer manager structure
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+void ihevc_dpb_mgr_reset(dpb_mgr_t *ps_dpb_mgr, buf_mgr_t *ps_buf_mgr)
+{
+    int i;
+    dpb_info_t *ps_dpb_info;
+
+    ps_dpb_info = ps_dpb_mgr->as_dpb_info;
+
+    for(i = 0; i < MAX_DPB_BUFS; i++)
+    {
+        if(ps_dpb_info[i].ps_pic_buf->u1_used_as_ref)
+        {
+            ps_dpb_info[i].ps_pic_buf->u1_used_as_ref = UNUSED_FOR_REF;
+            ps_dpb_info[i].ps_prev_dpb = NULL;
+            //Release physical buffer
+            ihevc_buf_mgr_release(ps_buf_mgr, ps_dpb_info[i].ps_pic_buf->u1_buf_id,
+                                  BUF_MGR_REF);
+
+            ps_dpb_info[i].ps_pic_buf = NULL;
+        }
+    }
+    ps_dpb_mgr->u1_num_ref_bufs = 0;
+    ps_dpb_mgr->ps_dpb_head = NULL;
+
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  deletes all pictures from DPB
+ *
+ * @par Description:
+ *  Deletes all pictures present in the DPB manager
+ *
+ * @param[in] ps_buf_mgr
+ *  Pointer to buffer manager structure
+ *
+ * @param[in] u1_disp_bufs
+ *  Number of buffers to be deleted
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+void ihevc_dpb_mgr_release_pics(buf_mgr_t *ps_buf_mgr, UWORD8 u1_disp_bufs)
+{
+    WORD8 i;
+    UWORD32 buf_status;
+
+    for(i = 0; i < u1_disp_bufs; i++)
+    {
+        buf_status = ihevc_buf_mgr_get_status(ps_buf_mgr, i);
+        if(0 != buf_status)
+        {
+            ihevc_buf_mgr_release((buf_mgr_t *)ps_buf_mgr, i, BUF_MGR_REF);
+        }
+    }
+}

diff --git a/common/ihevc_dpb_mgr.h b/common/ihevc_dpb_mgr.h
new file mode 100644
index 0000000..bf60413
--- /dev/null
+++ b/common/ihevc_dpb_mgr.h

@@ -0,0 +1,109 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_dpb_mgr.h
+ *
+ * @brief
+ *  Function declarations used for decoded picture buffer management
+ *
+ * @author
+ *  Srinivas T
+ *
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#ifndef _DPB_MANAGER_H
+#define _DPB_MANAGER_H
+
+/* Temporary definitions. Have to be defined later */
+
+#define MAX_DPB_BUFS                (MAX_DPB_SIZE * 4)
+
+#define MARK_ST_PICNUM_AS_NONREF    1
+#define MARK_LT_INDEX_AS_NONREF     2
+#define MARK_ST_PICNUM_AS_LT_INDEX  3
+#define RESET_REF_PICTURES          5
+
+typedef struct dpb_info_t dpb_info_t;
+
+enum
+{
+    UNUSED_FOR_REF = 0,
+    LONG_TERM_REF,
+    SHORT_TERM_REF,
+};
+struct dpb_info_t
+{
+    /**
+     * Pointer to picture buffer structure
+     */
+    pic_buf_t *ps_pic_buf;
+
+    /**
+     * Link to the DPB buffer with previous pic Num
+     */
+    dpb_info_t *ps_prev_dpb;
+
+};
+
+typedef struct
+{
+    /**
+     * Pointer to the most recent pic Num
+     */
+    dpb_info_t *ps_dpb_head;
+
+    /**
+     * Physical storage for dpbInfo for ref bufs
+     */
+    dpb_info_t as_dpb_info[MAX_DPB_BUFS];
+
+    /**
+     * Number of reference buffers
+     */
+    UWORD8 u1_num_ref_bufs;
+
+}dpb_mgr_t;
+
+void ihevc_dpb_mgr_init(dpb_mgr_t *ps_dpb_mgr);
+
+WORD32 ihevc_dpb_mgr_insert_ref(dpb_mgr_t *ps_dpb_mgr,
+                                pic_buf_t *ps_pic_buf,
+                                WORD32 buf_id);
+
+void ihevc_dpb_mgr_del_ref(dpb_mgr_t *ps_dpb_mgr,
+                           buf_mgr_t *ps_buf_mgr,
+                           WORD32 u4_abs_poc);
+
+pic_buf_t* ihevc_dpb_mgr_get_ref_by_nearest_poc(dpb_mgr_t *ps_dpb_mgr, WORD32 cur_abs_poc);
+
+pic_buf_t* ihevc_dpb_mgr_get_ref_by_poc(dpb_mgr_t *ps_dpb_mgr, WORD32 abs_poc);
+
+pic_buf_t* ihevc_dpb_mgr_get_ref_by_poc_lsb(dpb_mgr_t *ps_dpb_mgr, WORD32 poc_lsb);
+
+void ihevc_dpb_mgr_reset(dpb_mgr_t *ps_dpb_mgr, buf_mgr_t *ps_buf_mgr);
+
+void ihevc_dpb_mgr_release_pics(buf_mgr_t *ps_buf_mgr, UWORD8 u1_disp_bufs);
+
+#endif /*  _DPB_MANAGER_H */

diff --git a/common/ihevc_error.h b/common/ihevc_error.h
new file mode 100644
index 0000000..38eeccd
--- /dev/null
+++ b/common/ihevc_error.h

@@ -0,0 +1,65 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_error.h
+*
+* @brief
+*  Definitions related to error handling for common modules
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVC_ERROR_H_
+#define _IHEVC_ERROR_H_
+
+/**
+ * Enumerations for error codes used in the codec.
+ * Not all these are expected to be returned to the application.
+ * Only select few will be exported
+ */
+typedef enum
+{
+    /**
+     *  No error
+     */
+    IHEVC_SUCCESS = 0,
+    /**
+     *  Start error code for decoder
+     */
+    IHEVC_DEC_ERROR_START = 0x100,
+
+    /**
+     *  Start error code for encoder
+     */
+    IHEVC_ENC_ERROR_START = 0x200,
+    /**
+     * Generic failure
+     */
+    IHEVC_FAIL                             = 0x7FFFFFFF
+}IHEVC_ERROR_T;
+#endif /* _IHEVC_ERROR_H_ */

diff --git a/common/ihevc_func_types.h b/common/ihevc_func_types.h
new file mode 100644
index 0000000..f5a2c44
--- /dev/null
+++ b/common/ihevc_func_types.h

@@ -0,0 +1,69 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_func_types.h
+*
+* @brief
+*  Defines different types of function implementations  Eg C, Cortex A8
+* Intrinsics, Neon assembly etc
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_FUNC_TYPES_H_
+#define _IHEVC_FUNC_TYPES_H_
+
+
+/* C Model : No platform specific intrinsics or inline assemblies */
+#define    C            0
+
+/* Cortex Ax intrinsics */
+#define    CXAINTR      10
+
+/* Neon intrinsics */
+#define    NEONINTR     0
+
+/* X86 intrinsics */
+#define    X86INTR      12
+
+/* X64 intrinsics */
+#define    X64INTR      13
+
+/* Atom intrinsics */
+#define    ATOMINTR       14
+
+/* Cortex Ax assembly */
+#define    CXAASM       20
+
+/* Neon assembly */
+#define    NEONASM      21
+
+/* X86 assembly */
+#define    X86ASM       22
+
+
+#endif /* _IHEVC_FUNC_TYPES_H_ */

diff --git a/common/ihevc_inter_pred.h b/common/ihevc_inter_pred.h
new file mode 100644
index 0000000..b6cca71
--- /dev/null
+++ b/common/ihevc_inter_pred.h

@@ -0,0 +1,403 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_inter_pred.h
+*
+* @brief
+*  Declarations for the fucntions defined in  ihevc_inter_pred_ft.c
+*
+* @author
+*  Srinivas T
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_INTER_PRED_H_
+#define _IHEVC_INTER_PRED_H_
+
+#define NTAPS_LUMA 8
+#define NTAPS_CHROMA 4
+#define SHIFT_14_MINUS_BIT_DEPTH (14 - BIT_DEPTH)
+#define OFFSET_14_MINUS_BIT_DEPTH (1 << (SHIFT_14_MINUS_BIT_DEPTH - 1))
+#define OFFSET14 (1 << (14 - 1))
+#define FILTER_PREC 6
+
+#define REF_WIDTH 1280
+#define REF_HEIGHT 720
+
+/*****************************************************************************/
+/* Function Declarations                                                     */
+/*****************************************************************************/
+
+typedef void ihevc_inter_pred_ft(
+                UWORD8 *pu1_src,
+                UWORD8 *pu1_dst,
+                WORD32 src_strd,
+                WORD32 dst_strd,
+                WORD8 *pi1_coeff,
+                WORD32 ht,
+                WORD32 wd);
+
+typedef void ihevc_inter_pred_w16out_ft(
+                UWORD8 *pu1_src,
+                WORD16 *pi2_dst,
+                WORD32 src_strd,
+                WORD32 dst_strd,
+                WORD8 *pi1_coeff,
+                WORD32 ht,
+                WORD32 wd);
+
+typedef void ihevc_inter_pred_w16inp_ft(
+                WORD16 *pi2_src,
+                UWORD8 *pu1_dst,
+                WORD32 src_strd,
+                WORD32 dst_strd,
+                WORD8 *pi1_coeff,
+                WORD32 ht,
+                WORD32 wd);
+
+typedef void ihevc_inter_pred_w16inp_w16out_ft(
+                WORD16 *pi2_src,
+                WORD16 *pi2_dst,
+                WORD32 src_strd,
+                WORD32 dst_strd,
+                WORD8 *pi1_coeff,
+                WORD32 ht,
+                WORD32 wd);
+
+
+typedef void ihevc_hbd_inter_pred_ft(UWORD16 *pu2_src,
+                                     UWORD16 *pu2_dst,
+                                     WORD32 src_strd,
+                                     WORD32 dst_strd,
+                                     WORD8 *pi1_coeff,
+                                     WORD32 ht,
+                                     WORD32 wd,
+                                     UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_inter_pred_w16out_ft(
+                UWORD16 *pu2_src,
+                WORD16 *pi2_dst,
+                WORD32 src_strd,
+                WORD32 dst_strd,
+                WORD8 *pi1_coeff,
+                WORD32 ht,
+                WORD32 wd,
+                UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_inter_pred_w16inp_ft(
+                WORD16 *pi2_src,
+                UWORD16 *pu2_dst,
+                WORD32 src_strd,
+                WORD32 dst_strd,
+                WORD8 *pi1_coeff,
+                WORD32 ht,
+                WORD32 wd,
+                UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_inter_pred_w16inp_w16out_ft(
+                WORD16 *pi2_src,
+                WORD16 *pi2_dst,
+                WORD32 src_strd,
+                WORD32 dst_strd,
+                WORD8 *pi1_coeff,
+                WORD32 ht,
+                WORD32 wd,
+                UWORD8 bit_depth);
+
+typedef void ihevc_hbd_weighted_pred_uni_ft(
+                WORD16 *pi2_src,
+                UWORD16 *pu2_dst,
+                WORD32 src_strd,
+                WORD32 dst_strd,
+                WORD32 wgt0,
+                WORD32 off0,
+                WORD32 shift,
+                WORD32 lvl_shift,
+                WORD32 ht,
+                WORD32 wd,
+                UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_weighted_pred_bi_ft(
+                WORD16 *pi2_src1,
+                WORD16 *pi2_src2,
+                UWORD16 *pu2_dst,
+                WORD32 src_strd1,
+                WORD32 src_strd2,
+                WORD32 dst_strd,
+                WORD32 wgt0,
+                WORD32 off0,
+                WORD32 wgt1,
+                WORD32 off1,
+                WORD32 shift,
+                WORD32 lvl_shift1,
+                WORD32 lvl_shift2,
+                WORD32 ht,
+                WORD32 wd,
+                UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_weighted_pred_bi_default_ft(
+                WORD16 *pi2_src1,
+                WORD16 *pi2_src2,
+                UWORD16 *pu2_dst,
+                WORD32 src_strd1,
+                WORD32 src_strd2,
+                WORD32 dst_strd,
+                WORD32 lvl_shift1,
+                WORD32 lvl_shift2,
+                WORD32 ht,
+                WORD32 wd,
+                UWORD8 bit_depth);
+typedef void ihevc_hbd_weighted_pred_chroma_uni_ft(WORD16 *pi2_src,
+                                                   UWORD16 *pu2_dst,
+                                                   WORD32 src_strd,
+                                                   WORD32 dst_strd,
+                                                   WORD32 wgt0_cb,
+                                                   WORD32 wgt0_cr,
+                                                   WORD32 off0_cb,
+                                                   WORD32 off0_cr,
+                                                   WORD32 shift,
+                                                   WORD32 lvl_shift,
+                                                   WORD32 ht,
+                                                   WORD32 wd,
+                                                   UWORD8 bit_depth);
+
+typedef void ihevc_hbd_weighted_pred_chroma_bi_ft(WORD16 *pi2_src1,
+                                                  WORD16 *pi2_src2,
+                                                  UWORD16 *pu2_dst,
+                                                  WORD32 src_strd1,
+                                                  WORD32 src_strd2,
+                                                  WORD32 dst_strd,
+                                                  WORD32 wgt0_cb,
+                                                  WORD32 wgt0_cr,
+                                                  WORD32 off0_cb,
+                                                  WORD32 off0_cr,
+                                                  WORD32 wgt1_cb,
+                                                  WORD32 wgt1_cr,
+                                                  WORD32 off1_cb,
+                                                  WORD32 off1_cr,
+                                                  WORD32 shift,
+                                                  WORD32 lvl_shift1,
+                                                  WORD32 lvl_shift2,
+                                                  WORD32 ht,
+                                                  WORD32 wd,
+                                                  UWORD8 bit_depth);
+
+typedef void ihevc_hbd_weighted_pred_chroma_bi_default_ft(WORD16 *pi2_src1,
+                                                          WORD16 *pi2_src2,
+                                                          UWORD16 *pu2_dst,
+                                                          WORD32 src_strd1,
+                                                          WORD32 src_strd2,
+                                                          WORD32 dst_strd,
+                                                          WORD32 lvl_shift1,
+                                                          WORD32 lvl_shift2,
+                                                          WORD32 ht,
+                                                          WORD32 wd,
+                                                          UWORD8 bit_depth);
+/* C function declarations */
+ihevc_inter_pred_ft ihevc_inter_pred_luma_copy;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_horz;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_vert;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_copy_w16out;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_horz_w16out;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_vert_w16out;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_luma_vert_w16inp;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_luma_vert_w16inp_w16out;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_copy;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_horz;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_vert;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_copy_w16out;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_horz_w16out;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_vert_w16out;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_chroma_vert_w16inp;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_chroma_vert_w16inp_w16out;
+
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_luma_copy;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_luma_horz;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_luma_vert;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_luma_copy_w16out;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_luma_horz_w16out;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_luma_vert_w16out;
+ihevc_hbd_inter_pred_w16inp_ft ihevc_hbd_inter_pred_luma_vert_w16inp;
+ihevc_hbd_inter_pred_w16inp_w16out_ft ihevc_hbd_inter_pred_luma_vert_w16inp_w16out;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_chroma_copy;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_chroma_horz;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_chroma_vert;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_chroma_copy_w16out;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_chroma_horz_w16out;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_chroma_vert_w16out;
+ihevc_hbd_inter_pred_w16inp_ft ihevc_hbd_inter_pred_chroma_vert_w16inp;
+ihevc_hbd_inter_pred_w16inp_w16out_ft ihevc_hbd_inter_pred_chroma_vert_w16inp_w16out;
+ihevc_hbd_weighted_pred_uni_ft ihevc_hbd_weighted_pred_uni;
+ihevc_hbd_weighted_pred_bi_ft ihevc_hbd_weighted_pred_bi;
+ihevc_hbd_weighted_pred_bi_default_ft ihevc_hbd_weighted_pred_bi_default;
+ihevc_hbd_weighted_pred_chroma_uni_ft ihevc_hbd_weighted_pred_chroma_uni;
+ihevc_hbd_weighted_pred_chroma_bi_ft ihevc_hbd_weighted_pred_chroma_bi;
+ihevc_hbd_weighted_pred_chroma_bi_default_ft ihevc_hbd_weighted_pred_chroma_bi_default;
+
+/* A9 Q function declarations */
+ihevc_inter_pred_ft ihevc_inter_pred_luma_copy_a9q;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_horz_a9q;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_vert_a9q;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_copy_w16out_a9q;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_horz_w16out_a9q;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_vert_w16out_a9q;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_luma_vert_w16inp_a9q;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_luma_vert_w16inp_w16out_a9q;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_copy_a9q;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_horz_a9q;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_vert_a9q;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_copy_w16out_a9q;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_horz_w16out_a9q;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_vert_w16out_a9q;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_chroma_vert_w16inp_a9q;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q;
+
+/* A9 A function declarations */
+ihevc_inter_pred_ft ihevc_inter_pred_luma_copy_a9a;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_horz_a9a;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_vert_a9a;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_copy_w16out_a9a;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_horz_w16out_a9a;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_vert_w16out_a9a;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_luma_vert_w16inp_a9a;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_luma_vert_w16inp_w16out_a9a;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_copy_a9a;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_horz_a9a;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_vert_a9a;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_copy_w16out_a9a;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_horz_w16out_a9a;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_vert_w16out_a9a;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_chroma_vert_w16inp_a9a;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_chroma_vert_w16inp_w16out_a9a;
+
+/* NEONINTR function declarations */
+ihevc_inter_pred_ft ihevc_inter_pred_luma_copy_neonintr;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_horz_neonintr;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_vert_neonintr;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_copy_w16out_neonintr;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_horz_w16out_neonintr;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_vert_w16out_neonintr;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_luma_vert_w16inp_neonintr;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_luma_vert_w16inp_w16out_neonintr;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_copy_neonintr;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_horz_neonintr;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_vert_neonintr;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_copy_w16out_neonintr;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_horz_w16out_neonintr;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_vert_w16out_neonintr;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_chroma_vert_w16inp_neonintr;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_chroma_vert_w16inp_w16out_neonintr;
+
+/* SSSE31 function declarations */
+ihevc_inter_pred_ft ihevc_inter_pred_luma_copy_ssse3;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_horz_ssse3;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_vert_ssse3;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_copy_w16out_ssse3;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_horz_w16out_ssse3;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_vert_w16out_ssse3;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_luma_vert_w16inp_ssse3;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_luma_vert_w16inp_w16out_ssse3;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_copy_ssse3;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_horz_ssse3;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_vert_ssse3;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_copy_w16out_ssse3;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_horz_w16out_ssse3;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_vert_w16out_ssse3;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_chroma_vert_w16inp_ssse3;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_chroma_vert_w16inp_w16out_ssse3;
+
+/* SSE42 function declarations */
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_copy_w16out_sse42;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_copy_sse42;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_copy_w16out_sse42;
+
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_luma_copy_sse42;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_luma_horz_sse42;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_luma_vert_sse42;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_luma_copy_w16out_sse42;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_luma_horz_w16out_sse42;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_luma_vert_w16out_sse42;
+ihevc_hbd_inter_pred_w16inp_ft ihevc_hbd_inter_pred_luma_vert_w16inp_sse42;
+ihevc_hbd_inter_pred_w16inp_w16out_ft ihevc_hbd_inter_pred_luma_vert_w16inp_w16out_sse42;
+
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_chroma_copy_sse42;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_chroma_horz_sse42;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_chroma_vert_sse42;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_chroma_copy_w16out_sse42;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_chroma_horz_w16out_sse42;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_chroma_vert_w16out_sse42;
+ihevc_hbd_inter_pred_w16inp_ft ihevc_hbd_inter_pred_chroma_vert_w16inp_sse42;
+ihevc_hbd_inter_pred_w16inp_w16out_ft ihevc_hbd_inter_pred_chroma_vert_w16inp_w16out_sse42;
+
+ihevc_hbd_weighted_pred_uni_ft ihevc_hbd_weighted_pred_uni_sse42;
+ihevc_hbd_weighted_pred_bi_ft ihevc_hbd_weighted_pred_bi_sse42;
+ihevc_hbd_weighted_pred_bi_default_ft ihevc_hbd_weighted_pred_bi_default_sse42;
+ihevc_hbd_weighted_pred_chroma_uni_ft ihevc_hbd_weighted_pred_chroma_uni_sse42;
+ihevc_hbd_weighted_pred_chroma_bi_ft ihevc_hbd_weighted_pred_chroma_bi_sse42;
+ihevc_hbd_weighted_pred_chroma_bi_default_ft ihevc_hbd_weighted_pred_chroma_bi_default_sse42;
+
+#ifndef DISABLE_AVX2
+/* AVX2 function declarations */
+ihevc_inter_pred_ft ihevc_inter_pred_luma_copy_avx2;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_horz_avx2;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_vert_avx2;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_copy_w16out_avx2;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_horz_w16out_avx2;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_vert_w16out_avx2;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_luma_vert_w16inp_avx2;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_luma_vert_w16inp_w16out_avx2;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_copy_avx2;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_horz_avx2;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_vert_avx2;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_copy_w16out_avx2;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_horz_w16out_avx2;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_vert_w16out_avx2;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_chroma_vert_w16inp_avx2;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_chroma_vert_w16inp_w16out_avx2;
+#endif
+
+/* armv8 function declarations */
+ihevc_inter_pred_ft ihevc_inter_pred_luma_copy_av8;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_horz_av8;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_vert_av8;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_copy_w16out_av8;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_horz_w16out_av8;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_vert_w16out_av8;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_luma_vert_w16inp_av8;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_luma_vert_w16inp_w16out_av8;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_copy_av8;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_horz_av8;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_vert_av8;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_copy_w16out_av8;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_horz_w16out_av8;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_vert_w16out_av8;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_chroma_vert_w16inp_av8;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_chroma_vert_w16inp_w16out_av8;
+#endif /*_IHEVC_INTER_PRED_H_*/

diff --git a/common/ihevc_inter_pred_filters.c b/common/ihevc_inter_pred_filters.c
new file mode 100644
index 0000000..717bb53
--- /dev/null
+++ b/common/ihevc_inter_pred_filters.c

@@ -0,0 +1,1214 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_inter_pred_filters.c
+*
+* @brief
+*  Contains function definitions for inter prediction  interpolation filters
+*
+*
+* @author
+*  Srinivas T
+*
+* @par List of Functions:
+*  - ihevc_inter_pred_luma_copy()
+*  - ihevc_inter_pred_luma_horz()
+*  - ihevc_inter_pred_luma_vert()
+*  - ihevc_inter_pred_luma_copy_w16out()
+*  - ihevc_inter_pred_luma_horz_w16out()
+*  - ihevc_inter_pred_luma_vert_w16out()
+*  - ihevc_inter_pred_luma_vert_w16inp()
+*  - ihevc_inter_pred_luma_vert_w16inp_w16out()
+*  - ihevc_inter_pred_chroma_copy()
+*  - ihevc_inter_pred_chroma_horz()
+*  - ihevc_inter_pred_chroma_vert()
+*  - ihevc_inter_pred_chroma_copy_w16out()
+*  - ihevc_inter_pred_chroma_horz_w16out()
+*  - ihevc_inter_pred_chroma_vert_w16out()
+*  - ihevc_inter_pred_chroma_vert_w16inp()
+*  - ihevc_inter_pred_chroma_vert_w16inp_w16out()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_func_selector.h"
+
+#include "ihevc_inter_pred.h"
+/*****************************************************************************/
+/* Function Definitions                                                      */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief
+*       Interprediction luma function for copy
+*
+* @par Description:
+*    Copies the array of width 'wd' and height 'ht' from the  location pointed
+*    by 'src' to the location pointed by 'dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_luma_copy(UWORD8 *pu1_src,
+                                UWORD8 *pu1_dst,
+                                WORD32 src_strd,
+                                WORD32 dst_strd,
+                                WORD8 *pi1_coeff,
+                                WORD32 ht,
+                                WORD32 wd)
+{
+    WORD32 row, col;
+    UNUSED(pi1_coeff);
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < wd; col++)
+        {
+            pu1_dst[col] = pu1_src[col];
+        }
+
+        pu1_src += src_strd;
+        pu1_dst += dst_strd;
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Interprediction luma filter for horizontal input
+*
+* @par Description:
+*    Applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
+*    to the elements pointed by 'pu1_src' and  writes to the location pointed
+*    by 'pu1_dst'  The output is downshifted by 6 and clipped to 8 bits
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_luma_horz(UWORD8 *pu1_src,
+                                UWORD8 *pu1_dst,
+                                WORD32 src_strd,
+                                WORD32 dst_strd,
+                                WORD8 *pi1_coeff,
+                                WORD32 ht,
+                                WORD32 wd)
+{
+    WORD32 row, col, i;
+    WORD16 i2_tmp;
+
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < wd; col++)
+        {
+            i2_tmp = 0;
+            for(i = 0; i < NTAPS_LUMA; i++)
+                i2_tmp += pi1_coeff[i] * pu1_src[col + (i - 3)];
+
+            i2_tmp = (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH;
+            i2_tmp = CLIP_U8(i2_tmp);
+
+            pu1_dst[col] = (UWORD8)i2_tmp;
+        }
+
+        pu1_src += src_strd;
+        pu1_dst += dst_strd;
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Interprediction luma filter for vertical input
+*
+* @par Description:
+*   Applies a vertcal filter with coefficients pointed to  by 'pi1_coeff' to
+*   the elements pointed by 'pu1_src' and  writes to the location pointed by
+*   'pu1_dst'  The output is downshifted by 6 and clipped to 8 bits
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_luma_vert(UWORD8 *pu1_src,
+                                UWORD8 *pu1_dst,
+                                WORD32 src_strd,
+                                WORD32 dst_strd,
+                                WORD8 *pi1_coeff,
+                                WORD32 ht,
+                                WORD32 wd)
+{
+    WORD32 row, col, i;
+    WORD16 i2_tmp;
+
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < wd; col++)
+        {
+            i2_tmp = 0;
+            for(i = 0; i < NTAPS_LUMA; i++)
+                i2_tmp += pi1_coeff[i] * pu1_src[col + (i - 3) * src_strd];
+
+            i2_tmp = (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH;
+            i2_tmp = CLIP_U8(i2_tmp);
+
+            pu1_dst[col] = (UWORD8)i2_tmp;
+        }
+
+        pu1_src += src_strd;
+        pu1_dst += dst_strd;
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*       Interprediction luma filter for copy 16bit output
+*
+* @par Description:
+*    Copies the array of width 'wd' and height 'ht' from the  location pointed
+*    by 'src' to the location pointed by 'dst' The output is upshifted by 6
+*    bits and is used as input for vertical filtering or weighted prediction
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+*  WORD16 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_luma_copy_w16out(UWORD8 *pu1_src,
+                                       WORD16 *pi2_dst,
+                                       WORD32 src_strd,
+                                       WORD32 dst_strd,
+                                       WORD8 *pi1_coeff,
+                                       WORD32 ht,
+                                       WORD32 wd)
+{
+    WORD32 row, col;
+    UNUSED(pi1_coeff);
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < wd; col++)
+        {
+            pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH);
+        }
+
+        pu1_src += src_strd;
+        pi2_dst += dst_strd;
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Interprediction luma filter for horizontal 16bit output
+*
+* @par Description:
+*    Applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
+*    to the elements pointed by 'pu1_src' and  writes to the location pointed
+*    by 'pu1_dst'  No downshifting or clipping is done and the output is  used
+*    as an input for vertical filtering or weighted  prediction
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+*  WORD16 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_luma_horz_w16out(UWORD8 *pu1_src,
+                                       WORD16 *pi2_dst,
+                                       WORD32 src_strd,
+                                       WORD32 dst_strd,
+                                       WORD8 *pi1_coeff,
+                                       WORD32 ht,
+                                       WORD32 wd)
+{
+    WORD32 row, col, i;
+    WORD16 i2_tmp;
+
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < wd; col++)
+        {
+            i2_tmp = 0;
+            for(i = 0; i < NTAPS_LUMA; i++)
+                i2_tmp += pi1_coeff[i] * pu1_src[col + (i - 3)];
+
+            pi2_dst[col] = i2_tmp;
+        }
+
+        pu1_src += src_strd;
+        pi2_dst += dst_strd;
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*      Interprediction luma filter for vertical 16bit output
+*
+* @par Description:
+*    Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+*    'pu1_dst'  No downshifting or clipping is done and the output is  used as
+*    an input for weighted prediction
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+*  WORD16 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_luma_vert_w16out(UWORD8 *pu1_src,
+                                       WORD16 *pi2_dst,
+                                       WORD32 src_strd,
+                                       WORD32 dst_strd,
+                                       WORD8 *pi1_coeff,
+                                       WORD32 ht,
+                                       WORD32 wd)
+{
+    WORD32 row, col, i;
+    WORD16 i2_tmp;
+
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < wd; col++)
+        {
+            i2_tmp = 0;
+            for(i = 0; i < NTAPS_LUMA; i++)
+                i2_tmp += pi1_coeff[i] * pu1_src[col + (i - 3) * src_strd];
+
+            pi2_dst[col] = i2_tmp;
+        }
+
+        pu1_src += src_strd;
+        pi2_dst += dst_strd;
+    }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*
+*        Luma vertical filter for 16bit input.
+*
+* @par Description:
+*   Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+*   the elements pointed by 'pu1_src' and  writes to the location pointed by
+*   'pu1_dst'  Input is 16 bits  The filter output is downshifted by 12 and
+*   clipped to lie  between 0 and 255
+*
+* @param[in] pi2_src
+*  WORD16 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_luma_vert_w16inp(WORD16 *pi2_src,
+                                       UWORD8 *pu1_dst,
+                                       WORD32 src_strd,
+                                       WORD32 dst_strd,
+                                       WORD8 *pi1_coeff,
+                                       WORD32 ht,
+                                       WORD32 wd)
+{
+    WORD32 row, col, i;
+    WORD32 i4_tmp;
+
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < wd; col++)
+        {
+            i4_tmp = 0;
+            for(i = 0; i < NTAPS_LUMA; i++)
+                i4_tmp += pi1_coeff[i] * pi2_src[col + (i - 3) * src_strd];
+
+            i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH;
+            i4_tmp = CLIP_U8(i4_tmp);
+
+            pu1_dst[col] = i4_tmp;
+        }
+
+        pi2_src += src_strd;
+        pu1_dst += dst_strd;
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*      Luma prediction filter for vertical 16bit input & output
+*
+* @par Description:
+*    Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+*    'pu1_dst'  Input is 16 bits  The filter output is downshifted by 6 and
+*    8192 is  subtracted to store it as a 16 bit number  The output is used as
+*    a input to weighted prediction
+*
+* @param[in] pi2_src
+*  WORD16 pointer to the source
+*
+* @param[out] pi2_dst
+*  WORD16 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_luma_vert_w16inp_w16out(WORD16 *pi2_src,
+                                              WORD16 *pi2_dst,
+                                              WORD32 src_strd,
+                                              WORD32 dst_strd,
+                                              WORD8 *pi1_coeff,
+                                              WORD32 ht,
+                                              WORD32 wd)
+{
+    WORD32 row, col, i;
+    WORD32 i4_tmp;
+
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < wd; col++)
+        {
+            i4_tmp = 0;
+            for(i = 0; i < NTAPS_LUMA; i++)
+                i4_tmp += pi1_coeff[i] * pi2_src[col + (i - 3) * src_strd];
+
+            i4_tmp = (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) - OFFSET14;
+
+            pi2_dst[col] = i4_tmp;
+        }
+
+        pi2_src += src_strd;
+        pi2_dst += dst_strd;
+    }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*      Chroma interprediction filter for copy
+*
+* @par Description:
+*    Copies the array of width 'wd' and height 'ht' from the  location pointed
+*    by 'src' to the location pointed by 'dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_chroma_copy(UWORD8 *pu1_src,
+                                  UWORD8 *pu1_dst,
+                                  WORD32 src_strd,
+                                  WORD32 dst_strd,
+                                  WORD8 *pi1_coeff,
+                                  WORD32 ht,
+                                  WORD32 wd)
+{
+    WORD32 row, col;
+    UNUSED(pi1_coeff);
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < 2 * wd; col++)
+        {
+            pu1_dst[col] = pu1_src[col];
+        }
+
+        pu1_src += src_strd;
+        pu1_dst += dst_strd;
+    }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Chroma interprediction filter for horizontal input
+*
+* @par Description:
+*    Applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
+*    to the elements pointed by 'pu1_src' and  writes to the location pointed
+*    by 'pu1_dst'  The output is downshifted by 6 and clipped to 8 bits
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_chroma_horz(UWORD8 *pu1_src,
+                                  UWORD8 *pu1_dst,
+                                  WORD32 src_strd,
+                                  WORD32 dst_strd,
+                                  WORD8 *pi1_coeff,
+                                  WORD32 ht,
+                                  WORD32 wd)
+{
+    WORD32 row, col, i;
+    WORD16 i2_tmp_u, i2_tmp_v;
+
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < 2 * wd; col += 2)
+        {
+            i2_tmp_u = 0;
+            i2_tmp_v = 0;
+            for(i = 0; i < NTAPS_CHROMA; i++)
+            {
+                i2_tmp_u += pi1_coeff[i] * pu1_src[col + (i - 1) * 2];
+                i2_tmp_v += pi1_coeff[i] * pu1_src[col + 1 + (i - 1) * 2];
+            }
+
+            i2_tmp_u = (i2_tmp_u + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH;
+            i2_tmp_u = CLIP_U8(i2_tmp_u);
+            i2_tmp_v = (i2_tmp_v + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH;
+            i2_tmp_v = CLIP_U8(i2_tmp_v);
+
+
+            pu1_dst[col] = (UWORD8)i2_tmp_u;
+            pu1_dst[col + 1] = (UWORD8)i2_tmp_v;
+        }
+
+        pu1_src += src_strd;
+        pu1_dst += dst_strd;
+    }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Chroma interprediction filter for vertical input
+*
+* @par Description:
+*    Applies a vertcal filter with coefficients pointed to  by 'pi1_coeff' to
+*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+*    'pu1_dst'  The output is downshifted by 6 and clipped to 8 bits
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_chroma_vert(UWORD8 *pu1_src,
+                                  UWORD8 *pu1_dst,
+                                  WORD32 src_strd,
+                                  WORD32 dst_strd,
+                                  WORD8 *pi1_coeff,
+                                  WORD32 ht,
+                                  WORD32 wd)
+{
+    WORD32 row, col, i;
+    WORD16 i2_tmp;
+
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < 2 * wd; col++)
+        {
+            i2_tmp = 0;
+            for(i = 0; i < NTAPS_CHROMA; i++)
+            {
+                i2_tmp += pi1_coeff[i] * pu1_src[col + (i - 1) * src_strd];
+            }
+
+            i2_tmp = (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH;
+            i2_tmp = CLIP_U8(i2_tmp);
+
+            pu1_dst[col] = (UWORD8)i2_tmp;
+        }
+
+        pu1_src += src_strd;
+        pu1_dst += dst_strd;
+    }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*       chroma interprediction filter for copying 16bit output
+*
+* @par Description:
+*    Copies the array of width 'wd' and height 'ht' from the  location pointed
+*    by 'src' to the location pointed by 'dst' The output is upshifted by 6
+*    bits and is used as input for vertical filtering or weighted prediction
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+*  WORD16 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_chroma_copy_w16out(UWORD8 *pu1_src,
+                                         WORD16 *pi2_dst,
+                                         WORD32 src_strd,
+                                         WORD32 dst_strd,
+                                         WORD8 *pi1_coeff,
+                                         WORD32 ht,
+                                         WORD32 wd)
+{
+    WORD32 row, col;
+    UNUSED(pi1_coeff);
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < 2 * wd; col++)
+        {
+            pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH);
+        }
+
+        pu1_src += src_strd;
+        pi2_dst += dst_strd;
+    }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*       chroma interprediction filter to store horizontal 16bit ouput
+*
+* @par Description:
+*    Applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
+*    to the elements pointed by 'pu1_src' and  writes to the location pointed
+*    by 'pu1_dst'  No downshifting or clipping is done and the output is  used
+*    as an input for vertical filtering or weighted  prediction
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+*  WORD16 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_chroma_horz_w16out(UWORD8 *pu1_src,
+                                         WORD16 *pi2_dst,
+                                         WORD32 src_strd,
+                                         WORD32 dst_strd,
+                                         WORD8 *pi1_coeff,
+                                         WORD32 ht,
+                                         WORD32 wd)
+{
+    WORD32 row, col, i;
+    WORD16 i2_tmp_u, i2_tmp_v;
+
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < 2 * wd; col += 2)
+        {
+            i2_tmp_u = 0;
+            i2_tmp_v = 0;
+            for(i = 0; i < NTAPS_CHROMA; i++)
+            {
+                i2_tmp_u += pi1_coeff[i] * pu1_src[col + (i - 1) * 2];
+                i2_tmp_v += pi1_coeff[i] * pu1_src[col + 1 + (i - 1) * 2];
+            }
+
+            pi2_dst[col] = i2_tmp_u;
+            pi2_dst[col + 1] = i2_tmp_v;
+        }
+
+        pu1_src += src_strd;
+        pi2_dst += dst_strd;
+    }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Interprediction chroma filter to store vertical 16bit ouput
+*
+* @par Description:
+*    Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+*    'pu1_dst'  No downshifting or clipping is done and the output is  used as
+*    an input for weighted prediction
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+*  WORD16 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_chroma_vert_w16out(UWORD8 *pu1_src,
+                                         WORD16 *pi2_dst,
+                                         WORD32 src_strd,
+                                         WORD32 dst_strd,
+                                         WORD8 *pi1_coeff,
+                                         WORD32 ht,
+                                         WORD32 wd)
+{
+    WORD32 row, col, i;
+    WORD16 i2_tmp;
+
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < 2 * wd; col++)
+        {
+            i2_tmp = 0;
+            for(i = 0; i < NTAPS_CHROMA; i++)
+            {
+                i2_tmp += pi1_coeff[i] * pu1_src[col + (i - 1) * src_strd];
+            }
+
+            pi2_dst[col] = i2_tmp;
+        }
+
+        pu1_src += src_strd;
+        pi2_dst += dst_strd;
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*     chroma interprediction filter for vertical 16bit input
+*
+* @par Description:
+*    Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+*    'pu1_dst'  Input is 16 bits  The filter output is downshifted by 12 and
+*    clipped to lie  between 0 and 255
+*
+* @param[in] pi2_src
+*  WORD16 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_chroma_vert_w16inp(WORD16 *pi2_src,
+                                         UWORD8 *pu1_dst,
+                                         WORD32 src_strd,
+                                         WORD32 dst_strd,
+                                         WORD8 *pi1_coeff,
+                                         WORD32 ht,
+                                         WORD32 wd)
+{
+    WORD32 row, col, i;
+    WORD32 i4_tmp;
+
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < 2 * wd; col++)
+        {
+            i4_tmp = 0;
+            for(i = 0; i < NTAPS_CHROMA; i++)
+            {
+                i4_tmp += pi1_coeff[i] * pi2_src[col + (i - 1) * src_strd];
+            }
+
+            i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH;
+            i4_tmp = CLIP_U8(i4_tmp);
+
+            pu1_dst[col] = i4_tmp;
+        }
+
+        pi2_src += src_strd;
+        pu1_dst += dst_strd;
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*
+*      Chroma interprediction filter for 16bit vertical input and output.
+*
+* @par Description:
+*       Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+*       the elements pointed by 'pu1_src' and  writes to the location pointed by
+*       'pu1_dst'  Input is 16 bits  The filter output is downshifted by 6 and
+*       8192 is  subtracted to store it as a 16 bit number  The output is used as
+*       a input to weighted prediction
+*
+* @param[in] pi2_src
+*  WORD16 pointer to the source
+*
+* @param[out] pi2_dst
+*  WORD16 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_chroma_vert_w16inp_w16out(WORD16 *pi2_src,
+                                                WORD16 *pi2_dst,
+                                                WORD32 src_strd,
+                                                WORD32 dst_strd,
+                                                WORD8 *pi1_coeff,
+                                                WORD32 ht,
+                                                WORD32 wd)
+{
+    WORD32 row, col, i;
+    WORD32 i4_tmp;
+
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < 2 * wd; col++)
+        {
+            i4_tmp = 0;
+            for(i = 0; i < NTAPS_CHROMA; i++)
+            {
+                i4_tmp += pi1_coeff[i] * pi2_src[col + (i - 1) * src_strd];
+            }
+
+            i4_tmp = (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH);
+
+            pi2_dst[col] = i4_tmp;
+        }
+
+        pi2_src += src_strd;
+        pi2_dst += dst_strd;
+    }
+
+}
+
+

diff --git a/common/ihevc_intra_pred.h b/common/ihevc_intra_pred.h
new file mode 100644
index 0000000..a29e99d
--- /dev/null
+++ b/common/ihevc_intra_pred.h

@@ -0,0 +1,410 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_intra_pred.h
+*
+* @brief
+*  Declarations for the fucntions defined in  ihevc_intra_pred_filters
+*
+* @author
+*  Mamatha
+*
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef IHEVC_INTRA_PRED_H_
+#define IHEVC_INTRA_PRED_H_
+
+
+/*****************************************************************************/
+/* Macro definitions                                                         */
+/*****************************************************************************/
+#define look_up_trailing_zeros(x) (0 == (x) ? 8 : CTZ(x))
+
+/*****************************************************************************/
+/* Function Declarations                                                     */
+/*****************************************************************************/
+typedef void ihevc_intra_pred_luma_planar_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+
+
+typedef void ihevc_intra_pred_luma_dc_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+typedef void ihevc_intra_pred_luma_horz_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+
+typedef void ihevc_intra_pred_luma_ver_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+
+typedef void ihevc_intra_pred_luma_mode2_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+
+typedef void ihevc_intra_pred_luma_mode_18_34_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+
+typedef void ihevc_intra_pred_luma_mode_3_to_9_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+
+typedef void ihevc_intra_pred_luma_mode_11_to_17_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+
+typedef void ihevc_intra_pred_luma_mode_19_to_25_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+
+typedef void ihevc_intra_pred_luma_mode_27_to_33_ft(
+                UWORD8 *pu1_ref,
+                WORD32 src_strd,
+                UWORD8 *pu1_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode);
+
+typedef void ihevc_intra_pred_luma_ref_substitution_ft(UWORD8 *pu1_top_left,
+                                                       UWORD8 *pu1_top,
+                                                       UWORD8 *pu1_left,
+                                                       WORD32 src_strd,
+                                                       WORD32 nt,
+                                                       WORD32 nbr_flags,
+                                                       UWORD8 *pu1_dst,
+                                                       WORD32 dst_strd);
+
+
+typedef void ihevc_intra_pred_luma_ref_subst_all_avlble_ft(UWORD8 *pu1_top_left,
+                                                           UWORD8 *pu1_top,
+                                                           UWORD8 *pu1_left,
+                                                           WORD32 src_strd,
+                                                           WORD32 nt,
+                                                           WORD32 nbr_flags,
+                                                           UWORD8 *pu1_dst,
+                                                           WORD32 dst_strd);
+
+typedef void ihevc_intra_pred_ref_filtering_ft(UWORD8 *pu1_src,
+                                               WORD32 nt,
+                                               UWORD8 *pu1_dst,
+                                               WORD32 mode,
+                                               WORD32 strong_intra_smoothing_enable_flag);
+
+typedef void ihevc_hbd_intra_pred_luma_planar_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode,
+                UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_intra_pred_luma_dc_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode,
+                UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_intra_pred_luma_horz_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode,
+                UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_intra_pred_luma_ver_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode,
+                UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_intra_pred_luma_mode2_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode,
+                UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_intra_pred_luma_mode_18_34_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode,
+                UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_intra_pred_luma_mode_3_to_9_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode,
+                UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_intra_pred_luma_mode_11_to_17_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode,
+                UWORD8 bit_depth);
+
+typedef void ihevc_hbd_intra_pred_luma_mode_19_to_25_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode,
+                UWORD8 bit_depth);
+
+
+
+typedef void ihevc_hbd_intra_pred_luma_mode_27_to_33_ft(
+                UWORD16 *pu2_ref,
+                WORD32 src_strd,
+                UWORD16 *pu2_dst,
+                WORD32 dst_strd,
+                WORD32 nt,
+                WORD32 mode,
+                UWORD8 bit_depth);
+
+typedef void ihevc_hbd_intra_pred_luma_ref_substitution_ft(UWORD16 *pu2_top_left,
+                                                           UWORD16 *pu2_top,
+                                                           UWORD16 *pu2_left,
+                                                           WORD32 src_strd,
+                                                           WORD32 nt,
+                                                           WORD32 nbr_flags,
+                                                           UWORD16 *pu2_dst,
+                                                           WORD32 dst_strd,
+                                                           UWORD8 bit_depth);
+
+
+
+typedef void ihevc_hbd_intra_pred_ref_filtering_ft(UWORD16 *pu2_src,
+                                                   WORD32 nt,
+                                                   UWORD16 *pu2_dst,
+                                                   WORD32 mode,
+                                                   WORD32 strong_intra_smoothing_enable_flag,
+                                                   UWORD8 bit_depth);
+
+/* C function declarations */
+ihevc_intra_pred_luma_planar_ft ihevc_intra_pred_luma_planar;
+ihevc_intra_pred_luma_dc_ft ihevc_intra_pred_luma_dc;
+ihevc_intra_pred_luma_horz_ft ihevc_intra_pred_luma_horz;
+ihevc_intra_pred_luma_ver_ft ihevc_intra_pred_luma_ver;
+ihevc_intra_pred_luma_mode2_ft ihevc_intra_pred_luma_mode2;
+ihevc_intra_pred_luma_mode_18_34_ft ihevc_intra_pred_luma_mode_18_34;
+ihevc_intra_pred_luma_mode_3_to_9_ft ihevc_intra_pred_luma_mode_3_to_9;
+ihevc_intra_pred_luma_mode_11_to_17_ft ihevc_intra_pred_luma_mode_11_to_17;
+ihevc_intra_pred_luma_mode_19_to_25_ft ihevc_intra_pred_luma_mode_19_to_25;
+ihevc_intra_pred_luma_mode_27_to_33_ft ihevc_intra_pred_luma_mode_27_to_33;
+ihevc_intra_pred_luma_ref_substitution_ft ihevc_intra_pred_luma_ref_substitution;
+ihevc_intra_pred_luma_ref_subst_all_avlble_ft ihevc_intra_pred_luma_ref_subst_all_avlble;
+ihevc_intra_pred_ref_filtering_ft ihevc_intra_pred_ref_filtering;
+
+ihevc_hbd_intra_pred_luma_planar_ft ihevc_hbd_intra_pred_luma_planar;
+ihevc_hbd_intra_pred_luma_dc_ft ihevc_hbd_intra_pred_luma_dc;
+ihevc_hbd_intra_pred_luma_horz_ft ihevc_hbd_intra_pred_luma_horz;
+ihevc_hbd_intra_pred_luma_ver_ft ihevc_hbd_intra_pred_luma_ver;
+ihevc_hbd_intra_pred_luma_mode2_ft ihevc_hbd_intra_pred_luma_mode2;
+ihevc_hbd_intra_pred_luma_mode_18_34_ft ihevc_hbd_intra_pred_luma_mode_18_34;
+ihevc_hbd_intra_pred_luma_mode_3_to_9_ft ihevc_hbd_intra_pred_luma_mode_3_to_9;
+ihevc_hbd_intra_pred_luma_mode_11_to_17_ft ihevc_hbd_intra_pred_luma_mode_11_to_17;
+ihevc_hbd_intra_pred_luma_mode_19_to_25_ft ihevc_hbd_intra_pred_luma_mode_19_to_25;
+ihevc_hbd_intra_pred_luma_mode_27_to_33_ft ihevc_hbd_intra_pred_luma_mode_27_to_33;
+ihevc_hbd_intra_pred_luma_ref_substitution_ft ihevc_hbd_intra_pred_luma_ref_substitution;
+ihevc_hbd_intra_pred_ref_filtering_ft ihevc_hbd_intra_pred_ref_filtering;
+
+
+/* A9Q function declarations */
+ihevc_intra_pred_luma_planar_ft ihevc_intra_pred_luma_planar_a9q;
+ihevc_intra_pred_luma_dc_ft ihevc_intra_pred_luma_dc_a9q;
+ihevc_intra_pred_luma_horz_ft ihevc_intra_pred_luma_horz_a9q;
+ihevc_intra_pred_luma_ver_ft ihevc_intra_pred_luma_ver_a9q;
+ihevc_intra_pred_luma_mode2_ft ihevc_intra_pred_luma_mode2_a9q;
+ihevc_intra_pred_luma_mode_18_34_ft ihevc_intra_pred_luma_mode_18_34_a9q;
+ihevc_intra_pred_luma_mode_3_to_9_ft ihevc_intra_pred_luma_mode_3_to_9_a9q;
+ihevc_intra_pred_luma_mode_11_to_17_ft ihevc_intra_pred_luma_mode_11_to_17_a9q;
+ihevc_intra_pred_luma_mode_19_to_25_ft ihevc_intra_pred_luma_mode_19_to_25_a9q;
+ihevc_intra_pred_luma_mode_27_to_33_ft ihevc_intra_pred_luma_mode_27_to_33_a9q;
+ihevc_intra_pred_luma_ref_substitution_ft ihevc_intra_pred_luma_ref_substitution_a9q;
+ihevc_intra_pred_ref_filtering_ft ihevc_intra_pred_ref_filtering_a9q;
+
+/* A9 A function declarations */
+ihevc_intra_pred_luma_planar_ft ihevc_intra_pred_luma_planar_a9a;
+ihevc_intra_pred_luma_dc_ft ihevc_intra_pred_luma_dc_a9a;
+ihevc_intra_pred_luma_horz_ft ihevc_intra_pred_luma_horz_a9a;
+ihevc_intra_pred_luma_ver_ft ihevc_intra_pred_luma_ver_a9a;
+ihevc_intra_pred_luma_mode2_ft ihevc_intra_pred_luma_mode2_a9a;
+ihevc_intra_pred_luma_mode_18_34_ft ihevc_intra_pred_luma_mode_18_34_a9a;
+ihevc_intra_pred_luma_mode_3_to_9_ft ihevc_intra_pred_luma_mode_3_to_9_a9a;
+ihevc_intra_pred_luma_mode_11_to_17_ft ihevc_intra_pred_luma_mode_11_to_17_a9a;
+ihevc_intra_pred_luma_mode_19_to_25_ft ihevc_intra_pred_luma_mode_19_to_25_a9a;
+ihevc_intra_pred_luma_mode_27_to_33_ft ihevc_intra_pred_luma_mode_27_to_33_a9a;
+ihevc_intra_pred_luma_ref_substitution_ft ihevc_intra_pred_luma_ref_substitution_a9a;
+ihevc_intra_pred_ref_filtering_ft ihevc_intra_pred_ref_filtering_a9a;
+
+/* NEONINTR function declarations */
+ihevc_intra_pred_luma_planar_ft ihevc_intra_pred_luma_planar_neonintr;
+ihevc_intra_pred_luma_dc_ft ihevc_intra_pred_luma_dc_neonintr;
+ihevc_intra_pred_luma_horz_ft ihevc_intra_pred_luma_horz_neonintr;
+ihevc_intra_pred_luma_ver_ft ihevc_intra_pred_luma_ver_neonintr;
+ihevc_intra_pred_luma_mode2_ft ihevc_intra_pred_luma_mode2_neonintr;
+ihevc_intra_pred_luma_mode_18_34_ft ihevc_intra_pred_luma_mode_18_34_neonintr;
+ihevc_intra_pred_luma_mode_3_to_9_ft ihevc_intra_pred_luma_mode_3_to_9_neonintr;
+ihevc_intra_pred_luma_mode_11_to_17_ft ihevc_intra_pred_luma_mode_11_to_17_neonintr;
+ihevc_intra_pred_luma_mode_19_to_25_ft ihevc_intra_pred_luma_mode_19_to_25_neonintr;
+ihevc_intra_pred_luma_mode_27_to_33_ft ihevc_intra_pred_luma_mode_27_to_33_neonintr;
+ihevc_intra_pred_luma_ref_substitution_ft ihevc_intra_pred_luma_ref_substitution_neonintr;
+ihevc_intra_pred_ref_filtering_ft ihevc_intra_pred_ref_filtering_neonintr;
+
+/* SSSE31 function declarations */
+ihevc_intra_pred_luma_planar_ft ihevc_intra_pred_luma_planar_ssse3;
+ihevc_intra_pred_luma_dc_ft ihevc_intra_pred_luma_dc_ssse3;
+ihevc_intra_pred_luma_horz_ft ihevc_intra_pred_luma_horz_ssse3;
+ihevc_intra_pred_luma_ver_ft ihevc_intra_pred_luma_ver_ssse3;
+ihevc_intra_pred_luma_mode2_ft ihevc_intra_pred_luma_mode2_ssse3;
+ihevc_intra_pred_luma_mode_18_34_ft ihevc_intra_pred_luma_mode_18_34_ssse3;
+ihevc_intra_pred_luma_mode_3_to_9_ft ihevc_intra_pred_luma_mode_3_to_9_ssse3;
+ihevc_intra_pred_luma_mode_11_to_17_ft ihevc_intra_pred_luma_mode_11_to_17_ssse3;
+ihevc_intra_pred_luma_mode_19_to_25_ft ihevc_intra_pred_luma_mode_19_to_25_ssse3;
+ihevc_intra_pred_luma_mode_27_to_33_ft ihevc_intra_pred_luma_mode_27_to_33_ssse3;
+ihevc_intra_pred_luma_ref_substitution_ft ihevc_intra_pred_luma_ref_substitution_ssse3;
+ihevc_intra_pred_ref_filtering_ft ihevc_intra_pred_ref_filtering_ssse3;
+
+/* SSE42 function declarations */
+ihevc_intra_pred_luma_dc_ft ihevc_intra_pred_luma_dc_sse42;
+ihevc_intra_pred_luma_horz_ft ihevc_intra_pred_luma_horz_sse42;
+ihevc_intra_pred_luma_ver_ft ihevc_intra_pred_luma_ver_sse42;
+ihevc_intra_pred_luma_mode_3_to_9_ft ihevc_intra_pred_luma_mode_3_to_9_sse42;
+ihevc_intra_pred_luma_mode_11_to_17_ft ihevc_intra_pred_luma_mode_11_to_17_sse42;
+ihevc_intra_pred_luma_mode_19_to_25_ft ihevc_intra_pred_luma_mode_19_to_25_sse42;
+ihevc_intra_pred_luma_mode_27_to_33_ft ihevc_intra_pred_luma_mode_27_to_33_sse42;
+ihevc_intra_pred_ref_filtering_ft ihevc_intra_pred_ref_filtering_sse42;
+ihevc_hbd_intra_pred_luma_planar_ft ihevc_hbd_intra_pred_luma_planar_sse42;
+ihevc_hbd_intra_pred_luma_dc_ft ihevc_hbd_intra_pred_luma_dc_sse42;
+ihevc_hbd_intra_pred_luma_horz_ft ihevc_hbd_intra_pred_luma_horz_sse42;
+ihevc_hbd_intra_pred_luma_ver_ft ihevc_hbd_intra_pred_luma_ver_sse42;
+ihevc_hbd_intra_pred_luma_mode2_ft ihevc_hbd_intra_pred_luma_mode2_sse42;
+ihevc_hbd_intra_pred_luma_mode_18_34_ft ihevc_hbd_intra_pred_luma_mode_18_34_sse42;
+ihevc_hbd_intra_pred_luma_mode_3_to_9_ft ihevc_hbd_intra_pred_luma_mode_3_to_9_sse42;
+ihevc_hbd_intra_pred_luma_mode_11_to_17_ft ihevc_hbd_intra_pred_luma_mode_11_to_17_sse42;
+ihevc_hbd_intra_pred_luma_mode_19_to_25_ft ihevc_hbd_intra_pred_luma_mode_19_to_25_sse42;
+ihevc_hbd_intra_pred_luma_mode_27_to_33_ft ihevc_hbd_intra_pred_luma_mode_27_to_33_sse42;
+ihevc_hbd_intra_pred_luma_ref_substitution_ft ihevc_hbd_intra_pred_luma_ref_substitution_sse42;
+ihevc_hbd_intra_pred_ref_filtering_ft ihevc_hbd_intra_pred_ref_filtering_sse42;
+
+/* AVX function declaration*/
+ihevc_intra_pred_luma_dc_ft ihevc_intra_pred_luma_dc_avx;
+ihevc_intra_pred_luma_mode_18_34_ft ihevc_intra_pred_luma_mode_18_34_avx;
+ihevc_intra_pred_luma_ver_ft ihevc_intra_pred_luma_ver_avx;
+
+ihevc_hbd_intra_pred_luma_dc_ft ihevc_hbd_intra_pred_luma_dc_avx;
+ihevc_hbd_intra_pred_luma_mode_18_34_ft ihevc_hbd_intra_pred_luma_mode_18_34_avx;
+ihevc_hbd_intra_pred_luma_ver_ft ihevc_hbd_intra_pred_luma_ver_avx;
+ihevc_hbd_intra_pred_ref_filtering_ft ihevc_hbd_intra_pred_ref_filtering_avx;
+
+/* armv8 function declarations */
+ihevc_intra_pred_luma_planar_ft ihevc_intra_pred_luma_planar_av8;
+ihevc_intra_pred_luma_dc_ft ihevc_intra_pred_luma_dc_av8;
+ihevc_intra_pred_luma_horz_ft ihevc_intra_pred_luma_horz_av8;
+ihevc_intra_pred_luma_ver_ft ihevc_intra_pred_luma_ver_av8;
+ihevc_intra_pred_luma_mode2_ft ihevc_intra_pred_luma_mode2_av8;
+ihevc_intra_pred_luma_mode_18_34_ft ihevc_intra_pred_luma_mode_18_34_av8;
+ihevc_intra_pred_luma_mode_3_to_9_ft ihevc_intra_pred_luma_mode_3_to_9_av8;
+ihevc_intra_pred_luma_mode_11_to_17_ft ihevc_intra_pred_luma_mode_11_to_17_av8;
+ihevc_intra_pred_luma_mode_19_to_25_ft ihevc_intra_pred_luma_mode_19_to_25_av8;
+ihevc_intra_pred_luma_mode_27_to_33_ft ihevc_intra_pred_luma_mode_27_to_33_av8;
+ihevc_intra_pred_luma_ref_substitution_ft ihevc_intra_pred_luma_ref_substitution_av8;
+ihevc_intra_pred_ref_filtering_ft ihevc_intra_pred_ref_filtering_av8;
+#endif /* IHEVC_INTRA_PRED_H_ */

diff --git a/common/ihevc_intra_pred_filters.c b/common/ihevc_intra_pred_filters.c
new file mode 100644
index 0000000..d6bc2ab
--- /dev/null
+++ b/common/ihevc_intra_pred_filters.c

@@ -0,0 +1,1553 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_intra_pred_filters.c
+*
+* @brief
+*  Contains function Definition for intra prediction  interpolation filters
+*
+*
+* @author
+*  Srinivas T
+*
+* @par List of Functions:
+*  - ihevc_intra_pred_luma_planar()
+*  - ihevc_intra_pred_luma_dc()
+*  - ihevc_intra_pred_luma_horz()
+*  - ihevc_intra_pred_luma_ver()
+*  - ihevc_intra_pred_luma_mode2()
+*  - ihevc_intra_pred_luma_mode_18_34()
+*  - ihevc_intra_pred_luma_mode_3_to_9()
+*  - ihevc_intra_pred_luma_mode_11_to_17()
+*  - ihevc_intra_pred_luma_mode_19_to_25()
+*  - ihevc_intra_pred_luma_mode_27_to_33()
+*  - ihevc_intra_pred_luma_ref_substitution()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+#include <assert.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_intra_pred.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_defs.h"
+#include "ihevc_mem_fns.h"
+#include "ihevc_debug.h"
+
+/****************************************************************************/
+/* Constant Macros                                                          */
+/****************************************************************************/
+#define MAX_CU_SIZE 64
+#define BIT_DEPTH 8
+#define T32_4NT 128
+#define T16_4NT 64
+
+
+/****************************************************************************/
+/* Function Macros                                                          */
+/****************************************************************************/
+#define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
+
+/*****************************************************************************/
+/* global tables Definition                                                  */
+/*****************************************************************************/
+
+
+/*****************************************************************************/
+/* Function Definition                                                      */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for pu1_ref substitution
+*
+*
+* @par Description:
+*    Reference substitution process for samples unavailable  for prediction
+*    Refer to section 8.4.4.2.2
+*
+* @param[in] pu1_top_left
+*  UWORD8 pointer to the top-left
+*
+* @param[in] pu1_top
+*  UWORD8 pointer to the top
+*
+* @param[in] pu1_left
+*  UWORD8 pointer to the left
+*
+* @param[in] src_strd
+*  WORD32 Source stride
+*
+* @param[in] nbr_flags
+*  WORD32 neighbor availability flags
+*
+* @param[in] nt
+*  WORD32 transform Block size
+*
+* @param[in] dst_strd
+*  WORD32 Destination stride
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void ihevc_intra_pred_luma_ref_subst_all_avlble(UWORD8 *pu1_top_left,
+                                                UWORD8 *pu1_top,
+                                                UWORD8 *pu1_left,
+                                                WORD32 src_strd,
+                                                WORD32 nt,
+                                                WORD32 nbr_flags,
+                                                UWORD8 *pu1_dst,
+                                                WORD32 dst_strd)
+{
+
+    WORD32 i;
+    WORD32 two_nt = 2 * nt;
+    UNUSED(nbr_flags);
+    UNUSED(dst_strd);
+
+    /* Neighbor Flag Structure*/
+    /* MSB ---> LSB */
+    /*    Top-Left | Top-Right | Top | Left | Bottom-Left
+              1         4         4     4         4
+     */
+    ASSERT((nbr_flags == 0x11188) || (nbr_flags == 0x133CC) || (nbr_flags == 0x1FFFF));
+    {
+
+        if(nt == 4)
+        {
+            /* 1 bit extraction for all the neighboring blocks */
+
+
+#if 1
+            /* Else fill the corresponding samples */
+            pu1_dst[two_nt] = *pu1_top_left;
+            //if(left)
+            {
+                for(i = 0; i < nt; i++)
+                    pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+            }
+//            if(bot_left)
+            {
+                for(i = nt; i < two_nt; i++)
+                    pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+            }
+//            if(top)
+            {
+                ihevc_memcpy(&pu1_dst[two_nt + 1], pu1_top, nt);
+            }
+//            if(tp_right)
+            {
+                ihevc_memcpy(&pu1_dst[two_nt + 1 + nt], pu1_top + nt, nt);
+            }
+
+#endif
+
+        }
+        else
+
+        {
+
+#if 1
+            /* Else fill the corresponding samples */
+            ASSERT((nt == 8) || (nt == 16) || (nt == 32));
+            pu1_dst[two_nt] = *pu1_top_left;
+
+            for(i = 0; i < nt; i++)
+                pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+
+            for(i = nt; i < two_nt; i++)
+                pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+
+            ihevc_memcpy_mul_8(&pu1_dst[two_nt + 1], pu1_top, nt);
+
+            ihevc_memcpy_mul_8(&pu1_dst[two_nt + 1 + nt], pu1_top + nt, nt);
+#endif
+        }
+
+    }
+}
+
+
+void ihevc_intra_pred_luma_ref_substitution(UWORD8 *pu1_top_left,
+                                            UWORD8 *pu1_top,
+                                            UWORD8 *pu1_left,
+                                            WORD32 src_strd,
+                                            WORD32 nt,
+                                            WORD32 nbr_flags,
+                                            UWORD8 *pu1_dst,
+                                            WORD32 dst_strd)
+{
+    UWORD8 pu1_ref;
+    WORD32 dc_val, i;
+    WORD32 total_samples = (4 * nt) + 1;
+    WORD32 two_nt = 2 * nt;
+
+    WORD32 three_nt = 3 * nt;
+    WORD32 get_bits;
+    WORD32 next;
+    WORD32 bot_left, left, top, tp_right, tp_left;
+
+    WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
+    UNUSED(dst_strd);
+    /*dc_val = 1 << (BIT_DEPTH - 1);*/
+    dc_val = 1 << (8 - 1);
+
+
+    /* Neighbor Flag Structure*/
+    /* MSB ---> LSB */
+    /*    Top-Left | Top-Right | Top | Left | Bottom-Left
+              1         4         4     4         4
+     */
+    /* If no neighbor flags are present, fill the neighbor samples with DC value */
+    if(nbr_flags == 0)
+    {
+        for(i = 0; i < total_samples; i++)
+        {
+            pu1_dst[i] = dc_val;
+        }
+    }
+    else
+    {
+        if(nt <= 8)
+        {
+            /* 1 bit extraction for all the neighboring blocks */
+            tp_left = (nbr_flags & 0x10000) >> 16;
+            bot_left = (nbr_flags & 0x8) >> 3;
+            left = (nbr_flags & 0x80) >> 7;
+            top = (nbr_flags & 0x100) >> 8;
+            tp_right = (nbr_flags & 0x1000) >> 12;
+
+#if 1
+            /* Else fill the corresponding samples */
+            if(tp_left)
+                pu1_dst[two_nt] = *pu1_top_left;
+            else
+                pu1_dst[two_nt] = 0;
+
+
+            if(left)
+            {
+                for(i = 0; i < nt; i++)
+                    pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+            }
+            else
+            {
+                ihevc_memset(&pu1_dst[two_nt - 1 - (nt - 1)], 0, nt);
+            }
+
+
+            if(bot_left)
+            {
+                for(i = nt; i < two_nt; i++)
+                    pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+            }
+            else
+            {
+                ihevc_memset(&pu1_dst[two_nt - 1 - (two_nt - 1)], 0, nt);
+            }
+
+
+            if(top)
+            {
+                ihevc_memcpy(&pu1_dst[two_nt + 1], pu1_top, nt);
+            }
+            else
+            {
+                ihevc_memset(&pu1_dst[two_nt + 1], 0, nt);
+            }
+
+            if(tp_right)
+            {
+                ihevc_memcpy(&pu1_dst[two_nt + 1 + nt], pu1_top + nt, nt);
+            }
+            else
+            {
+                ihevc_memset(&pu1_dst[two_nt + 1 + nt], 0, nt);
+            }
+#endif
+            next = 1;
+
+            /* If bottom -left is not available, reverse substitution process*/
+            if(bot_left == 0)
+            {
+                WORD32 a_nbr_flag[5];
+                a_nbr_flag[0] = bot_left;
+                a_nbr_flag[1] = left;
+                a_nbr_flag[2] = tp_left;
+                a_nbr_flag[3] = top;
+                a_nbr_flag[4] = tp_right;
+
+                /* Check for the 1st available sample from bottom-left*/
+                while(!a_nbr_flag[next])
+                    next++;
+
+                /* If Left, top-left are available*/
+                if(next <= 2)
+                {
+                    idx = nt * next;
+                    pu1_ref = pu1_dst[idx];
+                    for(i = 0; i < idx; i++)
+                        pu1_dst[i] = pu1_ref;
+                }
+                else /* If top, top-right are available */
+                {
+                    /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
+                    idx = (nt * (next - 1)) + 1;
+                    pu1_ref = pu1_dst[idx];
+                    for(i = 0; i < idx; i++)
+                        pu1_dst[i] = pu1_ref;
+                }
+            }
+
+            /* Forward Substitution Process */
+            /* If left is Unavailable, copy the last bottom-left value */
+            if(left == 0)
+            {
+                ihevc_memset(&pu1_dst[nt], pu1_dst[nt - 1], nt);
+
+            }
+            /* If top-left is Unavailable, copy the last left value */
+            if(tp_left == 0)
+                pu1_dst[two_nt] = pu1_dst[two_nt - 1];
+            /* If top is Unavailable, copy the last top-left value */
+            if(top == 0)
+            {
+                ihevc_memset(&pu1_dst[two_nt + 1], pu1_dst[two_nt], nt);
+            }
+            /* If to right is Unavailable, copy the last top value */
+            if(tp_right == 0)
+            {
+                ihevc_memset(&pu1_dst[three_nt + 1], pu1_dst[three_nt], nt);
+
+            }
+        }
+
+        if(nt == 16)
+        {
+            WORD32 nbr_flags_temp = 0;
+            nbr_flags_temp = ((nbr_flags & 0xC) >> 2) + ((nbr_flags & 0xC0) >> 4)
+                            + ((nbr_flags & 0x300) >> 4)
+                            + ((nbr_flags & 0x3000) >> 6)
+                            + ((nbr_flags & 0x10000) >> 8);
+
+#if 1
+            /* Else fill the corresponding samples */
+            if(nbr_flags & 0x10000)
+                pu1_dst[two_nt] = *pu1_top_left;
+            else
+                pu1_dst[two_nt] = 0;
+
+            if(nbr_flags & 0xC0)
+            {
+                for(i = 0; i < nt; i++)
+                    pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+            }
+            else
+            {
+                ihevc_memset_mul_8(&pu1_dst[two_nt - 1 - (nt - 1)], 0, nt);
+            }
+
+            if(nbr_flags & 0xC)
+            {
+                for(i = nt; i < two_nt; i++)
+                    pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+            }
+            else
+            {
+                ihevc_memset_mul_8(&pu1_dst[two_nt - 1 - (two_nt - 1)], 0, nt);
+            }
+
+
+            if(nbr_flags & 0x300)
+            {
+                ihevc_memcpy_mul_8(&pu1_dst[two_nt + 1], pu1_top, nt);
+            }
+            else
+            {
+                ihevc_memset_mul_8(&pu1_dst[two_nt + 1], 0, nt);
+            }
+
+            if(nbr_flags & 0x3000)
+            {
+                ihevc_memcpy_mul_8(&pu1_dst[two_nt + 1 + nt], pu1_top + nt, nt);
+            }
+            else
+            {
+                ihevc_memset_mul_8(&pu1_dst[two_nt + 1 + nt], 0, nt);
+            }
+#endif
+            /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
+            /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+            {
+                nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
+
+                if(nbr_id_from_bl == 64)
+                    nbr_id_from_bl = 32;
+
+                if(nbr_id_from_bl == 32)
+                {
+                    /* for top left : 1 pel per nbr bit */
+                    if(!((nbr_flags_temp >> 8) & 0x1))
+                    {
+                        nbr_id_from_bl++;
+                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right;  8 pels per nbr bit */
+                        //nbr_id_from_bl += idx * 8;
+                    }
+                }
+                /* Reverse Substitution Process*/
+                if(nbr_id_from_bl)
+                {
+                    /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+                    pu1_ref = pu1_dst[nbr_id_from_bl];
+                    for(i = (nbr_id_from_bl - 1); i >= 0; i--)
+                    {
+                        pu1_dst[i] = pu1_ref;
+                    }
+                }
+            }
+
+            /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+            while(nbr_id_from_bl < ((T16_4NT)+1))
+            {
+                /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
+                /* Devide by 8 to obtain the original index */
+                frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
+
+                /* The Top-left flag is at the last bit location of nbr_flags*/
+                if(nbr_id_from_bl == (T16_4NT / 2))
+                {
+                    get_bits = GET_BITS(nbr_flags_temp, 8);
+
+                    /* only pel substitution for TL */
+                    if(!get_bits)
+                        pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
+                }
+                else
+                {
+                    get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag);
+                    if(!get_bits)
+                    {
+                        /* 8 pel substitution (other than TL) */
+                        pu1_ref = pu1_dst[nbr_id_from_bl - 1];
+                        ihevc_memset_mul_8(pu1_dst + nbr_id_from_bl, pu1_ref, 8);
+
+
+                    }
+
+                }
+                nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
+            }
+
+
+        }
+
+        if(nt == 32)
+        {
+#if 1
+            /* Else fill the corresponding samples */
+            if(nbr_flags & 0x10000)
+                pu1_dst[two_nt] = *pu1_top_left;
+            else
+                pu1_dst[two_nt] = 0;
+
+            if(nbr_flags & 0xF0)
+            {
+                for(i = 0; i < nt; i++)
+                    pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+            }
+            else
+            {
+                ihevc_memset_mul_8(&pu1_dst[two_nt - 1 - (nt - 1)], 0, nt);
+            }
+
+            if(nbr_flags & 0xF)
+            {
+                for(i = nt; i < two_nt; i++)
+                    pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+            }
+            else
+            {
+                ihevc_memset_mul_8(&pu1_dst[two_nt - 1 - (two_nt - 1)], 0, nt);
+            }
+
+
+            if(nbr_flags & 0xF00)
+            {
+                ihevc_memcpy_mul_8(&pu1_dst[two_nt + 1], pu1_top, nt);
+            }
+            else
+            {
+                ihevc_memset_mul_8(&pu1_dst[two_nt + 1], 0, nt);
+            }
+
+            if(nbr_flags & 0xF000)
+            {
+                ihevc_memcpy_mul_8(&pu1_dst[two_nt + 1 + nt], pu1_top + nt, nt);
+            }
+            else
+            {
+                ihevc_memset_mul_8(&pu1_dst[two_nt + 1 + nt], 0, nt);
+            }
+#endif
+            /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
+            /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+            {
+                nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
+
+                if(nbr_id_from_bl == 64)
+                {
+                    /* for top left : 1 pel per nbr bit */
+                    if(!((nbr_flags >> 16) & 0x1))
+                    {
+                        /* top left not available */
+                        nbr_id_from_bl++;
+                        /* top and top right;  8 pels per nbr bit */
+                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
+                    }
+                }
+                /* Reverse Substitution Process*/
+                if(nbr_id_from_bl)
+                {
+                    /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+                    pu1_ref = pu1_dst[nbr_id_from_bl];
+                    for(i = (nbr_id_from_bl - 1); i >= 0; i--)
+                        pu1_dst[i] = pu1_ref;
+                }
+            }
+
+            /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+            while(nbr_id_from_bl < ((T32_4NT)+1))
+            {
+                /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
+                /* Devide by 8 to obtain the original index */
+                frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
+
+                /* The Top-left flag is at the last bit location of nbr_flags*/
+                if(nbr_id_from_bl == (T32_4NT / 2))
+                {
+                    get_bits = GET_BITS(nbr_flags, 16);
+                    /* only pel substitution for TL */
+                    if(!get_bits)
+                        pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
+                }
+                else
+                {
+                    get_bits = GET_BITS(nbr_flags, frwd_nbr_flag);
+                    if(!get_bits)
+                    {
+                        /* 8 pel substitution (other than TL) */
+                        pu1_ref = pu1_dst[nbr_id_from_bl - 1];
+                        ihevc_memset_mul_8(&pu1_dst[nbr_id_from_bl], pu1_ref, 8);
+
+                    }
+
+                }
+                nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
+            }
+        }
+
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for ref_filtering
+*
+*
+* @par Description:
+*    Reference DC filtering for neighboring samples dependent  on TU size and
+*    mode  Refer to section 8.4.4.2.3 in the standard
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_ref_filtering(UWORD8 *pu1_src,
+                                    WORD32 nt,
+                                    UWORD8 *pu1_dst,
+                                    WORD32 mode,
+                                    WORD32 strong_intra_smoothing_enable_flag)
+{
+    WORD32 filter_flag;
+    WORD32 i; /* Generic indexing variable */
+    WORD32 four_nt = 4 * nt;
+    UWORD8 au1_flt[(4 * MAX_CU_SIZE) + 1];
+    WORD32 bi_linear_int_flag = 0;
+    WORD32 abs_cond_left_flag = 0;
+    WORD32 abs_cond_top_flag = 0;
+    /*WORD32 dc_val = 1 << (BIT_DEPTH - 5);*/
+    WORD32 dc_val = 1 << (8 - 5);
+    //WORD32 strong_intra_smoothing_enable_flag  = 1;
+
+    filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
+    if(0 == filter_flag)
+    {
+        if(pu1_src == pu1_dst)
+        {
+            return;
+        }
+        else
+        {
+            for(i = 0; i < (four_nt + 1); i++)
+                pu1_dst[i] = pu1_src[i];
+        }
+    }
+
+    else
+    {
+        /* If strong intra smoothin is enabled and transform size is 32 */
+        if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
+        {
+            /* Strong Intra Filtering */
+            abs_cond_top_flag = (ABS(pu1_src[2 * nt] + pu1_src[4 * nt]
+                            - (2 * pu1_src[3 * nt]))) < dc_val;
+            abs_cond_left_flag = (ABS(pu1_src[2 * nt] + pu1_src[0]
+                            - (2 * pu1_src[nt]))) < dc_val;
+
+            bi_linear_int_flag = ((1 == abs_cond_left_flag)
+                            && (1 == abs_cond_top_flag));
+        }
+        /* Extremities Untouched*/
+        au1_flt[0] = pu1_src[0];
+        au1_flt[4 * nt] = pu1_src[4 * nt];
+
+        /* Strong filtering of reference samples */
+        if(1 == bi_linear_int_flag)
+        {
+            au1_flt[2 * nt] = pu1_src[2 * nt];
+
+            for(i = 1; i < (2 * nt); i++)
+                au1_flt[i] = (((2 * nt) - i) * pu1_src[0] + i * pu1_src[2 * nt] + 32) >> 6;
+
+            for(i = 1; i < (2 * nt); i++)
+                au1_flt[i + (2 * nt)] = (((2 * nt) - i) * pu1_src[2 * nt] + i * pu1_src[4 * nt] + 32) >> 6;
+
+        }
+        else
+        {
+            /* Perform bilinear filtering of Reference Samples */
+            for(i = 0; i < (four_nt - 1); i++)
+            {
+                au1_flt[i + 1] = (pu1_src[i] + 2 * pu1_src[i + 1]
+                                + pu1_src[i + 2] + 2) >> 2;
+            }
+        }
+
+
+        for(i = 0; i < (four_nt + 1); i++)
+            pu1_dst[i] = au1_flt[i];
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for luma planar
+*
+* @par Description:
+*    Planar Intraprediction with reference neighboring samples location
+*    pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'  Refer
+*    to section 8.4.4.2.4 in the standard
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_planar(UWORD8 *pu1_ref,
+                                  WORD32 src_strd,
+                                  UWORD8 *pu1_dst,
+                                  WORD32 dst_strd,
+                                  WORD32 nt,
+                                  WORD32 mode)
+{
+
+
+    WORD32 row, col;
+    WORD32 log2nt = 5;
+    WORD32 two_nt, three_nt;
+    UNUSED(src_strd);
+    UNUSED(mode);
+    switch(nt)
+    {
+        case 32:
+            log2nt = 5;
+            break;
+        case 16:
+            log2nt = 4;
+            break;
+        case 8:
+            log2nt = 3;
+            break;
+        case 4:
+            log2nt = 2;
+            break;
+        default:
+            break;
+    }
+    two_nt = 2 * nt;
+    three_nt = 3 * nt;
+    /* Planar filtering */
+    for(row = 0; row < nt; row++)
+    {
+        for(col = 0; col < nt; col++)
+        {
+            pu1_dst[row * dst_strd + col] = ((nt - 1 - col)
+                            * pu1_ref[two_nt - 1 - row]
+                            + (col + 1) * pu1_ref[three_nt + 1]
+                            + (nt - 1 - row) * pu1_ref[two_nt + 1 + col]
+                            + (row + 1) * pu1_ref[nt - 1] + nt) >> (log2nt + 1);
+        }
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for luma dc
+*
+* @par Description:
+*   Intraprediction for DC mode with reference neighboring  samples location
+*   pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
+*   to section 8.4.4.2.5 in the standard
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_dc(UWORD8 *pu1_ref,
+                              WORD32 src_strd,
+                              UWORD8 *pu1_dst,
+                              WORD32 dst_strd,
+                              WORD32 nt,
+                              WORD32 mode)
+{
+
+    WORD32 acc_dc;
+    WORD32 dc_val, two_dc_val, three_dc_val;
+    WORD32 i;
+    WORD32 row, col;
+    WORD32 log2nt = 5;
+    WORD32 two_nt, three_nt;
+    UNUSED(mode);
+    UNUSED(src_strd);
+    switch(nt)
+    {
+        case 32:
+            log2nt = 5;
+            break;
+        case 16:
+            log2nt = 4;
+            break;
+        case 8:
+            log2nt = 3;
+            break;
+        case 4:
+            log2nt = 2;
+            break;
+        default:
+            break;
+    }
+    two_nt = 2 * nt;
+    three_nt = 3 * nt;
+
+    acc_dc = 0;
+    /* Calculate DC value for the transform block */
+    for(i = nt; i < two_nt; i++)
+        acc_dc += pu1_ref[i];
+
+    for(i = (two_nt + 1); i <= three_nt; i++)
+        acc_dc += pu1_ref[i];
+
+    dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+    two_dc_val = 2 * dc_val;
+    three_dc_val = 3 * dc_val;
+
+
+    if(nt == 32)
+    {
+        for(row = 0; row < nt; row++)
+            for(col = 0; col < nt; col++)
+                pu1_dst[(row * dst_strd) + col] = dc_val;
+    }
+    else
+    {
+        /* DC filtering for the first top row and first left column */
+        pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
+                        >> 2);
+
+        for(col = 1; col < nt; col++)
+            pu1_dst[col] = (pu1_ref[two_nt + 1 + col] + three_dc_val + 2) >> 2;
+
+        for(row = 1; row < nt; row++)
+            pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
+                            >> 2;
+
+        /* Fill the remaining rows with DC value*/
+        for(row = 1; row < nt; row++)
+            for(col = 1; col < nt; col++)
+                pu1_dst[(row * dst_strd) + col] = dc_val;
+    }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Intra prediction interpolation filter for horizontal luma variable.
+*
+* @par Description:
+*      Horizontal intraprediction(mode 10) with reference  samples location
+*      pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
+*      to section 8.4.4.2.6 in the standard (Special case)
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_horz(UWORD8 *pu1_ref,
+                                WORD32 src_strd,
+                                UWORD8 *pu1_dst,
+                                WORD32 dst_strd,
+                                WORD32 nt,
+                                WORD32 mode)
+{
+
+    WORD32 row, col;
+    WORD32 two_nt;
+    WORD16 s2_predpixel;
+    UNUSED(mode);
+    UNUSED(src_strd);
+    two_nt = 2 * nt;
+
+    if(nt == 32)
+    {
+        for(row = 0; row < nt; row++)
+            for(col = 0; col < nt; col++)
+                pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];
+    }
+    else
+    {
+        /*Filtering done for the 1st row */
+        for(col = 0; col < nt; col++)
+        {
+            s2_predpixel = pu1_ref[two_nt - 1]
+                            + ((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1);
+            pu1_dst[col] = CLIP_U8(s2_predpixel);
+        }
+
+        /* Replication to next rows*/
+        for(row = 1; row < nt; row++)
+            for(col = 0; col < nt; col++)
+                pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];
+    }
+}
+
+
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Intra prediction interpolation filter for vertical luma variable.
+*
+* @par Description:
+*    Horizontal intraprediction with reference neighboring  samples location
+*    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
+*    to section 8.4.4.2.6 in the standard (Special case)
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_ver(UWORD8 *pu1_ref,
+                               WORD32 src_strd,
+                               UWORD8 *pu1_dst,
+                               WORD32 dst_strd,
+                               WORD32 nt,
+                               WORD32 mode)
+{
+    WORD32 row, col;
+    WORD16 s2_predpixel;
+    WORD32 two_nt = 2 * nt;
+    UNUSED(mode);
+    UNUSED(src_strd);
+
+    if(nt == 32)
+    {
+        /* Replication to next columns*/
+        for(row = 0; row < nt; row++)
+            for(col = 0; col < nt; col++)
+                pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt + 1 + col];
+    }
+    else
+    {
+        /*Filtering done for the 1st column */
+        for(row = 0; row < nt; row++)
+        {
+            s2_predpixel = pu1_ref[two_nt + 1]
+                            + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1);
+            pu1_dst[row * dst_strd] = CLIP_U8(s2_predpixel);
+        }
+
+        /* Replication to next columns*/
+        for(row = 0; row < nt; row++)
+            for(col = 1; col < nt; col++)
+                pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt + 1 + col];
+    }
+}
+
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Intra prediction interpolation filter for luma mode2.
+*
+* @par Description:
+*    Intraprediction for mode 2 (sw angle) with reference  neighboring samples
+*    location pointed by 'pu1_ref' to the  TU block location pointed by
+*    'pu1_dst'  Refer to section 8.4.4.2.6 in the standard
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode2(UWORD8 *pu1_ref,
+                                 WORD32 src_strd,
+                                 UWORD8 *pu1_dst,
+                                 WORD32 dst_strd,
+                                 WORD32 nt,
+                                 WORD32 mode)
+{
+    WORD32 row, col;
+    WORD32 two_nt = 2 * nt;
+    WORD32 intra_pred_ang = 32;
+    WORD32 idx = 0;
+    UNUSED(mode);
+    UNUSED(src_strd);
+    /* For the angle 45, replication is done from the corresponding angle */
+    /* intra_pred_ang = tan(angle) in q5 format */
+    for(col = 0; col < nt; col++)
+    {
+        idx = ((col + 1) * intra_pred_ang) >> 5; /* Use idx++ */
+
+        for(row = 0; row < nt; row++)
+            pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt - row - idx - 1];
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for luma mode 18 & mode 34.
+*
+* @par Description:
+*    Intraprediction for mode 34 (ne angle) and  mode 18 (nw angle) with
+*    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+*    block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_18_34(UWORD8 *pu1_ref,
+                                      WORD32 src_strd,
+                                      UWORD8 *pu1_dst,
+                                      WORD32 dst_strd,
+                                      WORD32 nt,
+                                      WORD32 mode)
+{
+    WORD32 row, col;
+    WORD32 intra_pred_ang;
+    WORD32 idx = 0;
+    WORD32 two_nt = 2 * nt;
+    UNUSED(src_strd);
+    intra_pred_ang = 32;    /*Default value*/
+
+    /* For mode 18, angle is -45degree */
+    if(mode == 18)
+        intra_pred_ang = -32;
+    /* For mode 34, angle is 45degree */
+    else if(mode == 34)
+        intra_pred_ang = 32;
+    /* For the angle 45 and -45, replication is done from the corresponding angle */
+    /* No interpolation is done for 45 degree*/
+    for(row = 0; row < nt; row++)
+    {
+        idx = ((row + 1) * intra_pred_ang) >> 5;
+#if OPT
+        if(mode == 18)
+            idx--;
+        if(mode == 34)
+            idx++;
+#endif
+        for(col = 0; col < nt; col++)
+            pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1];
+
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for luma mode 3 to mode 9
+*
+* @par Description:
+*    Intraprediction for mode 3 to 9  (positive angle, horizontal mode ) with
+*    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+*    block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_3_to_9(UWORD8 *pu1_ref,
+                                       WORD32 src_strd,
+                                       UWORD8 *pu1_dst,
+                                       WORD32 dst_strd,
+                                       WORD32 nt,
+                                       WORD32 mode)
+{
+    WORD32 row, col;
+    WORD32 two_nt = 2 * nt;
+    WORD32 intra_pred_ang;
+    WORD32 idx, ref_main_idx;
+    WORD32 pos, fract;
+    UNUSED(src_strd);
+    /* Intra Pred Angle according to the mode */
+    intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+    /* samples dependent on distance to obtain destination sample */
+
+    for(col = 0; col < nt; col++)
+    {
+        pos = ((col + 1) * intra_pred_ang);
+        idx = pos >> 5;
+        fract = pos & (31);
+
+        // Do linear filtering
+        for(row = 0; row < nt; row++)
+        {
+            ref_main_idx = two_nt - row - idx - 1;
+            pu1_dst[col + (row * dst_strd)] = (((32 - fract)
+                            * pu1_ref[ref_main_idx]
+                            + fract * pu1_ref[ref_main_idx - 1] + 16) >> 5);
+        }
+
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Intra prediction interpolation filter for luma mode 11 to mode 17
+*
+* @par Description:
+*    Intraprediction for mode 11 to 17  (negative angle, horizontal mode )
+*    with reference  neighboring samples location pointed by 'pu1_ref' to the
+*    TU block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_11_to_17(UWORD8 *pu1_ref,
+                                         WORD32 src_strd,
+                                         UWORD8 *pu1_dst,
+                                         WORD32 dst_strd,
+                                         WORD32 nt,
+                                         WORD32 mode)
+{
+    /* This function and ihevc_intra_pred_luma_mode_19_to_25 are same except*/
+    /* for ref main & side samples assignment,can be combined for */
+    /* optimzation*/
+
+    WORD32 row, col, k;
+    WORD32 two_nt;
+    WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
+    WORD32 idx, ref_main_idx, ref_idx;
+    WORD32 pos, fract;
+
+    UWORD8 ref_temp[2 * MAX_CU_SIZE + 1];
+    UWORD8 *ref_main;
+    UNUSED(src_strd);
+    inv_ang_sum = 128;
+    two_nt    = 2 * nt;
+
+    intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+    inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
+    /* Intermediate reference samples for negative angle modes */
+    /* This have to be removed during optimization*/
+    /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
+
+    ref_main = ref_temp + nt - 1;
+    for(k = 0; k < nt + 1; k++)
+        ref_temp[k + nt - 1] = pu1_ref[two_nt - k];
+
+    ref_main = ref_temp + nt - 1;
+    ref_idx = (nt * intra_pred_ang) >> 5;
+
+    /* SIMD Optimization can be done using look-up table for the loop */
+    /* For negative angled derive the main reference samples from side */
+    /*  reference samples refer to section 8.4.4.2.6 */
+    for(k = -1; k > ref_idx; k--)
+    {
+        inv_ang_sum += inv_ang;
+        ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
+    }
+
+    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+    /* samples dependent on distance to obtain destination sample */
+    for(col = 0; col < nt; col++)
+    {
+        pos = ((col + 1) * intra_pred_ang);
+        idx = pos >> 5;
+        fract = pos & (31);
+
+        // Do linear filtering
+        for(row = 0; row < nt; row++)
+        {
+            ref_main_idx = row + idx + 1;
+            pu1_dst[col + (dst_strd * row)] = (UWORD8)(((32 - fract)
+                            * ref_main[ref_main_idx]
+                            + fract * ref_main[ref_main_idx + 1] + 16) >> 5);
+
+        }
+
+    }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Intra prediction interpolation filter for luma mode 19 to mode 25
+*
+* @par Description:
+*    Intraprediction for mode 19 to 25  (negative angle, vertical mode ) with
+*    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+*    block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_19_to_25(UWORD8 *pu1_ref,
+                                         WORD32 src_strd,
+                                         UWORD8 *pu1_dst,
+                                         WORD32 dst_strd,
+                                         WORD32 nt,
+                                         WORD32 mode)
+{
+
+    WORD32 row, col, k;
+    WORD32 two_nt, intra_pred_ang, idx;
+    WORD32 inv_ang, inv_ang_sum, pos, fract;
+    WORD32 ref_main_idx, ref_idx;
+    UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 1];
+    UWORD8 *ref_main;
+    UNUSED(src_strd);
+    two_nt = 2 * nt;
+    intra_pred_ang = gai4_ihevc_ang_table[mode];
+    inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
+
+    /* Intermediate reference samples for negative angle modes */
+    /* This have to be removed during optimization*/
+    /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+    ref_main = ref_temp + nt - 1;
+    for(k = 0; k < (nt + 1); k++)
+        ref_temp[k + nt - 1] = pu1_ref[two_nt + k];
+
+    ref_idx = (nt * intra_pred_ang) >> 5;
+    inv_ang_sum = 128;
+
+    /* SIMD Optimization can be done using look-up table for the loop */
+    /* For negative angled derive the main reference samples from side */
+    /*  reference samples refer to section 8.4.4.2.6 */
+    for(k = -1; k > ref_idx; k--)
+    {
+        inv_ang_sum += inv_ang;
+        ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
+    }
+
+    for(row = 0; row < nt; row++)
+    {
+        pos = ((row + 1) * intra_pred_ang);
+        idx = pos >> 5;
+        fract = pos & (31);
+
+        // Do linear filtering
+        for(col = 0; col < nt; col++)
+        {
+            ref_main_idx = col + idx + 1;
+            pu1_dst[(row * dst_strd) + col] = (UWORD8)(((32 - fract)
+                            * ref_main[ref_main_idx]
+                            + fract * ref_main[ref_main_idx + 1] + 16) >> 5);
+
+        }
+
+    }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for luma mode 27 to mode 33
+*
+* @par Description:
+*    Intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
+*    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+*    block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_27_to_33(UWORD8 *pu1_ref,
+                                         WORD32 src_strd,
+                                         UWORD8 *pu1_dst,
+                                         WORD32 dst_strd,
+                                         WORD32 nt,
+                                         WORD32 mode)
+{
+    WORD32 row, col;
+    WORD32 two_nt, pos, fract;
+    WORD32 intra_pred_ang;
+    WORD32 idx, ref_main_idx;
+    UNUSED(src_strd);
+    two_nt = 2 * nt;
+    intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+    for(row = 0; row < nt; row++)
+    {
+        pos = ((row + 1) * intra_pred_ang);
+        idx = pos >> 5;
+        fract = pos & (31);
+
+        // Do linear filtering
+        for(col = 0; col < nt; col++)
+        {
+            ref_main_idx = two_nt + col + idx + 1;
+            pu1_dst[col + (row * dst_strd)] = (((32 - fract)
+                            * pu1_ref[ref_main_idx]
+                            + fract * pu1_ref[ref_main_idx + 1] + 16) >> 5);
+        }
+
+    }
+
+}
+

diff --git a/common/ihevc_iquant_itrans_recon.c b/common/ihevc_iquant_itrans_recon.c
new file mode 100644
index 0000000..249aa56
--- /dev/null
+++ b/common/ihevc_iquant_itrans_recon.c

@@ -0,0 +1,456 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_iquant_itrans_recon.c
+ *
+ * @brief
+ *  Contains function definitions for inverse  quantization, inverse
+ * transform and reconstruction
+ *
+ * @author
+ *  100470
+ *
+ * @par List of Functions:
+ *  - ihevc_iquant_itrans_recon_4x4_ttype1()
+ *  - ihevc_iquant_itrans_recon_4x4()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_iquant_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions here are replicated from ihevc_itrans.c and modified to */
+/* include reconstruction */
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization, inverse  transform
+ * type1(DST) and reconstruction for 4x4  input block
+ *
+ * @par Description:
+ *  Performs inverse quantization , inverse transform type 1  and adds
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 4x4 buffer for storing inverse
+ *  transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 4x4 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @param[in] zero_rows
+ *  Zero Rows in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_iquant_itrans_recon_4x4_ttype1(WORD16 *pi2_src,
+                                          WORD16 *pi2_tmp,
+                                          UWORD8 *pu1_pred,
+                                          WORD16 *pi2_dequant_coeff,
+                                          UWORD8 *pu1_dst,
+                                          WORD32 qp_div, /* qpscaled / 6 */
+                                          WORD32 qp_rem, /* qpscaled % 6 */
+                                          WORD32 src_strd,
+                                          WORD32 pred_strd,
+                                          WORD32 dst_strd,
+                                          WORD32 zero_cols,
+                                          WORD32 zero_rows)
+{
+    UNUSED(zero_rows);
+    /* Inverse Quant and Inverse Transform and Reconstruction */
+    {
+        WORD32 i, c[4];
+        WORD32 add;
+        WORD32 shift;
+        WORD16 *pi2_tmp_orig;
+        WORD32 shift_iq;
+        WORD32 trans_size;
+        /* Inverse Quantization constants */
+        {
+            WORD32 log2_trans_size, bit_depth;
+
+            log2_trans_size = 2;
+            bit_depth = 8 + 0;
+            shift_iq = bit_depth + log2_trans_size - 5;
+        }
+
+        trans_size = TRANS_SIZE_4;
+        pi2_tmp_orig = pi2_tmp;
+
+        /* Inverse Transform 1st stage */
+        shift = IT_SHIFT_STAGE_1;
+        add = 1 << (shift - 1);
+
+        for(i = 0; i < trans_size; i++)
+        {
+            /* Checking for Zero Cols */
+            if((zero_cols & 1) == 1)
+            {
+                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+            }
+            else
+            {
+                WORD32 iq_tmp_1, iq_tmp_2, iq_tmp_3;
+                // Intermediate Variables
+                IQUANT_4x4(iq_tmp_1,
+                           pi2_src[0 * src_strd],
+                           pi2_dequant_coeff[0 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+                IQUANT_4x4(iq_tmp_2,
+                           pi2_src[2 * src_strd],
+                           pi2_dequant_coeff[2 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+                c[0] = iq_tmp_1 + iq_tmp_2;
+
+                IQUANT_4x4(iq_tmp_1,
+                           pi2_src[2 * src_strd],
+                           pi2_dequant_coeff[2 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+                IQUANT_4x4(iq_tmp_2,
+                           pi2_src[3 * src_strd],
+                           pi2_dequant_coeff[3 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+                c[1] = iq_tmp_1 + iq_tmp_2;
+
+                IQUANT_4x4(iq_tmp_1,
+                           pi2_src[0 * src_strd],
+                           pi2_dequant_coeff[0 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+                IQUANT_4x4(iq_tmp_2,
+                           pi2_src[3 * src_strd],
+                           pi2_dequant_coeff[3 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+                c[2] = iq_tmp_1 - iq_tmp_2;
+
+                IQUANT_4x4(iq_tmp_1,
+                           pi2_src[1 * src_strd],
+                           pi2_dequant_coeff[1 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+                c[3] = 74 * iq_tmp_1;
+
+                pi2_tmp[0] =
+                                CLIP_S16((29 * c[0] + 55 * c[1] + c[3] + add) >> shift);
+                pi2_tmp[1] =
+                                CLIP_S16((55 * c[2] - 29 * c[1] + c[3] + add) >> shift);
+
+                IQUANT_4x4(iq_tmp_1,
+                           pi2_src[0 * src_strd],
+                           pi2_dequant_coeff[0 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+                IQUANT_4x4(iq_tmp_2,
+                           pi2_src[2 * src_strd],
+                           pi2_dequant_coeff[2 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+                IQUANT_4x4(iq_tmp_3,
+                           pi2_src[3 * src_strd],
+                           pi2_dequant_coeff[3 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+
+                pi2_tmp[2] =
+                                CLIP_S16((74 * (iq_tmp_1 - iq_tmp_2 + iq_tmp_3) + add) >> shift);
+                pi2_tmp[3] =
+                                CLIP_S16((55 * c[0] + 29 * c[2] - c[3] + add) >> shift);
+            }
+            pi2_src++;
+            pi2_dequant_coeff++;
+            pi2_tmp += trans_size;
+            zero_cols = zero_cols >> 1;
+        }
+
+        pi2_tmp = pi2_tmp_orig;
+
+        /* Inverse Transform 2nd stage */
+        shift = IT_SHIFT_STAGE_2;
+        add = 1 << (shift - 1);
+
+        for(i = 0; i < trans_size; i++)
+        {
+            WORD32 itrans_out;
+
+            // Intermediate Variables
+            c[0] = pi2_tmp[0] + pi2_tmp[2 * trans_size];
+            c[1] = pi2_tmp[2 * trans_size] + pi2_tmp[3 * trans_size];
+            c[2] = pi2_tmp[0] - pi2_tmp[3 * trans_size];
+            c[3] = 74 * pi2_tmp[trans_size];
+
+            itrans_out =
+                            CLIP_S16((29 * c[0] + 55 * c[1] + c[3] + add) >> shift);
+            pu1_dst[0] = CLIP_U8((itrans_out + pu1_pred[0]));
+
+            itrans_out =
+                            CLIP_S16((55 * c[2] - 29 * c[1] + c[3] + add) >> shift);
+            pu1_dst[1] = CLIP_U8((itrans_out + pu1_pred[1]));
+
+            itrans_out =
+                            CLIP_S16((74 * (pi2_tmp[0] - pi2_tmp[2 * trans_size] + pi2_tmp[3 * trans_size]) + add) >> shift);
+            pu1_dst[2] = CLIP_U8((itrans_out + pu1_pred[2]));
+
+            itrans_out =
+                            CLIP_S16((55 * c[0] + 29 * c[2] - c[3] + add) >> shift);
+            pu1_dst[3] = CLIP_U8((itrans_out + pu1_pred[3]));
+            pi2_tmp++;
+            pu1_pred += pred_strd;
+            pu1_dst += dst_strd;
+        }
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization, inverse  transform and
+ * reconstruction for 4x4 input block
+ *
+ * @par Description:
+ *  Performs inverse quantization , inverse transform  and adds the
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 4x4 buffer for storing inverse
+ *  transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 4x4 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @param[in] zero_rows
+ *  Zero Rows in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_iquant_itrans_recon_4x4(WORD16 *pi2_src,
+                                   WORD16 *pi2_tmp,
+                                   UWORD8 *pu1_pred,
+                                   WORD16 *pi2_dequant_coeff,
+                                   UWORD8 *pu1_dst,
+                                   WORD32 qp_div, /* qpscaled / 6 */
+                                   WORD32 qp_rem, /* qpscaled % 6 */
+                                   WORD32 src_strd,
+                                   WORD32 pred_strd,
+                                   WORD32 dst_strd,
+                                   WORD32 zero_cols,
+                                   WORD32 zero_rows)
+{
+    UNUSED(zero_rows);
+    /* Inverse Transform */
+    {
+        WORD32 j;
+        WORD32 e[2], o[2];
+        WORD32 add;
+        WORD32 shift;
+        WORD16 *pi2_tmp_orig;
+        WORD32 shift_iq;
+        WORD32 trans_size;
+        /* Inverse Quantization constants */
+        {
+            WORD32 log2_trans_size, bit_depth;
+
+            log2_trans_size = 2;
+            bit_depth = 8 + 0;
+            shift_iq = bit_depth + log2_trans_size - 5;
+        }
+
+        trans_size = TRANS_SIZE_4;
+        pi2_tmp_orig = pi2_tmp;
+
+        /* Inverse Transform 1st stage */
+        shift = IT_SHIFT_STAGE_1;
+        add = 1 << (shift - 1);
+
+        for(j = 0; j < trans_size; j++)
+        {
+            /* Checking for Zero Cols */
+            if((zero_cols & 1) == 1)
+            {
+                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+            }
+            else
+            {
+                WORD32 iq_tmp_1, iq_tmp_2;
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                IQUANT_4x4(iq_tmp_1,
+                           pi2_src[1 * src_strd],
+                           pi2_dequant_coeff[1 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+                IQUANT_4x4(iq_tmp_2,
+                           pi2_src[3 * src_strd],
+                           pi2_dequant_coeff[3 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+
+                o[0] = g_ai2_ihevc_trans_4[1][0] * iq_tmp_1
+                                + g_ai2_ihevc_trans_4[3][0] * iq_tmp_2;
+                o[1] = g_ai2_ihevc_trans_4[1][1] * iq_tmp_1
+                                + g_ai2_ihevc_trans_4[3][1] * iq_tmp_2;
+
+                IQUANT_4x4(iq_tmp_1,
+                           pi2_src[0 * src_strd],
+                           pi2_dequant_coeff[0 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+                IQUANT_4x4(iq_tmp_2,
+                           pi2_src[2 * src_strd],
+                           pi2_dequant_coeff[2 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+
+                e[0] = g_ai2_ihevc_trans_4[0][0] * iq_tmp_1
+                                + g_ai2_ihevc_trans_4[2][0] * iq_tmp_2;
+                e[1] = g_ai2_ihevc_trans_4[0][1] * iq_tmp_1
+                                + g_ai2_ihevc_trans_4[2][1] * iq_tmp_2;
+
+                pi2_tmp[0] =
+                                CLIP_S16(((e[0] + o[0] + add) >> shift));
+                pi2_tmp[1] =
+                                CLIP_S16(((e[1] + o[1] + add) >> shift));
+                pi2_tmp[2] =
+                                CLIP_S16(((e[1] - o[1] + add) >> shift));
+                pi2_tmp[3] =
+                                CLIP_S16(((e[0] - o[0] + add) >> shift));
+            }
+            pi2_src++;
+            pi2_dequant_coeff++;
+            pi2_tmp += trans_size;
+            zero_cols = zero_cols >> 1;
+        }
+
+        pi2_tmp = pi2_tmp_orig;
+
+        /* Inverse Transform 2nd stage */
+        shift = IT_SHIFT_STAGE_2;
+        add = 1 << (shift - 1);
+
+        for(j = 0; j < trans_size; j++)
+        {
+            WORD32 itrans_out;
+
+            /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+            o[0] = g_ai2_ihevc_trans_4[1][0] * pi2_tmp[trans_size]
+                            + g_ai2_ihevc_trans_4[3][0]
+                                            * pi2_tmp[3 * trans_size];
+            o[1] = g_ai2_ihevc_trans_4[1][1] * pi2_tmp[trans_size]
+                            + g_ai2_ihevc_trans_4[3][1]
+                                            * pi2_tmp[3 * trans_size];
+            e[0] = g_ai2_ihevc_trans_4[0][0] * pi2_tmp[0]
+                            + g_ai2_ihevc_trans_4[2][0]
+                                            * pi2_tmp[2 * trans_size];
+            e[1] = g_ai2_ihevc_trans_4[0][1] * pi2_tmp[0]
+                            + g_ai2_ihevc_trans_4[2][1]
+                                            * pi2_tmp[2 * trans_size];
+
+            itrans_out =
+                            CLIP_S16(((e[0] + o[0] + add) >> shift));
+            pu1_dst[0] = CLIP_U8((itrans_out + pu1_pred[0]));
+
+            itrans_out =
+                            CLIP_S16(((e[1] + o[1] + add) >> shift));
+            pu1_dst[1] = CLIP_U8((itrans_out + pu1_pred[1]));
+
+            itrans_out =
+                            CLIP_S16(((e[1] - o[1] + add) >> shift));
+            pu1_dst[2] = CLIP_U8((itrans_out + pu1_pred[2]));
+
+            itrans_out =
+                            CLIP_S16(((e[0] - o[0] + add) >> shift));
+            pu1_dst[3] = CLIP_U8((itrans_out + pu1_pred[3]));
+
+            pi2_tmp++;
+            pu1_pred += pred_strd;
+            pu1_dst += dst_strd;
+
+        }
+    }
+}

diff --git a/common/ihevc_iquant_itrans_recon.h b/common/ihevc_iquant_itrans_recon.h
new file mode 100644
index 0000000..33055b4
--- /dev/null
+++ b/common/ihevc_iquant_itrans_recon.h

@@ -0,0 +1,197 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_iquant_itrans_recon.h
+*
+* @brief
+*  Functions declarations for inverse quantization,  inverse transform and
+* reconstruction
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVC_IQUANT_ITRANS_RECON_H_
+#define _IHEVC_IQUANT_ITRANS_RECON_H_
+
+typedef void ihevc_iquant_itrans_recon_4x4_ttype1_ft(WORD16 *pi2_src,
+                                                     WORD16 *pi2_tmp,
+                                                     UWORD8 *pu1_pred,
+                                                     WORD16 *pi2_dequant_coeff,
+                                                     UWORD8 *pu1_dst,
+                                                     WORD32 qp_div, /* qpscaled / 6 */
+                                                     WORD32 qp_rem, /* qpscaled % 6 */
+                                                     WORD32 src_strd,
+                                                     WORD32 pred_strd,
+                                                     WORD32 dst_strd,
+                                                     WORD32 zero_cols,
+                                                     WORD32 zero_rows);
+
+typedef void ihevc_hbd_iquant_itrans_recon_4x4_ttype1_ft(WORD16 *pi2_src,
+                                                         WORD16 *pi2_tmp,
+                                                         UWORD16 *pu2_pred,
+                                                         WORD16 *pi2_dequant_coeff,
+                                                         UWORD16 *pu2_dst,
+                                                         WORD32 qp_div, /* qpscaled / 6 */
+                                                         WORD32 qp_rem, /* qpscaled % 6 */
+                                                         WORD32 src_strd,
+                                                         WORD32 pred_strd,
+                                                         WORD32 dst_strd,
+                                                         WORD32 zero_cols,
+                                                         WORD32 zero_rows,
+                                                         UWORD8 bit_depth);
+
+typedef void ihevc_iquant_itrans_recon_4x4_ft(WORD16 *pi2_src,
+                                              WORD16 *pi2_tmp,
+                                              UWORD8 *pu1_pred,
+                                              WORD16 *pi2_dequant_coeff,
+                                              UWORD8 *pu1_dst,
+                                              WORD32 qp_div, /* qpscaled / 6 */
+                                              WORD32 qp_rem, /* qpscaled % 6 */
+                                              WORD32 src_strd,
+                                              WORD32 pred_strd,
+                                              WORD32 dst_strd,
+                                              WORD32 zero_cols,
+                                              WORD32 zero_rows);
+
+typedef void ihevc_hbd_iquant_itrans_recon_4x4_ft(WORD16 *pi2_src,
+                                                  WORD16 *pi2_tmp,
+                                                  UWORD16 *pu2_pred,
+                                                  WORD16 *pi2_dequant_coeff,
+                                                  UWORD16 *pu2_dst,
+                                                  WORD32 qp_div, /* qpscaled / 6 */
+                                                  WORD32 qp_rem, /* qpscaled % 6 */
+                                                  WORD32 src_strd,
+                                                  WORD32 pred_strd,
+                                                  WORD32 dst_strd,
+                                                  WORD32 zero_cols,
+                                                  WORD32 zero_rows,
+                                                  UWORD8 bit_depth);
+
+typedef void ihevc_iquant_itrans_recon_8x8_ft(WORD16 *pi2_src,
+                                              WORD16 *pi2_tmp,
+                                              UWORD8 *pu1_pred,
+                                              WORD16 *pi2_dequant_coeff,
+                                              UWORD8 *pu1_dst,
+                                              WORD32 qp_div, /* qpscaled / 6 */
+                                              WORD32 qp_rem, /* qpscaled % 6 */
+                                              WORD32 src_strd,
+                                              WORD32 pred_strd,
+                                              WORD32 dst_strd,
+                                              WORD32 zero_cols,
+                                              WORD32 zero_rows);
+
+typedef void ihevc_hbd_iquant_itrans_recon_8x8_ft(WORD16 *pi2_src,
+                                                  WORD16 *pi2_tmp,
+                                                  UWORD16 *pu2_pred,
+                                                  WORD16 *pi2_dequant_coeff,
+                                                  UWORD16 *pu2_dst,
+                                                  WORD32 qp_div, /* qpscaled / 6 */
+                                                  WORD32 qp_rem, /* qpscaled % 6 */
+                                                  WORD32 src_strd,
+                                                  WORD32 pred_strd,
+                                                  WORD32 dst_strd,
+                                                  WORD32 zero_cols,
+                                                  WORD32 zero_rows,
+                                                  UWORD8 bit_depth);
+
+typedef void ihevc_iquant_itrans_recon_16x16_ft(WORD16 *pi2_src,
+                                                WORD16 *pi2_tmp,
+                                                UWORD8 *pu1_pred,
+                                                WORD16 *pi2_dequant_coeff,
+                                                UWORD8 *pu1_dst,
+                                                WORD32 qp_div, /* qpscaled / 6 */
+                                                WORD32 qp_rem, /* qpscaled % 6 */
+                                                WORD32 src_strd,
+                                                WORD32 pred_strd,
+                                                WORD32 dst_strd,
+                                                WORD32 zero_cols,
+                                                WORD32 zero_rows);
+
+typedef void ihevc_hbd_iquant_itrans_recon_16x16_ft(WORD16 *pi2_src,
+                                                    WORD16 *pi2_tmp,
+                                                    UWORD16 *pu2_pred,
+                                                    WORD16 *pi2_dequant_coeff,
+                                                    UWORD16 *pu2_dst,
+                                                    WORD32 qp_div, /* qpscaled / 6 */
+                                                    WORD32 qp_rem, /* qpscaled % 6 */
+                                                    WORD32 src_strd,
+                                                    WORD32 pred_strd,
+                                                    WORD32 dst_strd,
+                                                    WORD32 zero_cols,
+                                                    WORD32 zero_rows,
+                                                    UWORD8 bit_depth);
+
+typedef void ihevc_iquant_itrans_recon_32x32_ft(WORD16 *pi2_src,
+                                                WORD16 *pi2_tmp,
+                                                UWORD8 *pu1_pred,
+                                                WORD16 *pi2_dequant_coeff,
+                                                UWORD8 *pu1_dst,
+                                                WORD32 qp_div, /* qpscaled / 6 */
+                                                WORD32 qp_rem, /* qpscaled % 6 */
+                                                WORD32 src_strd,
+                                                WORD32 pred_strd,
+                                                WORD32 dst_strd,
+                                                WORD32 zero_cols,
+                                                WORD32 zero_rows);
+
+typedef void ihevc_hbd_iquant_itrans_recon_32x32_ft(WORD16 *pi2_src,
+                                                    WORD16 *pi2_tmp,
+                                                    UWORD16 *pu2_pred,
+                                                    WORD16 *pi2_dequant_coeff,
+                                                    UWORD16 *pu2_dst,
+                                                    WORD32 qp_div, /* qpscaled / 6 */
+                                                    WORD32 qp_rem, /* qpscaled % 6 */
+                                                    WORD32 src_strd,
+                                                    WORD32 pred_strd,
+                                                    WORD32 dst_strd,
+                                                    WORD32 zero_cols,
+                                                    WORD32 zero_rows,
+                                                    UWORD8 bit_depth);
+
+ihevc_iquant_itrans_recon_4x4_ttype1_ft ihevc_iquant_itrans_recon_4x4_ttype1;
+ihevc_hbd_iquant_itrans_recon_4x4_ttype1_ft ihevc_hbd_iquant_itrans_recon_4x4_ttype1;
+ihevc_iquant_itrans_recon_4x4_ft ihevc_iquant_itrans_recon_4x4;
+ihevc_hbd_iquant_itrans_recon_4x4_ft ihevc_hbd_iquant_itrans_recon_4x4;
+ihevc_iquant_itrans_recon_8x8_ft ihevc_iquant_itrans_recon_8x8;
+ihevc_hbd_iquant_itrans_recon_8x8_ft ihevc_hbd_iquant_itrans_recon_8x8;
+ihevc_iquant_itrans_recon_16x16_ft ihevc_iquant_itrans_recon_16x16;
+ihevc_hbd_iquant_itrans_recon_16x16_ft ihevc_hbd_iquant_itrans_recon_16x16;
+ihevc_iquant_itrans_recon_32x32_ft ihevc_iquant_itrans_recon_32x32;
+ihevc_hbd_iquant_itrans_recon_32x32_ft ihevc_hbd_iquant_itrans_recon_32x32;
+
+ihevc_iquant_itrans_recon_4x4_ttype1_ft ihevc_iquant_itrans_recon_4x4_ttype1_sse42;
+ihevc_iquant_itrans_recon_4x4_ft ihevc_iquant_itrans_recon_4x4_sse42;
+ihevc_iquant_itrans_recon_8x8_ft ihevc_iquant_itrans_recon_8x8_sse42;
+ihevc_iquant_itrans_recon_16x16_ft ihevc_iquant_itrans_recon_16x16_sse42;
+ihevc_iquant_itrans_recon_32x32_ft ihevc_iquant_itrans_recon_32x32_sse42;
+
+ihevc_hbd_iquant_itrans_recon_4x4_ttype1_ft ihevc_hbd_iquant_itrans_recon_4x4_ttype1_sse42;
+ihevc_hbd_iquant_itrans_recon_4x4_ft ihevc_hbd_iquant_itrans_recon_4x4_sse42;
+ihevc_hbd_iquant_itrans_recon_8x8_ft ihevc_hbd_iquant_itrans_recon_8x8_sse42;
+ihevc_hbd_iquant_itrans_recon_16x16_ft ihevc_hbd_iquant_itrans_recon_16x16_sse42;
+ihevc_hbd_iquant_itrans_recon_32x32_ft ihevc_hbd_iquant_itrans_recon_32x32_sse42;
+#endif /*_IHEVC_IQUANT_ITRANS_RECON_H_*/
+

diff --git a/common/ihevc_iquant_recon.c b/common/ihevc_iquant_recon.c
new file mode 100644
index 0000000..de5ff53
--- /dev/null
+++ b/common/ihevc_iquant_recon.c

@@ -0,0 +1,612 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_iquant_recon.c
+ *
+ * @brief
+ *  Contains function definitions for inverse  quantization and
+ * reconstruction
+ *
+ * @author
+ *  100470
+ *
+ * @par List of Functions:
+ *  - ihevc_iquant_recon_4x4_ttype1()
+ *  - ihevc_iquant_recon_4x4()
+ *  - ihevc_iquant_recon_8x8()
+ *  - ihevc_iquant_recon_16x16()
+ *  - ihevc_iquant_recon_32x32()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_iquant_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions here are replicated from ihevc_iquant_itrans_recon.c and modified to */
+/* include reconstruction */
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization type 1 and  reconstruction
+ * for 4x4 input block
+ *
+ * @par Description:
+ *  This function performs inverse quantization and  reconstruction for 4x4
+ * input block
+ *
+ * @param[in] pi2_src
+ *  Input 4x4 coefficients
+ *
+ * @param[in] pu1_pred
+ *  Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 4x4 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_iquant_recon_4x4_ttype1(WORD16 *pi2_src,
+                                   UWORD8 *pu1_pred,
+                                   WORD16 *pi2_dequant_coeff,
+                                   UWORD8 *pu1_dst,
+                                   WORD32 qp_div, /* qpscaled / 6 */
+                                   WORD32 qp_rem, /* qpscaled % 6 */
+                                   WORD32 src_strd,
+                                   WORD32 pred_strd,
+                                   WORD32 dst_strd,
+                                   WORD32 zero_cols)
+{
+
+    {
+        /* Inverse Quant and recon */
+        {
+            WORD32 i, j;
+            WORD32 shift_iq;
+            WORD32 trans_size;
+            /* Inverse Quantization constants */
+            {
+                WORD32 log2_trans_size, bit_depth;
+
+                log2_trans_size = 2;
+                bit_depth = 8 + 0;
+                shift_iq = bit_depth + log2_trans_size - 5;
+            }
+
+            trans_size = TRANS_SIZE_4;
+
+            for(i = 0; i < trans_size; i++)
+            {
+                /* Checking for Zero Cols */
+                if((zero_cols & 1) == 1)
+                {
+                    for(j = 0; j < trans_size; j++)
+                        pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+                }
+                else
+                {
+                    for(j = 0; j < trans_size; j++)
+                    {
+                        WORD32 iquant_out;
+                        IQUANT_4x4(iquant_out,
+                                   pi2_src[j * src_strd],
+                                   pi2_dequant_coeff[j * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                                   shift_iq, qp_div);
+
+                        iquant_out = (iquant_out + 16) >> 5;
+                        pu1_dst[j * dst_strd] =
+                                        CLIP_U8(iquant_out + pu1_pred[j * pred_strd]);
+                    }
+                }
+                pi2_src++;
+                pi2_dequant_coeff++;
+                pu1_pred++;
+                pu1_dst++;
+
+                zero_cols = zero_cols >> 1;
+            }
+        }
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization and  reconstruction for 4x4
+ * input block
+ *
+ * @par Description:
+ *  This function performs inverse quantization and  reconstruction for 4x4
+ * input block
+ *
+ * @param[in] pi2_src
+ *  Input 4x4 coefficients
+ *
+ * @param[in] pu1_pred
+ *  Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 4x4 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_iquant_recon_4x4(WORD16 *pi2_src,
+                            UWORD8 *pu1_pred,
+                            WORD16 *pi2_dequant_coeff,
+                            UWORD8 *pu1_dst,
+                            WORD32 qp_div, /* qpscaled / 6 */
+                            WORD32 qp_rem, /* qpscaled % 6 */
+                            WORD32 src_strd,
+                            WORD32 pred_strd,
+                            WORD32 dst_strd,
+                            WORD32 zero_cols)
+{
+
+    {
+        /* Inverse Quant and recon */
+        {
+            WORD32 i, j;
+            WORD32 shift_iq;
+            WORD32 trans_size;
+            /* Inverse Quantization constants */
+            {
+                WORD32 log2_trans_size, bit_depth;
+
+                log2_trans_size = 2;
+                bit_depth = 8 + 0;
+                shift_iq = bit_depth + log2_trans_size - 5;
+            }
+
+            trans_size = TRANS_SIZE_4;
+
+            for(i = 0; i < trans_size; i++)
+            {
+                /* Checking for Zero Cols */
+                if((zero_cols & 1) == 1)
+                {
+                    for(j = 0; j < trans_size; j++)
+                        pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+                }
+                else
+                {
+                    for(j = 0; j < trans_size; j++)
+                    {
+                        WORD32 iquant_out;
+                        IQUANT_4x4(iquant_out,
+                                   pi2_src[j * src_strd],
+                                   pi2_dequant_coeff[j * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                                   shift_iq, qp_div);
+                        iquant_out = (iquant_out + 16) >> 5;
+                        pu1_dst[j * dst_strd] =
+                                        CLIP_U8(iquant_out + pu1_pred[j * pred_strd]);
+                    }
+                }
+                pi2_src++;
+                pi2_dequant_coeff++;
+                pu1_pred++;
+                pu1_dst++;
+
+                zero_cols = zero_cols >> 1;
+            }
+        }
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization and  reconstruction for 8x8
+ * input block
+ *
+ * @par Description:
+ *  This function performs inverse quantization and  reconstruction for 8x8
+ * input block
+ *
+ * @param[in] pi2_src
+ *  Input 8x8 coefficients
+ *
+ * @param[in] pu1_pred
+ *  Prediction 8x8 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 8x8 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_iquant_recon_8x8(WORD16 *pi2_src,
+                            UWORD8 *pu1_pred,
+                            WORD16 *pi2_dequant_coeff,
+                            UWORD8 *pu1_dst,
+                            WORD32 qp_div, /* qpscaled / 6 */
+                            WORD32 qp_rem, /* qpscaled % 6 */
+                            WORD32 src_strd,
+                            WORD32 pred_strd,
+                            WORD32 dst_strd,
+                            WORD32 zero_cols)
+{
+
+    {
+        /* Inverse Quant and recon */
+        {
+            WORD32 i, j;
+            WORD32 shift_iq;
+            WORD32 trans_size;
+            /* Inverse Quantization constants */
+            {
+                WORD32 log2_trans_size, bit_depth;
+
+                log2_trans_size = 3;
+                bit_depth = 8 + 0;
+                shift_iq = bit_depth + log2_trans_size - 5;
+            }
+
+            trans_size = TRANS_SIZE_8;
+
+            for(i = 0; i < trans_size; i++)
+            {
+                /* Checking for Zero Cols */
+                if((zero_cols & 1) == 1)
+                {
+                    for(j = 0; j < trans_size; j++)
+                        pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+                }
+                else
+                {
+                    for(j = 0; j < trans_size; j++)
+                    {
+                        WORD32 iquant_out;
+                        IQUANT(iquant_out,
+                               pi2_src[j * src_strd],
+                               pi2_dequant_coeff[j * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                               shift_iq, qp_div);
+                        iquant_out = (iquant_out + 16) >> 5;
+                        pu1_dst[j * dst_strd] =
+                                        CLIP_U8(iquant_out + pu1_pred[j * pred_strd]);
+                    }
+                }
+                pi2_src++;
+                pi2_dequant_coeff++;
+                pu1_pred++;
+                pu1_dst++;
+
+                zero_cols = zero_cols >> 1;
+            }
+        }
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization and  reconstruction for 16x16
+ * input block
+ *
+ * @par Description:
+ *  This function performs inverse quantization and  reconstruction for 16x16
+ * input block
+ *
+ * @param[in] pi2_src
+ *  Input 16x16 coefficients
+ *
+ * @param[in] pu1_pred
+ *  Prediction 16x16 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 16x16 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_iquant_recon_16x16(WORD16 *pi2_src,
+                              UWORD8 *pu1_pred,
+                              WORD16 *pi2_dequant_coeff,
+                              UWORD8 *pu1_dst,
+                              WORD32 qp_div, /* qpscaled / 6 */
+                              WORD32 qp_rem, /* qpscaled % 6 */
+                              WORD32 src_strd,
+                              WORD32 pred_strd,
+                              WORD32 dst_strd,
+                              WORD32 zero_cols)
+
+{
+
+    {
+        /* Inverse Quant and recon */
+        {
+            WORD32 i, j;
+            WORD32 shift_iq;
+            WORD32 trans_size;
+            /* Inverse Quantization constants */
+            {
+                WORD32 log2_trans_size, bit_depth;
+
+                log2_trans_size = 4;
+                bit_depth = 8 + 0;
+                shift_iq = bit_depth + log2_trans_size - 5;
+            }
+
+            trans_size = TRANS_SIZE_16;
+
+            for(i = 0; i < trans_size; i++)
+            {
+                /* Checking for Zero Cols */
+                if((zero_cols & 1) == 1)
+                {
+                    for(j = 0; j < trans_size; j++)
+                        pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+                }
+                else
+                {
+                    for(j = 0; j < trans_size; j++)
+                    {
+                        WORD32 iquant_out;
+                        IQUANT(iquant_out,
+                               pi2_src[j * src_strd],
+                               pi2_dequant_coeff[j * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                               shift_iq, qp_div);
+                        iquant_out = (iquant_out + 16) >> 5;
+                        pu1_dst[j * dst_strd] =
+                                        CLIP_U8(iquant_out + pu1_pred[j * pred_strd]);
+                    }
+                }
+                pi2_src++;
+                pi2_dequant_coeff++;
+                pu1_pred++;
+                pu1_dst++;
+
+                zero_cols = zero_cols >> 1;
+            }
+        }
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization and  reconstruction for 32x32
+ * input block
+ *
+ * @par Description:
+ *  This function performs inverse quantization and  reconstruction for 32x32
+ * input block
+ *
+ * @param[in] pi2_src
+ *  Input 32x32 coefficients
+ *
+ * @param[in] pu1_pred
+ *  Prediction 32x32 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 32x32 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_iquant_recon_32x32(WORD16 *pi2_src,
+                              UWORD8 *pu1_pred,
+                              WORD16 *pi2_dequant_coeff,
+                              UWORD8 *pu1_dst,
+                              WORD32 qp_div, /* qpscaled / 6 */
+                              WORD32 qp_rem, /* qpscaled % 6 */
+                              WORD32 src_strd,
+                              WORD32 pred_strd,
+                              WORD32 dst_strd,
+                              WORD32 zero_cols)
+{
+
+    {
+        /* Inverse Quant and recon */
+        {
+            WORD32 i, j;
+            WORD32 shift_iq;
+            WORD32 trans_size;
+            /* Inverse Quantization constants */
+            {
+                WORD32 log2_trans_size, bit_depth;
+
+                log2_trans_size = 5;
+                bit_depth = 8 + 0;
+                shift_iq = bit_depth + log2_trans_size - 5;
+            }
+
+            trans_size = TRANS_SIZE_32;
+
+            for(i = 0; i < trans_size; i++)
+            {
+                /* Checking for Zero Cols */
+                if((zero_cols & 1) == 1)
+                {
+                    for(j = 0; j < trans_size; j++)
+                        pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+                }
+                else
+                {
+                    for(j = 0; j < trans_size; j++)
+                    {
+                        WORD32 iquant_out;
+                        IQUANT(iquant_out,
+                               pi2_src[j * src_strd],
+                               pi2_dequant_coeff[j * trans_size] * g_ihevc_iquant_scales[qp_rem],
+                               shift_iq, qp_div);
+                        iquant_out = (iquant_out + 16) >> 5;
+                        pu1_dst[j * dst_strd] =
+                                        CLIP_U8(iquant_out + pu1_pred[j * pred_strd]);
+                    }
+                }
+                pi2_src++;
+                pi2_dequant_coeff++;
+                pu1_pred++;
+                pu1_dst++;
+
+                zero_cols = zero_cols >> 1;
+            }
+        }
+    }
+}
+

diff --git a/common/ihevc_iquant_recon.h b/common/ihevc_iquant_recon.h
new file mode 100644
index 0000000..c732b04
--- /dev/null
+++ b/common/ihevc_iquant_recon.h

@@ -0,0 +1,154 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_iquant_recon.h
+*
+* @brief
+*  Functions declarations for inverse quantization and  reconstruction
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_IQUANT_RECON_H_
+#define _IHEVC_IQUANT_RECON_H_
+
+typedef void ihevc_iquant_recon_4x4_ttype1_ft(WORD16 *pi2_src,
+                                              UWORD8 *pu1_pred,
+                                              WORD16 *pi2_dequant_coeff,
+                                              UWORD8 *pu1_dst,
+                                              WORD32 qp_div, /* qpscaled / 6 */
+                                              WORD32 qp_rem, /* qpscaled % 6 */
+                                              WORD32 src_strd,
+                                              WORD32 pred_strd,
+                                              WORD32 dst_strd,
+                                              WORD32 zero_cols);
+typedef void ihevc_hbd_iquant_recon_4x4_ttype1_ft(WORD16 *pi2_src,
+                                                  UWORD16 *pu2_pred,
+                                                  WORD16 *pi2_dequant_coeff,
+                                                  UWORD16 *pu2_dst,
+                                                  WORD32 qp_div, /* qpscaled / 6 */
+                                                  WORD32 qp_rem, /* qpscaled % 6 */
+                                                  WORD32 src_strd,
+                                                  WORD32 pred_strd,
+                                                  WORD32 dst_strd,
+                                                  WORD32 zero_cols,
+                                                  UWORD8 bit_depth);
+typedef void ihevc_iquant_recon_4x4_ft(WORD16 *pi2_src,
+                                       UWORD8 *pu1_pred,
+                                       WORD16 *pi2_dequant_coeff,
+                                       UWORD8 *pu1_dst,
+                                       WORD32 qp_div, /* qpscaled / 6 */
+                                       WORD32 qp_rem, /* qpscaled % 6 */
+                                       WORD32 src_strd,
+                                       WORD32 pred_strd,
+                                       WORD32 dst_strd,
+                                       WORD32 zero_cols);
+typedef void ihevc_hbd_iquant_recon_4x4_ft(WORD16 *pi2_src,
+                                           UWORD16 *pu2_pred,
+                                           WORD16 *pi2_dequant_coeff,
+                                           UWORD16 *pu2_dst,
+                                           WORD32 qp_div, /* qpscaled / 6 */
+                                           WORD32 qp_rem, /* qpscaled % 6 */
+                                           WORD32 src_strd,
+                                           WORD32 pred_strd,
+                                           WORD32 dst_strd,
+                                           WORD32 zero_cols,
+                                           UWORD8 bit_depth);
+typedef void ihevc_iquant_recon_8x8_ft(WORD16 *pi2_src,
+                                       UWORD8 *pu1_pred,
+                                       WORD16 *pi2_dequant_coeff,
+                                       UWORD8 *pu1_dst,
+                                       WORD32 qp_div, /* qpscaled / 6 */
+                                       WORD32 qp_rem, /* qpscaled % 6 */
+                                       WORD32 src_strd,
+                                       WORD32 pred_strd,
+                                       WORD32 dst_strd,
+                                       WORD32 zero_cols);
+typedef void ihevc_hbd_iquant_recon_8x8_ft(WORD16 *pi2_src,
+                                           UWORD16 *pu2_pred,
+                                           WORD16 *pi2_dequant_coeff,
+                                           UWORD16 *pu2_dst,
+                                           WORD32 qp_div, /* qpscaled / 6 */
+                                           WORD32 qp_rem, /* qpscaled % 6 */
+                                           WORD32 src_strd,
+                                           WORD32 pred_strd,
+                                           WORD32 dst_strd,
+                                           WORD32 zero_cols,
+                                           UWORD8 bit_depth);
+typedef void ihevc_iquant_recon_16x16_ft(WORD16 *pi2_src,
+                                         UWORD8 *pu1_pred,
+                                         WORD16 *pi2_dequant_coeff,
+                                         UWORD8 *pu1_dst,
+                                         WORD32 qp_div, /* qpscaled / 6 */
+                                         WORD32 qp_rem, /* qpscaled % 6 */
+                                         WORD32 src_strd,
+                                         WORD32 pred_strd,
+                                         WORD32 dst_strd,
+                                         WORD32 zero_cols);
+typedef void ihevc_hbd_iquant_recon_16x16_ft(WORD16 *pi2_src,
+                                             UWORD16 *pu2_pred,
+                                             WORD16 *pi2_dequant_coeff,
+                                             UWORD16 *pu2_dst,
+                                             WORD32 qp_div, /* qpscaled / 6 */
+                                             WORD32 qp_rem, /* qpscaled % 6 */
+                                             WORD32 src_strd,
+                                             WORD32 pred_strd,
+                                             WORD32 dst_strd,
+                                             WORD32 zero_cols,
+                                             UWORD8 bit_depth);
+typedef void ihevc_iquant_recon_32x32_ft(WORD16 *pi2_src,
+                                         UWORD8 *pu1_pred,
+                                         WORD16 *pi2_dequant_coeff,
+                                         UWORD8 *pu1_dst,
+                                         WORD32 qp_div, /* qpscaled / 6 */
+                                         WORD32 qp_rem, /* qpscaled % 6 */
+                                         WORD32 src_strd,
+                                         WORD32 pred_strd,
+                                         WORD32 dst_strd,
+                                         WORD32 zero_cols);
+typedef void ihevc_hbd_iquant_recon_32x32_ft(WORD16 *pi2_src,
+                                             UWORD16 *pu2_pred,
+                                             WORD16 *pi2_dequant_coeff,
+                                             UWORD16 *pu2_dst,
+                                             WORD32 qp_div, /* qpscaled / 6 */
+                                             WORD32 qp_rem, /* qpscaled % 6 */
+                                             WORD32 src_strd,
+                                             WORD32 pred_strd,
+                                             WORD32 dst_strd,
+                                             WORD32 zero_cols,
+                                             UWORD8 bit_depth);
+
+ihevc_iquant_recon_4x4_ttype1_ft ihevc_iquant_recon_4x4_ttype1;
+ihevc_hbd_iquant_recon_4x4_ttype1_ft ihevc_hbd_iquant_recon_4x4_ttype1;
+ihevc_iquant_recon_4x4_ft ihevc_iquant_recon_4x4;
+ihevc_hbd_iquant_recon_4x4_ft ihevc_hbd_iquant_recon_4x4;
+ihevc_iquant_recon_8x8_ft ihevc_iquant_recon_8x8;
+ihevc_hbd_iquant_recon_8x8_ft ihevc_hbd_iquant_recon_8x8;
+ihevc_iquant_recon_16x16_ft ihevc_iquant_recon_16x16;
+ihevc_hbd_iquant_recon_16x16_ft ihevc_hbd_iquant_recon_16x16;
+ihevc_iquant_recon_32x32_ft ihevc_iquant_recon_32x32;
+ihevc_hbd_iquant_recon_32x32_ft ihevc_hbd_iquant_recon_32x32;
+
+#endif /*_IHEVC_IQUANT_RECON_H_*/

diff --git a/common/ihevc_itrans.c b/common/ihevc_itrans.c
new file mode 100644
index 0000000..741c2ab
--- /dev/null
+++ b/common/ihevc_itrans.c

@@ -0,0 +1,974 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_itrans.c
+ *
+ * @brief
+ *  Contains function definitions for single stage  inverse transform
+ *
+ * @author
+ *  100470
+ *
+ * @par List of Functions:
+ *  - ihevc_itrans_4x4_ttype1()
+ *  - ihevc_itrans_4x4()
+ *  - ihevc_itrans_8x8()
+ *  - ihevc_itrans_16x16()
+ *  - ihevc_itrans_32x32()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+#define NON_OPTIMIZED 1
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs Single stage  Inverse transform type 1 (DST) for
+ * 4x4 input block
+ *
+ * @par Description:
+ *  Performs single stage 4x4 inverse transform type 1  by utilizing the
+ * symmetry of transformation matrix  and reducing number of multiplications
+ * wherever  possible but keeping the number of operations
+ * (addition,multiplication and shift)same
+ *
+ * @param[in] pi2_src
+ *  Input 4x4 coefficients
+ *
+ * @param[out] pi2_dst
+ *  Output 4x4 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] i4_shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_itrans_4x4_ttype1(WORD16 *pi2_src,
+                             WORD16 *pi2_dst,
+                             WORD32 src_strd,
+                             WORD32 dst_strd,
+                             WORD32 i4_shift,
+                             WORD32 zero_cols)
+{
+    WORD32 i, c[4];
+    WORD32 add;
+
+    add = 1 << (i4_shift - 1);
+
+    for(i = 0; i < TRANS_SIZE_4; i++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            memset(pi2_dst, 0, TRANS_SIZE_4 * sizeof(WORD16));
+        }
+        else
+        {
+            // Intermediate Variables
+            c[0] = pi2_src[0] + pi2_src[2 * src_strd];
+            c[1] = pi2_src[2 * src_strd] + pi2_src[3 * src_strd];
+            c[2] = pi2_src[0] - pi2_src[3 * src_strd];
+            c[3] = 74 * pi2_src[src_strd];
+
+            pi2_dst[0] =
+                            CLIP_S16((29 * c[0] + 55 * c[1] + c[3] + add) >> i4_shift);
+            pi2_dst[1] =
+                            CLIP_S16((55 * c[2] - 29 * c[1] + c[3] + add) >> i4_shift);
+            pi2_dst[2] =
+                            CLIP_S16((74 * (pi2_src[0] - pi2_src[2 * src_strd] + pi2_src[3 * src_strd]) + add) >> i4_shift);
+            pi2_dst[3] =
+                            CLIP_S16((55 * c[0] + 29 * c[2] - c[3] + add) >> i4_shift);
+        }
+        pi2_src++;
+        pi2_dst += dst_strd;
+        zero_cols = zero_cols >> 1;
+    }
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs Single stage  Inverse transform for 4x4 input
+ * block
+ *
+ * @par Description:
+ *  Performs single stage 4x4 inverse transform by utilizing  the symmetry of
+ * transformation matrix and reducing number  of multiplications wherever
+ * possible but keeping the  number of operations(addition,multiplication and
+ * shift)  same
+ *
+ * @param[in] pi2_src
+ *  Input 4x4 coefficients
+ *
+ * @param[out] pi2_dst
+ *  Output 4x4 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] i4_shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+#if NON_OPTIMIZED
+void ihevc_itrans_4x4(WORD16 *pi2_src,
+                      WORD16 *pi2_dst,
+                      WORD32 src_strd,
+                      WORD32 dst_strd,
+                      WORD32 i4_shift,
+                      WORD32 zero_cols)
+{
+    WORD32 j;
+    WORD32 e[2], o[2];
+    WORD32 add;
+
+    add = 1 << (i4_shift - 1);
+
+    for(j = 0; j < TRANS_SIZE_4; j++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            memset(pi2_dst, 0, TRANS_SIZE_4 * sizeof(WORD16));
+        }
+        else
+        {
+
+            /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+            o[0] = g_ai2_ihevc_trans_4[1][0] * pi2_src[src_strd]
+                            + g_ai2_ihevc_trans_4[3][0] * pi2_src[3 * src_strd];
+            o[1] = g_ai2_ihevc_trans_4[1][1] * pi2_src[src_strd]
+                            + g_ai2_ihevc_trans_4[3][1] * pi2_src[3 * src_strd];
+            e[0] = g_ai2_ihevc_trans_4[0][0] * pi2_src[0]
+                            + g_ai2_ihevc_trans_4[2][0] * pi2_src[2 * src_strd];
+            e[1] = g_ai2_ihevc_trans_4[0][1] * pi2_src[0]
+                            + g_ai2_ihevc_trans_4[2][1] * pi2_src[2 * src_strd];
+
+            pi2_dst[0] =
+                            CLIP_S16(((e[0] + o[0] + add) >> i4_shift));
+            pi2_dst[1] =
+                            CLIP_S16(((e[1] + o[1] + add) >> i4_shift));
+            pi2_dst[2] =
+                            CLIP_S16(((e[1] - o[1] + add) >> i4_shift));
+            pi2_dst[3] =
+                            CLIP_S16(((e[0] - o[0] + add) >> i4_shift));
+
+        }
+        pi2_src++;
+        pi2_dst += dst_strd;
+        zero_cols = zero_cols >> 1;
+    }
+}
+#else
+void ihevc_itrans_4x4(WORD16 *pi2_src,
+                      WORD16 *pi2_dst,
+                      WORD32 src_strd,
+                      WORD32 dst_strd,
+                      WORD32 i4_shift,
+                      WORD32 zero_cols)
+{
+    WORD32 j;
+    WORD32 e[2], o[2];
+    WORD32 add;
+
+    add = 1 << (i4_shift - 1);
+
+    /***************************************************************************/
+    /* Transform Matrix 4x4                                                    */
+    /*      0   1   2   3                                                      */
+    /* 0 { 64, 64, 64, 64},                                                    */
+    /* 1 { 83, 36,-36,-83},                                                    */
+    /* 2 { 64,-64,-64, 64},                                                    */
+    /* 3 { 36,-83, 83,-36}                                                     */
+    /***************************************************************************/
+
+    for(j = 0; j < TRANS_SIZE_4; j++)
+    {
+        WORD32 temp;
+
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            memset(pi2_dst, 0, TRANS_SIZE_4 * sizeof(WORD16));
+        }
+        else
+        {
+            /* Common operation in o[0] and o[1] */
+            temp = (pi2_src[src_strd] + pi2_src[3 * src_strd]) * 36;
+
+            o[0] = temp + 47 * pi2_src[src_strd];
+            o[1] = temp - 119 * pi2_src[3 * src_strd];
+            e[0] = (pi2_src[0] + pi2_src[2 * src_strd]) << 6;
+            e[1] = (pi2_src[0] - pi2_src[2 * src_strd]) << 6;
+
+            pi2_dst[0] =
+                            CLIP_S16(((e[0] + o[0] + add) >> i4_shift));
+            pi2_dst[1] =
+                            CLIP_S16(((e[1] + o[1] + add) >> i4_shift));
+            pi2_dst[2] =
+                            CLIP_S16(((e[1] - o[1] + add) >> i4_shift));
+            pi2_dst[3] =
+                            CLIP_S16(((e[0] - o[0] + add) >> i4_shift));
+        }
+        pi2_src++;
+        pi2_dst += dst_strd;
+        zero_cols = zero_cols >> 1;
+    }
+}
+#endif
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs Single stage  Inverse transform for 8x8 input
+ * block
+ *
+ * @par Description:
+ *  Performs single stage 8x8 inverse transform by utilizing  the symmetry of
+ * transformation matrix and reducing number  of multiplications wherever
+ * possible but keeping the  number of operations(addition,multiplication and
+ * shift)  same
+ *
+ * @param[in] pi2_src
+ *  Input 8x8 coefficients
+ *
+ * @param[out] pi2_dst
+ *  Output 8x8 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] i4_shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+#if NON_OPTIMIZED
+void ihevc_itrans_8x8(WORD16 *pi2_src,
+                      WORD16 *pi2_dst,
+                      WORD32 src_strd,
+                      WORD32 dst_strd,
+                      WORD32 i4_shift,
+                      WORD32 zero_cols)
+{
+    WORD32 j, k;
+    WORD32 e[4], o[4];
+    WORD32 ee[2], eo[2];
+    WORD32 add;
+
+    add = 1 << (i4_shift - 1);
+
+    for(j = 0; j < TRANS_SIZE_8; j++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            memset(pi2_dst, 0, TRANS_SIZE_8 * sizeof(WORD16));
+        }
+        else
+        {
+            /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+            for(k = 0; k < 4; k++)
+            {
+                o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_src[src_strd]
+                                + g_ai2_ihevc_trans_8[3][k]
+                                                * pi2_src[3 * src_strd]
+                                + g_ai2_ihevc_trans_8[5][k]
+                                                * pi2_src[5 * src_strd]
+                                + g_ai2_ihevc_trans_8[7][k]
+                                                * pi2_src[7 * src_strd];
+            }
+
+            eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_src[2 * src_strd]
+                            + g_ai2_ihevc_trans_8[6][0] * pi2_src[6 * src_strd];
+            eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_src[2 * src_strd]
+                            + g_ai2_ihevc_trans_8[6][1] * pi2_src[6 * src_strd];
+            ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_src[0]
+                            + g_ai2_ihevc_trans_8[4][0] * pi2_src[4 * src_strd];
+            ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_src[0]
+                            + g_ai2_ihevc_trans_8[4][1] * pi2_src[4 * src_strd];
+
+            /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+            e[0] = ee[0] + eo[0];
+            e[3] = ee[0] - eo[0];
+            e[1] = ee[1] + eo[1];
+            e[2] = ee[1] - eo[1];
+            for(k = 0; k < 4; k++)
+            {
+                pi2_dst[k] =
+                                CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
+                pi2_dst[k + 4] =
+                                CLIP_S16(((e[3 - k] - o[3 - k] + add) >> i4_shift));
+            }
+        }
+        pi2_src++;
+        pi2_dst += dst_strd;
+        zero_cols = zero_cols >> 1;
+    }
+}
+
+#else
+void ihevc_itrans_8x8(WORD16 *pi2_src,
+                      WORD16 *pi2_dst,
+                      WORD32 src_strd,
+                      WORD32 dst_strd,
+                      WORD32 i4_shift,
+                      WORD32 zero_cols)
+{
+    /* Transform Matrix 8x8                          */
+    /*              0    1    2   3   4   5   6   7  */
+    /*     0 -      64   64   64  64  64  64  64  64 */
+    /*     1 -      89   75   50  18 -18 -50 -75 -89 */
+    /*     2 -      83   36  -36 -83 -83 -36  36  83 */
+    /*     3 -      75  -18  -89 -50  50  89  18 -75 */
+    /*     4 -      64  -64  -64  64  64 -64 -64  64 */
+    /*     5 -      50  -89   18  75 -75 -18  89 -50 */
+    /*     6 -      36  -83   83 -36 -36  83 -83  36 */
+    /*     7 -      18  -50   75 -89  89 -75  50 -18 */
+
+    /* 0th and 4th row will have no multiplications */
+    /* 2nd and 6th row has only two coefff multiplies */
+    /* 1st, 3rd, 5th and 7th rows have o mirror symmetry */
+    WORD32 j, k;
+    WORD32 temp1, temp2;
+    WORD32 e[4], o[4];
+    WORD32 ee[2], eo[2];
+    WORD32 add;
+
+    add = 1 << (i4_shift - 1);
+
+    for(j = 0; j < TRANS_SIZE_8; j++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            memset(pi2_dst, 0, TRANS_SIZE_8 * sizeof(WORD16));
+        }
+        else
+        {
+
+            /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+            /*
+             o[0] = 89 *pi2_src[8] +  75 *pi2_src[3*8] +  50 *pi2_src[5*8] +  18 *pi2_src[7*8];
+             o[1] = 75 *pi2_src[8] + -18 *pi2_src[3*8] + -89 *pi2_src[5*8] + -50 *pi2_src[7*8];
+             o[2] = 50 *pi2_src[8] + -89 *pi2_src[3*8] +  18 *pi2_src[5*8] +  75 *pi2_src[7*8];
+             o[3] = 18 *pi2_src[8] + -50 *pi2_src[3*8] +  75 *pi2_src[5*8] + -89 *pi2_src[7*8];
+             */
+
+            /* Optimization: 4 mul + 2 add  ---> 3 mul + 3 add */
+            /*
+             temp1 = (pi2_src[8  ] + pi2_src[3*8]) * 75;
+             temp2 = (pi2_src[5*8] + pi2_src[7*8]) * 50;
+
+             o[0] = temp1 + 14 * pi2_src[8  ] + temp2 - 32 * pi2_src[7*8];
+             o[1] = temp1 - 93 * pi2_src[3*8] - temp2 - 39 * pi2_src[5*8];
+             */
+
+            temp1 = (pi2_src[src_strd] + pi2_src[3 * src_strd]) * 75;
+            temp2 = (pi2_src[5 * src_strd] + pi2_src[7 * src_strd]) * 50;
+
+            o[0] = temp1 + 14 * pi2_src[src_strd] + temp2
+                            - (pi2_src[7 * src_strd] << 5);
+            o[1] = temp1 - 93 * pi2_src[3 * src_strd] - temp2
+                            - 39 * pi2_src[5 * src_strd];
+
+            /* Optimization: 4 mul + 2 add  ---> 3 mul + 3 add */
+            /*
+             temp1 = (pi2_src[8  ] - pi2_src[3*8]) * 50;
+             temp2 = (pi2_src[5*8] + pi2_src[7*8]) * 75;
+
+             o[2] = temp1 - 39 * pi2_src[3*8] + temp2 -  57 * pi2_src[5*8];
+             o[3] = temp1 - 32 * pi2_src[8  ] + temp2 - 164 * pi2_src[7*8];
+             */
+
+            temp1 = (pi2_src[src_strd] - pi2_src[3 * src_strd]) * 50;
+            temp2 = (pi2_src[5 * src_strd] + pi2_src[7 * src_strd]) * 75;
+
+            o[2] = temp1 - 39 * pi2_src[3 * src_strd] + temp2
+                            - 57 * pi2_src[5 * src_strd];
+            o[3] = temp1 - (pi2_src[src_strd] << 5) + temp2
+                            - 164 * pi2_src[7 * src_strd];
+
+            /*
+             eo[0] = 83 *pi2_src[ 2*8 ] +  36 *pi2_src[ 6*8 ];
+             eo[1] = 36 *pi2_src[ 2*8 ] + -83 *pi2_src[ 6*8 ];
+             ee[0] = 64 *pi2_src[ 0   ] +  64 *pi2_src[ 4*8 ];
+             ee[1] = 64 *pi2_src[ 0   ] + -64 *pi2_src[ 4*8 ];
+             */
+
+            /* Optimization: 4 mul + 2 add  ---> 3 mul + 3 add */
+            temp1 = (pi2_src[2 * src_strd] + pi2_src[6 * src_strd]) * 36;
+            eo[0] = temp1 + 47 * pi2_src[2 * src_strd];
+            eo[1] = temp1 - 119 * pi2_src[6 * src_strd];
+
+            /* Optimization: 4 mul + 2 add  ---> 2 i4_shift + 2 add */
+            ee[0] = (pi2_src[0] + pi2_src[4 * src_strd]) << 6;
+            ee[1] = (pi2_src[0] - pi2_src[4 * src_strd]) << 6;
+
+            e[0] = ee[0] + eo[0];
+            e[3] = ee[0] - eo[0];
+            e[1] = ee[1] + eo[1];
+            e[2] = ee[1] - eo[1];
+
+            for(k = 0; k < 4; k++)
+            {
+                pi2_dst[k] =
+                                CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
+                pi2_dst[k + 4] =
+                                CLIP_S16(((e[3 - k] - o[3 - k] + add) >> i4_shift));
+            }
+        }
+        pi2_src++;
+        pi2_dst += dst_strd;
+        zero_cols = zero_cols >> 1;
+    }
+
+}
+#endif
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs Single stage  Inverse transform for 16x16 input
+ * block
+ *
+ * @par Description:
+ *  Performs single stage 16x16 inverse transform by  utilizing the symmetry
+ * of transformation matrix  and reducing number of multiplications wherever
+ * possible  but keeping the number of operations  (addition,multiplication
+ * and shift) same
+ *
+ * @param[in] pi2_src
+ *  Input 16x16 coefficients
+ *
+ * @param[out] pi2_dst
+ *  Output 16x16 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] i4_shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+#if NON_OPTIMIZED
+void ihevc_itrans_16x16(WORD16 *pi2_src,
+                        WORD16 *pi2_dst,
+                        WORD32 src_strd,
+                        WORD32 dst_strd,
+                        WORD32 i4_shift,
+                        WORD32 zero_cols)
+{
+    WORD32 j, k;
+    WORD32 e[8], o[8];
+    WORD32 ee[4], eo[4];
+    WORD32 eee[2], eeo[2];
+    WORD32 add;
+
+    add = 1 << (i4_shift - 1);
+
+    for(j = 0; j < TRANS_SIZE_16; j++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            memset(pi2_dst, 0, TRANS_SIZE_16 * sizeof(WORD16));
+        }
+        else
+        {
+            /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+            for(k = 0; k < 8; k++)
+            {
+                o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
+                                + g_ai2_ihevc_trans_16[3][k]
+                                                * pi2_src[3 * src_strd]
+                                + g_ai2_ihevc_trans_16[5][k]
+                                                * pi2_src[5 * src_strd]
+                                + g_ai2_ihevc_trans_16[7][k]
+                                                * pi2_src[7 * src_strd]
+                                + g_ai2_ihevc_trans_16[9][k]
+                                                * pi2_src[9 * src_strd]
+                                + g_ai2_ihevc_trans_16[11][k]
+                                                * pi2_src[11 * src_strd]
+                                + g_ai2_ihevc_trans_16[13][k]
+                                                * pi2_src[13 * src_strd]
+                                + g_ai2_ihevc_trans_16[15][k]
+                                                * pi2_src[15 * src_strd];
+            }
+            for(k = 0; k < 4; k++)
+            {
+                eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
+                                + g_ai2_ihevc_trans_16[6][k]
+                                                * pi2_src[6 * src_strd]
+                                + g_ai2_ihevc_trans_16[10][k]
+                                                * pi2_src[10 * src_strd]
+                                + g_ai2_ihevc_trans_16[14][k]
+                                                * pi2_src[14 * src_strd];
+            }
+            eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd]
+                            + g_ai2_ihevc_trans_16[12][0]
+                                            * pi2_src[12 * src_strd];
+            eee[0] =
+                            g_ai2_ihevc_trans_16[0][0] * pi2_src[0]
+                                            + g_ai2_ihevc_trans_16[8][0]
+                                                            * pi2_src[8
+                                                                            * src_strd];
+            eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd]
+                            + g_ai2_ihevc_trans_16[12][1]
+                                            * pi2_src[12 * src_strd];
+            eee[1] =
+                            g_ai2_ihevc_trans_16[0][1] * pi2_src[0]
+                                            + g_ai2_ihevc_trans_16[8][1]
+                                                            * pi2_src[8
+                                                                            * src_strd];
+
+            /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+            for(k = 0; k < 2; k++)
+            {
+                ee[k] = eee[k] + eeo[k];
+                ee[k + 2] = eee[1 - k] - eeo[1 - k];
+            }
+            for(k = 0; k < 4; k++)
+            {
+                e[k] = ee[k] + eo[k];
+                e[k + 4] = ee[3 - k] - eo[3 - k];
+            }
+            for(k = 0; k < 8; k++)
+            {
+                pi2_dst[k] =
+                                CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
+                pi2_dst[k + 8] =
+                                CLIP_S16(((e[7 - k] - o[7 - k] + add) >> i4_shift));
+            }
+        }
+        pi2_src++;
+        pi2_dst += dst_strd;
+        zero_cols = zero_cols >> 1;
+    }
+}
+#else
+void ihevc_itrans_16x16(WORD16 *pi2_src,
+                        WORD16 *pi2_dst,
+                        WORD32 src_strd,
+                        WORD32 dst_strd,
+                        WORD32 i4_shift,
+                        WORD32 zero_cols)
+{
+    WORD32 j, k;
+    WORD32 e[8], o[8];
+    WORD32 ee[4], eo[4];
+    WORD32 eee[2], eeo[2];
+    WORD32 add;
+    WORD32 temp1, temp2;
+
+    add = 1 << (i4_shift - 1);
+    /***************************************************************************/
+    /* Transform Matrix 16x16                                                  */
+    /*       0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15     */
+    /* 0  { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64},   */
+    /* 1  { 90, 87, 80, 70, 57, 43, 25,  9, -9,-25,-43,-57,-70,-80,-87,-90},   */
+    /* 2  { 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89},   */
+    /* 3  { 87, 57,  9,-43,-80,-90,-70,-25, 25, 70, 90, 80, 43, -9,-57,-87},   */
+    /* 4  { 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83},   */
+    /* 5  { 80,  9,-70,-87,-25, 57, 90, 43,-43,-90,-57, 25, 87, 70, -9,-80},   */
+    /* 6  { 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75},   */
+    /* 7  { 70,-43,-87,  9, 90, 25,-80,-57, 57, 80,-25,-90, -9, 87, 43,-70},   */
+    /* 8  { 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64},   */
+    /* 9  { 57,-80,-25, 90, -9,-87, 43, 70,-70,-43, 87,  9,-90, 25, 80,-57},   */
+    /* 10 { 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50},   */
+    /* 11 { 43,-90, 57, 25,-87, 70,  9,-80, 80, -9,-70, 87,-25,-57, 90,-43},   */
+    /* 12 { 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36},   */
+    /* 13 { 25,-70, 90,-80, 43,  9,-57, 87,-87, 57, -9,-43, 80,-90, 70,-25},   */
+    /* 14 { 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18},   */
+    /* 15 {  9,-25, 43,-57, 70,-80, 87,-90, 90,-87, 80,-70, 57,-43, 25, -9}    */
+    /***************************************************************************/
+
+    for(j = 0; j < TRANS_SIZE_16; j++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            memset(pi2_dst, 0, TRANS_SIZE_16 * sizeof(WORD16));
+        }
+        else
+        {
+            /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+            {
+                /*
+                 o[k] = g_ai2_ihevc_trans_16[ 1][k]*pi2_src[ src_strd   ] + g_ai2_ihevc_trans_16[ 3][k]*pi2_src[ 3*src_strd   ] + g_ai2_ihevc_trans_16[ 5][k]*pi2_src[ 5*src_strd   ] + g_ai2_ihevc_trans_16[ 7][k]*pi2_src[ 7*src_strd   ] +
+                 g_ai2_ihevc_trans_16[ 9][k]*pi2_src[ 9*src_strd   ] + g_ai2_ihevc_trans_16[11][k]*pi2_src[11*src_strd   ] + g_ai2_ihevc_trans_16[13][k]*pi2_src[13*src_strd   ] + g_ai2_ihevc_trans_16[15][k]*pi2_src[15*src_strd   ];
+                 */
+
+                o[0] = 90 * pi2_src[src_strd] + 87 * pi2_src[3 * src_strd]
+                                + 80 * pi2_src[5 * src_strd]
+                                + 70 * pi2_src[7 * src_strd]
+                                + 57 * pi2_src[9 * src_strd]
+                                + 43 * pi2_src[11 * src_strd]
+                                + 25 * pi2_src[13 * src_strd]
+                                + 9 * pi2_src[15 * src_strd];
+
+                o[1] = 87 * pi2_src[src_strd] + 57 * pi2_src[3 * src_strd]
+                                + 9 * pi2_src[5 * src_strd]
+                                + -43 * pi2_src[7 * src_strd]
+                                + -80 * pi2_src[9 * src_strd]
+                                + -90 * pi2_src[11 * src_strd]
+                                + -70 * pi2_src[13 * src_strd]
+                                + -25 * pi2_src[15 * src_strd];
+
+                o[2] = 80 * pi2_src[src_strd] + 9 * pi2_src[3 * src_strd]
+                                + -70 * pi2_src[5 * src_strd]
+                                + -87 * pi2_src[7 * src_strd]
+                                + -25 * pi2_src[9 * src_strd]
+                                + 57 * pi2_src[11 * src_strd]
+                                + 90 * pi2_src[13 * src_strd]
+                                + 43 * pi2_src[15 * src_strd];
+
+                o[3] = 70 * pi2_src[src_strd] + -43 * pi2_src[3 * src_strd]
+                                + -87 * pi2_src[5 * src_strd]
+                                + 9 * pi2_src[7 * src_strd]
+                                + 90 * pi2_src[9 * src_strd]
+                                + 25 * pi2_src[11 * src_strd]
+                                + -80 * pi2_src[13 * src_strd]
+                                + -57 * pi2_src[15 * src_strd];
+
+                o[4] = 57 * pi2_src[src_strd] + -80 * pi2_src[3 * src_strd]
+                                + -25 * pi2_src[5 * src_strd]
+                                + 90 * pi2_src[7 * src_strd]
+                                + -9 * pi2_src[9 * src_strd]
+                                + -87 * pi2_src[11 * src_strd]
+                                + 43 * pi2_src[13 * src_strd]
+                                + 70 * pi2_src[15 * src_strd];
+
+                o[5] = 43 * pi2_src[src_strd] + -90 * pi2_src[3 * src_strd]
+                                + 57 * pi2_src[5 * src_strd]
+                                + 25 * pi2_src[7 * src_strd]
+                                + -87 * pi2_src[9 * src_strd]
+                                + 70 * pi2_src[11 * src_strd]
+                                + 9 * pi2_src[13 * src_strd]
+                                + -80 * pi2_src[15 * src_strd];
+
+                o[6] = 25 * pi2_src[src_strd] + -70 * pi2_src[3 * src_strd]
+                                + 90 * pi2_src[5 * src_strd]
+                                + -80 * pi2_src[7 * src_strd]
+                                + 43 * pi2_src[9 * src_strd]
+                                + 9 * pi2_src[11 * src_strd]
+                                + -57 * pi2_src[13 * src_strd]
+                                + 87 * pi2_src[15 * src_strd];
+
+                o[7] = 9 * pi2_src[src_strd] + -25 * pi2_src[3 * src_strd]
+                                + 43 * pi2_src[5 * src_strd]
+                                + -57 * pi2_src[7 * src_strd]
+                                + 70 * pi2_src[9 * src_strd]
+                                + -80 * pi2_src[11 * src_strd]
+                                + 87 * pi2_src[13 * src_strd]
+                                + -90 * pi2_src[15 * src_strd];
+            }
+            {
+                temp1 = (pi2_src[2 * src_strd] + pi2_src[6 * src_strd]) * 75;
+                temp2 = (pi2_src[10 * src_strd] + pi2_src[14 * src_strd]) * 50;
+                eo[0] = temp1 + 14 * pi2_src[2 * src_strd] + temp2
+                                - (pi2_src[14 * src_strd] << 5);
+                eo[1] = temp1 - 93 * pi2_src[6 * src_strd] - temp2
+                                - 39 * pi2_src[10 * src_strd];
+
+                temp1 = (pi2_src[2 * src_strd] - pi2_src[6 * src_strd]) * 50;
+                temp2 = (pi2_src[10 * src_strd] + pi2_src[14 * src_strd]) * 75;
+                eo[2] = temp1 - 39 * pi2_src[6 * src_strd] + temp2
+                                - 57 * pi2_src[10 * src_strd];
+                eo[3] = temp1 - (pi2_src[2 * src_strd] << 5) + temp2
+                                - 164 * pi2_src[14 * src_strd];
+            }
+
+            temp1 = (pi2_src[4 * src_strd] + pi2_src[12 * src_strd]) * 36;
+            eeo[0] = temp1 + 47 * pi2_src[4 * src_strd];
+            eeo[1] = temp1 - 119 * pi2_src[12 * src_strd];
+
+            eee[0] = (pi2_src[0] + pi2_src[8 * src_strd]) << 6;
+            eee[1] = (pi2_src[0] - pi2_src[8 * src_strd]) << 6;
+
+            /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+            for(k = 0; k < 2; k++)
+            {
+                ee[k] = eee[k] + eeo[k];
+                ee[k + 2] = eee[1 - k] - eeo[1 - k];
+            }
+            for(k = 0; k < 4; k++)
+            {
+                e[k] = ee[k] + eo[k];
+                e[k + 4] = ee[3 - k] - eo[3 - k];
+            }
+            for(k = 0; k < 8; k++)
+            {
+                pi2_dst[k] =
+                                CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
+                pi2_dst[k + 8] =
+                                CLIP_S16(((e[7 - k] - o[7 - k] + add) >> i4_shift));
+            }
+        }
+        pi2_src++;
+        pi2_dst += dst_strd;
+        zero_cols = zero_cols >> 1;
+    }
+}
+#endif
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs Single stage  Inverse transform for 32x32 input
+ * block
+ *
+ * @par Description:
+ *  Performs single stage 32x32 inverse transform by  utilizing the symmetry
+ * of transformation matrix and  reducing number of multiplications wherever
+ * possible  but keeping the number of operations  (addition,multiplication
+ * and shift) same
+ *
+ * @param[in] pi2_src
+ *  Input 32x32 coefficients
+ *
+ * @param[out] pi2_dst
+ *  Output 32x32 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] i4_shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_itrans_32x32(WORD16 *pi2_src,
+                        WORD16 *pi2_dst,
+                        WORD32 src_strd,
+                        WORD32 dst_strd,
+                        WORD32 i4_shift,
+                        WORD32 zero_cols)
+{
+    WORD32 j, k;
+    WORD32 e[16], o[16];
+    WORD32 ee[8], eo[8];
+    WORD32 eee[4], eeo[4];
+    WORD32 eeee[2], eeeo[2];
+    WORD32 add;
+
+    add = 1 << (i4_shift - 1);
+
+    for(j = 0; j < TRANS_SIZE_32; j++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            memset(pi2_dst, 0, TRANS_SIZE_32 * sizeof(WORD16));
+        }
+        else
+        {
+            /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+            for(k = 0; k < 16; k++)
+            {
+                o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
+                                + g_ai2_ihevc_trans_32[3][k]
+                                                * pi2_src[3 * src_strd]
+                                + g_ai2_ihevc_trans_32[5][k]
+                                                * pi2_src[5 * src_strd]
+                                + g_ai2_ihevc_trans_32[7][k]
+                                                * pi2_src[7 * src_strd]
+                                + g_ai2_ihevc_trans_32[9][k]
+                                                * pi2_src[9 * src_strd]
+                                + g_ai2_ihevc_trans_32[11][k]
+                                                * pi2_src[11 * src_strd]
+                                + g_ai2_ihevc_trans_32[13][k]
+                                                * pi2_src[13 * src_strd]
+                                + g_ai2_ihevc_trans_32[15][k]
+                                                * pi2_src[15 * src_strd]
+                                + g_ai2_ihevc_trans_32[17][k]
+                                                * pi2_src[17 * src_strd]
+                                + g_ai2_ihevc_trans_32[19][k]
+                                                * pi2_src[19 * src_strd]
+                                + g_ai2_ihevc_trans_32[21][k]
+                                                * pi2_src[21 * src_strd]
+                                + g_ai2_ihevc_trans_32[23][k]
+                                                * pi2_src[23 * src_strd]
+                                + g_ai2_ihevc_trans_32[25][k]
+                                                * pi2_src[25 * src_strd]
+                                + g_ai2_ihevc_trans_32[27][k]
+                                                * pi2_src[27 * src_strd]
+                                + g_ai2_ihevc_trans_32[29][k]
+                                                * pi2_src[29 * src_strd]
+                                + g_ai2_ihevc_trans_32[31][k]
+                                                * pi2_src[31 * src_strd];
+            }
+            for(k = 0; k < 8; k++)
+            {
+                eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]
+                                + g_ai2_ihevc_trans_32[6][k]
+                                                * pi2_src[6 * src_strd]
+                                + g_ai2_ihevc_trans_32[10][k]
+                                                * pi2_src[10 * src_strd]
+                                + g_ai2_ihevc_trans_32[14][k]
+                                                * pi2_src[14 * src_strd]
+                                + g_ai2_ihevc_trans_32[18][k]
+                                                * pi2_src[18 * src_strd]
+                                + g_ai2_ihevc_trans_32[22][k]
+                                                * pi2_src[22 * src_strd]
+                                + g_ai2_ihevc_trans_32[26][k]
+                                                * pi2_src[26 * src_strd]
+                                + g_ai2_ihevc_trans_32[30][k]
+                                                * pi2_src[30 * src_strd];
+            }
+            for(k = 0; k < 4; k++)
+            {
+                eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd]
+                                + g_ai2_ihevc_trans_32[12][k]
+                                                * pi2_src[12 * src_strd]
+                                + g_ai2_ihevc_trans_32[20][k]
+                                                * pi2_src[20 * src_strd]
+                                + g_ai2_ihevc_trans_32[28][k]
+                                                * pi2_src[28 * src_strd];
+            }
+            eeeo[0] = g_ai2_ihevc_trans_32[8][0] * pi2_src[8 * src_strd]
+                            + g_ai2_ihevc_trans_32[24][0]
+                                            * pi2_src[24 * src_strd];
+            eeeo[1] = g_ai2_ihevc_trans_32[8][1] * pi2_src[8 * src_strd]
+                            + g_ai2_ihevc_trans_32[24][1]
+                                            * pi2_src[24 * src_strd];
+            eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0]
+                            + g_ai2_ihevc_trans_32[16][0]
+                                            * pi2_src[16 * src_strd];
+            eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0]
+                            + g_ai2_ihevc_trans_32[16][1]
+                                            * pi2_src[16 * src_strd];
+
+            /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+            eee[0] = eeee[0] + eeeo[0];
+            eee[3] = eeee[0] - eeeo[0];
+            eee[1] = eeee[1] + eeeo[1];
+            eee[2] = eeee[1] - eeeo[1];
+            for(k = 0; k < 4; k++)
+            {
+                ee[k] = eee[k] + eeo[k];
+                ee[k + 4] = eee[3 - k] - eeo[3 - k];
+            }
+            for(k = 0; k < 8; k++)
+            {
+                e[k] = ee[k] + eo[k];
+                e[k + 8] = ee[7 - k] - eo[7 - k];
+            }
+            for(k = 0; k < 16; k++)
+            {
+                pi2_dst[k] =
+                                CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
+                pi2_dst[k + 16] =
+                                CLIP_S16(((e[15 - k] - o[15 - k] + add) >> i4_shift));
+            }
+        }
+        pi2_src++;
+        pi2_dst += dst_strd;
+        zero_cols = zero_cols >> 1;
+    }
+}
+

diff --git a/common/ihevc_itrans.h b/common/ihevc_itrans.h
new file mode 100644
index 0000000..38a38a5
--- /dev/null
+++ b/common/ihevc_itrans.h

@@ -0,0 +1,109 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_itrans.h
+*
+* @brief
+*  Functions declarations for inverse transform
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_ITRANS_H_
+#define _IHEVC_ITRANS_H_
+
+typedef void ihevc_itrans_4x4_ttype1_ft(WORD16 *pi2_src,
+                                        WORD16 *pi2_dst,
+                                        WORD32 i4_src_strd,
+                                        WORD32 i4_dst_strd,
+                                        WORD32 i4_shift,
+                                        WORD32 i4_zero_cols);
+typedef void ihevc_itrans_4x4_ft(WORD16 *pi2_src,
+                                 WORD16 *pi2_dst,
+                                 WORD32 i4_src_strd,
+                                 WORD32 i4_dst_strd,
+                                 WORD32 i4_shift,
+                                 WORD32 i4_zero_cols);
+typedef void ihevc_itrans_8x8_ft(WORD16 *pi2_src,
+                                 WORD16 *pi2_dst,
+                                 WORD32 i4_src_strd,
+                                 WORD32 i4_dst_strd,
+                                 WORD32 i4_shift,
+                                 WORD32 i4_zero_cols);
+typedef void ihevc_itrans_16x16_ft(WORD16 *pi2_src,
+                                   WORD16 *pi2_dst,
+                                   WORD32 i4_src_strd,
+                                   WORD32 i4_dst_strd,
+                                   WORD32 i4_shift,
+                                   WORD32 i4_zero_cols);
+typedef void ihevc_itrans_32x32_ft(WORD16 *pi2_src,
+                                   WORD16 *pi2_dst,
+                                   WORD32 i4_src_strd,
+                                   WORD32 i4_dst_strd,
+                                   WORD32 i4_shift,
+                                   WORD32 i4_zero_cols);
+
+/* C function declarations */
+ihevc_itrans_4x4_ttype1_ft ihevc_itrans_4x4_ttype1;
+ihevc_itrans_4x4_ft ihevc_itrans_4x4;
+ihevc_itrans_8x8_ft ihevc_itrans_8x8;
+ihevc_itrans_16x16_ft ihevc_itrans_16x16;
+ihevc_itrans_32x32_ft ihevc_itrans_32x32;
+
+/* A9 Q function declarations */
+ihevc_itrans_4x4_ttype1_ft ihevc_itrans_4x4_ttype1_a9q;
+ihevc_itrans_4x4_ft ihevc_itrans_4x4_a9q;
+ihevc_itrans_8x8_ft ihevc_itrans_8x8_a9q;
+ihevc_itrans_16x16_ft ihevc_itrans_16x16_a9q;
+ihevc_itrans_32x32_ft ihevc_itrans_32x32_a9q;
+
+/* A9 Q function declarations */
+ihevc_itrans_4x4_ttype1_ft ihevc_itrans_4x4_ttype1_neonintr;
+ihevc_itrans_4x4_ft ihevc_itrans_4x4_neonintr;
+ihevc_itrans_8x8_ft ihevc_itrans_8x8_neonintr;
+ihevc_itrans_16x16_ft ihevc_itrans_16x16_neonintr;
+ihevc_itrans_32x32_ft ihevc_itrans_32x32_neonintr;
+
+/* SSSE3 function declarations */
+ihevc_itrans_4x4_ttype1_ft ihevc_itrans_4x4_ttype1_ssse3;
+ihevc_itrans_4x4_ft ihevc_itrans_4x4_ssse3;
+ihevc_itrans_8x8_ft ihevc_itrans_8x8_ssse3;
+ihevc_itrans_16x16_ft ihevc_itrans_16x16_ssse3;
+ihevc_itrans_32x32_ft ihevc_itrans_32x32_ssse3;
+
+/* SSE4.2 function declarations */
+ihevc_itrans_4x4_ttype1_ft ihevc_itrans_4x4_ttype1_sse42;
+ihevc_itrans_4x4_ft ihevc_itrans_4x4_sse42;
+ihevc_itrans_8x8_ft ihevc_itrans_8x8_sse42;
+ihevc_itrans_16x16_ft ihevc_itrans_16x16_sse42;
+ihevc_itrans_32x32_ft ihevc_itrans_32x32_sse42;
+
+/* armv8 function declarations */
+ihevc_itrans_4x4_ttype1_ft ihevc_itrans_4x4_ttype1_av8;
+ihevc_itrans_4x4_ft ihevc_itrans_4x4_av8;
+ihevc_itrans_8x8_ft ihevc_itrans_8x8_av8;
+ihevc_itrans_16x16_ft ihevc_itrans_16x16_av8;
+ihevc_itrans_32x32_ft ihevc_itrans_32x32_av8;
+#endif /*_IHEVC_ITRANS_H_*/

diff --git a/common/ihevc_itrans_recon.c b/common/ihevc_itrans_recon.c
new file mode 100644
index 0000000..0af96e8
--- /dev/null
+++ b/common/ihevc_itrans_recon.c

@@ -0,0 +1,333 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_itrans_recon.c
+ *
+ * @brief
+ *  Contains function definitions for inverse transform  and reconstruction
+ *
+ *
+ * @author
+ *  100470
+ *
+ * @par List of Functions:
+ *  - ihevc_itrans_recon_4x4_ttype1()
+ *  - ihevc_itrans_recon_4x4()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions here are replicated from ihevc_itrans.c and modified to */
+/* include reconstruction */
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs Inverse transform type 1 (DST)  and reconstruction
+ * for 4x4 input block
+ *
+ * @par Description:
+ *  Performs inverse transform and adds the prediction  data and clips output
+ * to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 4x4 buffer for storing inverse
+ *
+ *  transform
+ *  1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 4x4 block
+ *
+ * @param[out] pu1_dst
+ *  Output 4x4 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_4x4_ttype1(WORD16 *pi2_src,
+                                   WORD16 *pi2_tmp,
+                                   UWORD8 *pu1_pred,
+                                   UWORD8 *pu1_dst,
+                                   WORD32 src_strd,
+                                   WORD32 pred_strd,
+                                   WORD32 dst_strd,
+                                   WORD32 zero_cols,
+                                   WORD32 zero_rows)
+{
+    WORD32 i, c[4];
+    WORD32 add;
+    WORD32 shift;
+    WORD16 *pi2_tmp_orig;
+    WORD32 trans_size;
+    UNUSED(zero_rows);
+    trans_size = TRANS_SIZE_4;
+
+    pi2_tmp_orig = pi2_tmp;
+
+    /* Inverse Transform 1st stage */
+    shift = IT_SHIFT_STAGE_1;
+    add = 1 << (shift - 1);
+
+    for(i = 0; i < trans_size; i++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+        }
+        else
+        {
+            // Intermediate Variables
+            c[0] = pi2_src[0] + pi2_src[2 * src_strd];
+            c[1] = pi2_src[2 * src_strd] + pi2_src[3 * src_strd];
+            c[2] = pi2_src[0] - pi2_src[3 * src_strd];
+            c[3] = 74 * pi2_src[src_strd];
+
+            pi2_tmp[0] =
+                            CLIP_S16((29 * c[0] + 55 * c[1] + c[3] + add) >> shift);
+            pi2_tmp[1] =
+                            CLIP_S16((55 * c[2] - 29 * c[1] + c[3] + add) >> shift);
+            pi2_tmp[2] =
+                            CLIP_S16((74 * (pi2_src[0] - pi2_src[2 * src_strd] + pi2_src[3 * src_strd]) + add) >> shift);
+            pi2_tmp[3] =
+                            CLIP_S16((55 * c[0] + 29 * c[2] - c[3] + add) >> shift);
+        }
+        pi2_src++;
+        pi2_tmp += trans_size;
+        zero_cols = zero_cols >> 1;
+    }
+
+    pi2_tmp = pi2_tmp_orig;
+
+    /* Inverse Transform 2nd stage */
+    shift = IT_SHIFT_STAGE_2;
+    add = 1 << (shift - 1);
+
+    for(i = 0; i < trans_size; i++)
+    {
+        WORD32 itrans_out;
+        // Intermediate Variables
+        c[0] = pi2_tmp[0] + pi2_tmp[2 * trans_size];
+        c[1] = pi2_tmp[2 * trans_size] + pi2_tmp[3 * trans_size];
+        c[2] = pi2_tmp[0] - pi2_tmp[3 * trans_size];
+        c[3] = 74 * pi2_tmp[trans_size];
+
+        itrans_out =
+                        CLIP_S16((29 * c[0] + 55 * c[1] + c[3] + add) >> shift);
+        pu1_dst[0] = CLIP_U8((itrans_out + pu1_pred[0]));
+        itrans_out =
+                        CLIP_S16((55 * c[2] - 29 * c[1] + c[3] + add) >> shift);
+        pu1_dst[1] = CLIP_U8((itrans_out + pu1_pred[1]));
+        itrans_out =
+                        CLIP_S16((74 * (pi2_tmp[0] - pi2_tmp[2 * trans_size] + pi2_tmp[3 * trans_size]) + add) >> shift);
+        pu1_dst[2] = CLIP_U8((itrans_out + pu1_pred[2]));
+        itrans_out =
+                        CLIP_S16((55 * c[0] + 29 * c[2] - c[3] + add) >> shift);
+        pu1_dst[3] = CLIP_U8((itrans_out + pu1_pred[3]));
+        pi2_tmp++;
+        pu1_pred += pred_strd;
+        pu1_dst += dst_strd;
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs Inverse transform  and reconstruction for 4x4
+ * input block
+ *
+ * @par Description:
+ *  Performs inverse transform and adds the prediction  data and clips output
+ * to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 4x4 buffer for storing inverse
+ *
+ *  transform
+ *  1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 4x4 block
+ *
+ * @param[out] pu1_dst
+ *  Output 4x4 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_4x4(WORD16 *pi2_src,
+                            WORD16 *pi2_tmp,
+                            UWORD8 *pu1_pred,
+                            UWORD8 *pu1_dst,
+                            WORD32 src_strd,
+                            WORD32 pred_strd,
+                            WORD32 dst_strd,
+                            WORD32 zero_cols,
+                            WORD32 zero_rows)
+
+{
+    WORD32 j;
+    WORD32 e[2], o[2];
+    WORD32 add;
+    WORD32 shift;
+    WORD16 *pi2_tmp_orig;
+    WORD32 trans_size;
+    UNUSED(zero_rows);
+    trans_size = TRANS_SIZE_4;
+
+    pi2_tmp_orig = pi2_tmp;
+
+    /* Inverse Transform 1st stage */
+    shift = IT_SHIFT_STAGE_1;
+    add = 1 << (shift - 1);
+
+    for(j = 0; j < trans_size; j++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+        }
+        else
+        {
+
+            /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+            o[0] = g_ai2_ihevc_trans_4[1][0] * pi2_src[src_strd]
+                            + g_ai2_ihevc_trans_4[3][0] * pi2_src[3 * src_strd];
+            o[1] = g_ai2_ihevc_trans_4[1][1] * pi2_src[src_strd]
+                            + g_ai2_ihevc_trans_4[3][1] * pi2_src[3 * src_strd];
+            e[0] = g_ai2_ihevc_trans_4[0][0] * pi2_src[0]
+                            + g_ai2_ihevc_trans_4[2][0] * pi2_src[2 * src_strd];
+            e[1] = g_ai2_ihevc_trans_4[0][1] * pi2_src[0]
+                            + g_ai2_ihevc_trans_4[2][1] * pi2_src[2 * src_strd];
+
+            pi2_tmp[0] =
+                            CLIP_S16(((e[0] + o[0] + add) >> shift));
+            pi2_tmp[1] =
+                            CLIP_S16(((e[1] + o[1] + add) >> shift));
+            pi2_tmp[2] =
+                            CLIP_S16(((e[1] - o[1] + add) >> shift));
+            pi2_tmp[3] =
+                            CLIP_S16(((e[0] - o[0] + add) >> shift));
+
+        }
+        pi2_src++;
+        pi2_tmp += trans_size;
+        zero_cols = zero_cols >> 1;
+    }
+
+    pi2_tmp = pi2_tmp_orig;
+
+    /* Inverse Transform 2nd stage */
+    shift = IT_SHIFT_STAGE_2;
+    add = 1 << (shift - 1);
+
+    for(j = 0; j < trans_size; j++)
+    {
+        WORD32 itrans_out;
+        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+        o[0] = g_ai2_ihevc_trans_4[1][0] * pi2_tmp[trans_size]
+                        + g_ai2_ihevc_trans_4[3][0] * pi2_tmp[3 * trans_size];
+        o[1] = g_ai2_ihevc_trans_4[1][1] * pi2_tmp[trans_size]
+                        + g_ai2_ihevc_trans_4[3][1] * pi2_tmp[3 * trans_size];
+        e[0] = g_ai2_ihevc_trans_4[0][0] * pi2_tmp[0]
+                        + g_ai2_ihevc_trans_4[2][0] * pi2_tmp[2 * trans_size];
+        e[1] = g_ai2_ihevc_trans_4[0][1] * pi2_tmp[0]
+                        + g_ai2_ihevc_trans_4[2][1] * pi2_tmp[2 * trans_size];
+
+        itrans_out =
+                        CLIP_S16(((e[0] + o[0] + add) >> shift));
+        pu1_dst[0] = CLIP_U8((itrans_out + pu1_pred[0]));
+        itrans_out =
+                        CLIP_S16(((e[1] + o[1] + add) >> shift));
+        pu1_dst[1] = CLIP_U8((itrans_out + pu1_pred[1]));
+        itrans_out =
+                        CLIP_S16(((e[1] - o[1] + add) >> shift));
+        pu1_dst[2] = CLIP_U8((itrans_out + pu1_pred[2]));
+        itrans_out =
+                        CLIP_S16(((e[0] - o[0] + add) >> shift));
+        pu1_dst[3] = CLIP_U8((itrans_out + pu1_pred[3]));
+
+        pi2_tmp++;
+        pu1_pred += pred_strd;
+        pu1_dst += dst_strd;
+
+    }
+}
+

diff --git a/common/ihevc_itrans_recon.h b/common/ihevc_itrans_recon.h
new file mode 100644
index 0000000..56da261
--- /dev/null
+++ b/common/ihevc_itrans_recon.h

@@ -0,0 +1,193 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_itrans_recon.h
+*
+* @brief
+*  Functions declarations for inverse transform and  reconstruction
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_ITRANS_RECON_H_
+#define _IHEVC_ITRANS_RECON_H_
+
+typedef void ihevc_itrans_recon_4x4_ttype1_ft(WORD16 *pi2_src,
+                                              WORD16 *pi2_tmp,
+                                              UWORD8 *pu1_pred,
+                                              UWORD8 *pu1_dst,
+                                              WORD32 src_strd,
+                                              WORD32 pred_strd,
+                                              WORD32 dst_strd,
+                                              WORD32 zero_cols,
+                                              WORD32 zero_rows);
+typedef void ihevc_hbd_itrans_recon_4x4_ttype1_ft(WORD16 *pi2_src,
+                                                  WORD16 *pi2_tmp,
+                                                  UWORD16 *pu2_pred,
+                                                  UWORD16 *pu2_dst,
+                                                  WORD32 src_strd,
+                                                  WORD32 pred_strd,
+                                                  WORD32 dst_strd,
+                                                  WORD32 zero_cols,
+                                                  WORD32 zero_rows,
+                                                  UWORD8 bit_depth);
+typedef void ihevc_itrans_recon_4x4_ft(WORD16 *pi2_src,
+                                       WORD16 *pi2_tmp,
+                                       UWORD8 *pu1_pred,
+                                       UWORD8 *pu1_dst,
+                                       WORD32 src_strd,
+                                       WORD32 pred_strd,
+                                       WORD32 dst_strd,
+                                       WORD32 zero_cols,
+                                       WORD32 zero_rows);
+typedef void ihevc_hbd_itrans_recon_4x4_ft(WORD16 *pi2_src,
+                                           WORD16 *pi2_tmp,
+                                           UWORD16 *pu2_pred,
+                                           UWORD16 *pu2_dst,
+                                           WORD32 src_strd,
+                                           WORD32 pred_strd,
+                                           WORD32 dst_strd,
+                                           WORD32 zero_cols,
+                                           WORD32 zero_rows,
+                                           UWORD8 bit_depth);
+typedef void ihevc_itrans_recon_8x8_ft(WORD16 *pi2_src,
+                                       WORD16 *pi2_tmp,
+                                       UWORD8 *pu1_pred,
+                                       UWORD8 *pu1_dst,
+                                       WORD32 src_strd,
+                                       WORD32 pred_strd,
+                                       WORD32 dst_strd,
+                                       WORD32 zero_cols,
+                                       WORD32 zero_rows);
+typedef void ihevc_hbd_itrans_recon_8x8_ft(WORD16 *pi2_src,
+                                           WORD16 *pi2_tmp,
+                                           UWORD16 *pu2_pred,
+                                           UWORD16 *pu2_dst,
+                                           WORD32 src_strd,
+                                           WORD32 pred_strd,
+                                           WORD32 dst_strd,
+                                           WORD32 zero_cols,
+                                           WORD32 zero_rows,
+                                           UWORD8 bit_depth);
+typedef void ihevc_itrans_recon_16x16_ft(WORD16 *pi2_src,
+                                         WORD16 *pi2_tmp,
+                                         UWORD8 *pu1_pred,
+                                         UWORD8 *pu1_dst,
+                                         WORD32 src_strd,
+                                         WORD32 pred_strd,
+                                         WORD32 dst_strd,
+                                         WORD32 zero_cols,
+                                         WORD32 zero_rows);
+typedef void ihevc_hbd_itrans_recon_16x16_ft(WORD16 *pi2_src,
+                                             WORD16 *pi2_tmp,
+                                             UWORD16 *pu2_pred,
+                                             UWORD16 *pu2_dst,
+                                             WORD32 src_strd,
+                                             WORD32 pred_strd,
+                                             WORD32 dst_strd,
+                                             WORD32 zero_cols,
+                                             WORD32 zero_rows,
+                                             UWORD8 bit_depth);
+typedef void ihevc_itrans_recon_32x32_ft(WORD16 *pi2_src,
+                                         WORD16 *pi2_tmp,
+                                         UWORD8 *pu1_pred,
+                                         UWORD8 *pu1_dst,
+                                         WORD32 src_strd,
+                                         WORD32 pred_strd,
+                                         WORD32 dst_strd,
+                                         WORD32 zero_cols,
+                                         WORD32 zero_rows);
+typedef void ihevc_hbd_itrans_recon_32x32_ft(WORD16 *pi2_src,
+                                             WORD16 *pi2_tmp,
+                                             UWORD16 *pu2_pred,
+                                             UWORD16 *pu2_dst,
+                                             WORD32 src_strd,
+                                             WORD32 pred_strd,
+                                             WORD32 dst_strd,
+                                             WORD32 zero_cols,
+                                             WORD32 zero_rows,
+                                             UWORD8 bit_depth);
+
+/* C function declarations */
+ihevc_itrans_recon_4x4_ttype1_ft ihevc_itrans_recon_4x4_ttype1;
+ihevc_itrans_recon_4x4_ft ihevc_itrans_recon_4x4;
+ihevc_itrans_recon_8x8_ft ihevc_itrans_recon_8x8;
+ihevc_itrans_recon_16x16_ft ihevc_itrans_recon_16x16;
+ihevc_itrans_recon_32x32_ft ihevc_itrans_recon_32x32;
+
+ihevc_hbd_itrans_recon_4x4_ttype1_ft ihevc_hbd_itrans_recon_4x4_ttype1;
+ihevc_hbd_itrans_recon_4x4_ft ihevc_hbd_itrans_recon_4x4;
+ihevc_hbd_itrans_recon_8x8_ft ihevc_hbd_itrans_recon_8x8;
+ihevc_hbd_itrans_recon_16x16_ft ihevc_hbd_itrans_recon_16x16;
+ihevc_hbd_itrans_recon_32x32_ft ihevc_hbd_itrans_recon_32x32;
+
+/* A9 Q function declarations */
+ihevc_itrans_recon_4x4_ttype1_ft ihevc_itrans_recon_4x4_ttype1_a9q;
+ihevc_itrans_recon_4x4_ft ihevc_itrans_recon_4x4_a9q;
+ihevc_itrans_recon_8x8_ft ihevc_itrans_recon_8x8_a9q;
+ihevc_itrans_recon_16x16_ft ihevc_itrans_recon_16x16_a9q;
+ihevc_itrans_recon_32x32_ft ihevc_itrans_recon_32x32_a9q;
+
+/* A9 A function declarations */
+ihevc_itrans_recon_4x4_ttype1_ft ihevc_itrans_recon_4x4_ttype1_a9a;
+ihevc_itrans_recon_4x4_ft ihevc_itrans_recon_4x4_a9a;
+ihevc_itrans_recon_8x8_ft ihevc_itrans_recon_8x8_a9a;
+ihevc_itrans_recon_16x16_ft ihevc_itrans_recon_16x16_a9a;
+ihevc_itrans_recon_32x32_ft ihevc_itrans_recon_32x32_a9a;
+
+/* NEONINTR function declarations */
+ihevc_itrans_recon_4x4_ttype1_ft ihevc_itrans_recon_4x4_ttype1_neonintr;
+ihevc_itrans_recon_4x4_ft ihevc_itrans_recon_4x4_neonintr;
+ihevc_itrans_recon_8x8_ft ihevc_itrans_recon_8x8_neonintr;
+ihevc_itrans_recon_16x16_ft ihevc_itrans_recon_16x16_neonintr;
+ihevc_itrans_recon_32x32_ft ihevc_itrans_recon_32x32_neonintr;
+
+/* SSSE31 function declarations */
+ihevc_itrans_recon_4x4_ttype1_ft ihevc_itrans_recon_4x4_ttype1_ssse3;
+ihevc_itrans_recon_4x4_ft ihevc_itrans_recon_4x4_ssse3;
+ihevc_itrans_recon_8x8_ft ihevc_itrans_recon_8x8_ssse3;
+ihevc_itrans_recon_16x16_ft ihevc_itrans_recon_16x16_ssse3;
+ihevc_itrans_recon_32x32_ft ihevc_itrans_recon_32x32_ssse3;
+
+/* SSE42 function declarations */
+ihevc_itrans_recon_4x4_ttype1_ft ihevc_itrans_recon_4x4_ttype1_sse42;
+ihevc_itrans_recon_4x4_ft ihevc_itrans_recon_4x4_sse42;
+ihevc_itrans_recon_8x8_ft ihevc_itrans_recon_8x8_sse42;
+ihevc_itrans_recon_32x32_ft ihevc_itrans_recon_32x32_sse42;
+
+ihevc_hbd_itrans_recon_4x4_ttype1_ft ihevc_hbd_itrans_recon_4x4_ttype1_sse42;
+ihevc_hbd_itrans_recon_4x4_ft ihevc_hbd_itrans_recon_4x4_sse42;
+ihevc_hbd_itrans_recon_8x8_ft ihevc_hbd_itrans_recon_8x8_sse42;
+ihevc_hbd_itrans_recon_16x16_ft ihevc_hbd_itrans_recon_16x16_sse42;
+ihevc_hbd_itrans_recon_32x32_ft ihevc_hbd_itrans_recon_32x32_sse42;
+
+
+/* armv8 function declarations */
+ihevc_itrans_recon_4x4_ttype1_ft ihevc_itrans_recon_4x4_ttype1_av8;
+ihevc_itrans_recon_4x4_ft ihevc_itrans_recon_4x4_av8;
+ihevc_itrans_recon_8x8_ft ihevc_itrans_recon_8x8_av8;
+ihevc_itrans_recon_16x16_ft ihevc_itrans_recon_16x16_av8;
+ihevc_itrans_recon_32x32_ft ihevc_itrans_recon_32x32_av8;
+#endif /*_IHEVC_ITRANS_RECON_H_*/

diff --git a/common/ihevc_itrans_recon_16x16.c b/common/ihevc_itrans_recon_16x16.c
new file mode 100644
index 0000000..56e28a3
--- /dev/null
+++ b/common/ihevc_itrans_recon_16x16.c

@@ -0,0 +1,889 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_itrans_recon_16x16.c
+ *
+ * @brief
+ *  Contains function definitions for inverse transform  and reconstruction 16x16
+ *
+ *
+ * @author
+ *  100470
+ *
+ * @par List of Functions:
+ *  - ihevc_itrans_recon_16x16()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs Inverse transform  and reconstruction for 16x16
+ * input block
+ *
+ * @par Description:
+ *  Performs inverse transform and adds the prediction  data and clips output
+ * to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 16x16 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 16x16 buffer for storing inverse
+ *
+ *  transform
+ *  1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 16x16 block
+ *
+ * @param[out] pu1_dst
+ *  Output 16x16 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_16x16(WORD16 *pi2_src,
+                              WORD16 *pi2_tmp,
+                              UWORD8 *pu1_pred,
+                              UWORD8 *pu1_dst,
+                              WORD32 src_strd,
+                              WORD32 pred_strd,
+                              WORD32 dst_strd,
+                              WORD32 zero_cols,
+                              WORD32 zero_rows)
+{
+    WORD32 j, k;
+    WORD32 e[8], o[8];
+    WORD32 ee[4], eo[4];
+    WORD32 eee[2], eeo[2];
+    WORD32 add;
+    WORD32 shift;
+    WORD16 *pi2_tmp_orig;
+    WORD32 trans_size;
+    WORD32 zero_rows_2nd_stage = zero_cols;
+    WORD32 row_limit_2nd_stage;
+
+    if((zero_cols & 0xFFF0) == 0xFFF0)
+        row_limit_2nd_stage = 4;
+    else if((zero_cols & 0xFF00) == 0xFF00)
+        row_limit_2nd_stage = 8;
+    else
+        row_limit_2nd_stage = TRANS_SIZE_16;
+
+    trans_size = TRANS_SIZE_16;
+    pi2_tmp_orig = pi2_tmp;
+    if((zero_rows & 0xFFF0) == 0xFFF0)  /* First 4 rows of input are non-zero */
+    {
+        /* Inverse Transform 1st stage */
+        /************************************************************************************************/
+        /**********************************START - IT_RECON_16x16****************************************/
+        /************************************************************************************************/
+
+        shift = IT_SHIFT_STAGE_1;
+        add = 1 << (shift - 1);
+
+        for(j = 0; j < row_limit_2nd_stage; j++)
+        {
+            /* Checking for Zero Cols */
+            if((zero_cols & 1) == 1)
+            {
+                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+            }
+            else
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_src[3 * src_strd];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd];
+                }
+                eeo[0] = 0;
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
+                eeo[1] = 0;
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    pi2_tmp[k] =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pi2_tmp[k + 8] =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                }
+            }
+            pi2_src++;
+            pi2_tmp += trans_size;
+            zero_cols = zero_cols >> 1;
+        }
+
+        pi2_tmp = pi2_tmp_orig;
+
+        /* Inverse Transform 2nd stage */
+        shift = IT_SHIFT_STAGE_2;
+        add = 1 << (shift - 1);
+
+        if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_tmp[3 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
+                }
+                eeo[0] = 0;
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+                eeo[1] = 0;
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                    pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 4 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_16[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_16[7][k]
+                                                    * pi2_tmp[7 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+                                    + g_ai2_ihevc_trans_16[6][k]
+                                                    * pi2_tmp[6 * trans_size];
+                }
+                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                    pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else /* All rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_16[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_16[7][k]
+                                                    * pi2_tmp[7 * trans_size]
+                                    + g_ai2_ihevc_trans_16[9][k]
+                                                    * pi2_tmp[9 * trans_size]
+                                    + g_ai2_ihevc_trans_16[11][k]
+                                                    * pi2_tmp[11 * trans_size]
+                                    + g_ai2_ihevc_trans_16[13][k]
+                                                    * pi2_tmp[13 * trans_size]
+                                    + g_ai2_ihevc_trans_16[15][k]
+                                                    * pi2_tmp[15 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+                                    + g_ai2_ihevc_trans_16[6][k]
+                                                    * pi2_tmp[6 * trans_size]
+                                    + g_ai2_ihevc_trans_16[10][k]
+                                                    * pi2_tmp[10 * trans_size]
+                                    + g_ai2_ihevc_trans_16[14][k]
+                                                    * pi2_tmp[14 * trans_size];
+                }
+                eeo[0] =
+                                g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
+                                                + g_ai2_ihevc_trans_16[12][0]
+                                                                * pi2_tmp[12
+                                                                                * trans_size];
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
+                                + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
+                eeo[1] =
+                                g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
+                                                + g_ai2_ihevc_trans_16[12][1]
+                                                                * pi2_tmp[12
+                                                                                * trans_size];
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
+                                + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                    pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        /************************************************************************************************/
+        /************************************END - IT_RECON_16x16****************************************/
+        /************************************************************************************************/
+    }
+    else if((zero_rows & 0xFF00) == 0xFF00)  /* First 8 rows of input are non-zero */
+    {
+        /* Inverse Transform 1st stage */
+        /************************************************************************************************/
+        /**********************************START - IT_RECON_16x16****************************************/
+        /************************************************************************************************/
+
+        shift = IT_SHIFT_STAGE_1;
+        add = 1 << (shift - 1);
+
+        for(j = 0; j < row_limit_2nd_stage; j++)
+        {
+            /* Checking for Zero Cols */
+            if((zero_cols & 1) == 1)
+            {
+                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+            }
+            else
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_src[3 * src_strd]
+                                    + g_ai2_ihevc_trans_16[5][k]
+                                                    * pi2_src[5 * src_strd]
+                                    + g_ai2_ihevc_trans_16[7][k]
+                                                    * pi2_src[7 * src_strd];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
+                                    + g_ai2_ihevc_trans_16[6][k]
+                                                    * pi2_src[6 * src_strd];
+                }
+                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd];
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
+                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd];
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    pi2_tmp[k] =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pi2_tmp[k + 8] =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                }
+            }
+            pi2_src++;
+            pi2_tmp += trans_size;
+            zero_cols = zero_cols >> 1;
+        }
+
+        pi2_tmp = pi2_tmp_orig;
+
+        /* Inverse Transform 2nd stage */
+        shift = IT_SHIFT_STAGE_2;
+        add = 1 << (shift - 1);
+
+        if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_tmp[3 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
+                }
+                eeo[0] = 0;
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+                eeo[1] = 0;
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                    pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 4 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_16[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_16[7][k]
+                                                    * pi2_tmp[7 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+                                    + g_ai2_ihevc_trans_16[6][k]
+                                                    * pi2_tmp[6 * trans_size];
+                }
+                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                    pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else /* All rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_16[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_16[7][k]
+                                                    * pi2_tmp[7 * trans_size]
+                                    + g_ai2_ihevc_trans_16[9][k]
+                                                    * pi2_tmp[9 * trans_size]
+                                    + g_ai2_ihevc_trans_16[11][k]
+                                                    * pi2_tmp[11 * trans_size]
+                                    + g_ai2_ihevc_trans_16[13][k]
+                                                    * pi2_tmp[13 * trans_size]
+                                    + g_ai2_ihevc_trans_16[15][k]
+                                                    * pi2_tmp[15 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+                                    + g_ai2_ihevc_trans_16[6][k]
+                                                    * pi2_tmp[6 * trans_size]
+                                    + g_ai2_ihevc_trans_16[10][k]
+                                                    * pi2_tmp[10 * trans_size]
+                                    + g_ai2_ihevc_trans_16[14][k]
+                                                    * pi2_tmp[14 * trans_size];
+                }
+                eeo[0] =
+                                g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
+                                                + g_ai2_ihevc_trans_16[12][0]
+                                                                * pi2_tmp[12
+                                                                                * trans_size];
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
+                                + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
+                eeo[1] =
+                                g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
+                                                + g_ai2_ihevc_trans_16[12][1]
+                                                                * pi2_tmp[12
+                                                                                * trans_size];
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
+                                + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                    pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        /************************************************************************************************/
+        /************************************END - IT_RECON_16x16****************************************/
+        /************************************************************************************************/
+    }
+    else  /* All rows of input are non-zero */
+    {
+        /* Inverse Transform 1st stage */
+        /************************************************************************************************/
+        /**********************************START - IT_RECON_16x16****************************************/
+        /************************************************************************************************/
+
+        shift = IT_SHIFT_STAGE_1;
+        add = 1 << (shift - 1);
+
+        for(j = 0; j < row_limit_2nd_stage; j++)
+        {
+            /* Checking for Zero Cols */
+            if((zero_cols & 1) == 1)
+            {
+                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+            }
+            else
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_src[3 * src_strd]
+                                    + g_ai2_ihevc_trans_16[5][k]
+                                                    * pi2_src[5 * src_strd]
+                                    + g_ai2_ihevc_trans_16[7][k]
+                                                    * pi2_src[7 * src_strd]
+                                    + g_ai2_ihevc_trans_16[9][k]
+                                                    * pi2_src[9 * src_strd]
+                                    + g_ai2_ihevc_trans_16[11][k]
+                                                    * pi2_src[11 * src_strd]
+                                    + g_ai2_ihevc_trans_16[13][k]
+                                                    * pi2_src[13 * src_strd]
+                                    + g_ai2_ihevc_trans_16[15][k]
+                                                    * pi2_src[15 * src_strd];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
+                                    + g_ai2_ihevc_trans_16[6][k]
+                                                    * pi2_src[6 * src_strd]
+                                    + g_ai2_ihevc_trans_16[10][k]
+                                                    * pi2_src[10 * src_strd]
+                                    + g_ai2_ihevc_trans_16[14][k]
+                                                    * pi2_src[14 * src_strd];
+                }
+                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd]
+                                + g_ai2_ihevc_trans_16[12][0]
+                                                * pi2_src[12 * src_strd];
+                eee[0] =
+                                g_ai2_ihevc_trans_16[0][0] * pi2_src[0]
+                                                + g_ai2_ihevc_trans_16[8][0]
+                                                                * pi2_src[8
+                                                                                * src_strd];
+                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd]
+                                + g_ai2_ihevc_trans_16[12][1]
+                                                * pi2_src[12 * src_strd];
+                eee[1] =
+                                g_ai2_ihevc_trans_16[0][1] * pi2_src[0]
+                                                + g_ai2_ihevc_trans_16[8][1]
+                                                                * pi2_src[8
+                                                                                * src_strd];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    pi2_tmp[k] =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pi2_tmp[k + 8] =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                }
+            }
+            pi2_src++;
+            pi2_tmp += trans_size;
+            zero_cols = zero_cols >> 1;
+        }
+
+        pi2_tmp = pi2_tmp_orig;
+
+        /* Inverse Transform 2nd stage */
+        shift = IT_SHIFT_STAGE_2;
+        add = 1 << (shift - 1);
+
+        if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_tmp[3 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
+                }
+                eeo[0] = 0;
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+                eeo[1] = 0;
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                    pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 4 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_16[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_16[7][k]
+                                                    * pi2_tmp[7 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+                                    + g_ai2_ihevc_trans_16[6][k]
+                                                    * pi2_tmp[6 * trans_size];
+                }
+                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                    pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else /* All rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 8; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_16[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_16[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_16[7][k]
+                                                    * pi2_tmp[7 * trans_size]
+                                    + g_ai2_ihevc_trans_16[9][k]
+                                                    * pi2_tmp[9 * trans_size]
+                                    + g_ai2_ihevc_trans_16[11][k]
+                                                    * pi2_tmp[11 * trans_size]
+                                    + g_ai2_ihevc_trans_16[13][k]
+                                                    * pi2_tmp[13 * trans_size]
+                                    + g_ai2_ihevc_trans_16[15][k]
+                                                    * pi2_tmp[15 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+                                    + g_ai2_ihevc_trans_16[6][k]
+                                                    * pi2_tmp[6 * trans_size]
+                                    + g_ai2_ihevc_trans_16[10][k]
+                                                    * pi2_tmp[10 * trans_size]
+                                    + g_ai2_ihevc_trans_16[14][k]
+                                                    * pi2_tmp[14 * trans_size];
+                }
+                eeo[0] =
+                                g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
+                                                + g_ai2_ihevc_trans_16[12][0]
+                                                                * pi2_tmp[12
+                                                                                * trans_size];
+                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
+                                + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
+                eeo[1] =
+                                g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
+                                                + g_ai2_ihevc_trans_16[12][1]
+                                                                * pi2_tmp[12
+                                                                                * trans_size];
+                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
+                                + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                for(k = 0; k < 2; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 4] = ee[3 - k] - eo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+                    pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        /************************************************************************************************/
+        /************************************END - IT_RECON_16x16****************************************/
+        /************************************************************************************************/
+    }
+
+}
+

diff --git a/common/ihevc_itrans_recon_32x32.c b/common/ihevc_itrans_recon_32x32.c
new file mode 100644
index 0000000..b8a71ab
--- /dev/null
+++ b/common/ihevc_itrans_recon_32x32.c

@@ -0,0 +1,1127 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_itrans_recon_32x32.c
+ *
+ * @brief
+ *  Contains function definitions for inverse transform  and reconstruction 32x32
+ *
+ *
+ * @author
+ *  100470
+ *
+ * @par List of Functions:
+ *  - ihevc_itrans_recon_32x32()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs Inverse transform  and reconstruction for 32x32
+ * input block
+ *
+ * @par Description:
+ *  Performs inverse transform and adds the prediction  data and clips output
+ * to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 32x32 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 32x32 buffer for storing inverse
+ *
+ *  transform
+ *  1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 32x32 block
+ *
+ * @param[out] pu1_dst
+ *  Output 32x32 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_32x32(WORD16 *pi2_src,
+                              WORD16 *pi2_tmp,
+                              UWORD8 *pu1_pred,
+                              UWORD8 *pu1_dst,
+                              WORD32 src_strd,
+                              WORD32 pred_strd,
+                              WORD32 dst_strd,
+                              WORD32 zero_cols,
+                              WORD32 zero_rows)
+{
+    WORD32 j, k;
+    WORD32 e[16], o[16];
+    WORD32 ee[8], eo[8];
+    WORD32 eee[4], eeo[4];
+    WORD32 eeee[2], eeeo[2];
+    WORD32 add;
+    WORD32 shift;
+    WORD16 *pi2_tmp_orig;
+    WORD32 trans_size;
+    WORD32 zero_rows_2nd_stage = zero_cols;
+    WORD32 row_limit_2nd_stage;
+
+    trans_size = TRANS_SIZE_32;
+    pi2_tmp_orig = pi2_tmp;
+
+    if((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0)
+        row_limit_2nd_stage = 4;
+    else if((zero_cols & 0xFFFFFF00) == 0xFFFFFF00)
+        row_limit_2nd_stage = 8;
+    else
+        row_limit_2nd_stage = TRANS_SIZE_32;
+
+    if((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0)  /* First 4 rows of input are non-zero */
+    {
+        /************************************************************************************************/
+        /**********************************START - IT_RECON_32x32****************************************/
+        /************************************************************************************************/
+        /* Inverse Transform 1st stage */
+        shift = IT_SHIFT_STAGE_1;
+        add = 1 << (shift - 1);
+
+        for(j = 0; j < row_limit_2nd_stage; j++)
+        {
+            /* Checking for Zero Cols */
+            if((zero_cols & 1) == 1)
+            {
+                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+            }
+            else
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 16; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
+                                    + g_ai2_ihevc_trans_32[3][k]
+                                                    * pi2_src[3 * src_strd];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd];
+                }
+//                for(k = 0; k < 4; k++)
+                {
+                    eeo[0] = 0;
+                    eeo[1] = 0;
+                    eeo[2] = 0;
+                    eeo[3] = 0;
+                }
+                eeeo[0] = 0;
+                eeeo[1] = 0;
+                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0];
+                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                eee[0] = eeee[0] + eeeo[0];
+                eee[3] = eeee[0] - eeeo[0];
+                eee[1] = eeee[1] + eeeo[1];
+                eee[2] = eeee[1] - eeeo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 8] = ee[7 - k] - eo[7 - k];
+                }
+                for(k = 0; k < 16; k++)
+                {
+                    pi2_tmp[k] =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pi2_tmp[k + 16] =
+                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+                }
+            }
+            pi2_src++;
+            pi2_tmp += trans_size;
+            zero_cols = zero_cols >> 1;
+        }
+
+        pi2_tmp = pi2_tmp_orig;
+
+        /* Inverse Transform 2nd stage */
+        shift = IT_SHIFT_STAGE_2;
+        add = 1 << (shift - 1);
+        if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 16; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_32[3][k]
+                                                    * pi2_tmp[3 * trans_size];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size];
+                }
+//                for(k = 0; k < 4; k++)
+                {
+                    eeo[0] = 0;
+                    eeo[1] = 0;
+                    eeo[2] = 0;
+                    eeo[3] = 0;
+                }
+                eeeo[0] = 0;
+                eeeo[1] = 0;
+                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
+                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                eee[0] = eeee[0] + eeeo[0];
+                eee[3] = eeee[0] - eeeo[0];
+                eee[1] = eeee[1] + eeeo[1];
+                eee[2] = eeee[1] - eeeo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 8] = ee[7 - k] - eo[7 - k];
+                }
+                for(k = 0; k < 16; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 16; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_32[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_32[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_32[7][k]
+                                                    * pi2_tmp[7 * trans_size];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
+                                    + g_ai2_ihevc_trans_32[6][k]
+                                                    * pi2_tmp[6 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size];
+                }
+                eeeo[0] = 0;
+                eeeo[1] = 0;
+                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
+                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                eee[0] = eeee[0] + eeeo[0];
+                eee[3] = eeee[0] - eeeo[0];
+                eee[1] = eeee[1] + eeeo[1];
+                eee[2] = eeee[1] - eeeo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 8] = ee[7 - k] - eo[7 - k];
+                }
+                for(k = 0; k < 16; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else /* All rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 16; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_32[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_32[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_32[7][k]
+                                                    * pi2_tmp[7 * trans_size]
+                                    + g_ai2_ihevc_trans_32[9][k]
+                                                    * pi2_tmp[9 * trans_size]
+                                    + g_ai2_ihevc_trans_32[11][k]
+                                                    * pi2_tmp[11 * trans_size]
+                                    + g_ai2_ihevc_trans_32[13][k]
+                                                    * pi2_tmp[13 * trans_size]
+                                    + g_ai2_ihevc_trans_32[15][k]
+                                                    * pi2_tmp[15 * trans_size]
+                                    + g_ai2_ihevc_trans_32[17][k]
+                                                    * pi2_tmp[17 * trans_size]
+                                    + g_ai2_ihevc_trans_32[19][k]
+                                                    * pi2_tmp[19 * trans_size]
+                                    + g_ai2_ihevc_trans_32[21][k]
+                                                    * pi2_tmp[21 * trans_size]
+                                    + g_ai2_ihevc_trans_32[23][k]
+                                                    * pi2_tmp[23 * trans_size]
+                                    + g_ai2_ihevc_trans_32[25][k]
+                                                    * pi2_tmp[25 * trans_size]
+                                    + g_ai2_ihevc_trans_32[27][k]
+                                                    * pi2_tmp[27 * trans_size]
+                                    + g_ai2_ihevc_trans_32[29][k]
+                                                    * pi2_tmp[29 * trans_size]
+                                    + g_ai2_ihevc_trans_32[31][k]
+                                                    * pi2_tmp[31 * trans_size];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
+                                    + g_ai2_ihevc_trans_32[6][k]
+                                                    * pi2_tmp[6 * trans_size]
+                                    + g_ai2_ihevc_trans_32[10][k]
+                                                    * pi2_tmp[10 * trans_size]
+                                    + g_ai2_ihevc_trans_32[14][k]
+                                                    * pi2_tmp[14 * trans_size]
+                                    + g_ai2_ihevc_trans_32[18][k]
+                                                    * pi2_tmp[18 * trans_size]
+                                    + g_ai2_ihevc_trans_32[22][k]
+                                                    * pi2_tmp[22 * trans_size]
+                                    + g_ai2_ihevc_trans_32[26][k]
+                                                    * pi2_tmp[26 * trans_size]
+                                    + g_ai2_ihevc_trans_32[30][k]
+                                                    * pi2_tmp[30 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]
+                                    + g_ai2_ihevc_trans_32[12][k]
+                                                    * pi2_tmp[12 * trans_size]
+                                    + g_ai2_ihevc_trans_32[20][k]
+                                                    * pi2_tmp[20 * trans_size]
+                                    + g_ai2_ihevc_trans_32[28][k]
+                                                    * pi2_tmp[28 * trans_size];
+                }
+                eeeo[0] =
+                                g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size]
+                                                + g_ai2_ihevc_trans_32[24][0]
+                                                                * pi2_tmp[24
+                                                                                * trans_size];
+                eeeo[1] =
+                                g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size]
+                                                + g_ai2_ihevc_trans_32[24][1]
+                                                                * pi2_tmp[24
+                                                                                * trans_size];
+                eeee[0] =
+                                g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]
+                                                + g_ai2_ihevc_trans_32[16][0]
+                                                                * pi2_tmp[16
+                                                                                * trans_size];
+                eeee[1] =
+                                g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]
+                                                + g_ai2_ihevc_trans_32[16][1]
+                                                                * pi2_tmp[16
+                                                                                * trans_size];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                eee[0] = eeee[0] + eeeo[0];
+                eee[3] = eeee[0] - eeeo[0];
+                eee[1] = eeee[1] + eeeo[1];
+                eee[2] = eeee[1] - eeeo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 8] = ee[7 - k] - eo[7 - k];
+                }
+                for(k = 0; k < 16; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        /************************************************************************************************/
+        /************************************END - IT_RECON_32x32****************************************/
+        /************************************************************************************************/
+    }
+    else if((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of input are non-zero */
+    {
+        /************************************************************************************************/
+        /**********************************START - IT_RECON_32x32****************************************/
+        /************************************************************************************************/
+        /* Inverse Transform 1st stage */
+        shift = IT_SHIFT_STAGE_1;
+        add = 1 << (shift - 1);
+
+        for(j = 0; j < row_limit_2nd_stage; j++)
+        {
+            /* Checking for Zero Cols */
+            if((zero_cols & 1) == 1)
+            {
+                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+            }
+            else
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 16; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
+                                    + g_ai2_ihevc_trans_32[3][k]
+                                                    * pi2_src[3 * src_strd]
+                                    + g_ai2_ihevc_trans_32[5][k]
+                                                    * pi2_src[5 * src_strd]
+                                    + g_ai2_ihevc_trans_32[7][k]
+                                                    * pi2_src[7 * src_strd];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]
+                                    + g_ai2_ihevc_trans_32[6][k]
+                                                    * pi2_src[6 * src_strd];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd];
+                }
+                eeeo[0] = 0;
+                eeeo[1] = 0;
+                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0];
+                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                eee[0] = eeee[0] + eeeo[0];
+                eee[3] = eeee[0] - eeeo[0];
+                eee[1] = eeee[1] + eeeo[1];
+                eee[2] = eeee[1] - eeeo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 8] = ee[7 - k] - eo[7 - k];
+                }
+                for(k = 0; k < 16; k++)
+                {
+                    pi2_tmp[k] =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pi2_tmp[k + 16] =
+                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+                }
+            }
+            pi2_src++;
+            pi2_tmp += trans_size;
+            zero_cols = zero_cols >> 1;
+        }
+
+        pi2_tmp = pi2_tmp_orig;
+
+        /* Inverse Transform 2nd stage */
+        shift = IT_SHIFT_STAGE_2;
+        add = 1 << (shift - 1);
+        if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 16; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_32[3][k]
+                                                    * pi2_tmp[3 * trans_size];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size];
+                }
+//                for(k = 0; k < 4; k++)
+                {
+                    eeo[0] = 0;
+                    eeo[1] = 0;
+                    eeo[2] = 0;
+                    eeo[3] = 0;
+                }
+                eeeo[0] = 0;
+                eeeo[1] = 0;
+                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
+                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                eee[0] = eeee[0] + eeeo[0];
+                eee[3] = eeee[0] - eeeo[0];
+                eee[1] = eeee[1] + eeeo[1];
+                eee[2] = eeee[1] - eeeo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 8] = ee[7 - k] - eo[7 - k];
+                }
+                for(k = 0; k < 16; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 16; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_32[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_32[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_32[7][k]
+                                                    * pi2_tmp[7 * trans_size];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
+                                    + g_ai2_ihevc_trans_32[6][k]
+                                                    * pi2_tmp[6 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size];
+                }
+                eeeo[0] = 0;
+                eeeo[1] = 0;
+                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
+                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                eee[0] = eeee[0] + eeeo[0];
+                eee[3] = eeee[0] - eeeo[0];
+                eee[1] = eeee[1] + eeeo[1];
+                eee[2] = eeee[1] - eeeo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 8] = ee[7 - k] - eo[7 - k];
+                }
+                for(k = 0; k < 16; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else /* All rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 16; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_32[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_32[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_32[7][k]
+                                                    * pi2_tmp[7 * trans_size]
+                                    + g_ai2_ihevc_trans_32[9][k]
+                                                    * pi2_tmp[9 * trans_size]
+                                    + g_ai2_ihevc_trans_32[11][k]
+                                                    * pi2_tmp[11 * trans_size]
+                                    + g_ai2_ihevc_trans_32[13][k]
+                                                    * pi2_tmp[13 * trans_size]
+                                    + g_ai2_ihevc_trans_32[15][k]
+                                                    * pi2_tmp[15 * trans_size]
+                                    + g_ai2_ihevc_trans_32[17][k]
+                                                    * pi2_tmp[17 * trans_size]
+                                    + g_ai2_ihevc_trans_32[19][k]
+                                                    * pi2_tmp[19 * trans_size]
+                                    + g_ai2_ihevc_trans_32[21][k]
+                                                    * pi2_tmp[21 * trans_size]
+                                    + g_ai2_ihevc_trans_32[23][k]
+                                                    * pi2_tmp[23 * trans_size]
+                                    + g_ai2_ihevc_trans_32[25][k]
+                                                    * pi2_tmp[25 * trans_size]
+                                    + g_ai2_ihevc_trans_32[27][k]
+                                                    * pi2_tmp[27 * trans_size]
+                                    + g_ai2_ihevc_trans_32[29][k]
+                                                    * pi2_tmp[29 * trans_size]
+                                    + g_ai2_ihevc_trans_32[31][k]
+                                                    * pi2_tmp[31 * trans_size];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
+                                    + g_ai2_ihevc_trans_32[6][k]
+                                                    * pi2_tmp[6 * trans_size]
+                                    + g_ai2_ihevc_trans_32[10][k]
+                                                    * pi2_tmp[10 * trans_size]
+                                    + g_ai2_ihevc_trans_32[14][k]
+                                                    * pi2_tmp[14 * trans_size]
+                                    + g_ai2_ihevc_trans_32[18][k]
+                                                    * pi2_tmp[18 * trans_size]
+                                    + g_ai2_ihevc_trans_32[22][k]
+                                                    * pi2_tmp[22 * trans_size]
+                                    + g_ai2_ihevc_trans_32[26][k]
+                                                    * pi2_tmp[26 * trans_size]
+                                    + g_ai2_ihevc_trans_32[30][k]
+                                                    * pi2_tmp[30 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]
+                                    + g_ai2_ihevc_trans_32[12][k]
+                                                    * pi2_tmp[12 * trans_size]
+                                    + g_ai2_ihevc_trans_32[20][k]
+                                                    * pi2_tmp[20 * trans_size]
+                                    + g_ai2_ihevc_trans_32[28][k]
+                                                    * pi2_tmp[28 * trans_size];
+                }
+                eeeo[0] =
+                                g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size]
+                                                + g_ai2_ihevc_trans_32[24][0]
+                                                                * pi2_tmp[24
+                                                                                * trans_size];
+                eeeo[1] =
+                                g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size]
+                                                + g_ai2_ihevc_trans_32[24][1]
+                                                                * pi2_tmp[24
+                                                                                * trans_size];
+                eeee[0] =
+                                g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]
+                                                + g_ai2_ihevc_trans_32[16][0]
+                                                                * pi2_tmp[16
+                                                                                * trans_size];
+                eeee[1] =
+                                g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]
+                                                + g_ai2_ihevc_trans_32[16][1]
+                                                                * pi2_tmp[16
+                                                                                * trans_size];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                eee[0] = eeee[0] + eeeo[0];
+                eee[3] = eeee[0] - eeeo[0];
+                eee[1] = eeee[1] + eeeo[1];
+                eee[2] = eeee[1] - eeeo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 8] = ee[7 - k] - eo[7 - k];
+                }
+                for(k = 0; k < 16; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        /************************************************************************************************/
+        /************************************END - IT_RECON_32x32****************************************/
+        /************************************************************************************************/
+    }
+    else  /* All rows of input are non-zero */
+    {
+        /************************************************************************************************/
+        /**********************************START - IT_RECON_32x32****************************************/
+        /************************************************************************************************/
+        /* Inverse Transform 1st stage */
+        shift = IT_SHIFT_STAGE_1;
+        add = 1 << (shift - 1);
+
+        for(j = 0; j < row_limit_2nd_stage; j++)
+        {
+            /* Checking for Zero Cols */
+            if((zero_cols & 1) == 1)
+            {
+                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+            }
+            else
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 16; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
+                                    + g_ai2_ihevc_trans_32[3][k]
+                                                    * pi2_src[3 * src_strd]
+                                    + g_ai2_ihevc_trans_32[5][k]
+                                                    * pi2_src[5 * src_strd]
+                                    + g_ai2_ihevc_trans_32[7][k]
+                                                    * pi2_src[7 * src_strd]
+                                    + g_ai2_ihevc_trans_32[9][k]
+                                                    * pi2_src[9 * src_strd]
+                                    + g_ai2_ihevc_trans_32[11][k]
+                                                    * pi2_src[11 * src_strd]
+                                    + g_ai2_ihevc_trans_32[13][k]
+                                                    * pi2_src[13 * src_strd]
+                                    + g_ai2_ihevc_trans_32[15][k]
+                                                    * pi2_src[15 * src_strd]
+                                    + g_ai2_ihevc_trans_32[17][k]
+                                                    * pi2_src[17 * src_strd]
+                                    + g_ai2_ihevc_trans_32[19][k]
+                                                    * pi2_src[19 * src_strd]
+                                    + g_ai2_ihevc_trans_32[21][k]
+                                                    * pi2_src[21 * src_strd]
+                                    + g_ai2_ihevc_trans_32[23][k]
+                                                    * pi2_src[23 * src_strd]
+                                    + g_ai2_ihevc_trans_32[25][k]
+                                                    * pi2_src[25 * src_strd]
+                                    + g_ai2_ihevc_trans_32[27][k]
+                                                    * pi2_src[27 * src_strd]
+                                    + g_ai2_ihevc_trans_32[29][k]
+                                                    * pi2_src[29 * src_strd]
+                                    + g_ai2_ihevc_trans_32[31][k]
+                                                    * pi2_src[31 * src_strd];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]
+                                    + g_ai2_ihevc_trans_32[6][k]
+                                                    * pi2_src[6 * src_strd]
+                                    + g_ai2_ihevc_trans_32[10][k]
+                                                    * pi2_src[10 * src_strd]
+                                    + g_ai2_ihevc_trans_32[14][k]
+                                                    * pi2_src[14 * src_strd]
+                                    + g_ai2_ihevc_trans_32[18][k]
+                                                    * pi2_src[18 * src_strd]
+                                    + g_ai2_ihevc_trans_32[22][k]
+                                                    * pi2_src[22 * src_strd]
+                                    + g_ai2_ihevc_trans_32[26][k]
+                                                    * pi2_src[26 * src_strd]
+                                    + g_ai2_ihevc_trans_32[30][k]
+                                                    * pi2_src[30 * src_strd];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd]
+                                    + g_ai2_ihevc_trans_32[12][k]
+                                                    * pi2_src[12 * src_strd]
+                                    + g_ai2_ihevc_trans_32[20][k]
+                                                    * pi2_src[20 * src_strd]
+                                    + g_ai2_ihevc_trans_32[28][k]
+                                                    * pi2_src[28 * src_strd];
+                }
+                eeeo[0] = g_ai2_ihevc_trans_32[8][0] * pi2_src[8 * src_strd]
+                                + g_ai2_ihevc_trans_32[24][0]
+                                                * pi2_src[24 * src_strd];
+                eeeo[1] = g_ai2_ihevc_trans_32[8][1] * pi2_src[8 * src_strd]
+                                + g_ai2_ihevc_trans_32[24][1]
+                                                * pi2_src[24 * src_strd];
+                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0]
+                                + g_ai2_ihevc_trans_32[16][0]
+                                                * pi2_src[16 * src_strd];
+                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0]
+                                + g_ai2_ihevc_trans_32[16][1]
+                                                * pi2_src[16 * src_strd];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                eee[0] = eeee[0] + eeeo[0];
+                eee[3] = eeee[0] - eeeo[0];
+                eee[1] = eeee[1] + eeeo[1];
+                eee[2] = eeee[1] - eeeo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 8] = ee[7 - k] - eo[7 - k];
+                }
+                for(k = 0; k < 16; k++)
+                {
+                    pi2_tmp[k] =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pi2_tmp[k + 16] =
+                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+                }
+            }
+            pi2_src++;
+            pi2_tmp += trans_size;
+            zero_cols = zero_cols >> 1;
+        }
+
+        pi2_tmp = pi2_tmp_orig;
+
+        /* Inverse Transform 2nd stage */
+        shift = IT_SHIFT_STAGE_2;
+        add = 1 << (shift - 1);
+        if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 16; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_32[3][k]
+                                                    * pi2_tmp[3 * trans_size];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size];
+                }
+//                for(k = 0; k < 4; k++)
+                {
+                    eeo[0] = 0;
+                    eeo[1] = 0;
+                    eeo[2] = 0;
+                    eeo[3] = 0;
+                }
+                eeeo[0] = 0;
+                eeeo[1] = 0;
+                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
+                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                eee[0] = eeee[0] + eeeo[0];
+                eee[3] = eeee[0] - eeeo[0];
+                eee[1] = eeee[1] + eeeo[1];
+                eee[2] = eeee[1] - eeeo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 8] = ee[7 - k] - eo[7 - k];
+                }
+                for(k = 0; k < 16; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 16; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_32[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_32[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_32[7][k]
+                                                    * pi2_tmp[7 * trans_size];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
+                                    + g_ai2_ihevc_trans_32[6][k]
+                                                    * pi2_tmp[6 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size];
+                }
+                eeeo[0] = 0;
+                eeeo[1] = 0;
+                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
+                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                eee[0] = eeee[0] + eeeo[0];
+                eee[3] = eeee[0] - eeeo[0];
+                eee[1] = eeee[1] + eeeo[1];
+                eee[2] = eeee[1] - eeeo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 8] = ee[7 - k] - eo[7 - k];
+                }
+                for(k = 0; k < 16; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else /* All rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 16; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_32[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_32[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_32[7][k]
+                                                    * pi2_tmp[7 * trans_size]
+                                    + g_ai2_ihevc_trans_32[9][k]
+                                                    * pi2_tmp[9 * trans_size]
+                                    + g_ai2_ihevc_trans_32[11][k]
+                                                    * pi2_tmp[11 * trans_size]
+                                    + g_ai2_ihevc_trans_32[13][k]
+                                                    * pi2_tmp[13 * trans_size]
+                                    + g_ai2_ihevc_trans_32[15][k]
+                                                    * pi2_tmp[15 * trans_size]
+                                    + g_ai2_ihevc_trans_32[17][k]
+                                                    * pi2_tmp[17 * trans_size]
+                                    + g_ai2_ihevc_trans_32[19][k]
+                                                    * pi2_tmp[19 * trans_size]
+                                    + g_ai2_ihevc_trans_32[21][k]
+                                                    * pi2_tmp[21 * trans_size]
+                                    + g_ai2_ihevc_trans_32[23][k]
+                                                    * pi2_tmp[23 * trans_size]
+                                    + g_ai2_ihevc_trans_32[25][k]
+                                                    * pi2_tmp[25 * trans_size]
+                                    + g_ai2_ihevc_trans_32[27][k]
+                                                    * pi2_tmp[27 * trans_size]
+                                    + g_ai2_ihevc_trans_32[29][k]
+                                                    * pi2_tmp[29 * trans_size]
+                                    + g_ai2_ihevc_trans_32[31][k]
+                                                    * pi2_tmp[31 * trans_size];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
+                                    + g_ai2_ihevc_trans_32[6][k]
+                                                    * pi2_tmp[6 * trans_size]
+                                    + g_ai2_ihevc_trans_32[10][k]
+                                                    * pi2_tmp[10 * trans_size]
+                                    + g_ai2_ihevc_trans_32[14][k]
+                                                    * pi2_tmp[14 * trans_size]
+                                    + g_ai2_ihevc_trans_32[18][k]
+                                                    * pi2_tmp[18 * trans_size]
+                                    + g_ai2_ihevc_trans_32[22][k]
+                                                    * pi2_tmp[22 * trans_size]
+                                    + g_ai2_ihevc_trans_32[26][k]
+                                                    * pi2_tmp[26 * trans_size]
+                                    + g_ai2_ihevc_trans_32[30][k]
+                                                    * pi2_tmp[30 * trans_size];
+                }
+                for(k = 0; k < 4; k++)
+                {
+                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]
+                                    + g_ai2_ihevc_trans_32[12][k]
+                                                    * pi2_tmp[12 * trans_size]
+                                    + g_ai2_ihevc_trans_32[20][k]
+                                                    * pi2_tmp[20 * trans_size]
+                                    + g_ai2_ihevc_trans_32[28][k]
+                                                    * pi2_tmp[28 * trans_size];
+                }
+                eeeo[0] =
+                                g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size]
+                                                + g_ai2_ihevc_trans_32[24][0]
+                                                                * pi2_tmp[24
+                                                                                * trans_size];
+                eeeo[1] =
+                                g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size]
+                                                + g_ai2_ihevc_trans_32[24][1]
+                                                                * pi2_tmp[24
+                                                                                * trans_size];
+                eeee[0] =
+                                g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]
+                                                + g_ai2_ihevc_trans_32[16][0]
+                                                                * pi2_tmp[16
+                                                                                * trans_size];
+                eeee[1] =
+                                g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]
+                                                + g_ai2_ihevc_trans_32[16][1]
+                                                                * pi2_tmp[16
+                                                                                * trans_size];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                eee[0] = eeee[0] + eeeo[0];
+                eee[3] = eeee[0] - eeeo[0];
+                eee[1] = eeee[1] + eeeo[1];
+                eee[2] = eeee[1] - eeeo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    ee[k] = eee[k] + eeo[k];
+                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
+                }
+                for(k = 0; k < 8; k++)
+                {
+                    e[k] = ee[k] + eo[k];
+                    e[k + 8] = ee[7 - k] - eo[7 - k];
+                }
+                for(k = 0; k < 16; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        /************************************************************************************************/
+        /************************************END - IT_RECON_32x32****************************************/
+        /************************************************************************************************/
+    }
+}
+

diff --git a/common/ihevc_itrans_recon_8x8.c b/common/ihevc_itrans_recon_8x8.c
new file mode 100644
index 0000000..5e2de86
--- /dev/null
+++ b/common/ihevc_itrans_recon_8x8.c

@@ -0,0 +1,414 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_itrans_recon_8x8.c
+ *
+ * @brief
+ *  Contains function definitions for inverse transform  and reconstruction 8x8
+ *
+ *
+ * @author
+ *  100470
+ *
+ * @par List of Functions:
+ *  - ihevc_itrans_recon_8x8()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs Inverse transform  and reconstruction for 8x8
+ * input block
+ *
+ * @par Description:
+ *  Performs inverse transform and adds the prediction  data and clips output
+ * to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 8x8 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 8x8 buffer for storing inverse
+ *
+ *  transform
+ *  1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 8x8 block
+ *
+ * @param[out] pu1_dst
+ *  Output 8x8 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_8x8(WORD16 *pi2_src,
+                            WORD16 *pi2_tmp,
+                            UWORD8 *pu1_pred,
+                            UWORD8 *pu1_dst,
+                            WORD32 src_strd,
+                            WORD32 pred_strd,
+                            WORD32 dst_strd,
+                            WORD32 zero_cols,
+                            WORD32 zero_rows)
+{
+    WORD32 j, k;
+    WORD32 e[4], o[4];
+    WORD32 ee[2], eo[2];
+    WORD32 add;
+    WORD32 shift;
+    WORD16 *pi2_tmp_orig;
+    WORD32 trans_size;
+    WORD32 zero_rows_2nd_stage = zero_cols;
+    WORD32 row_limit_2nd_stage;
+
+    trans_size = TRANS_SIZE_8;
+
+    pi2_tmp_orig = pi2_tmp;
+
+    if((zero_cols & 0xF0) == 0xF0)
+        row_limit_2nd_stage = 4;
+    else
+        row_limit_2nd_stage = TRANS_SIZE_8;
+
+
+    if((zero_rows & 0xF0) == 0xF0) /* First 4 rows of input are non-zero */
+    {
+        /************************************************************************************************/
+        /**********************************START - IT_RECON_8x8******************************************/
+        /************************************************************************************************/
+
+        /* Inverse Transform 1st stage */
+        shift = IT_SHIFT_STAGE_1;
+        add = 1 << (shift - 1);
+
+        for(j = 0; j < row_limit_2nd_stage; j++)
+        {
+            /* Checking for Zero Cols */
+            if((zero_cols & 1) == 1)
+            {
+                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+            }
+            else
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 4; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_src[src_strd]
+                                    + g_ai2_ihevc_trans_8[3][k]
+                                                    * pi2_src[3 * src_strd];
+                }
+                eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_src[2 * src_strd];
+                eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_src[2 * src_strd];
+                ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_src[0];
+                ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_src[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                e[0] = ee[0] + eo[0];
+                e[3] = ee[0] - eo[0];
+                e[1] = ee[1] + eo[1];
+                e[2] = ee[1] - eo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    pi2_tmp[k] =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pi2_tmp[k + 4] =
+                                    CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
+                }
+            }
+            pi2_src++;
+            pi2_tmp += trans_size;
+            zero_cols = zero_cols >> 1;
+        }
+
+        pi2_tmp = pi2_tmp_orig;
+
+        /* Inverse Transform 2nd stage */
+        shift = IT_SHIFT_STAGE_2;
+        add = 1 << (shift - 1);
+        if((zero_rows_2nd_stage & 0xF0) == 0xF0) /* First 4 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 4; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_8[3][k] * pi2_tmp[3 * trans_size];
+                }
+                eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size];
+                eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size];
+                ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0];
+                ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                e[0] = ee[0] + eo[0];
+                e[3] = ee[0] - eo[0];
+                e[1] = ee[1] + eo[1];
+                e[2] = ee[1] - eo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
+                    pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else /* All rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 4; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_8[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_8[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_8[7][k]
+                                                    * pi2_tmp[7 * trans_size];
+                }
+
+                eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size]
+                                + g_ai2_ihevc_trans_8[6][0] * pi2_tmp[6 * trans_size];
+                eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size]
+                                + g_ai2_ihevc_trans_8[6][1] * pi2_tmp[6 * trans_size];
+                ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0]
+                                + g_ai2_ihevc_trans_8[4][0] * pi2_tmp[4 * trans_size];
+                ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0]
+                                + g_ai2_ihevc_trans_8[4][1] * pi2_tmp[4 * trans_size];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                e[0] = ee[0] + eo[0];
+                e[3] = ee[0] - eo[0];
+                e[1] = ee[1] + eo[1];
+                e[2] = ee[1] - eo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
+                    pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        /************************************************************************************************/
+        /************************************END - IT_RECON_8x8******************************************/
+        /************************************************************************************************/
+    }
+    else /* All rows of input are non-zero */
+    {
+        /************************************************************************************************/
+        /**********************************START - IT_RECON_8x8******************************************/
+        /************************************************************************************************/
+
+        /* Inverse Transform 1st stage */
+        shift = IT_SHIFT_STAGE_1;
+        add = 1 << (shift - 1);
+
+        for(j = 0; j < row_limit_2nd_stage; j++)
+        {
+            /* Checking for Zero Cols */
+            if((zero_cols & 1) == 1)
+            {
+                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+            }
+            else
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 4; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_src[src_strd]
+                                    + g_ai2_ihevc_trans_8[3][k]
+                                                    * pi2_src[3 * src_strd]
+                                    + g_ai2_ihevc_trans_8[5][k]
+                                                    * pi2_src[5 * src_strd]
+                                    + g_ai2_ihevc_trans_8[7][k]
+                                                    * pi2_src[7 * src_strd];
+                }
+
+                eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_src[2 * src_strd]
+                                + g_ai2_ihevc_trans_8[6][0] * pi2_src[6 * src_strd];
+                eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_src[2 * src_strd]
+                                + g_ai2_ihevc_trans_8[6][1] * pi2_src[6 * src_strd];
+                ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_src[0]
+                                + g_ai2_ihevc_trans_8[4][0] * pi2_src[4 * src_strd];
+                ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_src[0]
+                                + g_ai2_ihevc_trans_8[4][1] * pi2_src[4 * src_strd];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                e[0] = ee[0] + eo[0];
+                e[3] = ee[0] - eo[0];
+                e[1] = ee[1] + eo[1];
+                e[2] = ee[1] - eo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    pi2_tmp[k] =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pi2_tmp[k + 4] =
+                                    CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
+                }
+            }
+            pi2_src++;
+            pi2_tmp += trans_size;
+            zero_cols = zero_cols >> 1;
+        }
+
+        pi2_tmp = pi2_tmp_orig;
+
+        /* Inverse Transform 2nd stage */
+        shift = IT_SHIFT_STAGE_2;
+        add = 1 << (shift - 1);
+        if((zero_rows_2nd_stage & 0xF0) == 0xF0) /* First 4 rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 4; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_8[3][k] * pi2_tmp[3 * trans_size];
+                }
+                eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size];
+                eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size];
+                ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0];
+                ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                e[0] = ee[0] + eo[0];
+                e[3] = ee[0] - eo[0];
+                e[1] = ee[1] + eo[1];
+                e[2] = ee[1] - eo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
+                    pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        else /* All rows of output of 1st stage are non-zero */
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+                for(k = 0; k < 4; k++)
+                {
+                    o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
+                                    + g_ai2_ihevc_trans_8[3][k]
+                                                    * pi2_tmp[3 * trans_size]
+                                    + g_ai2_ihevc_trans_8[5][k]
+                                                    * pi2_tmp[5 * trans_size]
+                                    + g_ai2_ihevc_trans_8[7][k]
+                                                    * pi2_tmp[7 * trans_size];
+                }
+
+                eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size]
+                                + g_ai2_ihevc_trans_8[6][0] * pi2_tmp[6 * trans_size];
+                eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size]
+                                + g_ai2_ihevc_trans_8[6][1] * pi2_tmp[6 * trans_size];
+                ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0]
+                                + g_ai2_ihevc_trans_8[4][0] * pi2_tmp[4 * trans_size];
+                ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0]
+                                + g_ai2_ihevc_trans_8[4][1] * pi2_tmp[4 * trans_size];
+
+                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+                e[0] = ee[0] + eo[0];
+                e[3] = ee[0] - eo[0];
+                e[1] = ee[1] + eo[1];
+                e[2] = ee[1] - eo[1];
+                for(k = 0; k < 4; k++)
+                {
+                    WORD32 itrans_out;
+                    itrans_out =
+                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
+                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+                    itrans_out =
+                                    CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
+                    pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
+                }
+                pi2_tmp++;
+                pu1_pred += pred_strd;
+                pu1_dst += dst_strd;
+            }
+        }
+        /************************************************************************************************/
+        /************************************END - IT_RECON_8x8******************************************/
+        /************************************************************************************************/
+    }
+}
+

diff --git a/common/ihevc_macros.h b/common/ihevc_macros.h
new file mode 100644
index 0000000..3852c85
--- /dev/null
+++ b/common/ihevc_macros.h

@@ -0,0 +1,89 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_macros.h
+*
+* @brief
+*  Macro definitions used in the codec
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_MACROS_H_
+#define _IHEVC_MACROS_H_
+
+#define RETURN_IF(cond, retval) if(cond) {return (retval);}
+#define UNUSED(x) ((void)(x))
+
+#define CLIP3(x, min, max) (((x) > max) ? max :(((x) < min)? min:(x)))
+
+#define MAX(x,y)    ((((WORD32)x) > ((WORD32)y)) ? ((WORD32)x) :((WORD32)y))
+#define MIN(x,y)    ((((WORD32)x) < ((WORD32)y)) ? ((WORD32)x) :((WORD32)y))
+#define SIGN(x)     ((x) >= 0 ? ((x)>0 ? 1: 0) : -1)
+#define ABS(x)      ((((WORD32)(x)) > 0)           ? (x) : -(x))
+
+#define ALIGN128(x) ((((x) + 127) >> 7) << 7)
+#define ALIGN64(x)  ((((x) + 63) >> 6) << 6)
+#define ALIGN32(x)  ((((x) + 31) >> 5) << 5)
+#define ALIGN16(x)  ((((x) + 15) >> 4) << 4)
+#define ALIGN8(x)   ((((x) + 7) >> 3) << 3)
+
+#define ALIGN_POW2(ptr,align) ((((WORD32)ptr)+align-1)&(~(align-1)))
+
+/** Sets x bits to '1' starting from MSB */
+#define MSB_ONES(x) ((UWORD32)0xFFFFFFFF << (32 - (x)))
+
+/** Generates a pattern of x number of '01' in binary starting from MSB */
+#define DUP_MSB_01(x) ((UWORD32)0x55555555 << (32 - ((x) * 2)))
+
+/** Generates a pattern of x number of '10' in binary starting from MSB */
+#define DUP_MSB_10(x) ((UWORD32)0xAAAAAAAA << (32 - ((x) * 2)))
+
+/** Generates a pattern of x number of '11' in binary starting from MSB */
+#define DUP_MSB_11(x) ((UWORD32)0xFFFFFFFF << (32 - ((x) * 2)))
+
+/** Sets x bits to '1' starting from LSB */
+#define LSB_ONES(x) ((UWORD32)0xFFFFFFFF >> (32 - (x)))
+
+/** Generates a pattern of x number of '01' in binary starting from LSB */
+#define DUP_LSB_01(x) ((UWORD32)0x55555555 >> (32 - ((x) * 2)))
+
+/** Generates a pattern of x number of '10' in binary starting from LSB */
+#define DUP_LSB_10(x) ((UWORD32)0xAAAAAAAA >> (32 - ((x) * 2)))
+
+/** Generates a pattern of x number of '11' in binary starting from LSB */
+#define DUP_LSB_11(x) ((UWORD32)0xFFFFFFFF >> (32 - ((x) * 2)))
+
+/** Sets the bit in given position to 1 */
+#define BITSET(x, pos) ((x) | (1 << (pos)))
+
+/** Swap two variables */
+#define SWAP(X,Y)                   \
+{                                   \
+    (X) = (X) ^ (Y);                \
+    (Y) = (X) ^ (Y);                \
+    (X) = (X) ^ (Y);                \
+}
+#endif /*_IHEVCD_MACROS_H_*/

diff --git a/common/ihevc_mem_fns.c b/common/ihevc_mem_fns.c
new file mode 100644
index 0000000..4a2227d
--- /dev/null
+++ b/common/ihevc_mem_fns.c

@@ -0,0 +1,166 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_mem_fns.c
+ *
+ * @brief
+ *  Functions used for memory operations
+ *
+ * @author
+ *  Ittiam
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_mem_fns.h"
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *   memcpy of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ *   Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu1_dst
+ *  UWORD8 pointer to the destination
+ *
+ * @param[in] pu1_src
+ *  UWORD8 pointer to the source
+ *
+ * @param[in] num_bytes
+ *  number of bytes to copy
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_memcpy(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes)
+{
+    memcpy(pu1_dst, pu1_src, num_bytes);
+}
+
+
+void ihevc_memcpy_mul_8(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes)
+{
+    memcpy(pu1_dst, pu1_src, num_bytes);
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *   memset of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ *   Does memset of 8bit data for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu1_dst
+ *  UWORD8 pointer to the destination
+ *
+ * @param[in] value
+ *  UWORD8 value used for memset
+ *
+ * @param[in] num_bytes
+ *  number of bytes to set
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_memset(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes)
+{
+    memset(pu1_dst, value, num_bytes);
+}
+
+
+void ihevc_memset_mul_8(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes)
+{
+    memset(pu1_dst, value, num_bytes);
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *   memset of 16bit data of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ *   Does memset of 16bit data for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu2_dst
+ *  UWORD8 pointer to the destination
+ *
+ * @param[in] value
+ *  UWORD16 value used for memset
+ *
+ * @param[in] num_words
+ *  number of words to set
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_memset_16bit(UWORD16 *pu2_dst, UWORD16 value, UWORD32 num_words)
+{
+    UWORD32 i;
+    for(i = 0; i < num_words; i++)
+    {
+        *pu2_dst++ = value;
+    }
+}
+
+
+
+void ihevc_memset_16bit_mul_8(UWORD16 *pu2_dst, UWORD16 value, UWORD32 num_words)
+{
+    UWORD32 i;
+    for(i = 0; i < num_words; i++)
+    {
+        *pu2_dst++ = value;
+    }
+}
+

diff --git a/common/ihevc_mem_fns.h b/common/ihevc_mem_fns.h
new file mode 100644
index 0000000..1b37e99
--- /dev/null
+++ b/common/ihevc_mem_fns.h

@@ -0,0 +1,132 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_mem_fns.h
+*
+* @brief
+*  Function declarations used for memory functions
+*
+* @author
+*  Naveen SR
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _MEM_FNS_H_
+#define _MEM_FNS_H_
+
+typedef void ihevc_memcpy_ft(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes);
+
+typedef void ihevc_memcpy_mul_8_ft(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes);
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *   memset of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ *   Does memset of 8bit data for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu1_dst
+ *  UWORD8 pointer to the destination
+ *
+ * @param[in] value
+ *  UWORD8 value used for memset
+ *
+ * @param[in] num_bytes
+ *  number of bytes to set
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+typedef void ihevc_memset_ft(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes);
+
+typedef void ihevc_memset_mul_8_ft(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes);
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *   memset of 16bit data of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ *   Does memset of 16bit data for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu2_dst
+ *  UWORD8 pointer to the destination
+ *
+ * @param[in] value
+ *  UWORD16 value used for memset
+ *
+ * @param[in] num_words
+ *  number of words to set
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+typedef void ihevc_memset_16bit_ft(UWORD16 *pu2_dst, UWORD16 value, UWORD32 num_words);
+
+typedef void ihevc_memset_16bit_mul_8_ft(UWORD16 *pu2_dst, UWORD16 value, UWORD32 num_words);
+
+/* C function declarations */
+ihevc_memcpy_ft ihevc_memcpy;
+ihevc_memcpy_mul_8_ft ihevc_memcpy_mul_8;
+ihevc_memset_ft ihevc_memset;
+ihevc_memset_mul_8_ft ihevc_memset_mul_8;
+ihevc_memset_16bit_ft ihevc_memset_16bit;
+ihevc_memset_16bit_mul_8_ft ihevc_memset_16bit_mul_8;
+
+/* A9 Q function declarations */
+ihevc_memcpy_ft ihevc_memcpy_a9q;
+ihevc_memcpy_mul_8_ft ihevc_memcpy_mul_8_a9q;
+ihevc_memset_ft ihevc_memset_a9q;
+ihevc_memset_mul_8_ft ihevc_memset_mul_8_a9q;
+ihevc_memset_16bit_ft ihevc_memset_16bit_a9q;
+ihevc_memset_16bit_mul_8_ft ihevc_memset_16bit_mul_8_a9q;
+
+/* A9 A function declarations */
+ihevc_memcpy_ft ihevc_memcpy_a9a;
+ihevc_memcpy_mul_8_ft ihevc_memcpy_mul_8_a9a;
+ihevc_memset_ft ihevc_memset_a9a;
+ihevc_memset_mul_8_ft ihevc_memset_mul_8_a9a;
+ihevc_memset_16bit_ft ihevc_memset_16bit_a9a;
+ihevc_memset_16bit_mul_8_ft ihevc_memset_16bit_mul_8_a9a;
+
+/* SSSE3 function declarations */
+ihevc_memcpy_mul_8_ft ihevc_memcpy_mul_8_ssse3;
+ihevc_memset_mul_8_ft ihevc_memset_mul_8_ssse3;
+ihevc_memset_16bit_mul_8_ft ihevc_memset_16bit_mul_8_ssse3;
+
+/* armv8 function declarations */
+ihevc_memcpy_ft ihevc_memcpy_av8;
+ihevc_memcpy_mul_8_ft ihevc_memcpy_mul_8_av8;
+ihevc_memset_ft ihevc_memset_av8;
+ihevc_memset_mul_8_ft ihevc_memset_mul_8_av8;
+ihevc_memset_16bit_ft ihevc_memset_16bit_av8;
+ihevc_memset_16bit_mul_8_ft ihevc_memset_16bit_mul_8_av8;
+#endif  //_MEM_FNS_H_

diff --git a/common/ihevc_padding.c b/common/ihevc_padding.c
new file mode 100644
index 0000000..dce8464
--- /dev/null
+++ b/common/ihevc_padding.c

@@ -0,0 +1,577 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_padding.c
+*
+* @brief
+*  Contains function definitions for Padding
+*
+* @author
+*  Srinivas T
+*
+* @par List of Functions:
+*   - ihevc_pad_horz_luma()
+*   - ihevc_pad_horz_chroma()
+*   - ihevc_pad_vert()
+*   - ihevc_pad_left_luma()
+*   - ihevc_pad_left_chroma()
+*   - ihevc_pad_right_luma()
+*   - ihevc_pad_right_chroma()
+*   - ihevc_pad_top()
+*   - ihevc_pad_bottom()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_mem_fns.h"
+/**
+*******************************************************************************
+*
+* @brief
+*       Padding function for horizontal input variable
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @param[in] pad_size
+*  integer -padding size of the array
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_vert(UWORD8 *pu1_src,
+                    WORD32 src_strd,
+                    WORD32 ht,
+                    WORD32 wd,
+                    WORD32 pad_size)
+{
+    WORD32 row;
+
+    for(row = 1; row <= pad_size; row++)
+    {
+        memcpy(pu1_src - row * src_strd, pu1_src, wd);
+        memcpy(pu1_src + (ht + row - 1) * src_strd,
+               pu1_src + (ht - 1) * src_strd, wd);
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Padding function for vertical input variable
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @param[in] pad_size
+*  integer -padding size of the array
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_horz_chroma(UWORD8 *pu1_src,
+                           WORD32 src_strd,
+                           WORD32 ht,
+                           WORD32 wd,
+                           WORD32 pad_size)
+{
+    WORD32 row;
+    //WORD32 col;
+    UWORD16 *pu2_src = (UWORD16 *)pu1_src;
+
+    src_strd >>= 1;
+    wd >>= 1;
+    pad_size >>= 1;
+
+    for(row = 0; row < ht; row++)
+    {
+        UWORD16 u2_uv_val;
+
+        u2_uv_val = pu2_src[0];
+        ihevc_memset_16bit(&pu2_src[-pad_size], u2_uv_val, pad_size);
+
+        u2_uv_val = pu2_src[wd - 1];
+        ihevc_memset_16bit(&pu2_src[wd], u2_uv_val, pad_size);
+
+        pu2_src += src_strd;
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Padding function for vertical input variable
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @param[in] pad_size
+*  integer -padding size of the array
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_horz_luma(UWORD8 *pu1_src,
+                         WORD32 src_strd,
+                         WORD32 ht,
+                         WORD32 wd,
+                         WORD32 pad_size)
+{
+    WORD32 row;
+
+    for(row = 0; row < ht; row++)
+    {
+        memset(pu1_src - pad_size, *pu1_src, pad_size);
+        memset(pu1_src + wd, *(pu1_src + wd - 1), pad_size);
+
+        pu1_src += src_strd;
+    }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*       Padding at the top of a 2d array
+*
+* @par Description:
+*       The top row of a 2d array is replicated for pad_size times at the top
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @param[in] pad_size
+*  integer -padding size of the array
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_top(UWORD8 *pu1_src,
+                   WORD32 src_strd,
+                   WORD32 wd,
+                   WORD32 pad_size)
+{
+    WORD32 row;
+
+    for(row = 1; row <= pad_size; row++)
+    {
+        memcpy(pu1_src - row * src_strd, pu1_src, wd);
+    }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Padding at the bottom of a 2d array
+*
+* @par Description:
+*   The bottom row of a 2d array is replicated for pad_size times at the bottom
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @param[in] pad_size
+*  integer -padding size of the array
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_bottom(UWORD8 *pu1_src,
+                      WORD32 src_strd,
+                      WORD32 wd,
+                      WORD32 pad_size)
+{
+    WORD32 row;
+
+    for(row = 1; row <= pad_size; row++)
+    {
+        memcpy(pu1_src + (row - 1) * src_strd,
+               pu1_src - 1 * src_strd, wd);
+    }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Padding (luma block) at the left of a 2d array
+*
+* @par Description:
+*   The left column of a 2d array is replicated for pad_size times at the left
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @param[in] pad_size
+*  integer -padding size of the array
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_left_luma(UWORD8 *pu1_src,
+                         WORD32 src_strd,
+                         WORD32 ht,
+                         WORD32 pad_size)
+{
+    WORD32 row;
+
+    for(row = 0; row < ht; row++)
+    {
+        memset(pu1_src - pad_size, *pu1_src, pad_size);
+
+        pu1_src += src_strd;
+    }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Padding (chroma block) at the left of a 2d array
+*
+* @par Description:
+*   The left column of a 2d array is replicated for pad_size times at the left
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array (each colour component)
+*
+* @param[in] pad_size
+*  integer -padding size of the array
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_left_chroma(UWORD8 *pu1_src,
+                           WORD32 src_strd,
+                           WORD32 ht,
+                           WORD32 pad_size)
+{
+    WORD32 row;
+    WORD32 col;
+    UWORD16 *pu2_src = (UWORD16 *)pu1_src;
+
+    src_strd >>= 1;
+    pad_size >>= 1;
+
+    for(row = 0; row < ht; row++)
+    {
+        UWORD16 u2_uv_val;
+
+        u2_uv_val = pu2_src[0];
+        for(col = -pad_size; col < 0; col++)
+            pu2_src[col] = u2_uv_val;
+
+        pu2_src += src_strd;
+    }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding (luma block) at the right of a 2d array
+*
+* @par Description:
+* The right column of a 2d array is replicated for pad_size times at the right
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @param[in] pad_size
+*  integer -padding size of the array
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_right_luma(UWORD8 *pu1_src,
+                          WORD32 src_strd,
+                          WORD32 ht,
+                          WORD32 pad_size)
+{
+    WORD32 row;
+
+    for(row = 0; row < ht; row++)
+    {
+        memset(pu1_src, *(pu1_src - 1), pad_size);
+
+        pu1_src += src_strd;
+    }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding (chroma block) at the right of a 2d array
+*
+* @par Description:
+* The right column of a 2d array is replicated for pad_size times at the right
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array (each colour component)
+*
+* @param[in] pad_size
+*  integer -padding size of the array
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_right_chroma(UWORD8 *pu1_src,
+                            WORD32 src_strd,
+                            WORD32 ht,
+                            WORD32 pad_size)
+{
+    WORD32 row;
+    WORD32 col;
+    UWORD16 *pu2_src = (UWORD16 *)pu1_src;
+
+    src_strd >>= 1;
+    pad_size >>= 1;
+
+    for(row = 0; row < ht; row++)
+    {
+        UWORD16 u2_uv_val;
+
+        u2_uv_val = pu2_src[-1];
+        for(col = 0; col < pad_size; col++)
+            pu2_src[col] = u2_uv_val;
+
+        pu2_src += src_strd;
+    }
+}
+

diff --git a/common/ihevc_padding.h b/common/ihevc_padding.h
new file mode 100644
index 0000000..349ac12
--- /dev/null
+++ b/common/ihevc_padding.h

@@ -0,0 +1,209 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_padding.h
+*
+* @brief
+*  Declarations for the fucntions defined in  ihevc_padding.c
+*
+* @author
+*  Srinivas T
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_PADDING_H_
+#define _IHEVC_PADDING_H_
+
+/*****************************************************************************/
+/* Function Declarations                                                     */
+/*****************************************************************************/
+
+typedef void ihevc_pad_horz_luma_ft(
+                UWORD8 *pu1_src,
+                WORD32 src_strd,
+                WORD32 ht,
+                WORD32 wd,
+                WORD32 pad_size);
+
+typedef void ihevc_hbd_pad_horz_luma_ft(
+                UWORD16 *pu2_src,
+                WORD32 src_strd,
+                WORD32 ht,
+                WORD32 wd,
+                WORD32 pad_size);
+
+typedef void ihevc_pad_horz_chroma_ft(
+                UWORD8 *pu1_src,
+                WORD32 src_strd,
+                WORD32 ht,
+                WORD32 wd,
+                WORD32 pad_size);
+
+typedef void ihevc_hbd_pad_horz_chroma_ft(
+                UWORD16 *pu2_src,
+                WORD32 src_strd,
+                WORD32 ht,
+                WORD32 wd,
+                WORD32 pad_size);
+
+typedef void ihevc_pad_vert_ft(
+                UWORD8 *pu1_src,
+                WORD32 src_strd,
+                WORD32 ht,
+                WORD32 wd,
+                WORD32 pad_size);
+
+typedef void ihevc_hbd_pad_vert_ft(
+                UWORD16 *pu2_src,
+                WORD32 src_strd,
+                WORD32 ht,
+                WORD32 wd,
+                WORD32 pad_size);
+
+typedef void ihevc_pad_top_ft(UWORD8 *pu1_src,
+                              WORD32 src_strd,
+                              WORD32 wd,
+                              WORD32 pad_size);
+
+typedef void ihevc_hbd_pad_top_ft(UWORD16 *pu2_src,
+                                  WORD32 src_strd,
+                                  WORD32 wd,
+                                  WORD32 pad_size);
+
+typedef void ihevc_pad_bottom_ft(UWORD8 *pu1_src,
+                                 WORD32 src_strd,
+                                 WORD32 wd,
+                                 WORD32 pad_size);
+
+typedef void ihevc_hbd_pad_bottom_ft(UWORD16 *pu2_src,
+                                     WORD32 src_strd,
+                                     WORD32 wd,
+                                     WORD32 pad_size);
+
+typedef void ihevc_pad_left_luma_ft(UWORD8 *pu1_src,
+                                    WORD32 src_strd,
+                                    WORD32 ht,
+                                    WORD32 pad_size);
+
+typedef void ihevc_hbd_pad_left_luma_ft(UWORD16 *pu2_src,
+                                        WORD32 src_strd,
+                                        WORD32 ht,
+                                        WORD32 pad_size);
+
+typedef void ihevc_pad_left_chroma_ft(UWORD8 *pu1_src,
+                                      WORD32 src_strd,
+                                      WORD32 ht,
+                                      WORD32 pad_size);
+
+typedef void ihevc_hbd_pad_left_chroma_ft(UWORD16 *pu2_src,
+                                          WORD32 src_strd,
+                                          WORD32 ht,
+                                          WORD32 pad_size);
+
+typedef void ihevc_pad_right_luma_ft(UWORD8 *pu1_src,
+                                     WORD32 src_strd,
+                                     WORD32 ht,
+                                     WORD32 pad_size);
+
+typedef void ihevc_hbd_pad_right_luma_ft(UWORD16 *pu2_src,
+                                         WORD32 src_strd,
+                                         WORD32 ht,
+                                         WORD32 pad_size);
+
+typedef void ihevc_pad_right_chroma_ft(UWORD8 *pu1_src,
+                                       WORD32 src_strd,
+                                       WORD32 ht,
+                                       WORD32 pad_size);
+
+typedef void ihevc_hbd_pad_right_chroma_ft(UWORD16 *pu2_src,
+                                           WORD32 src_strd,
+                                           WORD32 ht,
+                                           WORD32 pad_size);
+
+/* C function declarations */
+ihevc_pad_horz_luma_ft ihevc_pad_horz_luma;
+ihevc_pad_horz_chroma_ft ihevc_pad_horz_chroma;
+ihevc_pad_vert_ft ihevc_pad_vert;
+ihevc_pad_top_ft ihevc_pad_top;
+ihevc_pad_bottom_ft ihevc_pad_bottom;
+ihevc_pad_left_luma_ft ihevc_pad_left_luma;
+ihevc_pad_left_chroma_ft ihevc_pad_left_chroma;
+ihevc_pad_right_luma_ft ihevc_pad_right_luma;
+ihevc_pad_right_chroma_ft ihevc_pad_right_chroma;
+
+ihevc_hbd_pad_horz_luma_ft ihevc_hbd_pad_horz_luma;
+ihevc_hbd_pad_horz_chroma_ft ihevc_hbd_pad_horz_chroma;
+ihevc_hbd_pad_vert_ft ihevc_hbd_pad_vert;
+ihevc_hbd_pad_top_ft ihevc_hbd_pad_top;
+ihevc_hbd_pad_bottom_ft ihevc_hbd_pad_bottom;
+ihevc_hbd_pad_left_luma_ft ihevc_hbd_pad_left_luma;
+ihevc_hbd_pad_left_chroma_ft ihevc_hbd_pad_left_chroma;
+ihevc_hbd_pad_right_luma_ft ihevc_hbd_pad_right_luma;
+ihevc_hbd_pad_right_chroma_ft ihevc_hbd_pad_right_chroma;
+
+/* A9 Q function declarations */
+ihevc_pad_horz_luma_ft ihevc_pad_horz_luma_a9q;
+ihevc_pad_horz_chroma_ft ihevc_pad_horz_chroma_a9q;
+ihevc_pad_vert_ft ihevc_pad_vert_a9q;
+ihevc_pad_top_ft ihevc_pad_top_a9q;
+ihevc_pad_bottom_ft ihevc_pad_bottom_a9q;
+ihevc_pad_left_luma_ft ihevc_pad_left_luma_a9q;
+ihevc_pad_left_chroma_ft ihevc_pad_left_chroma_a9q;
+ihevc_pad_right_luma_ft ihevc_pad_right_luma_a9q;
+ihevc_pad_right_chroma_ft ihevc_pad_right_chroma_a9q;
+
+/* A9 a function declarations */
+ihevc_pad_horz_luma_ft ihevc_pad_horz_luma_a9a;
+ihevc_pad_horz_chroma_ft ihevc_pad_horz_chroma_a9a;
+ihevc_pad_vert_ft ihevc_pad_vert_a9a;
+ihevc_pad_top_ft ihevc_pad_top_a9a;
+ihevc_pad_bottom_ft ihevc_pad_bottom_a9a;
+ihevc_pad_left_luma_ft ihevc_pad_left_luma_a9a;
+ihevc_pad_left_chroma_ft ihevc_pad_left_chroma_a9a;
+ihevc_pad_right_luma_ft ihevc_pad_right_luma_a9a;
+ihevc_pad_right_chroma_ft ihevc_pad_right_chroma_a9a;
+
+/* NEONINTR function declarations */
+ihevc_pad_horz_luma_ft ihevc_pad_horz_luma_neonintr;
+ihevc_pad_horz_chroma_ft ihevc_pad_horz_chroma_neonintr;
+ihevc_pad_vert_ft ihevc_pad_vert_neonintr;
+ihevc_pad_top_ft ihevc_pad_top_neonintr;
+ihevc_pad_bottom_ft ihevc_pad_bottom_neonintr;
+/*SSSE3 functions declarations */
+ihevc_pad_left_luma_ft ihevc_pad_left_luma_ssse3;
+ihevc_pad_left_chroma_ft ihevc_pad_left_chroma_ssse3;
+ihevc_pad_right_luma_ft ihevc_pad_right_luma_ssse3;
+ihevc_pad_right_chroma_ft ihevc_pad_right_chroma_ssse3;
+
+/* armv8 function declarations */
+ihevc_pad_horz_luma_ft ihevc_pad_horz_luma_av8;
+ihevc_pad_horz_chroma_ft ihevc_pad_horz_chroma_av8;
+ihevc_pad_vert_ft ihevc_pad_vert_av8;
+ihevc_pad_top_ft ihevc_pad_top_av8;
+ihevc_pad_bottom_ft ihevc_pad_bottom_av8;
+ihevc_pad_left_luma_ft ihevc_pad_left_luma_av8;
+ihevc_pad_left_chroma_ft ihevc_pad_left_chroma_av8;
+ihevc_pad_right_luma_ft ihevc_pad_right_luma_av8;
+ihevc_pad_right_chroma_ft ihevc_pad_right_chroma_av8;
+
+#endif /*_IHEVC_PADDING_H_*/

diff --git a/common/ihevc_quant_tables.c b/common/ihevc_quant_tables.c
new file mode 100644
index 0000000..10ccc0b
--- /dev/null
+++ b/common/ihevc_quant_tables.c

@@ -0,0 +1,471 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_trans_tables.c
+*
+* @brief
+*  Contains tables used in forward and inverse quantization
+*
+* @author
+*  100189
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_quant_tables.h"
+#include "ihevc_defs.h"
+
+
+
+/**  Default flat Scaling matrix for 32x32 transform
+ * Since the values are same, 32x32 matrix will be used for all sizes
+ */
+const WORD16 gi2_flat_scale_mat_32x32[] =
+{
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+
+};
+
+/**
+*
+* @brief default scaling matrix as specified by standard
+* 8x8 intra matrix
+*
+*/
+const WORD16 gi2_intra_default_scale_mat_8x8[] =
+{
+    16, 16, 16, 16, 17, 18, 21, 24,
+    16, 16, 16, 16, 17, 19, 22, 25,
+    16, 16, 17, 18, 20, 22, 25, 29,
+    16, 16, 18, 21, 24, 27, 31, 36,
+    17, 17, 20, 24, 30, 35, 41, 47,
+    18, 19, 22, 27, 35, 44, 54, 65,
+    21, 22, 25, 31, 41, 54, 70, 88,
+    24, 25, 29, 36, 47, 65, 88, 115
+};
+/**
+*
+* @brief default scaling matrix as specified by standard
+* 8x8 inter matrix
+*
+*/
+const WORD16 gi2_inter_default_scale_mat_8x8[] =
+{
+    16, 16, 16, 16, 17, 18, 20, 24,
+    16, 16, 16, 17, 18, 20, 24, 25,
+    16, 16, 17, 18, 20, 24, 25, 28,
+    16, 17, 18, 20, 24, 25, 28, 33,
+    17, 18, 20, 24, 25, 28, 33, 41,
+    18, 20, 24, 25, 28, 33, 41, 54,
+    20, 24, 25, 28, 33, 41, 54, 71,
+    24, 25, 28, 33, 41, 54, 71, 91
+};
+/**
+*
+* @brief default scaling matrix as specified by standard
+* 16x16 intra matrix
+*
+*/
+const WORD16 gi2_intra_default_scale_mat_16x16[] =
+{
+    16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 18, 18, 21, 21, 24,  24,
+    16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 18, 18, 21, 21, 24,  24,
+    16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 19, 19, 22, 22, 25,  25,
+    16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 19, 19, 22, 22, 25,  25,
+    16, 16, 16, 16, 17, 17, 18, 18, 20, 20, 22, 22, 25, 25, 29,  29,
+    16, 16, 16, 16, 17, 17, 18, 18, 20, 20, 22, 22, 25, 25, 29,  29,
+    16, 16, 16, 16, 18, 18, 21, 21, 24, 24, 27, 27, 31, 31, 36,  36,
+    16, 16, 16, 16, 18, 18, 21, 21, 24, 24, 27, 27, 31, 31, 36,  36,
+    17, 17, 17, 17, 20, 20, 24, 24, 30, 30, 35, 35, 41, 41, 47,  47,
+    17, 17, 17, 17, 20, 20, 24, 24, 30, 30, 35, 35, 41, 41, 47,  47,
+    18, 18, 19, 19, 22, 22, 27, 27, 35, 35, 44, 44, 54, 54, 65,  65,
+    18, 18, 19, 19, 22, 22, 27, 27, 35, 35, 44, 44, 54, 54, 65,  65,
+    21, 21, 22, 22, 25, 25, 31, 31, 41, 41, 54, 54, 70, 70, 88,  88,
+    21, 21, 22, 22, 25, 25, 31, 31, 41, 41, 54, 54, 70, 70, 88,  88,
+    24, 24, 25, 25, 29, 29, 36, 36, 47, 47, 65, 65, 88, 88, 115, 115,
+    24, 24, 25, 25, 29, 29, 36, 36, 47, 47, 65, 65, 88, 88, 115, 115
+};
+/**
+*
+* @brief default scaling matrix as specified by standard
+* 16x16 inter matrix
+*
+*/
+const WORD16 gi2_inter_default_scale_mat_16x16[] =
+{
+    16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 18, 18, 20, 20, 24, 24,
+    16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 18, 18, 20, 20, 24, 24,
+    16, 16, 16, 16, 16, 16, 17, 17, 18, 18, 20, 20, 24, 24, 25, 25,
+    16, 16, 16, 16, 16, 16, 17, 17, 18, 18, 20, 20, 24, 24, 25, 25,
+    16, 16, 16, 16, 17, 17, 18, 18, 20, 20, 24, 24, 25, 25, 28, 28,
+    16, 16, 16, 16, 17, 17, 18, 18, 20, 20, 24, 24, 25, 25, 28, 28,
+    16, 16, 17, 17, 18, 18, 20, 20, 24, 24, 25, 25, 28, 28, 33, 33,
+    16, 16, 17, 17, 18, 18, 20, 20, 24, 24, 25, 25, 28, 28, 33, 33,
+    17, 17, 18, 18, 20, 20, 24, 24, 25, 25, 28, 28, 33, 33, 41, 41,
+    17, 17, 18, 18, 20, 20, 24, 24, 25, 25, 28, 28, 33, 33, 41, 41,
+    18, 18, 20, 20, 24, 24, 25, 25, 28, 28, 33, 33, 41, 41, 54, 54,
+    18, 18, 20, 20, 24, 24, 25, 25, 28, 28, 33, 33, 41, 41, 54, 54,
+    20, 20, 24, 24, 25, 25, 28, 28, 33, 33, 41, 41, 54, 54, 71, 71,
+    20, 20, 24, 24, 25, 25, 28, 28, 33, 33, 41, 41, 54, 54, 71, 71,
+    24, 24, 25, 25, 28, 28, 33, 33, 41, 41, 54, 54, 71, 71, 91, 91,
+    24, 24, 25, 25, 28, 28, 33, 33, 41, 41, 54, 54, 71, 71, 91, 91
+};
+
+/**
+*
+* @brief default scaling matrix as specified by standard
+* 32x32 intra matrix
+*
+*/
+const WORD16 gi2_intra_default_scale_mat_32x32[] =
+{
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 21, 21, 21, 21, 24,  24,  24,  24,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 21, 21, 21, 21, 24,  24,  24,  24,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 21, 21, 21, 21, 24,  24,  24,  24,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 21, 21, 21, 21, 24,  24,  24,  24,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 19, 19, 19, 19, 22, 22, 22, 22, 25,  25,  25,  25,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 19, 19, 19, 19, 22, 22, 22, 22, 25,  25,  25,  25,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 19, 19, 19, 19, 22, 22, 22, 22, 25,  25,  25,  25,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 19, 19, 19, 19, 22, 22, 22, 22, 25,  25,  25,  25,
+    16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 22, 22, 22, 22, 25, 25, 25, 25, 29,  29,  29,  29,
+    16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 22, 22, 22, 22, 25, 25, 25, 25, 29,  29,  29,  29,
+    16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 22, 22, 22, 22, 25, 25, 25, 25, 29,  29,  29,  29,
+    16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 22, 22, 22, 22, 25, 25, 25, 25, 29,  29,  29,  29,
+    16, 16, 16, 16, 16, 16, 16, 16, 18, 18, 18, 18, 21, 21, 21, 21, 24, 24, 24, 24, 27, 27, 27, 27, 31, 31, 31, 31, 36,  36,  36,  36,
+    16, 16, 16, 16, 16, 16, 16, 16, 18, 18, 18, 18, 21, 21, 21, 21, 24, 24, 24, 24, 27, 27, 27, 27, 31, 31, 31, 31, 36,  36,  36,  36,
+    16, 16, 16, 16, 16, 16, 16, 16, 18, 18, 18, 18, 21, 21, 21, 21, 24, 24, 24, 24, 27, 27, 27, 27, 31, 31, 31, 31, 36,  36,  36,  36,
+    16, 16, 16, 16, 16, 16, 16, 16, 18, 18, 18, 18, 21, 21, 21, 21, 24, 24, 24, 24, 27, 27, 27, 27, 31, 31, 31, 31, 36,  36,  36,  36,
+    17, 17, 17, 17, 17, 17, 17, 17, 20, 20, 20, 20, 24, 24, 24, 24, 30, 30, 30, 30, 35, 35, 35, 35, 41, 41, 41, 41, 47,  47,  47,  47,
+    17, 17, 17, 17, 17, 17, 17, 17, 20, 20, 20, 20, 24, 24, 24, 24, 30, 30, 30, 30, 35, 35, 35, 35, 41, 41, 41, 41, 47,  47,  47,  47,
+    17, 17, 17, 17, 17, 17, 17, 17, 20, 20, 20, 20, 24, 24, 24, 24, 30, 30, 30, 30, 35, 35, 35, 35, 41, 41, 41, 41, 47,  47,  47,  47,
+    17, 17, 17, 17, 17, 17, 17, 17, 20, 20, 20, 20, 24, 24, 24, 24, 30, 30, 30, 30, 35, 35, 35, 35, 41, 41, 41, 41, 47,  47,  47,  47,
+    18, 18, 18, 18, 19, 19, 19, 19, 22, 22, 22, 22, 27, 27, 27, 27, 35, 35, 35, 35, 44, 44, 44, 44, 54, 54, 54, 54, 65,  65,  65,  65,
+    18, 18, 18, 18, 19, 19, 19, 19, 22, 22, 22, 22, 27, 27, 27, 27, 35, 35, 35, 35, 44, 44, 44, 44, 54, 54, 54, 54, 65,  65,  65,  65,
+    18, 18, 18, 18, 19, 19, 19, 19, 22, 22, 22, 22, 27, 27, 27, 27, 35, 35, 35, 35, 44, 44, 44, 44, 54, 54, 54, 54, 65,  65,  65,  65,
+    18, 18, 18, 18, 19, 19, 19, 19, 22, 22, 22, 22, 27, 27, 27, 27, 35, 35, 35, 35, 44, 44, 44, 44, 54, 54, 54, 54, 65,  65,  65,  65,
+    21, 21, 21, 21, 22, 22, 22, 22, 25, 25, 25, 25, 31, 31, 31, 31, 41, 41, 41, 41, 54, 54, 54, 54, 70, 70, 70, 70, 88,  88,  88,  88,
+    21, 21, 21, 21, 22, 22, 22, 22, 25, 25, 25, 25, 31, 31, 31, 31, 41, 41, 41, 41, 54, 54, 54, 54, 70, 70, 70, 70, 88,  88,  88,  88,
+    21, 21, 21, 21, 22, 22, 22, 22, 25, 25, 25, 25, 31, 31, 31, 31, 41, 41, 41, 41, 54, 54, 54, 54, 70, 70, 70, 70, 88,  88,  88,  88,
+    21, 21, 21, 21, 22, 22, 22, 22, 25, 25, 25, 25, 31, 31, 31, 31, 41, 41, 41, 41, 54, 54, 54, 54, 70, 70, 70, 70, 88,  88,  88,  88,
+    24, 24, 24, 24, 25, 25, 25, 25, 29, 29, 29, 29, 36, 36, 36, 36, 47, 47, 47, 47, 65, 65, 65, 65, 88, 88, 88, 88, 115, 115, 115, 115,
+    24, 24, 24, 24, 25, 25, 25, 25, 29, 29, 29, 29, 36, 36, 36, 36, 47, 47, 47, 47, 65, 65, 65, 65, 88, 88, 88, 88, 115, 115, 115, 115,
+    24, 24, 24, 24, 25, 25, 25, 25, 29, 29, 29, 29, 36, 36, 36, 36, 47, 47, 47, 47, 65, 65, 65, 65, 88, 88, 88, 88, 115, 115, 115, 115,
+    24, 24, 24, 24, 25, 25, 25, 25, 29, 29, 29, 29, 36, 36, 36, 36, 47, 47, 47, 47, 65, 65, 65, 65, 88, 88, 88, 88, 115, 115, 115, 115
+};
+
+/**
+*
+* @brief default scaling matrix as specified by standard
+* 32x32 inter matrix
+*
+*/
+const WORD16 gi2_inter_default_scale_mat_32x32[] =
+{
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25,
+    16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28,
+    16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28,
+    16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28,
+    16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28,
+    16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33,
+    16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33,
+    16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33,
+    16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33,
+    17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41,
+    17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41,
+    17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41,
+    17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41,
+    18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54,
+    18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54,
+    18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54,
+    18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54,
+    20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54, 71, 71, 71, 71,
+    20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54, 71, 71, 71, 71,
+    20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54, 71, 71, 71, 71,
+    20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54, 71, 71, 71, 71,
+    24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54, 71, 71, 71, 71, 91, 91, 91, 91,
+    24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54, 71, 71, 71, 71, 91, 91, 91, 91,
+    24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54, 71, 71, 71, 71, 91, 91, 91, 91,
+    24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54, 71, 71, 71, 71, 91, 91, 91, 91
+};
+
+
+
+/**  Default flat ReScaling matrix for 32x32 transform
+  * used for quantization
+  * value[i] = ceil(((1 << 15) -1) / gi2_default_scale_mat_4x4[i])
+  * Since the values are same, 32x32 matrix will be used for all sizes
+  */
+
+const WORD16 gi2_flat_rescale_mat_32x32[] =
+{
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048
+
+};
+
+
+/**
+* Default ReScaling matrix for 8x8 intra transform as defined by standard
+* used for quantization
+* value[i] = ceil(((1 << 15)-1) / gi2_default_scale_mat_4x4[i])
+*/
+
+const WORD16 gi2_intra_default_rescale_mat_8x8[] =
+{
+    2048, 2048, 2048, 2048, 1928, 1821, 1561, 1366,
+    2048, 2048, 2048, 2048, 1928, 1725, 1490, 1311,
+    2048, 2048, 1928, 1821, 1639, 1490, 1311, 1130,
+    2048, 2048, 1821, 1561, 1366, 1214, 1057, 911,
+    1928, 1928, 1639, 1366, 1093, 937,  800,  698,
+    1821, 1725, 1490, 1214, 937,  745,  607,  505,
+    1561, 1490, 1311, 1057, 800,  607,  469,  373,
+    1366, 1311, 1130, 911,  698,  505,  373,  285
+};
+
+/**
+*
+* @brief default rescaling scaling matrix as specified by standard
+* 8x8 inter matrix
+* value[i] = ceil(((1 << 15)-1) / gi2_default_scale_mat_4x4[i])
+*
+*/
+const WORD16 gi2_inter_default_rescale_mat_8x8[] =
+{
+    2048, 2048, 2048, 2048, 1928, 1821, 1639, 1366,
+    2048, 2048, 2048, 1928, 1821, 1639, 1366, 1311,
+    2048, 2048, 1928, 1821, 1639, 1366, 1311, 1171,
+    2048, 1928, 1821, 1639, 1366, 1311, 1171, 993,
+    1928, 1821, 1639, 1366, 1311, 1171, 993,  800,
+    1821, 1639, 1366, 1311, 1171, 993,  800,  607,
+    1639, 1366, 1311, 1171, 993,  800,  607,  462,
+    1366, 1311, 1171, 993,  800,  607,  462,  361
+};
+
+/**
+*
+* @brief default Rescaling scaling matrix as specified by standard
+* 16x16 intra matrix
+*
+*/
+const WORD16 gi2_intra_default_rescale_mat_16x16[] =
+{
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1561, 1561, 1366, 1366,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1561, 1561, 1366, 1366,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1725, 1725, 1490, 1490, 1311, 1311,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1725, 1725, 1490, 1490, 1311, 1311,
+    2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1490, 1490, 1311, 1311, 1130, 1130,
+    2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1490, 1490, 1311, 1311, 1130, 1130,
+    2048, 2048, 2048, 2048, 1821, 1821, 1561, 1561, 1366, 1366, 1214, 1214, 1057, 1057, 911,  911,
+    2048, 2048, 2048, 2048, 1821, 1821, 1561, 1561, 1366, 1366, 1214, 1214, 1057, 1057, 911,  911,
+    1928, 1928, 1928, 1928, 1639, 1639, 1366, 1366, 1093, 1093, 937,  937,  800,  800,  698,  698,
+    1928, 1928, 1928, 1928, 1639, 1639, 1366, 1366, 1093, 1093, 937,  937,  800,  800,  698,  698,
+    1821, 1821, 1725, 1725, 1490, 1490, 1214, 1214, 937,  937,  745,  745,  607,  607,  505,  505,
+    1821, 1821, 1725, 1725, 1490, 1490, 1214, 1214, 937,  937,  745,  745,  607,  607,  505,  505,
+    1561, 1561, 1490, 1490, 1311, 1311, 1057, 1057, 800,  800,  607,  607,  469,  469,  373,  373,
+    1561, 1561, 1490, 1490, 1311, 1311, 1057, 1057, 800,  800,  607,  607,  469,  469,  373,  373,
+    1366, 1366, 1311, 1311, 1130, 1130, 911,  911,  698,  698,  505,  505,  373,  373,  285,  285,
+    1366, 1366, 1311, 1311, 1130, 1130, 911,  911,  698,  698,  505,  505,  373,  373,  285,  285
+};
+
+/**
+*
+* @brief default rescaling scaling matrix as specified by standard
+* 16x16 intra matrix
+*
+*/
+const WORD16 gi2_inter_default_rescale_mat_16x16[] =
+{
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366,
+    2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311,
+    2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311,
+    2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171,
+    2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171,
+    2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171,  993,  993,
+    2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171,  993,  993,
+    1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171,  993,  993,  800,  800,
+    1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171,  993,  993,  800,  800,
+    1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171,  993,  993,  800,  800,  607,  607,
+    1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171,  993,  993,  800,  800,  607,  607,
+    1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171,  993,  993,  800,  800,  607,  607,  462,  462,
+    1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171,  993,  993,  800,  800,  607,  607,  462,  462,
+    1366, 1366, 1311, 1311, 1171, 1171,  993,  993,  800,  800,  607,  607,  462,  462,  361,  361,
+    1366, 1366, 1311, 1311, 1171, 1171,  993,  993,  800,  800,  607,  607,  462,  462,  361,  361
+};
+
+/**
+*
+* @brief default rescaled scaling matrix as specified by standard
+* 32x32 intra matrix
+*
+*/
+const WORD16 gi2_intra_default_rescale_mat_32x32[] =
+{
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1561, 1561, 1561, 1561, 1366, 1366, 1366, 1366,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1561, 1561, 1561, 1561, 1366, 1366, 1366, 1366,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1561, 1561, 1561, 1561, 1366, 1366, 1366, 1366,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1561, 1561, 1561, 1561, 1366, 1366, 1366, 1366,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1725, 1725, 1725, 1725, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1725, 1725, 1725, 1725, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1725, 1725, 1725, 1725, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1725, 1725, 1725, 1725, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311, 1130, 1130, 1130, 1130,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311, 1130, 1130, 1130, 1130,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311, 1130, 1130, 1130, 1130,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311, 1130, 1130, 1130, 1130,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1821, 1821, 1821, 1821, 1561, 1561, 1561, 1561, 1366, 1366, 1366, 1366, 1214, 1214, 1214, 1214, 1057, 1057, 1057, 1057,  911,  911,  911,  911,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1821, 1821, 1821, 1821, 1561, 1561, 1561, 1561, 1366, 1366, 1366, 1366, 1214, 1214, 1214, 1214, 1057, 1057, 1057, 1057,  911,  911,  911,  911,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1821, 1821, 1821, 1821, 1561, 1561, 1561, 1561, 1366, 1366, 1366, 1366, 1214, 1214, 1214, 1214, 1057, 1057, 1057, 1057,  911,  911,  911,  911,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1821, 1821, 1821, 1821, 1561, 1561, 1561, 1561, 1366, 1366, 1366, 1366, 1214, 1214, 1214, 1214, 1057, 1057, 1057, 1057,  911,  911,  911,  911,
+    1928, 1928, 1928, 1928, 1928, 1928, 1928, 1928, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1093, 1093, 1093, 1093,  937,  937,  937,  937,  800,  800,  800,  800,  698,  698,  698,  698,
+    1928, 1928, 1928, 1928, 1928, 1928, 1928, 1928, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1093, 1093, 1093, 1093,  937,  937,  937,  937,  800,  800,  800,  800,  698,  698,  698,  698,
+    1928, 1928, 1928, 1928, 1928, 1928, 1928, 1928, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1093, 1093, 1093, 1093,  937,  937,  937,  937,  800,  800,  800,  800,  698,  698,  698,  698,
+    1928, 1928, 1928, 1928, 1928, 1928, 1928, 1928, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1093, 1093, 1093, 1093,  937,  937,  937,  937,  800,  800,  800,  800,  698,  698,  698,  698,
+    1821, 1821, 1821, 1821, 1725, 1725, 1725, 1725, 1490, 1490, 1490, 1490, 1214, 1214, 1214, 1214,  937,  937,  937,  937,  745,  745,  745,  745,  607,  607,  607,  607,  505,  505,  505,  505,
+    1821, 1821, 1821, 1821, 1725, 1725, 1725, 1725, 1490, 1490, 1490, 1490, 1214, 1214, 1214, 1214,  937,  937,  937,  937,  745,  745,  745,  745,  607,  607,  607,  607,  505,  505,  505,  505,
+    1821, 1821, 1821, 1821, 1725, 1725, 1725, 1725, 1490, 1490, 1490, 1490, 1214, 1214, 1214, 1214,  937,  937,  937,  937,  745,  745,  745,  745,  607,  607,  607,  607,  505,  505,  505,  505,
+    1821, 1821, 1821, 1821, 1725, 1725, 1725, 1725, 1490, 1490, 1490, 1490, 1214, 1214, 1214, 1214,  937,  937,  937,  937,  745,  745,  745,  745,  607,  607,  607,  607,  505,  505,  505,  505,
+    1561, 1561, 1561, 1561, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311, 1057, 1057, 1057, 1057,  800,  800,  800,  800,  607,  607,  607,  607,  469,  469,  469,  469,  373,  373,  373,  373,
+    1561, 1561, 1561, 1561, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311, 1057, 1057, 1057, 1057,  800,  800,  800,  800,  607,  607,  607,  607,  469,  469,  469,  469,  373,  373,  373,  373,
+    1561, 1561, 1561, 1561, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311, 1057, 1057, 1057, 1057,  800,  800,  800,  800,  607,  607,  607,  607,  469,  469,  469,  469,  373,  373,  373,  373,
+    1561, 1561, 1561, 1561, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311, 1057, 1057, 1057, 1057,  800,  800,  800,  800,  607,  607,  607,  607,  469,  469,  469,  469,  373,  373,  373,  373,
+    1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1130, 1130, 1130, 1130,  911,  911,  911,  911,  698,  698,  698,  698,  505,  505,  505,  505,  373,  373,  373,  373,  285,  285,  285,  285,
+    1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1130, 1130, 1130, 1130,  911,  911,  911,  911,  698,  698,  698,  698,  505,  505,  505,  505,  373,  373,  373,  373,  285,  285,  285,  285,
+    1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1130, 1130, 1130, 1130,  911,  911,  911,  911,  698,  698,  698,  698,  505,  505,  505,  505,  373,  373,  373,  373,  285,  285,  285,  285,
+    1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1130, 1130, 1130, 1130,  911,  911,  911,  911,  698,  698,  698,  698,  505,  505,  505,  505,  373,  373,  373,  373,  285,  285,  285,  285
+};
+
+/**
+*
+* @brief default rescaled scaling matrix as specified by standard
+* 32x32 inter matrix
+*
+*/
+const WORD16 gi2_inter_default_rescale_mat_32x32[] =
+{
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,
+    2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,
+    2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,
+    2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,
+    2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,
+    2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,
+    1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,  800,  800,  800,  800,
+    1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,  800,  800,  800,  800,
+    1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,  800,  800,  800,  800,
+    1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,  800,  800,  800,  800,
+    1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,  800,  800,  800,  800,  607,  607,  607,  607,
+    1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,  800,  800,  800,  800,  607,  607,  607,  607,
+    1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,  800,  800,  800,  800,  607,  607,  607,  607,
+    1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,  800,  800,  800,  800,  607,  607,  607,  607,
+    1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,  800,  800,  800,  800,  607,  607,  607,  607,  462,  462,  462,  462,
+    1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,  800,  800,  800,  800,  607,  607,  607,  607,  462,  462,  462,  462,
+    1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,  800,  800,  800,  800,  607,  607,  607,  607,  462,  462,  462,  462,
+    1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,  800,  800,  800,  800,  607,  607,  607,  607,  462,  462,  462,  462,
+    1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,  800,  800,  800,  800,  607,  607,  607,  607,  462,  462,  462,  462,  361,  361,  361,  361,
+    1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,  800,  800,  800,  800,  607,  607,  607,  607,  462,  462,  462,  462,  361,  361,  361,  361,
+    1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,  800,  800,  800,  800,  607,  607,  607,  607,  462,  462,  462,  462,  361,  361,  361,  361,
+    1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,  993,  993,  993,  993,  800,  800,  800,  800,  607,  607,  607,  607,  462,  462,  462,  462,  361,  361,  361,  361
+};
+

diff --git a/common/ihevc_quant_tables.h b/common/ihevc_quant_tables.h
new file mode 100644
index 0000000..76d1eea
--- /dev/null
+++ b/common/ihevc_quant_tables.h

@@ -0,0 +1,66 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_quant_tables.h
+*
+* @brief
+*  Tables for forward and inverse quantization
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_QUANT_TABLES_H_
+#define _IHEVC_QUANT_TABLES_H_
+
+extern const WORD16 gi2_flat_scale_mat_32x32[];
+
+extern const WORD16 gi2_intra_default_scale_mat_8x8[];
+
+extern const WORD16 gi2_inter_default_scale_mat_8x8[];
+
+extern const WORD16 gi2_intra_default_scale_mat_16x16[];
+
+extern const WORD16 gi2_inter_default_scale_mat_16x16[];
+
+extern const WORD16 gi2_intra_default_scale_mat_32x32[];
+
+extern const WORD16 gi2_inter_default_scale_mat_32x32[];
+
+
+extern const WORD16 gi2_flat_rescale_mat_32x32[];
+
+extern const WORD16 gi2_intra_default_rescale_mat_8x8[];
+
+extern const WORD16 gi2_inter_default_rescale_mat_8x8[];
+
+extern const WORD16 gi2_intra_default_rescale_mat_16x16[];
+
+extern const WORD16 gi2_inter_default_rescale_mat_16x16[];
+
+extern const WORD16 gi2_intra_default_rescale_mat_32x32[];
+
+extern const WORD16 gi2_inter_default_rescale_mat_32x32[];
+
+#endif

diff --git a/common/ihevc_recon.c b/common/ihevc_recon.c
new file mode 100644
index 0000000..9d7015e
--- /dev/null
+++ b/common/ihevc_recon.c

@@ -0,0 +1,461 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_recon.c
+ *
+ * @brief
+ *  Functions definitions reconstruction
+ *
+ * @author
+ *  Ittiam
+ *
+ * @par List of Functions:
+ *  - ihevc_recon_4x4_ttype1()
+ *  - ihevc_recon_4x4()
+ *  - ihevc_recon_8x8()
+ *  - ihevc_recon_16x16()
+ *  - ihevc_recon_32x32()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions here are replicated from ihevc.c and modified to */
+/* include reconstruction */
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs reconstruction for  4x4 input block
+ *
+ * @par Description:
+ *  Performs reconstruction of 4x4 input block by adding  adding prediction
+ * data to input and clipping it to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 4x4 coefficients
+ *
+ * @param[in] pu1_pred
+ *  Prediction 4x4 block
+ *
+ * @param[out] pu1_dst
+ *  Output 4x4 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_tmp
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_recon_4x4_ttype1(WORD16 *pi2_src,
+                            UWORD8 *pu1_pred,
+                            UWORD8 *pu1_dst,
+                            WORD32 src_strd,
+                            WORD32 pred_strd,
+                            WORD32 dst_strd,
+                            WORD32 zero_cols)
+{
+    WORD32 i, j;
+    WORD32 trans_size;
+
+    trans_size = TRANS_SIZE_4;
+
+    /* Reconstruction */
+
+    for(i = 0; i < trans_size; i++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+            }
+        }
+        else
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                pu1_dst[j * dst_strd] =
+                                CLIP_U8(pi2_src[j * src_strd] + pu1_pred[j * pred_strd]);
+            }
+        }
+        pi2_src++;
+        pu1_dst++;
+        pu1_pred++;
+        zero_cols = zero_cols >> 1;
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs reconstruction for  4x4 input block
+ *
+ * @par Description:
+ *  Performs reconstruction of 4x4 input block by adding  adding prediction
+ * data to input and clipping it to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 4x4 coefficients
+ *
+ * @param[in] pu1_pred
+ *  Prediction 4x4 block
+ *
+ * @param[out] pu1_dst
+ *  Output 4x4 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_tmp
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_recon_4x4(WORD16 *pi2_src,
+                     UWORD8 *pu1_pred,
+                     UWORD8 *pu1_dst,
+                     WORD32 src_strd,
+                     WORD32 pred_strd,
+                     WORD32 dst_strd,
+                     WORD32 zero_cols)
+{
+    WORD32 i, j;
+    WORD32 trans_size;
+
+    trans_size = TRANS_SIZE_4;
+
+    /* Reconstruction */
+
+    for(i = 0; i < trans_size; i++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+            }
+        }
+        else
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                pu1_dst[j * dst_strd] =
+                                CLIP_U8(pi2_src[j * src_strd] + pu1_pred[j * pred_strd]);
+            }
+        }
+        pi2_src++;
+        pu1_dst++;
+        pu1_pred++;
+        zero_cols = zero_cols >> 1;
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs reconstruction for  8x8 input block
+ *
+ * @par Description:
+ *  Performs reconstruction of 8x8 input block by adding  adding prediction
+ * data to input and clipping it to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 8x8 coefficients
+ *
+ * @param[in] pu1_pred
+ *  Prediction 8x8 block
+ *
+ * @param[out] pu1_dst
+ *  Output 8x8 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_tmp
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_recon_8x8(WORD16 *pi2_src,
+                     UWORD8 *pu1_pred,
+                     UWORD8 *pu1_dst,
+                     WORD32 src_strd,
+                     WORD32 pred_strd,
+                     WORD32 dst_strd,
+                     WORD32 zero_cols)
+{
+    WORD32 i, j;
+    WORD32 trans_size;
+
+    trans_size = TRANS_SIZE_8;
+
+    /* Reconstruction */
+
+    for(i = 0; i < trans_size; i++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+            }
+        }
+        else
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                pu1_dst[j * dst_strd] =
+                                CLIP_U8(pi2_src[j * src_strd] + pu1_pred[j * pred_strd]);
+            }
+        }
+        pi2_src++;
+        pu1_dst++;
+        pu1_pred++;
+        zero_cols = zero_cols >> 1;
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs reconstruction for  16x16 input block
+ *
+ * @par Description:
+ *  Performs reconstruction of 16x16 input block by adding  adding prediction
+ * data to input and clipping it to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 16x16 coefficients
+ *
+ * @param[in] pu1_pred
+ *  Prediction 16x16 block
+ *
+ * @param[out] pu1_dst
+ *  Output 16x16 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_tmp
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_recon_16x16(WORD16 *pi2_src,
+                       UWORD8 *pu1_pred,
+                       UWORD8 *pu1_dst,
+                       WORD32 src_strd,
+                       WORD32 pred_strd,
+                       WORD32 dst_strd,
+                       WORD32 zero_cols)
+{
+    WORD32 i, j;
+    WORD32 trans_size;
+
+    trans_size = TRANS_SIZE_16;
+
+    /* Reconstruction */
+
+    for(i = 0; i < trans_size; i++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+            }
+        }
+        else
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                pu1_dst[j * dst_strd] =
+                                CLIP_U8(pi2_src[j * src_strd] + pu1_pred[j * pred_strd]);
+            }
+        }
+        pi2_src++;
+        pu1_dst++;
+        pu1_pred++;
+        zero_cols = zero_cols >> 1;
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs reconstruction for  32x32 input block
+ *
+ * @par Description:
+ *  Performs reconstruction of 32x32 input block by adding  adding prediction
+ * data to input and clipping it to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 32x32 coefficients
+ *
+ * @param[in] pu1_pred
+ *  Prediction 32x32 block
+ *
+ * @param[out] pu1_dst
+ *  Output 32x32 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_tmp
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_recon_32x32(WORD16 *pi2_src,
+                       UWORD8 *pu1_pred,
+                       UWORD8 *pu1_dst,
+                       WORD32 src_strd,
+                       WORD32 pred_strd,
+                       WORD32 dst_strd,
+                       WORD32 zero_cols)
+{
+    WORD32 i, j;
+    WORD32 trans_size;
+
+    trans_size = TRANS_SIZE_32;
+
+    /* Reconstruction */
+
+    for(i = 0; i < trans_size; i++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+            }
+        }
+        else
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                pu1_dst[j * dst_strd] =
+                                CLIP_U8(pi2_src[j * src_strd] + pu1_pred[j * pred_strd]);
+            }
+        }
+        pi2_src++;
+        pu1_dst++;
+        pu1_pred++;
+        zero_cols = zero_cols >> 1;
+    }
+}
+

diff --git a/common/ihevc_recon.h b/common/ihevc_recon.h
new file mode 100644
index 0000000..37711ec
--- /dev/null
+++ b/common/ihevc_recon.h

@@ -0,0 +1,124 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_recon.h
+*
+* @brief
+*  Functions declarations reconstruction
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_RECON_H_
+#define _IHEVC_RECON_H_
+
+typedef void ihevc_recon_4x4_ttype1_ft(WORD16 *pi2_src,
+                                       UWORD8 *pu1_pred,
+                                       UWORD8 *pu1_dst,
+                                       WORD32 src_strd,
+                                       WORD32 pred_strd,
+                                       WORD32 dst_strd,
+                                       WORD32 zero_cols);
+typedef void ihevc_hbd_recon_4x4_ttype1_ft(WORD16 *pi2_src,
+                                           UWORD16 *pu2_pred,
+                                           UWORD16 *pu2_dst,
+                                           WORD32 src_strd,
+                                           WORD32 pred_strd,
+                                           WORD32 dst_strd,
+                                           WORD32 zero_cols,
+                                           UWORD8 bit_depth);
+typedef void ihevc_recon_4x4_ft(WORD16 *pi2_src,
+                                UWORD8 *pu1_pred,
+                                UWORD8 *pu1_dst,
+                                WORD32 src_strd,
+                                WORD32 pred_strd,
+                                WORD32 dst_strd,
+                                WORD32 zero_cols);
+typedef void ihevc_hbd_recon_4x4_ft(WORD16 *pi2_src,
+                                    UWORD16 *pu2_pred,
+                                    UWORD16 *pu2_dst,
+                                    WORD32 src_strd,
+                                    WORD32 pred_strd,
+                                    WORD32 dst_strd,
+                                    WORD32 zero_cols,
+                                    UWORD8 bit_depth);
+typedef void ihevc_recon_8x8_ft(WORD16 *pi2_src,
+                                UWORD8 *pu1_pred,
+                                UWORD8 *pu1_dst,
+                                WORD32 src_strd,
+                                WORD32 pred_strd,
+                                WORD32 dst_strd,
+                                WORD32 zero_cols);
+typedef void ihevc_hbd_recon_8x8_ft(WORD16 *pi2_src,
+                                    UWORD16 *pu2_pred,
+                                    UWORD16 *pu2_dst,
+                                    WORD32 src_strd,
+                                    WORD32 pred_strd,
+                                    WORD32 dst_strd,
+                                    WORD32 zero_cols,
+                                    UWORD8 bit_depth);
+typedef void ihevc_recon_16x16_ft(WORD16 *pi2_src,
+                                  UWORD8 *pu1_pred,
+                                  UWORD8 *pu1_dst,
+                                  WORD32 src_strd,
+                                  WORD32 pred_strd,
+                                  WORD32 dst_strd,
+                                  WORD32 zero_cols);
+typedef void ihevc_hbd_recon_16x16_ft(WORD16 *pi2_src,
+                                      UWORD16 *pu2_pred,
+                                      UWORD16 *pu2_dst,
+                                      WORD32 src_strd,
+                                      WORD32 pred_strd,
+                                      WORD32 dst_strd,
+                                      WORD32 zero_cols,
+                                      UWORD8 bit_depth);
+typedef void ihevc_recon_32x32_ft(WORD16 *pi2_src,
+                                  UWORD8 *pu1_pred,
+                                  UWORD8 *pu1_dst,
+                                  WORD32 src_strd,
+                                  WORD32 pred_strd,
+                                  WORD32 dst_strd,
+                                  WORD32 zero_cols);
+typedef void ihevc_hbd_recon_32x32_ft(WORD16 *pi2_src,
+                                      UWORD16 *pu2_pred,
+                                      UWORD16 *pu2_dst,
+                                      WORD32 src_strd,
+                                      WORD32 pred_strd,
+                                      WORD32 dst_strd,
+                                      WORD32 zero_cols,
+                                      UWORD8 bit_depth);
+
+ihevc_recon_4x4_ttype1_ft ihevc_recon_4x4_ttype1;
+ihevc_hbd_recon_4x4_ttype1_ft ihevc_hbd_recon_4x4_ttype1;
+ihevc_recon_4x4_ft ihevc_recon_4x4;
+ihevc_hbd_recon_4x4_ft ihevc_hbd_recon_4x4;
+ihevc_recon_8x8_ft ihevc_recon_8x8;
+ihevc_hbd_recon_8x8_ft ihevc_hbd_recon_8x8;
+ihevc_recon_16x16_ft ihevc_recon_16x16;
+ihevc_hbd_recon_16x16_ft ihevc_hbd_recon_16x16;
+ihevc_recon_32x32_ft ihevc_recon_32x32;
+ihevc_hbd_recon_32x32_ft ihevc_hbd_recon_32x32;
+
+#endif /*_IHEVC_RECON_H_*/

diff --git a/common/ihevc_sao.c b/common/ihevc_sao.c
new file mode 100644
index 0000000..3b41f0d
--- /dev/null
+++ b/common/ihevc_sao.c

@@ -0,0 +1,1374 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_sao.c
+*
+* @brief
+*  Contains leaf level function definitions for sample adaptive offset process
+*
+* @author
+*  Srinivas T
+*
+* @par List of Functions:
+*   - ihevc_sao_band_offset_luma()
+*   - ihevc_sao_band_offset_chroma()
+*   - ihevc_sao_edge_offset_class0()
+*   - ihevc_sao_edge_offset_class0_chroma()
+*   - ihevc_sao_edge_offset_class1()
+*   - ihevc_sao_edge_offset_class1_chroma()
+*   - ihevc_sao_edge_offset_class2()
+*   - ihevc_sao_edge_offset_class2_chroma()
+*   - ihevc_sao_edge_offset_class3()
+*   - ihevc_sao_edge_offset_class3_chroma()
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_sao.h"
+
+#define NUM_BAND_TABLE  32
+
+const WORD32 gi4_ihevc_table_edge_idx[5] = { 1, 2, 0, 3, 4 };
+/**
+ * au4_avail is an array of flags - one for each neighboring block specifying if the block is available
+ * au4_avail[0] - left
+ * au4_avail[1] - right
+ * au4_avail[2] - top
+ * au4_avail[3] - bottom
+ * au4_avail[4] - top-left
+ * au4_avail[5] - top-right
+ * au4_avail[6] - bottom-left
+ * au4_avail[7] - bottom-right
+ */
+
+
+void ihevc_sao_band_offset_luma(UWORD8 *pu1_src,
+                                WORD32 src_strd,
+                                UWORD8 *pu1_src_left,
+                                UWORD8 *pu1_src_top,
+                                UWORD8 *pu1_src_top_left,
+                                WORD32 sao_band_pos,
+                                WORD8 *pi1_sao_offset,
+                                WORD32 wd,
+                                WORD32 ht)
+{
+    WORD32 band_shift;
+    WORD32 band_table[NUM_BAND_TABLE];
+    WORD32 i;
+    WORD32 row, col;
+
+    /* Updating left and top and top-left */
+    for(row = 0; row < ht; row++)
+    {
+        pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
+    }
+    pu1_src_top_left[0] = pu1_src_top[wd - 1];
+    for(col = 0; col < wd; col++)
+    {
+        pu1_src_top[col] = pu1_src[(ht - 1) * src_strd + col];
+    }
+
+    band_shift = BIT_DEPTH_LUMA - 5;
+    for(i = 0; i < NUM_BAND_TABLE; i++)
+    {
+        band_table[i] = 0;
+    }
+    for(i = 0; i < 4; i++)
+    {
+        band_table[(i + sao_band_pos) & 31] = i + 1;
+    }
+
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < wd; col++)
+        {
+            WORD32 band_idx;
+
+            band_idx = band_table[pu1_src[col] >> band_shift];
+            pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[band_idx], 0, (1 << (band_shift + 5)) - 1);
+        }
+        pu1_src += src_strd;
+    }
+}
+
+
+
+/* input 'wd' has to be for the interleaved block and not for each color component */
+void ihevc_sao_band_offset_chroma(UWORD8 *pu1_src,
+                                  WORD32 src_strd,
+                                  UWORD8 *pu1_src_left,
+                                  UWORD8 *pu1_src_top,
+                                  UWORD8 *pu1_src_top_left,
+                                  WORD32 sao_band_pos_u,
+                                  WORD32 sao_band_pos_v,
+                                  WORD8 *pi1_sao_offset_u,
+                                  WORD8 *pi1_sao_offset_v,
+                                  WORD32 wd,
+                                  WORD32 ht)
+{
+    WORD32 band_shift;
+    WORD32 band_table_u[NUM_BAND_TABLE];
+    WORD32 band_table_v[NUM_BAND_TABLE];
+    WORD32 i;
+    WORD32 row, col;
+
+    /* Updating left and top and top-left */
+    for(row = 0; row < ht; row++)
+    {
+        pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
+        pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
+    }
+    pu1_src_top_left[0] = pu1_src_top[wd - 2];
+    pu1_src_top_left[1] = pu1_src_top[wd - 1];
+    for(col = 0; col < wd; col++)
+    {
+        pu1_src_top[col] = pu1_src[(ht - 1) * src_strd + col];
+    }
+
+
+    band_shift = BIT_DEPTH_CHROMA - 5;
+    for(i = 0; i < NUM_BAND_TABLE; i++)
+    {
+        band_table_u[i] = 0;
+        band_table_v[i] = 0;
+    }
+    for(i = 0; i < 4; i++)
+    {
+        band_table_u[(i + sao_band_pos_u) & 31] = i + 1;
+        band_table_v[(i + sao_band_pos_v) & 31] = i + 1;
+    }
+
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < wd; col++)
+        {
+            WORD32 band_idx;
+            WORD8 *pi1_sao_offset;
+
+            pi1_sao_offset = (0 == col % 2) ? pi1_sao_offset_u : pi1_sao_offset_v;
+            band_idx = (0 == col % 2) ? band_table_u[pu1_src[col] >> band_shift] : band_table_v[pu1_src[col] >> band_shift];
+            pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[band_idx], 0, (1 << (band_shift + 5)) - 1);
+        }
+        pu1_src += src_strd;
+    }
+}
+
+
+
+/* Horizontal filtering */
+void ihevc_sao_edge_offset_class0(UWORD8 *pu1_src,
+                                  WORD32 src_strd,
+                                  UWORD8 *pu1_src_left,
+                                  UWORD8 *pu1_src_top,
+                                  UWORD8 *pu1_src_top_left,
+                                  UWORD8 *pu1_src_top_right,
+                                  UWORD8 *pu1_src_bot_left,
+                                  UWORD8 *pu1_avail,
+                                  WORD8 *pi1_sao_offset,
+                                  WORD32 wd,
+                                  WORD32 ht)
+{
+    WORD32 row, col;
+    UWORD8 au1_mask[MAX_CTB_SIZE];
+    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE];
+    WORD8 u1_sign_left, u1_sign_right;
+    WORD32 bit_depth;
+    UNUSED(pu1_src_top_right);
+    UNUSED(pu1_src_bot_left);
+    bit_depth = BIT_DEPTH_LUMA;
+
+    /* Initialize the mask values */
+    memset(au1_mask, 0xFF, MAX_CTB_SIZE);
+
+    /* Update top and top-left arrays */
+    *pu1_src_top_left = pu1_src_top[wd - 1];
+    for(row = 0; row < ht; row++)
+    {
+        au1_src_left_tmp[row] = pu1_src[row * src_strd + wd - 1];
+    }
+    for(col = 0; col < wd; col++)
+    {
+        pu1_src_top[col] = pu1_src[(ht - 1) * src_strd + col];
+    }
+
+    /* Update masks based on the availability flags */
+    if(0 == pu1_avail[0])
+    {
+        au1_mask[0] = 0;
+    }
+    if(0 == pu1_avail[1])
+    {
+        au1_mask[wd - 1] = 0;
+    }
+
+    /* Processing is done on the intermediate buffer and the output is written to the source buffer */
+    {
+        for(row = 0; row < ht; row++)
+        {
+            u1_sign_left = SIGN(pu1_src[0] - pu1_src_left[row]);
+            for(col = 0; col < wd; col++)
+            {
+                WORD32 edge_idx;
+
+                u1_sign_right = SIGN(pu1_src[col] - pu1_src[col + 1]);
+                edge_idx = 2 + u1_sign_left + u1_sign_right;
+                u1_sign_left = -u1_sign_right;
+
+                edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col];
+
+                if(0 != edge_idx)
+                {
+                    pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+                }
+            }
+
+            pu1_src += src_strd;
+        }
+    }
+
+    /* Update left array */
+    for(row = 0; row < ht; row++)
+    {
+        pu1_src_left[row] = au1_src_left_tmp[row];
+    }
+
+}
+
+
+
+
+/* input 'wd' has to be for the interleaved block and not for each color component */
+void ihevc_sao_edge_offset_class0_chroma(UWORD8 *pu1_src,
+                                         WORD32 src_strd,
+                                         UWORD8 *pu1_src_left,
+                                         UWORD8 *pu1_src_top,
+                                         UWORD8 *pu1_src_top_left,
+                                         UWORD8 *pu1_src_top_right,
+                                         UWORD8 *pu1_src_bot_left,
+                                         UWORD8 *pu1_avail,
+                                         WORD8 *pi1_sao_offset_u,
+                                         WORD8 *pi1_sao_offset_v,
+                                         WORD32 wd,
+                                         WORD32 ht)
+{
+    WORD32 row, col;
+    UWORD8 au1_mask[MAX_CTB_SIZE];
+    UWORD8 au1_src_left_tmp[2 * MAX_CTB_SIZE];
+    WORD8 u1_sign_left_u, u1_sign_right_u;
+    WORD8 u1_sign_left_v, u1_sign_right_v;
+    WORD32 bit_depth;
+    UNUSED(pu1_src_top_right);
+    UNUSED(pu1_src_bot_left);
+    bit_depth = BIT_DEPTH_CHROMA;
+
+    /* Initialize the mask values */
+    memset(au1_mask, 0xFF, MAX_CTB_SIZE);
+
+    /* Update left, top and top-left arrays */
+    pu1_src_top_left[0] = pu1_src_top[wd - 2];
+    pu1_src_top_left[1] = pu1_src_top[wd - 1];
+    for(row = 0; row < ht; row++)
+    {
+        au1_src_left_tmp[2 * row] = pu1_src[row * src_strd + wd - 2];
+        au1_src_left_tmp[2 * row + 1] = pu1_src[row * src_strd + wd - 1];
+    }
+    for(col = 0; col < wd; col++)
+    {
+        pu1_src_top[col] = pu1_src[(ht - 1) * src_strd + col];
+    }
+
+    /* Update masks based on the availability flags */
+    if(0 == pu1_avail[0])
+    {
+        au1_mask[0] = 0;
+    }
+    if(0 == pu1_avail[1])
+    {
+        au1_mask[(wd - 1) >> 1] = 0;
+    }
+
+    /* Processing is done on the intermediate buffer and the output is written to the source buffer */
+    {
+        for(row = 0; row < ht; row++)
+        {
+            u1_sign_left_u = SIGN(pu1_src[0] - pu1_src_left[2 * row]);
+            u1_sign_left_v = SIGN(pu1_src[1] - pu1_src_left[2 * row + 1]);
+            for(col = 0; col < wd; col++)
+            {
+                WORD32 edge_idx;
+                WORD8 *pi1_sao_offset;
+
+                if(0 == col % 2)
+                {
+                    pi1_sao_offset = pi1_sao_offset_u;
+                    u1_sign_right_u = SIGN(pu1_src[col] - pu1_src[col + 2]);
+                    edge_idx = 2 + u1_sign_left_u + u1_sign_right_u;
+                    u1_sign_left_u = -u1_sign_right_u;
+                }
+                else
+                {
+                    pi1_sao_offset = pi1_sao_offset_v;
+                    u1_sign_right_v = SIGN(pu1_src[col] - pu1_src[col + 2]);
+                    edge_idx = 2 + u1_sign_left_v + u1_sign_right_v;
+                    u1_sign_left_v = -u1_sign_right_v;
+                }
+
+                edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col >> 1];
+
+                if(0 != edge_idx)
+                {
+                    pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+                }
+            }
+
+            pu1_src += src_strd;
+        }
+    }
+
+    for(row = 0; row < 2 * ht; row++)
+    {
+        pu1_src_left[row] = au1_src_left_tmp[row];
+    }
+
+}
+
+
+
+/* Vertical filtering */
+void ihevc_sao_edge_offset_class1(UWORD8 *pu1_src,
+                                  WORD32 src_strd,
+                                  UWORD8 *pu1_src_left,
+                                  UWORD8 *pu1_src_top,
+                                  UWORD8 *pu1_src_top_left,
+                                  UWORD8 *pu1_src_top_right,
+                                  UWORD8 *pu1_src_bot_left,
+                                  UWORD8 *pu1_avail,
+                                  WORD8 *pi1_sao_offset,
+                                  WORD32 wd,
+                                  WORD32 ht)
+{
+    WORD32 row, col;
+    UWORD8 au1_mask[MAX_CTB_SIZE];
+    UWORD8 au1_src_top_tmp[MAX_CTB_SIZE];
+    WORD8 au1_sign_up[MAX_CTB_SIZE];
+    WORD8 u1_sign_down;
+    WORD32 bit_depth;
+    UNUSED(pu1_src_top_right);
+    UNUSED(pu1_src_bot_left);
+
+    bit_depth = BIT_DEPTH_LUMA;
+
+    /* Initialize the mask values */
+    memset(au1_mask, 0xFF, MAX_CTB_SIZE);
+
+    /* Update left, top and top-left arrays */
+    *pu1_src_top_left = pu1_src_top[wd - 1];
+    for(row = 0; row < ht; row++)
+    {
+        pu1_src_left[row] = pu1_src[row * src_strd + wd - 1];
+    }
+    for(col = 0; col < wd; col++)
+    {
+        au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col];
+    }
+
+    /* Update height and source pointers based on the availability flags */
+    if(0 == pu1_avail[2])
+    {
+        pu1_src += src_strd;
+        ht--;
+        for(col = 0; col < wd; col++)
+        {
+            au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src[col - src_strd]);
+        }
+    }
+    else
+    {
+        for(col = 0; col < wd; col++)
+        {
+            au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src_top[col]);
+        }
+    }
+    if(0 == pu1_avail[3])
+    {
+        ht--;
+    }
+
+    /* Processing is done on the intermediate buffer and the output is written to the source buffer */
+    {
+        for(row = 0; row < ht; row++)
+        {
+            for(col = 0; col < wd; col++)
+            {
+                WORD32 edge_idx;
+
+                u1_sign_down = SIGN(pu1_src[col] - pu1_src[col + src_strd]);
+                edge_idx = 2 + au1_sign_up[col] + u1_sign_down;
+                au1_sign_up[col] = -u1_sign_down;
+
+                edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col];
+
+                if(0 != edge_idx)
+                {
+                    pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+                }
+            }
+
+            pu1_src += src_strd;
+        }
+    }
+
+    for(col = 0; col < wd; col++)
+    {
+        pu1_src_top[col] = au1_src_top_tmp[col];
+    }
+
+}
+
+
+
+/* input 'wd' has to be for the interleaved block and not for each color component */
+void ihevc_sao_edge_offset_class1_chroma(UWORD8 *pu1_src,
+                                         WORD32 src_strd,
+                                         UWORD8 *pu1_src_left,
+                                         UWORD8 *pu1_src_top,
+                                         UWORD8 *pu1_src_top_left,
+                                         UWORD8 *pu1_src_top_right,
+                                         UWORD8 *pu1_src_bot_left,
+                                         UWORD8 *pu1_avail,
+                                         WORD8 *pi1_sao_offset_u,
+                                         WORD8 *pi1_sao_offset_v,
+                                         WORD32 wd,
+                                         WORD32 ht)
+{
+    WORD32 row, col;
+    UWORD8 au1_mask[MAX_CTB_SIZE];
+    UWORD8 au1_src_top_tmp[MAX_CTB_SIZE];
+    WORD8 au1_sign_up[MAX_CTB_SIZE];
+    WORD8 u1_sign_down;
+    WORD32 bit_depth;
+    UNUSED(pu1_src_top_right);
+    UNUSED(pu1_src_bot_left);
+
+    bit_depth = BIT_DEPTH_CHROMA;
+
+    /* Initialize the mask values */
+    memset(au1_mask, 0xFF, MAX_CTB_SIZE);
+
+    /* Update left, top and top-left arrays */
+    pu1_src_top_left[0] = pu1_src_top[wd - 2];
+    pu1_src_top_left[1] = pu1_src_top[wd - 1];
+    for(row = 0; row < ht; row++)
+    {
+        pu1_src_left[2 * row] = pu1_src[row * src_strd + wd - 2];
+        pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + wd - 1];
+    }
+    for(col = 0; col < wd; col++)
+    {
+        au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col];
+    }
+
+    /* Update height and source pointers based on the availability flags */
+    if(0 == pu1_avail[2])
+    {
+        pu1_src += src_strd;
+        ht--;
+        for(col = 0; col < wd; col++)
+        {
+            au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src[col - src_strd]);
+        }
+    }
+    else
+    {
+        for(col = 0; col < wd; col++)
+        {
+            au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src_top[col]);
+        }
+    }
+    if(0 == pu1_avail[3])
+    {
+        ht--;
+    }
+
+    /* Processing is done on the intermediate buffer and the output is written to the source buffer */
+    {
+        for(row = 0; row < ht; row++)
+        {
+            for(col = 0; col < wd; col++)
+            {
+                WORD32 edge_idx;
+                WORD8 *pi1_sao_offset;
+
+                pi1_sao_offset = (0 == col % 2) ? pi1_sao_offset_u : pi1_sao_offset_v;
+
+                u1_sign_down = SIGN(pu1_src[col] - pu1_src[col + src_strd]);
+                edge_idx = 2 + au1_sign_up[col] + u1_sign_down;
+                au1_sign_up[col] = -u1_sign_down;
+
+                edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col >> 1];
+
+                if(0 != edge_idx)
+                {
+                    pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+                }
+            }
+
+            pu1_src += src_strd;
+        }
+    }
+
+    for(col = 0; col < wd; col++)
+    {
+        pu1_src_top[col] = au1_src_top_tmp[col];
+    }
+
+}
+
+
+
+/* 135 degree filtering */
+void ihevc_sao_edge_offset_class2(UWORD8 *pu1_src,
+                                  WORD32 src_strd,
+                                  UWORD8 *pu1_src_left,
+                                  UWORD8 *pu1_src_top,
+                                  UWORD8 *pu1_src_top_left,
+                                  UWORD8 *pu1_src_top_right,
+                                  UWORD8 *pu1_src_bot_left,
+                                  UWORD8 *pu1_avail,
+                                  WORD8 *pi1_sao_offset,
+                                  WORD32 wd,
+                                  WORD32 ht)
+{
+    WORD32 row, col;
+    UWORD8 au1_mask[MAX_CTB_SIZE];
+    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE], au1_src_top_tmp[MAX_CTB_SIZE];
+    UWORD8 u1_src_top_left_tmp;
+    WORD8 au1_sign_up[MAX_CTB_SIZE + 1], au1_sign_up_tmp[MAX_CTB_SIZE + 1];
+    WORD8 u1_sign_down;
+    WORD8 *pu1_sign_up;
+    WORD8 *pu1_sign_up_tmp;
+    UWORD8 *pu1_src_left_cpy;
+
+    WORD32 bit_depth;
+    UWORD8 u1_pos_0_0_tmp;
+    UWORD8 u1_pos_wd_ht_tmp;
+    UNUSED(pu1_src_top_right);
+    UNUSED(pu1_src_bot_left);
+
+    bit_depth = BIT_DEPTH_LUMA;
+    pu1_sign_up = au1_sign_up;
+    pu1_sign_up_tmp = au1_sign_up_tmp;
+    pu1_src_left_cpy = pu1_src_left;
+
+    /* Initialize the mask values */
+    memset(au1_mask, 0xFF, MAX_CTB_SIZE);
+
+    /* Update left, top and top-left arrays */
+    u1_src_top_left_tmp = pu1_src_top[wd - 1];
+    for(row = 0; row < ht; row++)
+    {
+        au1_src_left_tmp[row] = pu1_src[row * src_strd + wd - 1];
+    }
+    for(col = 0; col < wd; col++)
+    {
+        au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col];
+    }
+
+
+    /* If top-left is available, process separately */
+    if(0 != pu1_avail[4])
+    {
+        WORD32 edge_idx;
+
+        edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
+                        SIGN(pu1_src[0] - pu1_src[1 + src_strd]);
+
+        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_0_0_tmp = pu1_src[0];
+        }
+    }
+    else
+    {
+        u1_pos_0_0_tmp = pu1_src[0];
+    }
+
+    /* If bottom-right is available, process separately */
+    if(0 != pu1_avail[7])
+    {
+        WORD32 edge_idx;
+
+        edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]) +
+                        SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]);
+
+        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
+        }
+    }
+    else
+    {
+        u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
+    }
+
+    /* If Left is not available */
+    if(0 == pu1_avail[0])
+    {
+        au1_mask[0] = 0;
+    }
+
+    /* If Top is not available */
+    if(0 == pu1_avail[2])
+    {
+        pu1_src += src_strd;
+        ht--;
+        pu1_src_left_cpy += 1;
+        for(col = 1; col < wd; col++)
+        {
+            pu1_sign_up[col] = SIGN(pu1_src[col] - pu1_src[col - 1 - src_strd]);
+        }
+    }
+    else
+    {
+        for(col = 1; col < wd; col++)
+        {
+            pu1_sign_up[col] = SIGN(pu1_src[col] - pu1_src_top[col - 1]);
+        }
+    }
+
+    /* If Right is not available */
+    if(0 == pu1_avail[1])
+    {
+        au1_mask[wd - 1] = 0;
+    }
+
+    /* If Bottom is not available */
+    if(0 == pu1_avail[3])
+    {
+        ht--;
+    }
+
+    /* Processing is done on the intermediate buffer and the output is written to the source buffer */
+    {
+        for(row = 0; row < ht; row++)
+        {
+            pu1_sign_up[0] = SIGN(pu1_src[0] - pu1_src_left_cpy[row - 1]);
+            for(col = 0; col < wd; col++)
+            {
+                WORD32 edge_idx;
+
+                u1_sign_down = SIGN(pu1_src[col] - pu1_src[col + 1 + src_strd]);
+                edge_idx = 2 + pu1_sign_up[col] + u1_sign_down;
+                pu1_sign_up_tmp[col + 1] = -u1_sign_down;
+
+                edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col];
+
+                if(0 != edge_idx)
+                {
+                    pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+                }
+            }
+
+            /* Swapping pu1_sign_up_tmp and pu1_sign_up */
+            {
+                WORD8 *pu1_swap_tmp = pu1_sign_up;
+                pu1_sign_up = pu1_sign_up_tmp;
+                pu1_sign_up_tmp = pu1_swap_tmp;
+            }
+
+            pu1_src += src_strd;
+        }
+
+        pu1_src[-(pu1_avail[2] ? ht : ht + 1) * src_strd] = u1_pos_0_0_tmp;
+        pu1_src[(pu1_avail[3] ? wd - 1 - src_strd : wd - 1)] = u1_pos_wd_ht_tmp;
+    }
+
+    if(0 == pu1_avail[2])
+        ht++;
+    if(0 == pu1_avail[3])
+        ht++;
+    *pu1_src_top_left = u1_src_top_left_tmp;
+    for(row = 0; row < ht; row++)
+    {
+        pu1_src_left[row] = au1_src_left_tmp[row];
+    }
+    for(col = 0; col < wd; col++)
+    {
+        pu1_src_top[col] = au1_src_top_tmp[col];
+    }
+
+}
+
+
+
+
+/* 135 degree filtering */
+void ihevc_sao_edge_offset_class2_chroma(UWORD8 *pu1_src,
+                                         WORD32 src_strd,
+                                         UWORD8 *pu1_src_left,
+                                         UWORD8 *pu1_src_top,
+                                         UWORD8 *pu1_src_top_left,
+                                         UWORD8 *pu1_src_top_right,
+                                         UWORD8 *pu1_src_bot_left,
+                                         UWORD8 *pu1_avail,
+                                         WORD8 *pi1_sao_offset_u,
+                                         WORD8 *pi1_sao_offset_v,
+                                         WORD32 wd,
+                                         WORD32 ht)
+{
+    WORD32 row, col;
+    UWORD8 au1_mask[MAX_CTB_SIZE];
+    UWORD8 au1_src_left_tmp[2 * MAX_CTB_SIZE], au1_src_top_tmp[MAX_CTB_SIZE];
+    UWORD8 au1_src_top_left_tmp[2];
+    WORD8 au1_sign_up[MAX_CTB_SIZE + 2], au1_sign_up_tmp[MAX_CTB_SIZE + 2];
+    WORD8 u1_sign_down;
+    WORD8 *pu1_sign_up;
+    WORD8 *pu1_sign_up_tmp;
+    UWORD8 *pu1_src_left_cpy;
+
+    WORD32 bit_depth;
+
+    UWORD8 u1_pos_0_0_tmp_u;
+    UWORD8 u1_pos_0_0_tmp_v;
+    UWORD8 u1_pos_wd_ht_tmp_u;
+    UWORD8 u1_pos_wd_ht_tmp_v;
+    UNUSED(pu1_src_top_right);
+    UNUSED(pu1_src_bot_left);
+
+
+    bit_depth = BIT_DEPTH_CHROMA;
+    pu1_sign_up = au1_sign_up;
+    pu1_sign_up_tmp = au1_sign_up_tmp;
+    pu1_src_left_cpy = pu1_src_left;
+
+    /* Initialize the mask values */
+    memset(au1_mask, 0xFF, MAX_CTB_SIZE);
+
+    /* Update left, top and top-left arrays */
+    au1_src_top_left_tmp[0] = pu1_src_top[wd - 2];
+    au1_src_top_left_tmp[1] = pu1_src_top[wd - 1];
+    for(row = 0; row < ht; row++)
+    {
+        au1_src_left_tmp[2 * row] = pu1_src[row * src_strd + wd - 2];
+        au1_src_left_tmp[2 * row + 1] = pu1_src[row * src_strd + wd - 1];
+    }
+    for(col = 0; col < wd; col++)
+    {
+        au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col];
+    }
+
+
+    /* If top-left is available, process separately */
+    if(0 != pu1_avail[4])
+    {
+        WORD32 edge_idx;
+
+        /* U */
+        edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
+                        SIGN(pu1_src[0] - pu1_src[2 + src_strd]);
+
+        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_0_0_tmp_u = pu1_src[0];
+        }
+
+        /* V */
+        edge_idx = 2 + SIGN(pu1_src[1] - pu1_src_top_left[1]) +
+                        SIGN(pu1_src[1] - pu1_src[1 + 2 + src_strd]);
+
+        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_0_0_tmp_v = CLIP3(pu1_src[1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_0_0_tmp_v = pu1_src[1];
+        }
+    }
+    else
+    {
+        u1_pos_0_0_tmp_u = pu1_src[0];
+        u1_pos_0_0_tmp_v = pu1_src[1];
+    }
+
+    /* If bottom-right is available, process separately */
+    if(0 != pu1_avail[7])
+    {
+        WORD32 edge_idx;
+
+        /* U */
+        edge_idx = 2 + SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]) +
+                        SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]);
+
+        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_wd_ht_tmp_u = CLIP3(pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
+        }
+
+        /* V */
+        edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) +
+                        SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]);
+
+        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
+        }
+    }
+    else
+    {
+        u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
+        u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
+    }
+
+    /* If Left is not available */
+    if(0 == pu1_avail[0])
+    {
+        au1_mask[0] = 0;
+    }
+
+    /* If Top is not available */
+    if(0 == pu1_avail[2])
+    {
+        pu1_src += src_strd;
+        pu1_src_left_cpy += 2;
+        ht--;
+        for(col = 2; col < wd; col++)
+        {
+            pu1_sign_up[col] = SIGN(pu1_src[col] - pu1_src[col - 2 - src_strd]);
+        }
+    }
+    else
+    {
+        for(col = 2; col < wd; col++)
+        {
+            pu1_sign_up[col] = SIGN(pu1_src[col] - pu1_src_top[col - 2]);
+        }
+    }
+
+    /* If Right is not available */
+    if(0 == pu1_avail[1])
+    {
+        au1_mask[(wd - 1) >> 1] = 0;
+    }
+
+    /* If Bottom is not available */
+    if(0 == pu1_avail[3])
+    {
+        ht--;
+    }
+
+    /* Processing is done on the intermediate buffer and the output is written to the source buffer */
+    {
+        for(row = 0; row < ht; row++)
+        {
+            pu1_sign_up[0] = SIGN(pu1_src[0] - pu1_src_left_cpy[2 * (row - 1)]);
+            pu1_sign_up[1] = SIGN(pu1_src[1] - pu1_src_left_cpy[2 * (row - 1) + 1]);
+            for(col = 0; col < wd; col++)
+            {
+                WORD32 edge_idx;
+                WORD8 *pi1_sao_offset;
+
+                pi1_sao_offset = (0 == col % 2) ? pi1_sao_offset_u : pi1_sao_offset_v;
+
+                u1_sign_down = SIGN(pu1_src[col] - pu1_src[col + 2 + src_strd]);
+                edge_idx = 2 + pu1_sign_up[col] + u1_sign_down;
+                pu1_sign_up_tmp[col + 2] = -u1_sign_down;
+
+                edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col >> 1];
+
+                if(0 != edge_idx)
+                {
+                    pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+                }
+            }
+
+            /* Swapping pu1_sign_up_tmp and pu1_sign_up */
+            {
+                WORD8 *pu1_swap_tmp = pu1_sign_up;
+                pu1_sign_up = pu1_sign_up_tmp;
+                pu1_sign_up_tmp = pu1_swap_tmp;
+            }
+
+            pu1_src += src_strd;
+        }
+
+        pu1_src[-(pu1_avail[2] ? ht : ht + 1) * src_strd] = u1_pos_0_0_tmp_u;
+        pu1_src[-(pu1_avail[2] ? ht : ht + 1) * src_strd + 1] = u1_pos_0_0_tmp_v;
+        pu1_src[(pu1_avail[3] ? wd - 2 - src_strd : wd - 2)] = u1_pos_wd_ht_tmp_u;
+        pu1_src[(pu1_avail[3] ? wd - 1 - src_strd : wd - 1)] = u1_pos_wd_ht_tmp_v;
+    }
+
+    if(0 == pu1_avail[2])
+        ht++;
+    if(0 == pu1_avail[3])
+        ht++;
+    pu1_src_top_left[0] = au1_src_top_left_tmp[0];
+    pu1_src_top_left[1] = au1_src_top_left_tmp[1];
+    for(row = 0; row < 2 * ht; row++)
+    {
+        pu1_src_left[row] = au1_src_left_tmp[row];
+    }
+    for(col = 0; col < wd; col++)
+    {
+        pu1_src_top[col] = au1_src_top_tmp[col];
+    }
+
+}
+
+
+
+
+/* 45 degree filtering */
+void ihevc_sao_edge_offset_class3(UWORD8 *pu1_src,
+                                  WORD32 src_strd,
+                                  UWORD8 *pu1_src_left,
+                                  UWORD8 *pu1_src_top,
+                                  UWORD8 *pu1_src_top_left,
+                                  UWORD8 *pu1_src_top_right,
+                                  UWORD8 *pu1_src_bot_left,
+                                  UWORD8 *pu1_avail,
+                                  WORD8 *pi1_sao_offset,
+                                  WORD32 wd,
+                                  WORD32 ht)
+{
+    WORD32 row, col;
+    UWORD8 au1_mask[MAX_CTB_SIZE];
+    UWORD8 au1_src_top_tmp[MAX_CTB_SIZE];
+    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE];
+    UWORD8 u1_src_top_left_tmp;
+    WORD8 au1_sign_up[MAX_CTB_SIZE];
+    UWORD8 *pu1_src_left_cpy;
+    WORD8 u1_sign_down;
+    WORD32 bit_depth;
+
+    UWORD8 u1_pos_0_ht_tmp;
+    UWORD8 u1_pos_wd_0_tmp;
+
+    bit_depth = BIT_DEPTH_LUMA;
+    pu1_src_left_cpy = pu1_src_left;
+
+    /* Initialize the mask values */
+    memset(au1_mask, 0xFF, MAX_CTB_SIZE);
+
+    /* Update left, top and top-left arrays */
+    u1_src_top_left_tmp = pu1_src_top[wd - 1];
+    for(row = 0; row < ht; row++)
+    {
+        au1_src_left_tmp[row] = pu1_src[row * src_strd + wd - 1];
+    }
+    for(col = 0; col < wd; col++)
+    {
+        au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col];
+    }
+
+    /* If top-right is available, process separately */
+    if(0 != pu1_avail[5])
+    {
+        WORD32 edge_idx;
+
+        edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +
+                        SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]);
+
+        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_wd_0_tmp = CLIP3(pu1_src[wd - 1] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_wd_0_tmp = pu1_src[wd - 1];
+        }
+    }
+    else
+    {
+        u1_pos_wd_0_tmp = pu1_src[wd - 1];
+    }
+
+    /* If bottom-left is available, process separately */
+    if(0 != pu1_avail[6])
+    {
+        WORD32 edge_idx;
+
+        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) +
+                        SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
+
+        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_0_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
+        }
+    }
+    else
+    {
+        u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
+    }
+
+    /* If Left is not available */
+    if(0 == pu1_avail[0])
+    {
+        au1_mask[0] = 0;
+    }
+
+    /* If Top is not available */
+    if(0 == pu1_avail[2])
+    {
+        pu1_src += src_strd;
+        ht--;
+        pu1_src_left_cpy += 1;
+        for(col = 0; col < wd - 1; col++)
+        {
+            au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src[col + 1 - src_strd]);
+        }
+    }
+    else
+    {
+        for(col = 0; col < wd - 1; col++)
+        {
+            au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src_top[col + 1]);
+        }
+    }
+
+    /* If Right is not available */
+    if(0 == pu1_avail[1])
+    {
+        au1_mask[wd - 1] = 0;
+    }
+
+    /* If Bottom is not available */
+    if(0 == pu1_avail[3])
+    {
+        ht--;
+    }
+
+    /* Processing is done on the intermediate buffer and the output is written to the source buffer */
+    {
+        for(row = 0; row < ht; row++)
+        {
+            au1_sign_up[wd - 1] = SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 + 1 - src_strd]);
+            for(col = 0; col < wd; col++)
+            {
+                WORD32 edge_idx;
+
+                u1_sign_down = SIGN(pu1_src[col] - ((col == 0) ? pu1_src_left_cpy[row + 1] :
+                                                                 pu1_src[col - 1 + src_strd]));
+                edge_idx = 2 + au1_sign_up[col] + u1_sign_down;
+                if(col > 0)
+                    au1_sign_up[col - 1] = -u1_sign_down;
+
+                edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col];
+
+                if(0 != edge_idx)
+                {
+                    pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+                }
+            }
+
+            pu1_src += src_strd;
+        }
+
+        pu1_src[-(pu1_avail[2] ? ht : ht + 1) * src_strd + wd - 1] = u1_pos_wd_0_tmp;
+        pu1_src[(pu1_avail[3] ?  (-src_strd) : 0)] = u1_pos_0_ht_tmp;
+    }
+
+    if(0 == pu1_avail[2])
+        ht++;
+    if(0 == pu1_avail[3])
+        ht++;
+    *pu1_src_top_left = u1_src_top_left_tmp;
+    for(row = 0; row < ht; row++)
+    {
+        pu1_src_left[row] = au1_src_left_tmp[row];
+    }
+    for(col = 0; col < wd; col++)
+    {
+        pu1_src_top[col] = au1_src_top_tmp[col];
+    }
+
+}
+
+
+
+
+void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src,
+                                         WORD32 src_strd,
+                                         UWORD8 *pu1_src_left,
+                                         UWORD8 *pu1_src_top,
+                                         UWORD8 *pu1_src_top_left,
+                                         UWORD8 *pu1_src_top_right,
+                                         UWORD8 *pu1_src_bot_left,
+                                         UWORD8 *pu1_avail,
+                                         WORD8 *pi1_sao_offset_u,
+                                         WORD8 *pi1_sao_offset_v,
+                                         WORD32 wd,
+                                         WORD32 ht)
+{
+    WORD32 row, col;
+    UWORD8 au1_mask[MAX_CTB_SIZE];
+    UWORD8 au1_src_left_tmp[2 * MAX_CTB_SIZE], au1_src_top_tmp[MAX_CTB_SIZE];
+    UWORD8 au1_src_top_left_tmp[2];
+    WORD8 au1_sign_up[MAX_CTB_SIZE];
+    UWORD8 *pu1_src_left_cpy;
+    WORD8 u1_sign_down;
+    WORD32 bit_depth;
+
+    UWORD8 u1_pos_wd_0_tmp_u;
+    UWORD8 u1_pos_wd_0_tmp_v;
+    UWORD8 u1_pos_0_ht_tmp_u;
+    UWORD8 u1_pos_0_ht_tmp_v;
+
+    bit_depth = BIT_DEPTH_CHROMA;
+    pu1_src_left_cpy = pu1_src_left;
+
+    /* Initialize the mask values */
+    memset(au1_mask, 0xFF, MAX_CTB_SIZE);
+
+    /* Update left, top and top-left arrays */
+    au1_src_top_left_tmp[0] = pu1_src_top[wd - 2];
+    au1_src_top_left_tmp[1] = pu1_src_top[wd - 1];
+    for(row = 0; row < ht; row++)
+    {
+        au1_src_left_tmp[2 * row] = pu1_src[row * src_strd + wd - 2];
+        au1_src_left_tmp[2 * row + 1] = pu1_src[row * src_strd + wd - 1];
+    }
+    for(col = 0; col < wd; col++)
+    {
+        au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col];
+    }
+
+
+    /* If top-right is available, process separately */
+    if(0 != pu1_avail[5])
+    {
+        WORD32 edge_idx;
+
+        /* U */
+        edge_idx = 2 + SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +
+                        SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]);
+
+        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_wd_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
+        }
+
+        /* V */
+        edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +
+                        SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]);
+
+        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_wd_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
+        }
+    }
+    else
+    {
+        u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
+        u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
+    }
+
+    /* If bottom-left is available, process separately */
+    if(0 != pu1_avail[6])
+    {
+        WORD32 edge_idx;
+
+        /* U */
+        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) +
+                        SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
+
+        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_0_ht_tmp_u = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
+        }
+
+        /* V */
+        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) +
+                        SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]);
+
+        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_0_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd + 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
+        }
+    }
+    else
+    {
+        u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
+        u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
+    }
+
+    /* If Left is not available */
+    if(0 == pu1_avail[0])
+    {
+        au1_mask[0] = 0;
+    }
+
+    /* If Top is not available */
+    if(0 == pu1_avail[2])
+    {
+        pu1_src += src_strd;
+        ht--;
+        pu1_src_left_cpy += 2;
+        for(col = 0; col < wd - 2; col++)
+        {
+            au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src[col + 2 - src_strd]);
+        }
+    }
+    else
+    {
+        for(col = 0; col < wd - 2; col++)
+        {
+            au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src_top[col + 2]);
+        }
+    }
+
+    /* If Right is not available */
+    if(0 == pu1_avail[1])
+    {
+        au1_mask[(wd - 1) >> 1] = 0;
+    }
+
+    /* If Bottom is not available */
+    if(0 == pu1_avail[3])
+    {
+        ht--;
+    }
+
+    /* Processing is done on the intermediate buffer and the output is written to the source buffer */
+    {
+        for(row = 0; row < ht; row++)
+        {
+            au1_sign_up[wd - 2] = SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 + 2 - src_strd]);
+            au1_sign_up[wd - 1] = SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 + 2 - src_strd]);
+            for(col = 0; col < wd; col++)
+            {
+                WORD32 edge_idx;
+                WORD8 *pi1_sao_offset;
+
+                pi1_sao_offset = (0 == col % 2) ? pi1_sao_offset_u : pi1_sao_offset_v;
+
+                u1_sign_down = SIGN(pu1_src[col] - ((col < 2) ? pu1_src_left_cpy[2 * (row + 1) + col] :
+                                                                pu1_src[col - 2 + src_strd]));
+                edge_idx = 2 + au1_sign_up[col] + u1_sign_down;
+                if(col > 1)
+                    au1_sign_up[col - 2] = -u1_sign_down;
+
+                edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col >> 1];
+
+                if(0 != edge_idx)
+                {
+                    pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+                }
+            }
+
+            pu1_src += src_strd;
+        }
+
+        pu1_src[-(pu1_avail[2] ? ht : ht + 1) * src_strd + wd - 2] = u1_pos_wd_0_tmp_u;
+        pu1_src[-(pu1_avail[2] ? ht : ht + 1) * src_strd + wd - 1] = u1_pos_wd_0_tmp_v;
+        pu1_src[(pu1_avail[3] ?  (-src_strd) : 0)] = u1_pos_0_ht_tmp_u;
+        pu1_src[(pu1_avail[3] ?  (-src_strd) : 0) + 1] = u1_pos_0_ht_tmp_v;
+    }
+
+    if(0 == pu1_avail[2])
+        ht++;
+    if(0 == pu1_avail[3])
+        ht++;
+    pu1_src_top_left[0] = au1_src_top_left_tmp[0];
+    pu1_src_top_left[1] = au1_src_top_left_tmp[1];
+    for(row = 0; row < 2 * ht; row++)
+    {
+        pu1_src_left[row] = au1_src_left_tmp[row];
+    }
+    for(col = 0; col < wd; col++)
+    {
+        pu1_src_top[col] = au1_src_top_tmp[col];
+    }
+
+}

diff --git a/common/ihevc_sao.h b/common/ihevc_sao.h
new file mode 100644
index 0000000..7d6fafa
--- /dev/null
+++ b/common/ihevc_sao.h

@@ -0,0 +1,402 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+******************************************************************************
+* @file ihevc_sao.h
+*
+* @brief
+*  This file contains enumerations, macros and extern declarations of HEVC
+*  SAO
+*
+* @author
+*  Ittiam
+******************************************************************************
+*/
+
+#ifndef _IHEVC_SAO_H_
+#define _IHEVC_SAO_H_
+
+enum
+{
+    SAO_NONE,
+
+    SAO_BAND,
+
+    SAO_EDGE_0_DEG,
+
+    SAO_EDGE_90_DEG,
+
+    SAO_EDGE_135_DEG,
+
+    SAO_EDGE_45_DEG
+};
+
+static const WORD32 gi4_ihevc_hbd_table_edge_idx[5] = { 1, 2, 0, 3, 4 };
+
+typedef void ihevc_sao_band_offset_luma_ft(UWORD8 *pu1_src,
+                                           WORD32 src_strd,
+                                           UWORD8 *pu1_src_left,
+                                           UWORD8 *pu1_src_top,
+                                           UWORD8 *pu1_src_top_left,
+                                           WORD32 sao_band_pos,
+                                           WORD8 *pi4_sao_offset,
+                                           WORD32 wd,
+                                           WORD32 ht);
+
+typedef void ihevc_hbd_sao_band_offset_luma_ft(UWORD16 *pu2_src,
+                                               WORD32 src_strd,
+                                               UWORD16 *pu2_src_left,
+                                               UWORD16 *pu2_src_top,
+                                               UWORD16 *pu2_src_top_left,
+                                               WORD32 sao_band_pos,
+                                               WORD8 *pi1_sao_offset,
+                                               WORD32 wd,
+                                               WORD32 ht,
+                                               UWORD32 bitdepth);
+
+typedef void ihevc_sao_band_offset_chroma_ft(UWORD8 *pu1_src,
+                                             WORD32 src_strd,
+                                             UWORD8 *pu1_src_left,
+                                             UWORD8 *pu1_src_top,
+                                             UWORD8 *pu1_src_top_left,
+                                             WORD32 sao_band_pos_u,
+                                             WORD32 sao_band_pos_v,
+                                             WORD8 *pi4_sao_offset_u,
+                                             WORD8 *pi4_sao_offset_v,
+                                             WORD32 wd,
+                                             WORD32 ht);
+
+typedef void ihevc_hbd_sao_band_offset_chroma_ft(UWORD16 *pu2_src,
+                                                 WORD32 src_strd,
+                                                 UWORD16 *pu2_src_left,
+                                                 UWORD16 *pu2_src_top,
+                                                 UWORD16 *pu2_src_top_left,
+                                                 WORD32 sao_band_pos_u,
+                                                 WORD32 sao_band_pos_v,
+                                                 WORD8 *pi1_sao_offset_u,
+                                                 WORD8 *pi1_sao_offset_v,
+                                                 WORD32 wd,
+                                                 WORD32 ht,
+                                                 UWORD32 bit_depth);
+
+typedef void ihevc_sao_edge_offset_class0_ft(UWORD8 *pu1_src,
+                                             WORD32 src_strd,
+                                             UWORD8 *pu1_src_left,
+                                             UWORD8 *pu1_src_top,
+                                             UWORD8 *pu1_src_top_left,
+                                             UWORD8 *pu1_src_top_right,
+                                             UWORD8 *pu1_src_bot_left,
+                                             UWORD8 *pu1_avail,
+                                             WORD8 *pi4_sao_offset,
+                                             WORD32 wd,
+                                             WORD32 ht);
+
+typedef void ihevc_hbd_sao_edge_offset_class0_ft(UWORD16 *pu2_src,
+                                                 WORD32 src_strd,
+                                                 UWORD16 *pu2_src_left,
+                                                 UWORD16 *pu2_src_top,
+                                                 UWORD16 *pu2_src_top_left,
+                                                 UWORD16 *pu2_src_top_right,
+                                                 UWORD16 *pu2_src_bot_left,
+                                                 UWORD8 *pu1_avail,
+                                                 WORD8 *pi1_sao_offset,
+                                                 WORD32 wd,
+                                                 WORD32 ht,
+                                                 UWORD32 bit_depth);
+
+typedef void ihevc_sao_edge_offset_class0_chroma_ft(UWORD8 *pu1_src,
+                                                    WORD32 src_strd,
+                                                    UWORD8 *pu1_src_left,
+                                                    UWORD8 *pu1_src_top,
+                                                    UWORD8 *pu1_src_top_left,
+                                                    UWORD8 *pu1_src_top_right,
+                                                    UWORD8 *pu1_src_bot_left,
+                                                    UWORD8 *pu1_avail,
+                                                    WORD8 *pi4_sao_offset_u,
+                                                    WORD8 *pi4_sao_offset_v,
+                                                    WORD32 wd,
+                                                    WORD32 ht);
+
+typedef void ihevc_hbd_sao_edge_offset_class0_chroma_ft(UWORD16 *pu2_src,
+                                                        WORD32 src_strd,
+                                                        UWORD16 *pu2_src_left,
+                                                        UWORD16 *pu2_src_top,
+                                                        UWORD16 *pu2_src_top_left,
+                                                        UWORD16 *pu2_src_top_right,
+                                                        UWORD16 *pu2_src_bot_left,
+                                                        UWORD8 *pu1_avail,
+                                                        WORD8 *pi1_sao_offset_u,
+                                                        WORD8 *pi1_sao_offset_v,
+                                                        WORD32 wd,
+                                                        WORD32 ht,
+                                                        UWORD32 bit_depth);
+
+typedef void ihevc_sao_edge_offset_class1_ft(UWORD8 *pu1_src,
+                                             WORD32 src_strd,
+                                             UWORD8 *pu1_src_left,
+                                             UWORD8 *pu1_src_top,
+                                             UWORD8 *pu1_src_top_left,
+                                             UWORD8 *pu1_src_top_right,
+                                             UWORD8 *pu1_src_bot_left,
+                                             UWORD8 *pu1_avail,
+                                             WORD8 *pi4_sao_offset,
+                                             WORD32 wd,
+                                             WORD32 ht);
+
+typedef void ihevc_hbd_sao_edge_offset_class1_ft(UWORD16 *pu2_src,
+                                                 WORD32 src_strd,
+                                                 UWORD16 *pu2_src_left,
+                                                 UWORD16 *pu2_src_top,
+                                                 UWORD16 *pu2_src_top_left,
+                                                 UWORD16 *pu2_src_top_right,
+                                                 UWORD16 *pu2_src_bot_left,
+                                                 UWORD8 *pu1_avail,
+                                                 WORD8 *pi1_sao_offset,
+                                                 WORD32 wd,
+                                                 WORD32 ht,
+                                                 UWORD32 bit_depth);
+
+typedef void ihevc_sao_edge_offset_class1_chroma_ft(UWORD8 *pu1_src,
+                                                    WORD32 src_strd,
+                                                    UWORD8 *pu1_src_left,
+                                                    UWORD8 *pu1_src_top,
+                                                    UWORD8 *pu1_src_top_left,
+                                                    UWORD8 *pu1_src_top_right,
+                                                    UWORD8 *pu1_src_bot_left,
+                                                    UWORD8 *pu1_avail,
+                                                    WORD8 *pi4_sao_offset_u,
+                                                    WORD8 *pi4_sao_offset_v,
+                                                    WORD32 wd,
+                                                    WORD32 ht);
+
+typedef void ihevc_hbd_sao_edge_offset_class1_chroma_ft(UWORD16 *pu2_src,
+                                                        WORD32 src_strd,
+                                                        UWORD16 *pu2_src_left,
+                                                        UWORD16 *pu2_src_top,
+                                                        UWORD16 *pu2_src_top_left,
+                                                        UWORD16 *pu2_src_top_right,
+                                                        UWORD16 *pu2_src_bot_left,
+                                                        UWORD8 *pu1_avail,
+                                                        WORD8 *pi1_sao_offset_u,
+                                                        WORD8 *pi1_sao_offset_v,
+                                                        WORD32 wd,
+                                                        WORD32 ht,
+                                                        UWORD32 bit_depth);
+
+typedef void ihevc_sao_edge_offset_class2_ft(UWORD8 *pu1_src,
+                                             WORD32 src_strd,
+                                             UWORD8 *pu1_src_left,
+                                             UWORD8 *pu1_src_top,
+                                             UWORD8 *pu1_src_top_left,
+                                             UWORD8 *pu1_src_top_right,
+                                             UWORD8 *pu1_src_bot_left,
+                                             UWORD8 *pu1_avail,
+                                             WORD8 *pi4_sao_offset,
+                                             WORD32 wd,
+                                             WORD32 ht);
+
+typedef void ihevc_hbd_sao_edge_offset_class2_ft(UWORD16 *pu2_src,
+                                                 WORD32 src_strd,
+                                                 UWORD16 *pu2_src_left,
+                                                 UWORD16 *pu2_src_top,
+                                                 UWORD16 *pu2_src_top_left,
+                                                 UWORD16 *pu2_src_top_right,
+                                                 UWORD16 *pu2_src_bot_left,
+                                                 UWORD8 *pu1_avail,
+                                                 WORD8 *pi1_sao_offset,
+                                                 WORD32 wd,
+                                                 WORD32 ht,
+                                                 UWORD32 bit_depth);
+
+typedef void ihevc_sao_edge_offset_class2_chroma_ft(UWORD8 *pu1_src,
+                                                    WORD32 src_strd,
+                                                    UWORD8 *pu1_src_left,
+                                                    UWORD8 *pu1_src_top,
+                                                    UWORD8 *pu1_src_top_left,
+                                                    UWORD8 *pu1_src_top_right,
+                                                    UWORD8 *pu1_src_bot_left,
+                                                    UWORD8 *pu1_avail,
+                                                    WORD8 *pi4_sao_offset_u,
+                                                    WORD8 *pi4_sao_offset_v,
+                                                    WORD32 wd,
+                                                    WORD32 ht);
+
+typedef void ihevc_hbd_sao_edge_offset_class2_chroma_ft(UWORD16 *pu2_src,
+                                                        WORD32 src_strd,
+                                                        UWORD16 *pu2_src_left,
+                                                        UWORD16 *pu2_src_top,
+                                                        UWORD16 *pu2_src_top_left,
+                                                        UWORD16 *pu2_src_top_right,
+                                                        UWORD16 *pu2_src_bot_left,
+                                                        UWORD8 *pu1_avail,
+                                                        WORD8 *pi1_sao_offset_u,
+                                                        WORD8 *pi1_sao_offset_v,
+                                                        WORD32 wd,
+                                                        WORD32 ht,
+                                                        UWORD32 bit_depth);
+
+typedef void ihevc_sao_edge_offset_class3_ft(UWORD8 *pu1_src,
+                                             WORD32 src_strd,
+                                             UWORD8 *pu1_src_left,
+                                             UWORD8 *pu1_src_top,
+                                             UWORD8 *pu1_src_top_left,
+                                             UWORD8 *pu1_src_top_right,
+                                             UWORD8 *pu1_src_bot_left,
+                                             UWORD8 *pu1_avail,
+                                             WORD8 *pi4_sao_offset,
+                                             WORD32 wd,
+                                             WORD32 ht);
+
+typedef void ihevc_hbd_sao_edge_offset_class3_ft(UWORD16 *pu2_src,
+                                                 WORD32 src_strd,
+                                                 UWORD16 *pu2_src_left,
+                                                 UWORD16 *pu2_src_top,
+                                                 UWORD16 *pu2_src_top_left,
+                                                 UWORD16 *pu2_src_top_right,
+                                                 UWORD16 *pu2_src_bot_left,
+                                                 UWORD8 *pu1_avail,
+                                                 WORD8 *pi1_sao_offset,
+                                                 WORD32 wd,
+                                                 WORD32 ht,
+                                                 UWORD32 bit_depth);
+typedef void ihevc_sao_edge_offset_class3_chroma_ft(UWORD8 *pu1_src,
+                                                    WORD32 src_strd,
+                                                    UWORD8 *pu1_src_left,
+                                                    UWORD8 *pu1_src_top,
+                                                    UWORD8 *pu1_src_top_left,
+                                                    UWORD8 *pu1_src_top_right,
+                                                    UWORD8 *pu1_src_bot_left,
+                                                    UWORD8 *pu1_avail,
+                                                    WORD8 *pi4_sao_offset_u,
+                                                    WORD8 *pi4_sao_offset_v,
+                                                    WORD32 wd,
+                                                    WORD32 ht);
+
+typedef void ihevc_hbd_sao_edge_offset_class3_chroma_ft(UWORD16 *pu2_src,
+                                                        WORD32 src_strd,
+                                                        UWORD16 *pu2_src_left,
+                                                        UWORD16 *pu2_src_top,
+                                                        UWORD16 *pu2_src_top_left,
+                                                        UWORD16 *pu2_src_top_right,
+                                                        UWORD16 *pu2_src_bot_left,
+                                                        UWORD8 *pu1_avail,
+                                                        WORD8 *pi1_sao_offset_u,
+                                                        WORD8 *pi1_sao_offset_v,
+                                                        WORD32 wd,
+                                                        WORD32 ht,
+                                                        UWORD32 bit_depth);
+/* C function declarations */
+ihevc_sao_band_offset_luma_ft ihevc_sao_band_offset_luma;
+ihevc_sao_band_offset_chroma_ft ihevc_sao_band_offset_chroma;
+ihevc_sao_edge_offset_class0_ft ihevc_sao_edge_offset_class0;
+ihevc_sao_edge_offset_class0_chroma_ft ihevc_sao_edge_offset_class0_chroma;
+ihevc_sao_edge_offset_class1_ft ihevc_sao_edge_offset_class1;
+ihevc_sao_edge_offset_class1_chroma_ft ihevc_sao_edge_offset_class1_chroma;
+ihevc_sao_edge_offset_class2_ft ihevc_sao_edge_offset_class2;
+ihevc_sao_edge_offset_class2_chroma_ft ihevc_sao_edge_offset_class2_chroma;
+ihevc_sao_edge_offset_class3_ft ihevc_sao_edge_offset_class3;
+ihevc_sao_edge_offset_class3_chroma_ft ihevc_sao_edge_offset_class3_chroma;
+
+/* NEONINTR function declarations */
+ihevc_sao_band_offset_luma_ft ihevc_sao_band_offset_luma_neonintr;
+ihevc_sao_band_offset_chroma_ft ihevc_sao_band_offset_chroma_neonintr;
+ihevc_sao_edge_offset_class0_ft ihevc_sao_edge_offset_class0_neonintr;
+ihevc_sao_edge_offset_class0_chroma_ft ihevc_sao_edge_offset_class0_chroma_neonintr;
+ihevc_sao_edge_offset_class1_ft ihevc_sao_edge_offset_class1_neonintr;
+ihevc_sao_edge_offset_class1_chroma_ft ihevc_sao_edge_offset_class1_chroma_neonintr;
+ihevc_sao_edge_offset_class2_ft ihevc_sao_edge_offset_class2_neonintr;
+ihevc_sao_edge_offset_class2_chroma_ft ihevc_sao_edge_offset_class2_chroma_neonintr;
+ihevc_sao_edge_offset_class3_ft ihevc_sao_edge_offset_class3_neonintr;
+ihevc_sao_edge_offset_class3_chroma_ft ihevc_sao_edge_offset_class3_chroma_neonintr;
+
+/* A9Q function declarations */
+ihevc_sao_band_offset_luma_ft ihevc_sao_band_offset_luma_a9q;
+ihevc_sao_band_offset_chroma_ft ihevc_sao_band_offset_chroma_a9q;
+ihevc_sao_edge_offset_class0_ft ihevc_sao_edge_offset_class0_a9q;
+ihevc_sao_edge_offset_class0_chroma_ft ihevc_sao_edge_offset_class0_chroma_a9q;
+ihevc_sao_edge_offset_class1_ft ihevc_sao_edge_offset_class1_a9q;
+ihevc_sao_edge_offset_class1_chroma_ft ihevc_sao_edge_offset_class1_chroma_a9q;
+ihevc_sao_edge_offset_class2_ft ihevc_sao_edge_offset_class2_a9q;
+ihevc_sao_edge_offset_class2_chroma_ft ihevc_sao_edge_offset_class2_chroma_a9q;
+ihevc_sao_edge_offset_class3_ft ihevc_sao_edge_offset_class3_a9q;
+ihevc_sao_edge_offset_class3_chroma_ft ihevc_sao_edge_offset_class3_chroma_a9q;
+
+/* A9A (Apple) function declarations */
+ihevc_sao_band_offset_luma_ft ihevc_sao_band_offset_luma_a9a;
+ihevc_sao_band_offset_chroma_ft ihevc_sao_band_offset_chroma_a9a;
+ihevc_sao_edge_offset_class0_ft ihevc_sao_edge_offset_class0_a9a;
+ihevc_sao_edge_offset_class0_chroma_ft ihevc_sao_edge_offset_class0_chroma_a9a;
+ihevc_sao_edge_offset_class1_ft ihevc_sao_edge_offset_class1_a9a;
+ihevc_sao_edge_offset_class1_chroma_ft ihevc_sao_edge_offset_class1_chroma_a9a;
+ihevc_sao_edge_offset_class2_ft ihevc_sao_edge_offset_class2_a9a;
+ihevc_sao_edge_offset_class2_chroma_ft ihevc_sao_edge_offset_class2_chroma_a9a;
+ihevc_sao_edge_offset_class3_ft ihevc_sao_edge_offset_class3_a9a;
+ihevc_sao_edge_offset_class3_chroma_ft ihevc_sao_edge_offset_class3_chroma_a9a;
+
+/* SSSE31 function declarations */
+ihevc_sao_band_offset_luma_ft ihevc_sao_band_offset_luma_ssse3;
+ihevc_sao_band_offset_chroma_ft ihevc_sao_band_offset_chroma_ssse3;
+ihevc_sao_edge_offset_class0_ft ihevc_sao_edge_offset_class0_ssse3;
+ihevc_sao_edge_offset_class0_chroma_ft ihevc_sao_edge_offset_class0_chroma_ssse3;
+ihevc_sao_edge_offset_class1_ft ihevc_sao_edge_offset_class1_ssse3;
+ihevc_sao_edge_offset_class1_chroma_ft ihevc_sao_edge_offset_class1_chroma_ssse3;
+ihevc_sao_edge_offset_class2_ft ihevc_sao_edge_offset_class2_ssse3;
+ihevc_sao_edge_offset_class2_chroma_ft ihevc_sao_edge_offset_class2_chroma_ssse3;
+ihevc_sao_edge_offset_class3_ft ihevc_sao_edge_offset_class3_ssse3;
+ihevc_sao_edge_offset_class3_chroma_ft ihevc_sao_edge_offset_class3_chroma_ssse3;
+
+/* SSE4 function declarations */
+
+/* C high bit depth function declarations */
+ihevc_hbd_sao_band_offset_luma_ft ihevc_hbd_sao_band_offset_luma;
+ihevc_hbd_sao_band_offset_chroma_ft ihevc_hbd_sao_band_offset_chroma;
+ihevc_hbd_sao_edge_offset_class0_ft ihevc_hbd_sao_edge_offset_class0;
+ihevc_hbd_sao_edge_offset_class0_chroma_ft ihevc_hbd_sao_edge_offset_class0_chroma;
+ihevc_hbd_sao_edge_offset_class1_ft ihevc_hbd_sao_edge_offset_class1;
+ihevc_hbd_sao_edge_offset_class1_chroma_ft ihevc_hbd_sao_edge_offset_class1_chroma;
+ihevc_hbd_sao_edge_offset_class2_ft ihevc_hbd_sao_edge_offset_class2;
+ihevc_hbd_sao_edge_offset_class2_chroma_ft ihevc_hbd_sao_edge_offset_class2_chroma;
+ihevc_hbd_sao_edge_offset_class3_ft ihevc_hbd_sao_edge_offset_class3;
+ihevc_hbd_sao_edge_offset_class3_chroma_ft ihevc_hbd_sao_edge_offset_class3_chroma;
+
+/* SSE4.2 HBD function Declarations*/
+ihevc_hbd_sao_band_offset_luma_ft ihevc_hbd_sao_band_offset_luma_sse42;
+ihevc_hbd_sao_band_offset_chroma_ft ihevc_hbd_sao_band_offset_chroma_sse42;
+ihevc_hbd_sao_edge_offset_class0_ft ihevc_hbd_sao_edge_offset_class0_sse42;
+ihevc_hbd_sao_edge_offset_class0_chroma_ft ihevc_hbd_sao_edge_offset_class0_chroma_sse42;
+ihevc_hbd_sao_edge_offset_class1_ft ihevc_hbd_sao_edge_offset_class1_sse42;
+ihevc_hbd_sao_edge_offset_class1_chroma_ft ihevc_hbd_sao_edge_offset_class1_chroma_sse42;
+ihevc_hbd_sao_edge_offset_class2_ft ihevc_hbd_sao_edge_offset_class2_sse42;
+ihevc_hbd_sao_edge_offset_class2_chroma_ft ihevc_hbd_sao_edge_offset_class2_chroma_sse42;
+ihevc_hbd_sao_edge_offset_class3_ft ihevc_hbd_sao_edge_offset_class3_sse42;
+ihevc_hbd_sao_edge_offset_class3_chroma_ft ihevc_hbd_sao_edge_offset_class3_chroma_sse42;
+
+/* armv8 function declarations */
+ihevc_sao_band_offset_luma_ft ihevc_sao_band_offset_luma_av8;
+ihevc_sao_band_offset_chroma_ft ihevc_sao_band_offset_chroma_av8;
+ihevc_sao_edge_offset_class0_ft ihevc_sao_edge_offset_class0_av8;
+ihevc_sao_edge_offset_class0_chroma_ft ihevc_sao_edge_offset_class0_chroma_av8;
+ihevc_sao_edge_offset_class1_ft ihevc_sao_edge_offset_class1_av8;
+ihevc_sao_edge_offset_class1_chroma_ft ihevc_sao_edge_offset_class1_chroma_av8;
+ihevc_sao_edge_offset_class2_ft ihevc_sao_edge_offset_class2_av8;
+ihevc_sao_edge_offset_class2_chroma_ft ihevc_sao_edge_offset_class2_chroma_av8;
+ihevc_sao_edge_offset_class3_ft ihevc_sao_edge_offset_class3_av8;
+ihevc_sao_edge_offset_class3_chroma_ft ihevc_sao_edge_offset_class3_chroma_av8;
+
+#endif /* _IHEVC_SAO_H_ */

diff --git a/common/ihevc_structs.h b/common/ihevc_structs.h
new file mode 100644
index 0000000..26857d8
--- /dev/null
+++ b/common/ihevc_structs.h

@@ -0,0 +1,2884 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_structs.h
+ *
+ * @brief
+ *  Structure definitions used in the code
+ *
+ * @author
+ *  Ittiam
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+#ifndef _IHEVC_STRUCTS_H_
+#define _IHEVC_STRUCTS_H_
+
+/**
+ * Picture buffer
+ */
+typedef struct
+{
+    UWORD8 *pu1_luma;
+    UWORD8 *pu1_chroma;
+
+    WORD32 i4_abs_poc;
+    WORD32 i4_poc_lsb;
+    /** Used to store display Timestamp for current buffer */
+    WORD32 u4_ts;
+    UWORD8 u1_used_as_ref;
+
+    UWORD8 u1_free_delay_cnt;
+
+    /**
+     * buffer ID from buffer manager
+     */
+    UWORD8 u1_buf_id;
+
+}pic_buf_t;
+
+
+/**
+ * Reference List
+ */
+typedef struct
+{
+    void *pv_pic_buf;
+
+    void *pv_mv_buf;
+
+    UWORD8 u1_used_as_ref;
+
+}ref_list_t;
+
+
+/**
+ * SAO
+ */
+typedef struct
+{
+    /**
+     * sao_type_idx_luma
+     */
+    UWORD32      b3_y_type_idx   : 3;
+
+    /**
+     * luma SaoOffsetVal[1]
+     */
+    WORD32      b4_y_offset_1   : 4;
+
+    /**
+     * luma SaoOffsetVal[2]
+     */
+    WORD32      b4_y_offset_2   : 4;
+
+    /**
+     * luma SaoOffsetVal[3]
+     */
+    WORD32      b4_y_offset_3   : 4;
+
+    /**
+     * luma SaoOffsetVal[4]
+     */
+    WORD32      b4_y_offset_4   : 4;
+
+    /**
+     * luma sao_band_position
+     */
+    UWORD32      b5_y_band_pos   : 5;
+
+    WORD32                      : 0;
+
+    /**
+     * sao_type_idx_chroma
+     */
+    UWORD32      b3_cb_type_idx  : 3;
+
+    /**
+     * chroma SaoOffsetVal[1]
+     */
+    WORD32      b4_cb_offset_1  : 4;
+
+    /**
+     * chroma SaoOffsetVal[2]
+     */
+    WORD32      b4_cb_offset_2  : 4;
+
+    /**
+     * chroma SaoOffsetVal[3]
+     */
+    WORD32      b4_cb_offset_3  : 4;
+
+    /**
+     * chroma SaoOffsetVal[4]
+     */
+    WORD32      b4_cb_offset_4  : 4;
+
+    /**
+     * cb sao_band_position
+     */
+    UWORD32      b5_cb_band_pos  : 5;
+
+    WORD32                      : 0;
+
+    /**
+     * sao_type_idx_chroma
+     */
+    UWORD32      b3_cr_type_idx  : 3;
+
+    /**
+     * chroma SaoOffsetVal[1]
+     */
+    WORD32      b4_cr_offset_1  : 4;
+
+    /**
+     * chroma SaoOffsetVal[2]
+     */
+    WORD32      b4_cr_offset_2  : 4;
+
+    /**
+     * chroma SaoOffsetVal[3]
+     */
+    WORD32      b4_cr_offset_3  : 4;
+
+    /**
+     * chroma SaoOffsetVal[4]
+     */
+    WORD32      b4_cr_offset_4  : 4;
+
+    /**
+     * cr sao_band_position
+     */
+    UWORD32      b5_cr_band_pos  : 5;
+
+    WORD32                      : 0;
+
+}sao_t;
+
+/**
+ * SAO
+ */
+typedef struct
+{
+    /**
+     * sao_type_idx_luma
+     */
+    UWORD32      b3_y_type_idx   : 3;
+
+    /**
+     * luma SaoOffsetVal[1]
+     */
+    WORD32      b8_y_offset_1   : 8;
+
+    /**
+     * luma SaoOffsetVal[2]
+     */
+    WORD32      b8_y_offset_2   : 8;
+
+    /**
+     * luma SaoOffsetVal[3]
+     */
+    WORD32      b8_y_offset_3   : 8;
+
+    /**
+     * luma SaoOffsetVal[4]
+     */
+    WORD32      b8_y_offset_4   : 8;
+
+    /**
+     * luma sao_band_position
+     */
+    UWORD32      b5_y_band_pos   : 5;
+
+    WORD32                      : 0;
+
+    /**
+     * sao_type_idx_chroma
+     */
+    UWORD32      b3_cb_type_idx  : 3;
+
+    /**
+     * chroma SaoOffsetVal[1]
+     */
+    WORD32      b8_cb_offset_1  : 8;
+
+    /**
+     * chroma SaoOffsetVal[2]
+     */
+    WORD32      b8_cb_offset_2  : 8;
+
+    /**
+     * chroma SaoOffsetVal[3]
+     */
+    WORD32      b8_cb_offset_3  : 8;
+
+    /**
+     * chroma SaoOffsetVal[4]
+     */
+    WORD32      b8_cb_offset_4  : 8;
+
+    /**
+     * cb sao_band_position
+     */
+    UWORD32      b5_cb_band_pos  : 5;
+
+    WORD32                      : 0;
+
+    /**
+     * sao_type_idx_chroma
+     */
+    UWORD32      b3_cr_type_idx  : 3;
+
+    /**
+     * chroma SaoOffsetVal[1]
+     */
+    WORD32      b8_cr_offset_1  : 8;
+
+    /**
+     * chroma SaoOffsetVal[2]
+     */
+    WORD32      b8_cr_offset_2  : 8;
+
+    /**
+     * chroma SaoOffsetVal[3]
+     */
+    WORD32      b8_cr_offset_3  : 8;
+
+    /**
+     * chroma SaoOffsetVal[4]
+     */
+    WORD32      b8_cr_offset_4  : 8;
+
+    /**
+     * cr sao_band_position
+     */
+    UWORD32      b5_cr_band_pos  : 5;
+
+    WORD32                      : 0;
+
+}sao_10bd_t;
+
+/**
+ * Motion vector
+ */
+typedef struct
+{
+    /**
+     * Horizontal Motion Vector
+     */
+    WORD16 i2_mvx;
+
+    /**
+     * Vertical Motion Vector
+     */
+    WORD16 i2_mvy;
+}mv_t;
+
+/*****************************************************************************/
+/* Following results in packed 48 bit structure. If mv_t included            */
+/*  ref_pic_buf_id, then 8 bits will be wasted for each mv for aligning.     */
+/*  Also using mv_t as elements directly instead of a pointer to l0 and l1   */
+/*  mvs. Since pointer takes 4 bytes and MV itself is 4 bytes. It does not   */
+/*  really help using pointers.                                              */
+/*****************************************************************************/
+
+/**
+ * PU Motion Vector info
+ */
+typedef struct
+{
+    /**
+     *  L0 Motion Vector
+     */
+    mv_t s_l0_mv;
+
+    /**
+     *  L1 Motion Vector
+     */
+    mv_t s_l1_mv;
+
+    /**
+     *  L0 Ref index
+     */
+    WORD8   i1_l0_ref_idx;
+
+    /**
+     *  L1 Ref index
+     */
+    WORD8   i1_l1_ref_idx;
+
+    /**
+     *  L0 Ref Pic Buf ID
+     */
+    WORD8 i1_l0_ref_pic_buf_id;
+
+    /**
+     *  L1 Ref Pic Buf ID
+     */
+    WORD8 i1_l1_ref_pic_buf_id;
+
+}pu_mv_t;
+
+/**
+ * PU information
+ */
+typedef struct
+{
+
+    /**
+     *  PU motion vectors
+     */
+    pu_mv_t     mv;
+
+    /**
+     *  PU X position in terms of min PU (4x4) units
+     */
+    UWORD32     b4_pos_x        : 4;
+
+    /**
+     *  PU Y position in terms of min PU (4x4) units
+     */
+    UWORD32     b4_pos_y        : 4;
+
+    /**
+     *  PU width in pixels = (b4_wd + 1) << 2
+     */
+    UWORD32     b4_wd           : 4;
+
+    /**
+     *  PU height in pixels = (b4_ht + 1) << 2
+     */
+    UWORD32     b4_ht           : 4;
+
+    /**
+     *  Intra or Inter flag for each partition - 0 or 1
+     */
+    UWORD32     b1_intra_flag   : 1;
+
+
+    /**
+     *  PRED_L0, PRED_L1, PRED_BI - Initialized in parsing only for MVP case
+     */
+    UWORD32     b2_pred_mode    : 2;
+
+
+/**
+     *  Merge flag for each partition - 0 or 1
+     */
+    UWORD32     b1_merge_flag   : 1;
+
+    /**
+     *  Merge index for each partition - 0 to 4
+     */
+    UWORD32     b3_merge_idx    : 3;
+
+    /*************************************************************************/
+    /* Following two flags can be overloaded with b3_merge_idx if there      */
+    /* is need for additional bits                                           */
+    /*************************************************************************/
+
+    /**
+     *  If merge is zero, following gives presence of mvd for L0 MV
+     */
+    UWORD32     b1_l0_mvp_idx   : 1;
+
+    /**
+     *  If merge is zero, following gives presence of mvd for L1 MV
+     */
+    UWORD32     b1_l1_mvp_idx   : 1;
+
+    /**
+     * Partition mode - Needed during MV merge stage
+     * Note: Part mode can be derived using pu_wd, pu_ht and minCB size
+     * If there is a need for bits, the following can be removed at the cost
+     * of more control code in MV Merge
+     */
+    UWORD32      b3_part_mode    : 3;
+
+    /**
+     * Partition index - Needed during MV merge stage
+     */
+    UWORD32      b2_part_idx     : 2;
+
+
+}pu_t;
+
+/**
+ * TU information
+ */
+typedef struct
+{
+    /**
+     *  TU X position in terms of min TU (4x4) units
+     */
+    UWORD32      b4_pos_x            : 4;
+
+    /**
+     *  TU Y position in terms of min TU (4x4) units
+     */
+    UWORD32     b4_pos_y            : 4;
+
+
+    /*************************************************************************/
+    /* Luma TU size (width or height) = 1 << (b3_size + 2)                   */
+    /*   i.e. 0 : 4, 1 : 8, 2: 16, 3: 32, 4: 64                              */
+    /* Note: Though 64 x 64 TU is not possible, this size is supported to    */
+    /* signal SKIP CUs or PCM CUs etc where transform is not called          */
+    /* Chroma width will be half of luma except for 4x4 luma                 */
+    /*************************************************************************/
+    /**
+     * Luma TU size (width or height)
+     */
+    UWORD32     b3_size             : 3; //To be changed.
+
+    /*************************************************************************/
+    /* Chroma present : For 4x4 Luma TUs only the fourth one contains Cb     */
+    /* Cr info. For the first three TUs in 8x8 (for 4x4 luma) this will      */
+    /* be zero. For all the other cases this will be 1                       */
+    /*************************************************************************/
+
+    /**
+     * 4x4 Luma TUs only the fourth one contains cb,cr
+     * TODO: Check if this is really needed, cb_cbf and cr_cbf should be enough
+     */
+    //UWORD32      b1_chroma_present   : 1;
+
+    /**
+     *  Y CBF
+     */
+    UWORD32      b1_y_cbf            : 1;
+
+    /**
+     *  Cb CBF
+     */
+    UWORD32      b1_cb_cbf           : 1;
+
+    /**
+     *  Cr CBF
+     */
+    UWORD32     b1_cr_cbf           : 1;
+
+
+    /**
+     *  Flag to indicate if it is the first TU in a CU
+     */
+    UWORD32     b1_first_tu_in_cu       : 1;
+
+    /**
+     *  Transform quant bypass flag
+     */
+    UWORD32     b1_transquant_bypass  : 1;
+
+    /**
+     *  Y Qp
+     */
+    //UWORD32     b6_qp               : 6; // BUG_FIX related to nighbour QP's in case of negative QP for HBD.
+    WORD32     b7_qp               : 7;
+
+
+    /**
+     *  Luma Intra Mode 0 - 34
+     */
+    UWORD32    b6_luma_intra_mode      : 6;
+
+    /*************************************************************************/
+    /* Chroma Intra Mode Index 0 - 4: Actual mode (0, 1, 10, 26, 34, X) to be*/
+    /* derived using luma_intra_mode and the following                       */
+    /*************************************************************************/
+    /**
+     * Chroma Intra Mode Index 0 - 4
+     */
+    UWORD32    b3_chroma_intra_mode_idx    : 3;
+
+
+}tu_t;
+
+/**
+ * CU information
+ */
+typedef struct
+{
+
+    /**
+     *  CU X position in terms of min CU (8x8) units
+     */
+    UWORD32 b3_cu_pos_x :3;
+
+    /**
+     *  CU Y position in terms of min CU (8x8) units
+     */
+    UWORD32 b3_cu_pos_y :3;
+
+    /**
+     *  CU size in terms of min CU (8x8) units
+     */
+    UWORD32 b4_cu_size :4;
+
+    /**
+     *  transquant bypass flag ; 0 for this encoder
+     */
+    UWORD32 b1_tq_bypass_flag :1;
+
+    /**
+     *  CU skip flag
+     */
+    UWORD32 b1_skip_flag :1;
+
+    /**
+     *  intra / inter CU flag
+     */
+    UWORD32 b1_pred_mode_flag :1;
+
+    /**
+     *  indicates partition information for CU
+     *  For intra 0 : for 2Nx2N / 1 for NxN iff CU=minCBsize
+     *  For inter 0 : @sa PART_SIZE_E
+     */
+    UWORD32 b3_part_mode :3;
+
+    /**
+     *  0 for this encoder
+     */
+    UWORD32 b1_pcm_flag :1;
+
+    /**
+     *  only applicable for intra cu
+     */
+    UWORD32 b3_chroma_intra_pred_mode :3;
+
+    /**
+     * only applicable for intra cu
+     */
+    UWORD32 b1_prev_intra_luma_pred_flag0 :1;
+
+    /**
+     * only applicable for intra cu and pred_mode=NxN
+     */
+    UWORD32 b1_prev_intra_luma_pred_flag1 :1;
+
+    /**
+     * only applicable for intra cu and pred_mode=NxN
+     */
+    UWORD32 b1_prev_intra_luma_pred_flag2 :1;
+
+    /**
+     * only applicable for intra cu and pred_mode=NxN
+     */
+    UWORD32 b1_prev_intra_luma_pred_flag3 :1;
+
+    /**
+     *  only applicable for luma intra cu
+     */
+    UWORD32 b2_mpm_idx0 :2;
+
+    /**
+     *  only applicable for intra cu and pred_mode=NxN
+     */
+    UWORD32 b2_mpm_idx1 :2;
+
+    /**
+     *  only applicable for intra cu and pred_mode=NxN
+     */
+    UWORD32 b2_mpm_idx2 :2;
+
+    /**
+     *  only applicable for intra cu and pred_mode=NxN
+     */
+    UWORD32 b2_mpm_idx3 :2;
+
+    /**
+     *  only applicable for intra cu
+     */
+    UWORD32 b5_rem_intra_pred_mode0 :5;
+
+    /**
+     *  only applicable for intra cu and pred_mode=NxN
+     */
+    UWORD32 b5_rem_intra_pred_mode1 :5;
+
+    /**
+     *  only applicable for intra cu and pred_mode=NxN
+     */
+    UWORD32 b5_rem_intra_pred_mode2 :5;
+
+    /**
+     *  only applicable for intra cu and pred_mode=NxN
+     */
+    UWORD32 b5_rem_intra_pred_mode3 :5;
+
+    /**
+     *  no residue flag for cu
+     */
+    UWORD32 b1_no_residual_syntax_flag :1;
+
+}cu_t;
+
+/*****************************************************************************/
+/* Since the following data will be accessed linearly (no random access      */
+/*  is needed for this) there is no need to store a frame level offset for   */
+/*  each CTB's TU data. Only a pointer to this is stored in CTB's structure  */
+/*****************************************************************************/
+
+typedef struct
+{
+    /*************************************************************************/
+    /* Number of TUs filled in as_tu                                         */
+    /* Having the first entry as 32 bit data, helps in keeping each of       */
+    /* the structures aligned to 32 bits at CTB level                        */
+    /*************************************************************************/
+    /**
+     * Number of TUs filled in as_tu
+     */
+    WORD32 i4_tu_cnt;
+
+    /**
+     *  Array to map each min TU unit to a corresponding entry in as_tu
+     */
+    UWORD8 au1_tu_map[MAX_TU_IN_CTB];
+
+    /*************************************************************************/
+    /* TU level information                                                  */
+    /* Though the allocation for as_pu as done to handle worst case data,    */
+    /* only valid number of TUs will be filled in the following array.       */
+    /* Next CTB starts after the valid as_tu entries                         */
+    /*************************************************************************/
+    /**
+     *  TU level information
+     */
+    tu_t as_tu[MAX_TU_IN_CTB];
+
+}ctb_tu_list_t;
+
+/*****************************************************************************/
+/* Info from last TU row of CTB is stored in a row level neighbour buffer    */
+/* , which will be used for Boundary Strength computation                    */
+/*****************************************************************************/
+/**
+ *  CTB neighbor info
+ */
+typedef struct
+{
+    /**
+     *  Slice index of the ctb
+     */
+    UWORD16 u2_slice_idx;
+
+    /*************************************************************************/
+    /* CBF of bottom TU row (replicated in 4 pixel boundary)                 */
+    /* MSB contains CBF of first TU in the last row and LSB contains CBF     */
+    /* of last TU in the last row                                            */
+    /*************************************************************************/
+    /**
+     * CBF of bottom TU row
+     */
+    UWORD16 u2_packed_cbf;
+
+    /*************************************************************************/
+    /* QP of bottom TU row (replicated at 8 pixel boundary (Since QP can     */
+    /* not change at less than min CU granularity)                           */
+    /*************************************************************************/
+    /**
+     * QP of bottom TU row
+     */
+    UWORD8 au1_qp[MAX_CU_IN_CTB_ROW];
+
+}ctb_top_ny_info_t;
+
+/**
+ *  CTB level info
+ */
+typedef struct _ctb_t
+{
+    /*************************************************************************/
+    /* Tile boundary can be detected by looking at tile start x and tile     */
+    /* start y.  And based on the tile, slice and frame boundary the         */
+    /* following will be initialized.                                        */
+    /*************************************************************************/
+    /**
+     *  Pointer to left CTB
+     */
+    /*  If not available, this will be set to NULL   */
+    struct _ctb_t *ps_ctb_left;
+
+    /**
+     *  Pointer to top-left CTB
+     */
+    /* If not available, this will be set to NULL   */
+    ctb_top_ny_info_t *ps_ctb_ny_topleft;
+
+    /**
+     *  Pointer to top CTB
+     */
+    /* If not available, this will be set to NULL  */
+    ctb_top_ny_info_t *ps_ctb_ny_top;
+
+    /**
+     *  Pointer to top-right CTB
+     */
+    /* If not available, this will be set to NULL */
+    ctb_top_ny_info_t *ps_ctb_ny_topright;
+
+    /*************************************************************************/
+    /* Pointer to PU data.                                                   */
+    /* This points to a MV Bank stored at frame level. Though this           */
+    /* pointer can be derived by reading offset at frame level, it is        */
+    /* stored here for faster access. Can be removed if storage of CTB       */
+    /* structure is critical                                                 */
+    /*************************************************************************/
+    /**
+     * Pointer to PU data
+     */
+    pu_t *ps_pu;
+
+    /*************************************************************************/
+    /* Pointer to a PU map stored at frame level,                            */
+    /* Though this pointer can be derived by multiplying CTB adress with     */
+    /* number of minTUs in a CTB, it is stored here for faster access.       */
+    /* Can be removed if storage of CTB structure is critical                */
+    /*************************************************************************/
+    /**
+     * Pointer to a PU map stored at frame level
+     */
+    UWORD8 *pu1_pu_map;
+
+    /**
+     *  Number of TUs filled in as_tu
+     */
+    /*************************************************************************/
+    /* Having the first entry as 32 bit data, helps in keeping each of       */
+    /* the structures aligned to 32 bits at CTB level                        */
+    /*************************************************************************/
+    WORD32 i4_tu_cnt;
+
+    /**
+     *  Array to map each min TU unit to a corresponding entry in as_tu
+     */
+    UWORD8 *pu1_tu_map;
+
+    /**
+     *  TU level information
+     */
+    /*************************************************************************/
+    /* Though the allocation for as_pu as done to handle worst case data,    */
+    /* only valid number of TUs will be filled in the following array.       */
+    /* Next CTB starts after the valid as_tu entries                         */
+    /*************************************************************************/
+    tu_t *ps_tu;
+
+    /**
+     *  Pointer to transform coeff data
+     */
+    /*************************************************************************/
+    /* Following format is repeated for every coded TU                       */
+    /* Luma Block                                                            */
+    /* num_coeffs      : 16 bits                                             */
+    /* zero_cols       : 8 bits ( 1 bit per 4 columns)                       */
+    /* sig_coeff_map   : ((TU Size * TU Size) + 31) >> 5 number of WORD32s   */
+    /* coeff_data      : Non zero coefficients                               */
+    /* Cb Block (only for last TU in 4x4 case else for every luma TU)        */
+    /* num_coeffs      : 16 bits                                             */
+    /* zero_cols       : 8 bits ( 1 bit per 4 columns)                       */
+    /* sig_coeff_map   : ((TU Size * TU Size) + 31) >> 5 number of WORD32s   */
+    /* coeff_data      : Non zero coefficients                               */
+    /* Cr Block (only for last TU in 4x4 case else for every luma TU)        */
+    /* num_coeffs      : 16 bits                                             */
+    /* zero_cols       : 8 bits ( 1 bit per 4 columns)                       */
+    /* sig_coeff_map   : ((TU Size * TU Size) + 31) >> 5 number of WORD32s   */
+    /* coeff_data      : Non zero coefficients                               */
+    /*************************************************************************/
+    void            *pv_coeff_data;
+
+    /**
+     *  Slice to which the CTB belongs to
+     */
+    WORD32 i4_slice_idx;
+
+    /**
+     *  CTB column position
+     */
+    WORD32 i4_pos_x;
+
+    /**
+     *  CTB row position
+     */
+    WORD32 i4_pos_y;
+
+    /**
+     *  Number of PUs filled in ps_pu
+     */
+    WORD32 i4_pu_cnt;
+
+    /**
+     *  Index of current PU being processed in ps_pu
+     */
+    /*  Scratch variable set to 0 at the start of any PU processing function */
+    WORD32 i4_pu_idx;
+
+    /**
+     * Vertical Boundary strength
+     */
+    /* Two bits per edge.
+    Stored in format. BS[15] | BS[14] | .. |BS[0]*/
+    UWORD32 *pu4_vert_bs;
+
+    /**
+     * Horizontal Boundary strength
+     */
+
+    /* Two bits per edge.
+    Stored in format. BS[15] | BS[14] | .. |BS[0]*/
+    UWORD32 *pu4_horz_bs;
+
+    /**
+     *  Qp array stored for each 8x8 pixels
+     */
+    UWORD8 *pu1_qp;
+
+    /**
+     *  Pointer to current frame's pu_t array
+     */
+    pu_t *ps_frm_pu;
+
+    /**
+     * Pointer to current frame's pu_t index array, which stores starting index
+     * of pu_t for every CTB
+     */
+    UWORD32 *pu4_frm_pu_idx;
+
+    /**
+     *  Pointer to current frame's pu map array
+     */
+    UWORD8 *pu1_frm_pu_map;
+
+    /*************************************************************************/
+    /* Need to add encoder specific elements for identifying the order of    */
+    /* coding for CU, TU and PU if any                                       */
+    /*************************************************************************/
+}ctb_t;
+
+/*****************************************************************************/
+/* The following can be used to typecast coefficient data that is stored     */
+/*  per subblock. Note that though i2_level is shown as an array that        */
+/*  holds 16 coefficients, only the first few entries will be valid. Next    */
+/*  subblocks data starts after the valid number of coefficients. Number     */
+/*  of non-zero coefficients will be derived using number of non-zero bits   */
+/*  in sig coeff map                                                         */
+/*****************************************************************************/
+/**
+ * Structure to hold coefficient info for a 4x4 subblock
+ */
+typedef struct
+{
+    /**
+     * sub block position
+     */
+    UWORD16 u2_subblk_pos;
+
+    /**
+     * significant coefficient map
+     */
+    UWORD16 u2_sig_coeff_map;
+
+    /**
+     * holds 16 coefficients
+     */
+    WORD16  ai2_level[SUBBLK_COEFF_CNT];
+}tu_sblk_coeff_data_t;
+
+
+
+/*************************************************************************/
+/* The following describes how each of the CU cases are handled          */
+/*************************************************************************/
+
+/*************************************************************************/
+/* For SKIP CU                                                           */
+/* One Inter PU with appropriate MV                                      */
+/* One TU which says Y, Cb and Cr CBF is zero with size equal to CB size */
+/*************************************************************************/
+
+/*************************************************************************/
+/* For Inter CU                                                          */
+/* M Inter PU with appropriate MVs (M between 1 to 4)                    */
+/* N TU (N is number of TU in CU)                                        */
+/*************************************************************************/
+
+/*************************************************************************/
+/* For Intra CU                                                          */
+/* N TU (N is number of TU in CU)                                        */
+/* N Intra PU with appropriate pred modes for luma and chroma            */
+/*************************************************************************/
+
+/*************************************************************************/
+/* For Intra PCM CU                                                      */
+/* One TU which says transquant bypass is 1  with size equal to CB size  */
+/* 1 Intra PU with pcm flag set to 1(which ensures no intra pred is done)*/
+/*************************************************************************/
+
+/*************************************************************************/
+/* For a CU where cu_transquant_bypass_flag is 1                         */
+/* One TU which says transquant bypass is 1  with size equal to CB size  */
+/* N Intra/Inter PUs                                                     */
+/*************************************************************************/
+
+/*************************************************************************/
+/* For a CU where no_residual_syntax_flag is 1                           */
+/* One TU which says Y, Cb, Cr CBF is 0  with size equal to CB size      */
+/* N Inter PUs                                                           */
+/*************************************************************************/
+
+#if 0
+
+/*************************************************************************/
+/* Keeping the following as arrays instead of pointers helps in          */
+/* reducing number of redirections and hence faster access to the        */
+/* data. But the downside is this results in unused memory holes         */
+/* after each array, since the allocation is for worst case but          */
+/* number of valid CU, TU and PU will be much lesser than worst case.    */
+/* Since the holes are three per CTB, it should not be so much of a      */
+/* problem.                                                              */
+/*************************************************************************/
+
+/* CU level information */
+/* TODO: If there is not much data that is stored at CU level, then the
+ following will be removed */
+cu_t as_cu[MAX_CU_IN_CTB];
+
+#endif
+
+/**
+ * Structure giving information about the tile
+ */
+typedef struct
+{
+    /* X position of the tile in the current frame in CTB units */
+    UWORD8 u1_pos_x;
+
+    /* Y position of the tile in the current frame in CTB units */
+    UWORD8 u1_pos_y;
+
+    /* Tile width in CTB units */
+    UWORD16 u2_wd;
+
+    /* Tile height in CTB units */
+    UWORD16 u2_ht;
+
+}tile_t;
+
+/**
+ * Structure to hold Profile tier level info for a given layer
+ */
+
+typedef struct
+{
+    /**
+     *  NAL unit type
+     */
+    WORD8 i1_nal_unit_type;
+
+    /**
+     *  NAL temporal id
+     */
+    WORD8 i1_nuh_temporal_id;
+}nal_header_t;
+
+/**
+ * Structure to hold Profile tier level info for a given layer
+ */
+
+typedef struct
+{
+    /**
+     *  profile_space
+     */
+    WORD8 i1_profile_space;
+
+    /**
+     *  tier_flag
+     */
+    WORD8 i1_tier_flag;
+
+    /**
+     *  profile_idc
+     */
+    WORD8 i1_profile_idc;
+
+    /**
+     *  profile_compatibility_flag[]
+     */
+    WORD8 ai1_profile_compatibility_flag[MAX_PROFILE_COMPATBLTY];
+
+    /**
+     * progressive_source_flag
+     */
+    WORD8 i1_general_progressive_source_flag;
+
+    /**
+     * interlaced_source_flag
+     */
+    WORD8 i1_general_interlaced_source_flag;
+
+    /**
+     * non_packed_constraint_flag
+     */
+    WORD8 i1_general_non_packed_constraint_flag;
+
+    /**
+     * frame_only_constraint_flag
+     */
+    WORD8 i1_frame_only_constraint_flag;
+
+    /**
+     *  level_idc
+     */
+    UWORD8 u1_level_idc;
+}profile_tier_lvl_t;
+
+/**
+ * Structure to hold Profile tier level info for all layers
+ */
+typedef struct
+{
+    /**
+     *  Profile and tier information for general
+     */
+    profile_tier_lvl_t s_ptl_gen;
+
+    /**
+     *  sub_layer_profile_present_flag[]
+     */
+    WORD8 ai1_sub_layer_profile_present_flag[VPS_MAX_SUB_LAYERS - 1];
+
+    /**
+     *  sub_layer_level_present_flag[]
+     */
+    WORD8 ai1_sub_layer_level_present_flag[VPS_MAX_SUB_LAYERS - 1];
+
+    /**
+     *  Profile and tier information for sub layers
+     */
+    profile_tier_lvl_t as_ptl_sub[VPS_MAX_SUB_LAYERS - 1];
+
+}profile_tier_lvl_info_t;
+
+/**
+ * Structure to hold short term reference picture set info
+ */
+typedef struct
+{
+    /**
+     *  delta_poc_s0_minus1[ i ] and delta_poc_s1_minus1[ i ]
+     */
+    WORD16 ai2_delta_poc[MAX_DPB_SIZE];
+
+    /**
+     *  inter_ref_pic_set_prediction_flag
+     */
+    WORD8 i1_inter_ref_pic_set_prediction_flag;
+
+    /**
+     *  num_negative_pics
+     */
+    WORD8 i1_num_neg_pics;
+
+    /**
+     *  num_positive_pics
+     */
+    WORD8 i1_num_pos_pics;
+
+    /**
+     *  used_by_curr_pic_s0_flag[ i ] and used_by_curr_pic_s1_flag[i]
+     */
+    WORD8 ai1_used[MAX_DPB_SIZE];
+
+    /**
+     *  Ref Idc
+     */
+    WORD8 ai1_ref_idc[MAX_DPB_SIZE];
+
+    /**
+     *  Sum of positive and negative pics for each refence
+     */
+    WORD8 i1_num_delta_pocs;
+
+    /**
+     *  Number of ref_idc
+     */
+    WORD8 i1_num_ref_idc;
+}stref_picset_t;
+
+/**
+ * Structure to hold weighted prediction info such as weights and offsets
+ */
+typedef struct
+{
+    /** luma_log2_weight_denom */
+    WORD8 i1_luma_log2_weight_denom;
+
+    /** delta_chroma_log2_weight_denom */
+    WORD8 i1_chroma_log2_weight_denom;
+
+    /** luma_weight_l0_flag[ i ] */
+    WORD8 i1_luma_weight_l0_flag[MAX_DPB_SIZE];
+
+    /** chroma_weight_l0_flag[ i ] */
+    WORD8 i1_chroma_weight_l0_flag[MAX_DPB_SIZE];
+
+    /** delta_luma_weight_l0[ i ] */
+    WORD16 i2_luma_weight_l0[MAX_DPB_SIZE];
+
+    /** luma_offset_l0[ i ] */
+    WORD16 i2_luma_offset_l0[MAX_DPB_SIZE];
+
+    /** delta_chroma_weight_l0[ i ][ j ] */
+    WORD16 i2_chroma_weight_l0_cb[MAX_DPB_SIZE];
+
+    /** delta_chroma_offset_l0[ i ][ j ] */
+    WORD16 i2_chroma_offset_l0_cb[MAX_DPB_SIZE];
+
+    /** delta_chroma_weight_l0[ i ][ j ] */
+    WORD16 i2_chroma_weight_l0_cr[MAX_DPB_SIZE];
+
+    /** delta_chroma_offset_l0[ i ][ j ] */
+    WORD16 i2_chroma_offset_l0_cr[MAX_DPB_SIZE];
+
+    /** luma_weight_l1_flag[ i ] */
+    WORD8 i1_luma_weight_l1_flag[MAX_DPB_SIZE];
+
+    /** chroma_weight_l1_flag[ i ] */
+    WORD8 i1_chroma_weight_l1_flag[MAX_DPB_SIZE];
+
+    /** delta_luma_weight_l1[ i ] */
+    WORD16 i2_luma_weight_l1[MAX_DPB_SIZE];
+
+    /** luma_offset_l1[ i ] */
+    WORD16 i2_luma_offset_l1[MAX_DPB_SIZE];
+
+    /** delta_chroma_weight_l1[ i ][ j ] */
+    WORD16 i2_chroma_weight_l1_cb[MAX_DPB_SIZE];
+
+    /** delta_chroma_offset_l1[ i ][ j ] */
+    WORD16 i2_chroma_offset_l1_cb[MAX_DPB_SIZE];
+
+    /** delta_chroma_weight_l1[ i ][ j ] */
+    WORD16 i2_chroma_weight_l1_cr[MAX_DPB_SIZE];
+
+    /** delta_chroma_offset_l1[ i ][ j ] */
+    WORD16 i2_chroma_offset_l1_cr[MAX_DPB_SIZE];
+
+}pred_wt_ofst_t;
+
+
+/**
+ * Structure to hold Reference picture list modification info
+ */
+typedef struct
+{
+    /* ref_pic_list_modification_flag_l0 */
+    WORD8 i1_ref_pic_list_modification_flag_l0;
+
+    /* list_entry_l0[ i ] */
+    WORD8 i1_list_entry_l0[16];
+
+    /* ref_pic_list_modification_flag_l1 */
+    WORD8 i1_ref_pic_list_modification_flag_l1;
+
+    /* list_entry_l1[ i ] */
+    WORD8 i1_list_entry_l1[16];
+}rplm_t;
+
+
+/**
+ * Structure to hold VPS info
+ */
+typedef struct
+{
+    /**
+     *  video_parameter_set_id
+     */
+    WORD8 i1_vps_id;
+
+    /**
+     *  vps_temporal_id_nesting_flag
+     */
+    WORD8 i1_vps_temporal_id_nesting_flag;
+    /**
+     * sub_layer_ordering_info_present_flag
+     */
+    WORD8 i1_sub_layer_ordering_info_present_flag;
+    /**
+     *  vps_max_sub_layers_minus1
+     */
+    WORD8 i1_vps_max_sub_layers;
+
+    /**
+     *  vps_max_dec_pic_buffering
+     */
+    WORD8 ai1_vps_max_dec_pic_buffering[VPS_MAX_SUB_LAYERS];
+
+    /**
+     *  vps_max_num_reorder_pics
+     */
+    WORD8 ai1_vps_max_num_reorder_pics[VPS_MAX_SUB_LAYERS];
+
+    /**
+     *  vps_max_latency_increase
+     */
+    WORD8 ai1_vps_max_latency_increase[VPS_MAX_SUB_LAYERS];
+
+    /**
+     *  vps_num_hrd_parameters
+     */
+    WORD8 i1_vps_num_hrd_parameters;
+
+    /**
+     * vps_max_nuh_reserved_zero_layer_id
+     */
+    WORD8 i1_vps_max_nuh_reserved_zero_layer_id;
+
+    /**
+     * vps_num_op_sets
+     */
+    WORD8 i1_vps_num_op_sets;
+
+    /**
+     * layer_id_included_flag
+     */
+    //WORD8 ai1_layer_id_included_flag[2][MAX_NUH_LAYERS];
+    /**
+     *  Profile, Tier and Level info
+     */
+    profile_tier_lvl_info_t s_ptl;
+
+    /**
+     * bit_rate_info_present_flag[i]
+     */
+    WORD8 ai1_bit_rate_info_present_flag[VPS_MAX_SUB_LAYERS];
+
+
+    /**
+     * pic_rate_info_present_flag[i]
+     */
+    WORD8 ai1_pic_rate_info_present_flag[VPS_MAX_SUB_LAYERS];
+
+    /**
+     * avg_bit_rate[i]
+     */
+    UWORD16 au2_avg_bit_rate[VPS_MAX_SUB_LAYERS];
+    /**
+     * max_bit_rate[i]
+     */
+    UWORD16 au2_max_bit_rate[VPS_MAX_SUB_LAYERS];
+    /**
+     * constant_pic_rate_idc[i]
+     */
+    WORD8 ai1_constant_pic_rate_idc[VPS_MAX_SUB_LAYERS];
+    /**
+     * avg_pic_rate[i]
+     */
+    UWORD16 au2_avg_pic_rate[VPS_MAX_SUB_LAYERS];
+}vps_t;
+
+/**
+ * Sub-layer HRD parameters Info
+ */
+typedef struct
+{
+    /**
+    *  (together with bit_rate_scale) specifies the
+    *  maximum input bit rate for the i-th CPB
+    */
+    UWORD32 au4_bit_rate_value_minus1[32];
+    /**
+    *  together with cpb_size_scale to specify the
+    *  CPB size when the CPB operates at the access unit level.
+    */
+    UWORD32 au4_cpb_size_value_minus1[32];
+
+    /**
+    * together with cpb_size_du_scale to specify the CPB size
+    * when the CPB operates at sub-picture level
+    */
+    UWORD32 au4_cpb_size_du_value_minus1[32];
+
+    /**
+    * specifies the maximum input bit rate for the i-th CPB when the CPB
+    * operates at the sub-picture level. bit_rate_du_value_minus1[ i ]
+    * shall be in the range of 0 to 2^32 - 2
+    */
+    UWORD32 au4_bit_rate_du_value_minus1[32];
+
+    /**
+    * if 1, specifies that the HSS operates in a constant bit rate (CBR) mode
+    * if 0, specifies that the HSS operates in a intermittent bit rate (CBR) mode
+    */
+    UWORD8  au1_cbr_flag[32];
+
+}sub_lyr_hrd_params_t;
+
+/**
+ * HRD parameters Info
+ */
+typedef struct
+{
+
+    /**
+    *   Indicates the presence of the
+    *   num_units_in_ticks, time_scale flag
+    */
+    UWORD8 u1_timing_info_present_flag;
+
+    /**
+    *   Number of units that
+    *   correspond to one increment of the
+    *   clock. Indicates the  resolution
+    */
+    UWORD32 u4_num_units_in_tick;
+
+    /**
+    *   The number of time units that pass in one second
+    */
+    UWORD32 u4_time_scale;
+
+    /**
+    * Nal- hrd parameters flag
+    */
+    UWORD8 u1_nal_hrd_parameters_present_flag;
+
+    /**
+    * VCL- hrd parameters flag
+    */
+    UWORD8 u1_vcl_hrd_parameters_present_flag;
+
+    /**
+    * Indicates the presence of NAL-HRD params or VCL_HRD params
+    * in the bitstream
+    */
+    UWORD8 u1_cpbdpb_delays_present_flag;
+
+    /**
+    * specifies that sub-picture level CPB removal delay parameters are
+    * present in picture timing SEI messages
+    */
+    UWORD8 u1_sub_pic_cpb_params_present_flag;
+
+    /**
+    * specify the clock sub-tick
+    * (the minimum interval of time that can be represented in the coded data when sub_pic_cpb_params_present_flag is equal to 1)
+    */
+    UWORD8 u1_tick_divisor_minus2;
+
+    /**
+    * specifies the length, in bits for the du cpb delay syntax in pt_sei
+    */
+    UWORD8 u1_du_cpb_removal_delay_increment_length_minus1;
+
+    /**
+    * Indicates presence of sub_pic_cpb_params in pic timing sei
+    */
+    UWORD8 u1_sub_pic_cpb_params_in_pic_timing_sei_flag;
+
+    /**
+    * specifies the length, in bits, of the pic_dpb_output_du_delay syntax
+    * element in the picture timing SEI message and the
+    * pic_spt_dpb_output_du_delay syntax element in the decoding unit
+    * information SEI message
+     */
+    UWORD8 u1_dpb_output_delay_du_length_minus1;
+
+    /**
+    * (together with bit_rate_value_minus1) specifies the
+    * maximum input bit rate of the i-th CPB
+    */
+    UWORD32 u4_bit_rate_scale;
+
+    /**
+    * (together with cpb_size_du_value_minus1) specfies
+    * CPB size of the i-th CPB when the CPB operates
+    * at the access unit level
+    */
+    UWORD32 u4_cpb_size_scale;
+
+    /**
+    * (together with cpb_size_du_value_minus1) specfies
+    * CPB size of the i-th CPB when the CPB operates
+    * at the sub-picture level
+    */
+    UWORD32 u4_cpb_size_du_scale;
+
+
+    /**
+    * specifies the length, in bits for initial cpb delay (nal/vcl)sysntax in bp sei
+    */
+    UWORD8  u1_initial_cpb_removal_delay_length_minus1;
+
+    /**
+    * specifies the length, in bits for the au cpb delay syntax in pt_sei
+    */
+    UWORD8  u1_au_cpb_removal_delay_length_minus1;
+
+    /**
+    * specifies the length, in bits, of the pic_dpb_output_delay syntax element in the pt SEI message
+    */
+    UWORD8  u1_dpb_output_delay_length_minus1;
+
+    /**
+    * if 1, , for the highest temporal sub-layers, the temporal distance between the HRD output times
+    *  of consecutive pictures in output order is constrained refer to Table E-6
+    */
+    UWORD8 au1_fixed_pic_rate_general_flag[6];
+
+    UWORD8 au1_fixed_pic_rate_within_cvs_flag[6];
+
+    /**
+    * if 1, , for the highest temporal sub-layers, the temporal distance (in clock ticks) between the
+    * element units that specify HRD output times of consecutive pictures in output order is constrained
+    * refer to Table E-6
+    */
+    UWORD8 au1_elemental_duration_in_tc_minus1[6];
+
+    /**
+    * specifies the HRD operational mode
+    */
+    UWORD8 au1_low_delay_hrd_flag[6];
+
+    /**
+    * 1 specifies the number of alternative CPB specifications in the
+    * bitstream of the cvs when HighestTid is equal to i
+    */
+    UWORD8 au1_cpb_cnt_minus1[6];
+
+
+    /**
+    *  VUI level Sub-layer HRD parameters
+    */
+    sub_lyr_hrd_params_t as_sub_layer_hrd_params[6];
+
+}hrd_params_t;
+
+
+/**
+ * Structure to hold VUI parameters Info
+ */
+typedef struct
+{
+    /**
+    *  indicates the presence of aspect_ratio
+    */
+    UWORD8 u1_aspect_ratio_info_present_flag;
+
+    /**
+    *  specifies the aspect ratio of the luma samples
+    */
+    UWORD8 u1_aspect_ratio_idc;
+
+    /**
+    *  width of the luma samples. user dependent
+    */
+    UWORD16 u2_sar_width;
+
+    /**
+    *  hieght of the luma samples. user dependent
+    */
+    UWORD16 u2_sar_height;
+
+    /**
+    * if 1, specifies that the overscan_appropriate_flag is present
+    * if 0, the preferred display method for the video signal is unspecified
+    */
+    UWORD8 u1_overscan_info_present_flag;
+
+    /**
+    * if 1,indicates that the cropped decoded pictures output
+    * are suitable for display using overscan
+    */
+    UWORD8 u1_overscan_appropriate_flag;
+
+    /**
+    * if 1 specifies that video_format, video_full_range_flag and
+    * colour_description_present_flag are present
+    */
+    UWORD8 u1_video_signal_type_present_flag;
+
+    /**
+    *
+    */
+    UWORD8 u1_video_format;
+
+    /**
+    * indicates the black level and range of the luma and chroma signals
+    */
+    UWORD8 u1_video_full_range_flag;
+
+    /**
+    * if 1,to 1 specifies that colour_primaries, transfer_characteristics
+    * and matrix_coefficients are present
+    */
+    UWORD8 u1_colour_description_present_flag;
+
+    /**
+    * indicates the chromaticity coordinates of the source primaries
+    */
+    UWORD8 u1_colour_primaries;
+
+    /**
+    * indicates the opto-electronic transfer characteristic of the source picture
+    */
+    UWORD8 u1_transfer_characteristics;
+
+    /**
+    * the matrix coefficients used in deriving luma and chroma signals
+    * from the green, blue, and red primaries
+    */
+    UWORD8 u1_matrix_coefficients;
+
+    /**
+    * if 1, specifies that chroma_sample_loc_type_top_field and
+    * chroma_sample_loc_type_bottom_field are present
+    */
+    UWORD8 u1_chroma_loc_info_present_flag;
+
+    /**
+    * location of chroma samples
+    */
+    UWORD8 u1_chroma_sample_loc_type_top_field;
+
+    UWORD8 u1_chroma_sample_loc_type_bottom_field;
+
+    /**
+    * if 1, indicates that the value of all decoded chroma samples is
+    * equal to 1 << ( BitDepthC - 1 )
+    */
+    UWORD8 u1_neutral_chroma_indication_flag;
+
+    /**
+    *  1 indicates that the coded video sequence conveys pictures that represent fields
+    *  0 indicates the pictures that represents field
+    */
+    UWORD8 u1_field_seq_flag;
+
+    /**
+    * specifies that picture timing SEI messages are present for every picture
+    */
+    UWORD8 u1_frame_field_info_present_flag;
+
+    /**
+    * 1 indicates that the default display window parameters follow next in the VUI
+    */
+    UWORD8 u1_default_display_window_flag;
+
+    /**
+    * specify the samples of the pictures in the coded video sequence
+    * that are within the default display window,
+    * in terms of a rectangular region specified in picture coordinates for display
+    */
+    UWORD32 u4_def_disp_win_left_offset;
+
+    UWORD32 u4_def_disp_win_right_offset;
+
+    UWORD32 u4_def_disp_win_top_offset;
+
+    UWORD32 u4_def_disp_win_bottom_offset;
+
+    /**
+    *  to 1 specifies that the syntax structure hrd_parameters is present in the vui_parameters syntax structue
+    */
+    UWORD8 u1_vui_hrd_parameters_present_flag;
+
+    /**
+    *  VUI level HRD parameters
+    */
+    hrd_params_t s_vui_hrd_parameters;
+
+    /**
+    *   Indicates the presence of the
+    *   num_units_in_ticks, time_scale flag
+    */
+    UWORD8 u1_vui_timing_info_present_flag;
+
+    /**
+    *   Number of units that
+    *   correspond to one increment of the
+    *   clock. Indicates the  resolution
+    */
+    UWORD32 u4_vui_num_units_in_tick;
+
+    /**
+    *   The number of time units that pass in one second
+    */
+    UWORD32 u4_vui_time_scale;
+    /**
+    * if 1, indicates that the POC for each picture in the coded video sequence (cvs) (not the first picture), in decoding order,
+    * is proportional to the output time of the picture relative to that of the first picture in the cvs
+    */
+    UWORD8 u1_poc_proportional_to_timing_flag;
+
+    /**
+    * num_ticks_poc_diff_one_minus1 plus 1 specifies the number of clock ticks
+    * corresponding to a difference of poc values equal to 1
+    */
+    UWORD8 u1_num_ticks_poc_diff_one_minus1;
+
+    /**
+    * 1, specifies that the following cvs bitstream restriction parameters are present
+    */
+    UWORD8 u1_bitstream_restriction_flag;
+
+    /**
+    *  if 1, indicates that each pps that is active in the cvs has
+    *  the same value of the tile syntax elements
+    */
+    UWORD8 u1_tiles_fixed_structure_flag;
+
+    /**
+    * if 0, indicates that no pel outside the pic boundaries and
+    * no sub-pels derived using pels outside the pic boundaries is used for inter prediction
+    */
+    UWORD8 u1_motion_vectors_over_pic_boundaries_flag;
+
+    /**
+    * if 1, indicates
+    * all P/B slices belonging to the same pic have an identical refpic list0,
+    * all B slices that belong to the same picture have an identical refpic list1.
+    */
+    UWORD8 u1_restricted_ref_pic_lists_flag;
+
+    /**
+    *   min_spatial_segmentation_idc, when not equal to 0, establishes a bound on the maximum possible size of distinct
+    *   coded spatial segmentation regions in the pictures of the CVS. When min_spatial_segmentation_idc is not present, it is
+    *   inferred to be equal to 0. The value of min_spatial_segmentation_idc shall be in the range of 0 to 4095, inclusive.
+    *
+    *   can be used by a decoder to calculate the maximum number of luma samples to be processed by one processing thread
+    *
+    *   If tiles=0 and entropy_sync=0 then
+    *       no slice shall exceed ( 4 * PicSizeInSamplesY ) / minSpatialSegmentationTimes4 luma samples
+    *
+    *   If tiles=1 and entropy_sync=0 then
+    *       no tile shall exceed ( 4 * PicSizeInSamplesY ) / minSpatialSegmentationTimes4 luma samples
+    *
+    *   If tiles=0 and entropy_sync=1 then
+    *       ( 2 * pic_height_in_luma_samples + pic_width_in_luma_samples ) * CtbSizeY
+    *               <= ( 4 * PicSizeInSamplesY ) / minSpatialSegmentationTimes4
+    */
+    UWORD32 u4_min_spatial_segmentation_idc;
+    /**
+    * Indicates a number of bytes not exceeded by the sum of the sizes of the VCL NAL units
+    * associated with any coded picture
+    */
+    UWORD8 u1_max_bytes_per_pic_denom;
+
+    /**
+    *  Indicates an upper bound for the number of bits of coding_unit() data
+    */
+    UWORD8 u1_max_bits_per_mincu_denom;
+
+    /**
+    * Indicate the maximum absolute value of a decoded horizontal MV component
+    * in quarter-pel luma units
+    */
+    UWORD8 u1_log2_max_mv_length_horizontal;
+
+    /**
+    * Indicate the maximum absolute value of a decoded vertical MV component
+    * in quarter-pel luma units
+    */
+    UWORD8 u1_log2_max_mv_length_vertical;
+
+
+}vui_t;
+
+
+/**
+ * Structure to hold SPS info
+ */
+typedef struct
+{
+    /**
+     * pic_width_in_luma_samples
+     */
+    WORD16 i2_pic_width_in_luma_samples;
+
+    /**
+     *  pic_height_in_luma_samples
+     */
+    WORD16 i2_pic_height_in_luma_samples;
+
+    /**
+     *  pic_crop_left_offset
+     */
+    WORD16 i2_pic_crop_left_offset;
+
+    /**
+     *  pic_crop_right_offset
+     */
+    WORD16 i2_pic_crop_right_offset;
+
+    /**
+     *  pic_crop_top_offset
+     */
+    WORD16 i2_pic_crop_top_offset;
+
+    /**
+     *  pic_crop_bottom_offset
+     */
+    WORD16 i2_pic_crop_bottom_offset;
+
+    /**
+     *  seq_parameter_set_id
+     */
+    WORD8 i1_sps_id;
+
+    /**
+     *  video_parameter_set_id
+     */
+    WORD8 i1_vps_id;
+
+    /**
+     *  sps_max_sub_layers_minus1
+     */
+    WORD8 i1_sps_max_sub_layers;
+
+    /**
+     *  chroma_format_idc
+     */
+    WORD8 i1_chroma_format_idc;
+
+    /**
+     * Bit depth of luma samples
+     */
+    WORD8 i1_bit_depth_luma_minus8;
+
+    /**
+     * Bit depth of chrma samples
+     */
+    WORD8 i1_bit_depth_chroma_minus8;
+
+    /* separate_colour_plane_flag */
+    WORD8 i1_separate_colour_plane_flag;
+
+    /**
+     *  pic_cropping_flag
+     */
+    WORD8 i1_pic_cropping_flag;
+
+    /**
+     *  pcm_enabled_flag
+     */
+    WORD8 i1_pcm_enabled_flag;
+
+    /**
+     *  pcm_sample_bit_depth_luma
+     */
+    WORD8 i1_pcm_sample_bit_depth_luma;
+
+    /**
+     *  pcm_sample_bit_depth_chroma
+     */
+    WORD8 i1_pcm_sample_bit_depth_chroma;
+
+    /**
+     *  log2_max_pic_order_cnt_lsb_minus4
+     */
+    WORD8 i1_log2_max_pic_order_cnt_lsb;
+    /**
+     * sps_sub_layer_ordering_info_present_flag
+     */
+    WORD8 i1_sps_sub_layer_ordering_info_present_flag;
+    /**
+     *  sps_max_dec_pic_buffering
+     */
+    WORD8 ai1_sps_max_dec_pic_buffering[SPS_MAX_SUB_LAYERS];
+
+    /**
+     *  sps_max_num_reorder_pics
+     */
+    WORD8 ai1_sps_max_num_reorder_pics[SPS_MAX_SUB_LAYERS];
+
+    /**
+     *  sps_max_latency_increase
+     */
+    WORD8 ai1_sps_max_latency_increase[SPS_MAX_SUB_LAYERS];
+
+    /**
+     *  log2_min_coding_block_size_minus3
+     */
+    WORD8 i1_log2_min_coding_block_size;
+
+    /**
+     *  log2_diff_max_min_coding_block_size
+     */
+    WORD8 i1_log2_diff_max_min_coding_block_size;
+
+    /**
+     *  log2_min_transform_block_size_minus2
+     */
+    WORD8 i1_log2_min_transform_block_size;
+
+    /**
+     *  log2_diff_max_min_transform_block_size
+     */
+    WORD8 i1_log2_diff_max_min_transform_block_size;
+
+    /**
+     *  log2_min_pcm_coding_block_size_minus3
+     */
+    WORD8 i1_log2_min_pcm_coding_block_size;
+
+    /**
+     *  log2_diff_max_min_pcm_coding_block_size
+     */
+    WORD8 i1_log2_diff_max_min_pcm_coding_block_size;
+
+    /**
+     *  max_transform_hierarchy_depth_inter
+     */
+    WORD8 i1_max_transform_hierarchy_depth_inter;
+
+    /**
+     *  max_transform_hierarchy_depth_intra
+     */
+    WORD8 i1_max_transform_hierarchy_depth_intra;
+
+    /**
+     *  scaling_list_enable_flag
+     */
+    WORD8 i1_scaling_list_enable_flag;
+
+    /**
+     *  sps_scaling_list_data_present_flag
+     */
+    WORD8 i1_sps_scaling_list_data_present_flag;
+
+    /**
+     *  amp_enabled_flag
+     */
+    WORD8 i1_amp_enabled_flag;
+
+    /**
+     *  sample_adaptive_offset_enabled_flag
+     */
+    WORD8 i1_sample_adaptive_offset_enabled_flag;
+
+    /**
+     *  pcm_loop_filter_disable_flag
+     */
+    WORD8 i1_pcm_loop_filter_disable_flag;
+
+    /**
+     *  sps_temporal_id_nesting_flag
+     */
+    WORD8 i1_sps_temporal_id_nesting_flag;
+
+    /**
+     *  num_short_term_ref_pic_sets
+     */
+    WORD8 i1_num_short_term_ref_pic_sets;
+
+    /**
+     *  long_term_ref_pics_present_flag
+     */
+    WORD8 i1_long_term_ref_pics_present_flag;
+
+    /**
+     *  num_long_term_ref_pics_sps
+     */
+    WORD8 i1_num_long_term_ref_pics_sps;
+
+    /**
+     *  lt_ref_pic_poc_lsb_sps[]
+     */
+    WORD8 ai1_lt_ref_pic_poc_lsb_sps[MAX_LTREF_PICS_SPS];
+
+    /**
+     *  used_by_curr_pic_lt_sps_flag[]
+     */
+    WORD8 ai1_used_by_curr_pic_lt_sps_flag[MAX_LTREF_PICS_SPS];
+
+    /**
+     *  sps_temporal_mvp_enable_flag
+     */
+    WORD8 i1_sps_temporal_mvp_enable_flag;
+
+    /**
+     * strong_intra_smoothing_enable_flag
+     */
+    WORD8 i1_strong_intra_smoothing_enable_flag;
+
+    /**
+     *  vui_parameters_present_flag
+     */
+    WORD8 i1_vui_parameters_present_flag;
+
+    /**
+     * vui parameters Structure info
+     */
+    vui_t s_vui_parameters;
+
+    /**
+     *  Log2(CTB Size) in luma units
+     */
+
+    WORD8 i1_log2_ctb_size;
+
+    /**
+     * Maximum transform block size
+     */
+    WORD8 i1_log2_max_transform_block_size;
+
+    /**
+     *  Picture width in CTB units
+     */
+
+    WORD16 i2_pic_wd_in_ctb;
+
+    /**
+     *  Picture height in CTB units
+     */
+
+    WORD16 i2_pic_ht_in_ctb;
+
+    /**
+     * Picture width in min CB units
+     */
+
+    WORD16 i2_pic_wd_in_min_cb;
+
+    /**
+     *  Picture height in min CB units
+     */
+
+    WORD16 i2_pic_ht_in_min_cb;
+
+    /**
+     *  Picture size in CTB units
+     */
+    WORD32 i4_pic_size_in_ctb;
+
+    /**
+     *  Profile, Tier and Level info
+     */
+
+    profile_tier_lvl_info_t s_ptl;
+
+    /**
+     *  Short term reference pic set
+     */
+    stref_picset_t as_stref_picset[MAX_STREF_PICS_SPS];
+
+    /**
+     *  Pointer to scaling matrix
+     */
+    /*************************************************************************/
+    /* Contanis the matrice in the following order in a 1D buffer            */
+    /* Intra 4 x 4 Y, 4 x 4 U, 4 x 4 V                                       */
+    /* Inter 4 x 4 Y, 4 x 4 U, 4 x 4 V                                       */
+    /* Intra 8 x 8 Y, 8 x 8 U, 8 x 8 V                                       */
+    /* Inter 8 x 8 Y, 8 x 8 U, 8 x 8 V                                       */
+    /* Intra 16x16 Y, 16x16 U, 16x16 V                                       */
+    /* Inter 16x16 Y, 16x16 U, 16x16 V                                       */
+    /* Intra 32x32 Y                                                         */
+    /* Inter 32x32 Y                                                         */
+    /*************************************************************************/
+    WORD16 *pi2_scaling_mat;
+
+    /*
+     * Flag indicating if the SPS is parsed
+     */
+    WORD8 i1_sps_valid;
+
+}sps_t;
+
+/**
+ * Structure to hold PPS info
+ */
+typedef struct
+{
+    /**
+     *  Pointer to scaling matrix
+     */
+    /*************************************************************************/
+    /* Contanis the matrice in the following order in a 1D buffer            */
+    /* Intra 4 x 4 Y, 4 x 4 U, 4 x 4 V                                       */
+    /* Inter 4 x 4 Y, 4 x 4 U, 4 x 4 V                                       */
+    /* Intra 8 x 8 Y, 8 x 8 U, 8 x 8 V                                       */
+    /* Inter 8 x 8 Y, 8 x 8 U, 8 x 8 V                                       */
+    /* Intra 16x16 Y, 16x16 U, 16x16 V                                       */
+    /* Inter 16x16 Y, 16x16 U, 16x16 V                                       */
+    /* Intra 32x32 Y                                                         */
+    /* Inter 32x32 Y                                                         */
+    /*************************************************************************/
+    WORD16 *pi2_scaling_mat;
+
+    /**
+     *  Pointer to an array containing tile info such as position, width, height
+     *  of each tile
+     */
+
+    /* column_width_minus1[ i ] and row_height_minus1[ i ] */
+    tile_t *ps_tile;
+
+    /**
+     *  pic_parameter_set_id
+     */
+    WORD8 i1_pps_id;
+
+    /**
+     *  seq_parameter_set_id
+     */
+    WORD8 i1_sps_id;
+
+    /**
+     *  sign_data_hiding_flag
+     */
+    WORD8 i1_sign_data_hiding_flag;
+
+    /**
+     *  cabac_init_present_flag
+     */
+    WORD8 i1_cabac_init_present_flag;
+
+    /**
+     *  num_ref_idx_l0_default_active_minus1
+     */
+    WORD8 i1_num_ref_idx_l0_default_active;
+
+    /**
+     * num_ref_idx_l1_default_active_minus1
+     */
+    WORD8 i1_num_ref_idx_l1_default_active;
+
+    /**
+     *  pic_init_qp_minus26
+     */
+    WORD8 i1_pic_init_qp;
+
+    /**
+     *  constrained_intra_pred_flag
+     */
+    WORD8 i1_constrained_intra_pred_flag;
+
+    /**
+     *  transform_skip_enabled_flag
+     */
+    WORD8 i1_transform_skip_enabled_flag;
+
+    /**
+     *  cu_qp_delta_enabled_flag
+     */
+    WORD8 i1_cu_qp_delta_enabled_flag;
+
+    /**
+     * diff_cu_qp_delta_depth
+     */
+    WORD8 i1_diff_cu_qp_delta_depth;
+
+    /**
+     *  pic_cb_qp_offset
+     */
+    WORD8 i1_pic_cb_qp_offset;
+
+    /**
+     *  pic_cr_qp_offset
+     */
+    WORD8 i1_pic_cr_qp_offset;
+
+    /**
+     *  pic_slice_level_chroma_qp_offsets_present_flag
+     */
+    WORD8 i1_pic_slice_level_chroma_qp_offsets_present_flag;
+
+    /**
+     *  weighted_pred_flag
+     */
+    WORD8 i1_weighted_pred_flag;
+
+    /**
+     *  weighted_bipred_flag
+     */
+    WORD8 i1_weighted_bipred_flag;
+
+    /**
+     *  output_flag_present_flag
+     */
+    WORD8 i1_output_flag_present_flag;
+
+    /**
+     *  transquant_bypass_enable_flag
+     */
+    WORD8 i1_transquant_bypass_enable_flag;
+
+    /**
+     *  dependent_slice_enabled_flag
+     */
+    WORD8 i1_dependent_slice_enabled_flag;
+
+    /**
+     *  tiles_enabled_flag
+     */
+    WORD8 i1_tiles_enabled_flag;
+
+    /**
+     *  entropy_coding_sync_enabled_flag
+     */
+    WORD8 i1_entropy_coding_sync_enabled_flag;
+
+    /**
+     * entropy_slice_enabled_flag
+     */
+    WORD8 i1_entropy_slice_enabled_flag;
+
+    /**
+     *  num_tile_columns_minus1
+     */
+    WORD8 i1_num_tile_columns;
+
+    /**
+     *  num_tile_rows_minus1
+     */
+    WORD8 i1_num_tile_rows;
+
+    /**
+     *  uniform_spacing_flag
+     */
+    WORD8 i1_uniform_spacing_flag;
+
+    /**
+     *  loop_filter_across_tiles_enabled_flag
+     */
+    WORD8 i1_loop_filter_across_tiles_enabled_flag;
+
+    /**
+     *  loop_filter_across_slices_enabled_flag
+     */
+    WORD8 i1_loop_filter_across_slices_enabled_flag;
+
+    /**
+     *  deblocking_filter_control_present_flag
+     */
+    WORD8 i1_deblocking_filter_control_present_flag;
+
+    /**
+     *  deblocking_filter_override_enabled_flag
+     */
+    WORD8 i1_deblocking_filter_override_enabled_flag;
+
+    /**
+     *  pic_disable_deblocking_filter_flag
+     */
+    WORD8 i1_pic_disable_deblocking_filter_flag;
+
+    /**
+     *  beta_offset_div2
+     */
+    WORD8 i1_beta_offset_div2;
+
+    /**
+     *  tc_offset_div2
+     */
+    WORD8 i1_tc_offset_div2;
+
+    /**
+     *  pps_scaling_list_data_present_flag
+     */
+    WORD8 i1_pps_scaling_list_data_present_flag;
+
+    /**
+     * lists_modification_present_flag
+     */
+    WORD8 i1_lists_modification_present_flag;
+
+    /**
+     * num_extra_slice_header_bits
+     */
+    WORD8 i1_num_extra_slice_header_bits;
+
+    /**
+     *  log2_parallel_merge_level_minus2
+     */
+    WORD8 i1_log2_parallel_merge_level;
+
+    /**
+     *  slice_header_extension_present_flag
+     */
+    WORD8 i1_slice_header_extension_present_flag;
+
+    /**
+     *  slice_extension_present_flag
+     */
+    WORD8 i1_slice_extension_present_flag;
+
+    /**
+     *  scaling_list_dc_coef_minus8
+     */
+    /*************************************************************************/
+    /* DC value of the scaling list                                          */
+    /* Only 16 x 16 and 32 x 32 scaling lists have valid entries.            */
+    /* Entries stored for all sizes for uniformity.                          */
+    /* Remaining will be initialized to default values if used               */
+    /*************************************************************************/
+    UWORD8 au1_scaling_list_dc_coef[TOTAL_SCALE_MAT_COUNT];
+
+    /**
+     * Log2MinCuQpDeltaSize
+     */
+    WORD8 i1_log2_min_cu_qp_delta_size;
+
+    /*
+     * Flag indicating if the PPS is parsed
+     */
+    WORD8 i1_pps_valid;
+
+}pps_t;
+
+
+
+/**
+ * Buffering Period SEI parameters Info
+ */
+typedef struct
+{
+    /**
+    * specifies SPS Id active for the coded picture assosiated
+    * with the bp message.
+    */
+    UWORD8  u1_sps_id;
+
+    /**
+    * Derived from Hrd parameters
+    */
+    UWORD8  u1_sub_pic_cpb_params_present_flag;
+
+    /**
+    * specifies the presence of the initial_alt_cpb_removal_delay[ i ]
+    * and initial_alt_cpb_removal_offset[ i ] syntax elements
+    */
+    UWORD8  u1_rap_cpb_params_present_flag;
+
+    /**
+    * cbp removal delay used in buffering period SEI
+    */
+    UWORD32 cpb_delay_offset;
+
+    /**
+    * dbp removal delay used in buffering period SEI
+    */
+    UWORD32 dpb_delay_offset;
+
+    /**
+    * concatanation flag
+    */
+    UWORD8 concatenation_flag;
+
+    /**
+    * delata cbp removal delay
+    */
+    UWORD32 au_cpb_removal_delay_delta_minus1;
+
+    /**
+    * specify the default initial CPB removal delays, respectively,
+    * for the CPB when the NAL HRD parameters are in use
+    */
+    UWORD32 au4_nal_initial_cpb_removal_delay[32];
+
+    /**
+    * specify the alternate initial CPB removal delays, respectively,
+    * for the CPB when the NAL HRD parameters are in use
+    */
+    UWORD32 au4_nal_initial_alt_cpb_removal_delay[32];
+
+    /**
+    * specify the initial CPB removal delay offset, respectively,
+    * for the CPB when the NAL HRD parameters are in use
+    */
+    UWORD32 au4_nal_initial_cpb_removal_delay_offset[32];
+
+    /**
+    * specify the alternate initial CPB removal delays offsets, respectively,
+    * for the CPB when the NAL HRD parameters are in use
+    */
+    UWORD32 au4_nal_initial_alt_cpb_removal_delay_offset[32];
+
+    /**
+    * specify the default initial CPB removal delays, respectively,
+    * for the CPB when the VCL HRD parameters are in use
+    */
+    UWORD32 au4_vcl_initial_cpb_removal_delay[32];
+
+    /**
+    * specify the initial alt CPB removal delays , respectively,
+    * for the CPB when the VCL HRD parameters are in use
+    */
+    UWORD32 au4_vcl_initial_alt_cpb_removal_delay[32];
+
+    /**
+    * specify the initial CPB removal delay offset, respectively,
+    * for the CPB when the VCL HRD parameters are in use
+    */
+    UWORD32 au4_vcl_initial_cpb_removal_delay_offset[32];
+
+    /**
+    * specify the alternate initial CPB removal delays offsets, respectively,
+    * for the CPB when the VCL HRD parameters are in use
+    */
+    UWORD32 au4_vcl_initial_alt_cpb_removal_delay_offset[32];
+
+    /**
+    *  Inital CPB removal delay
+    */
+    UWORD32 u4_initial_cpb_removal_delay_length;
+
+    /**
+    *  CPB cnt for corr. sublayer
+    */
+    UWORD32 u4_cpb_cnt;
+
+
+    /**
+    * VBV buffer size used in buffering period SEI
+    */
+    UWORD32 u4_buffer_size_sei;
+
+    /**
+    * Encoder buffer fullness  used in buffering period SEI
+    */
+    UWORD32 u4_ebf_sei;
+
+    /**
+    * target bitrate used in buffering period SEI
+    */
+    UWORD32 u4_target_bit_rate_sei;
+
+
+
+
+}buf_period_sei_params_t;
+
+
+/**
+ * Picture Timing SEI parameters Info
+ */
+typedef struct
+{
+    /**
+    * derived from vui parameters
+    */
+    UWORD8 u1_frame_field_info_present_flag;
+
+    /**
+    * indicates whether a picture should be displayed as a
+    * frame or as one or more fields
+    */
+    UWORD32 u4_pic_struct;
+
+    UWORD8  u1_num_clk_ticks;
+
+    /**
+    * indicates whether a scan-type of the pic should be interpreted
+    * as progressive or interlaced
+    */
+    UWORD8 u1_progressive_source_idc;
+
+    /**
+    * if 1, indicates if the current pic is a duplicte pic in output order
+    */
+    UWORD8 u1_duplicate_flag;
+
+    /**
+    * specifies the number clock ticks between the nominal CPB removal time
+    * au associated with the pt SEI message and
+    * the preceding au in decoding order that contained a bp SEI message
+    */
+    UWORD32 u4_au_cpb_removal_delay_minus1;
+
+    /**
+    * compute the DPB output time of the picture
+    */
+    UWORD32 u4_pic_dpb_output_delay;
+
+    UWORD32 u4_pic_dpb_output_du_delay;
+
+    /**
+    * specifies the number of decoding units in the access unit
+    * the picture timing SEI message is associated with
+    */
+    UWORD32 u4_num_decoding_units_minus1;
+
+    /**
+    * if 1 specifies that the du_common_cpb_removal_delay_increment_minus1 is present
+    */
+    UWORD32 u4_du_common_cpb_removal_delay_flag;
+
+    /**
+    * specifies the duration, in units of clock sub-ticks,
+    * between the nominal CPB removal times of any two consecutive decoding units
+    * in decoding order in the access unit associated with the pt_SEI message
+    */
+    UWORD32 u4_du_common_cpb_removal_delay_increment_minus1; //same as u4_du_cpb_removal_delay_increment_minus1
+
+    /**
+    * specifies the number of NAL units in the decoding unit of the access unit
+    * the picture timing SEI message is associated with.
+    * range from 0 to (pic size in ctby - 1)
+    */
+    UWORD32 u4_num_nalus_in_du_minus1;
+
+    /**
+    * specifies the duration, in units of clock sub-ticks,
+    * between the nominal CPB removal times of the ( i + 1 )-th decoding unit and the i-th decoding unit,
+    * in decoding order, in the access unit associated with the pt_SEI message
+    */
+    UWORD32 u4_du_cpb_removal_delay_increment_minus1;
+
+
+}pic_timing_sei_params_t;
+
+/**
+ * Structure to hold Recovery point SEI parameters Info
+ */
+typedef struct
+{
+    /**
+    * specifies the recovery point of output pictures in output order
+    */
+    WORD32 i4_recovery_poc_cnt;
+
+    UWORD8 u1_exact_match_flag;
+
+    /**
+    * indicates the presence or absence of a broken link in the NAL unit
+    * stream at the location of the recovery point SEI message
+    */
+
+    UWORD8 u1_broken_link_flag;
+
+}recovery_point_sei_params_t;
+/**
+ * Structure to hold active parameter parameter set SEI parameters Info
+ */
+typedef struct
+{
+    /*
+    * active vps id
+    */
+
+    UWORD8 u1_active_video_parameter_set_id;
+
+    /*
+    * default set to zero.
+    */
+    UWORD8 u1_self_contained_cvs_flag;
+
+    UWORD8 u1_no_parameter_set_update_flag;
+
+    UWORD8 u1_num_sps_ids_minus1;
+
+    /*
+    * active sps id
+    */
+    UWORD8 au1_active_seq_parameter_set_id[15];
+
+}active_parameter_set_sei_param_t;
+
+/**
+ * Structure to hold SEI parameters Info
+ */
+typedef struct
+{
+
+    WORD8 i1_sei_parameters_present_flag;
+
+    WORD8 i1_aud_present_flag;
+
+    WORD8 i1_buf_period_params_present_flag;
+
+    WORD8 i1_pic_timing_params_present_flag;
+
+    WORD8 i1_recovery_point_params_present_flag;
+
+    buf_period_sei_params_t  s_buf_period_sei_params;
+
+    pic_timing_sei_params_t  s_pic_timing_sei_params;
+
+    recovery_point_sei_params_t s_recovery_point_params;
+
+    active_parameter_set_sei_param_t s_active_parameter_set_sei_params;
+
+
+}sei_params_t;
+
+
+
+/**
+ * Structure to hold slice header info
+ */
+typedef struct
+{
+    /**
+     *  entry_point_offset[ i ]
+     */
+    WORD32 *pi4_entry_point_offset;
+
+    /**
+     *  poc_lsb_lt[ i ]
+     */
+    WORD32 ai4_poc_lsb_lt[MAX_DPB_SIZE];
+
+    /**
+     *  slice_header_extension_length
+     */
+    WORD16 i2_slice_header_extension_length;
+
+    /**
+     *  slice_address
+     */
+    WORD16 i2_slice_address;
+
+    /**
+     *  first_slice_in_pic_flag
+     */
+    WORD8 i1_first_slice_in_pic_flag;
+
+    /* PPS id */
+    WORD8 i1_pps_id;
+    /**
+     *  no_output_of_prior_pics_flag
+     */
+    WORD8 i1_no_output_of_prior_pics_flag;
+
+    /**
+     *  dependent_slice_flag
+     */
+    WORD8 i1_dependent_slice_flag;
+
+    /**
+     *  slice_type
+     */
+    WORD8 i1_slice_type;
+
+    /**
+     *  pic_output_flag
+     */
+    WORD8 i1_pic_output_flag;
+
+    /**
+     *  colour_plane_id
+     */
+    WORD8 i1_colour_plane_id;
+
+    /**
+     *  pic_order_cnt_lsb
+     */
+    WORD32 i4_pic_order_cnt_lsb;
+
+    /**
+     *  absolute pic_order_cnt
+     */
+    WORD32 i4_abs_pic_order_cnt;
+
+    /**
+     *  short_term_ref_pic_set_sps_flag
+     */
+    WORD8 i1_short_term_ref_pic_set_sps_flag;
+
+    /**
+     *  short_term_ref_pic_set_idx
+     */
+    WORD8 i1_short_term_ref_pic_set_idx;
+
+    /**
+     *  num_long_term_sps
+     */
+    WORD8 i1_num_long_term_sps;
+
+    /**
+     *  num_long_term_pics
+     */
+    WORD8 i1_num_long_term_pics;
+
+    /**
+     *  lt_idx_sps[ i ]
+     */
+    WORD8 ai1_lt_idx_sps[MAX_DPB_SIZE];
+
+    /**
+     *  used_by_curr_pic_lt_flag[ i ]
+     */
+    WORD8 ai1_used_by_curr_pic_lt_flag[MAX_DPB_SIZE];
+
+    /**
+     *  delta_poc_msb_present_flag[ i ]
+     */
+    WORD8 ai1_delta_poc_msb_present_flag[MAX_DPB_SIZE];
+
+    /**
+     *  delta_poc_msb_cycle_lt[ i ]
+     */
+    WORD8 ai1_delta_poc_msb_cycle_lt[MAX_DPB_SIZE];
+
+    /**
+     *  slice_sao_luma_flag
+     */
+    WORD8 i1_slice_sao_luma_flag;
+
+    /**
+     *  slice_sao_chroma_flag
+     */
+    WORD8 i1_slice_sao_chroma_flag;
+
+    /**
+     *  slice_temporal_mvp_enable_flag
+     */
+    WORD8 i1_slice_temporal_mvp_enable_flag;
+
+    /**
+     *  num_ref_idx_active_override_flag
+     */
+    WORD8 i1_num_ref_idx_active_override_flag;
+
+    /**
+     *  num_ref_idx_l0_active_minus1
+     */
+    WORD8 i1_num_ref_idx_l0_active;
+
+    /**
+     *  num_ref_idx_l1_active_minus1
+     */
+    WORD8 i1_num_ref_idx_l1_active;
+
+    /**
+     *  mvd_l1_zero_flag
+     */
+    WORD8 i1_mvd_l1_zero_flag;
+
+    /**
+     *  cabac_init_flag
+     */
+    WORD8 i1_cabac_init_flag;
+
+    /**
+     *  collocated_from_l0_flag
+     */
+    WORD8 i1_collocated_from_l0_flag;
+
+    /**
+     *  collocated_ref_idx
+     */
+    WORD8 i1_collocated_ref_idx;
+
+    /**
+     * five_minus_max_num_merge_cand
+     */
+    WORD8 i1_max_num_merge_cand;
+
+    /**
+     *  slice_qp_delta
+     */
+    WORD8 i1_slice_qp_delta;
+
+    /**
+     *  slice_cb_qp_offset
+     */
+    WORD8 i1_slice_cb_qp_offset;
+
+    /**
+     *  slice_cr_qp_offset
+     */
+    WORD8 i1_slice_cr_qp_offset;
+
+    /**
+     *  deblocking_filter_override_flag
+     */
+    WORD8 i1_deblocking_filter_override_flag;
+
+    /**
+     *  slice_disable_deblocking_filter_flag
+     */
+    WORD8 i1_slice_disable_deblocking_filter_flag;
+
+    /**
+     *  beta_offset_div2
+     */
+    WORD8 i1_beta_offset_div2;
+
+    /**
+     *  tc_offset_div2
+     */
+    WORD8 i1_tc_offset_div2;
+
+    /**
+     *  slice_loop_filter_across_slices_enabled_flag
+     */
+    WORD8 i1_slice_loop_filter_across_slices_enabled_flag;
+
+    /**
+     *  NUmber of entry point offsets
+     */
+    WORD32 i4_num_entry_point_offsets;
+
+    /**
+     *  offset_len_minus1
+     */
+    WORD8 i1_offset_len;
+
+    /**
+     *  Entry point offsets
+     */
+    WORD32 *pu4_entry_point_offset;
+
+    /**
+     * Short term reference picture set
+     */
+    stref_picset_t s_stref_picset;
+
+    /**
+     *  Weight and offset info for Weighted prediction
+     */
+    pred_wt_ofst_t s_wt_ofst;
+
+    /**
+     *  Reference prediction list modification
+     */
+    rplm_t s_rplm;
+
+    /**
+     *  First CTB' X pos : slice_address % i2_pic_wd_in_ctb
+     */
+    WORD16 i2_ctb_x;
+
+    /**
+     *  First CTB' Y pos : slice_address / i2_pic_wd_in_ctb
+     */
+    WORD16 i2_ctb_y;
+
+    /**
+     * L0 Reference pic lists
+     */
+    ref_list_t as_ref_pic_list0[MAX_DPB_SIZE];
+
+    /**
+     * L1 Reference pic lists
+     */
+    ref_list_t as_ref_pic_list1[MAX_DPB_SIZE];
+
+    /**
+     * NAL unit type of the slice
+     */
+    WORD8 i1_nal_unit_type;
+
+    /**
+     * Low delay check flag
+     */
+    WORD8 i1_low_delay_flag;
+
+    /**
+     * The last independent slice's start ctb_x
+     * If the current slice is independent, it is the same as the current CTBs ctb_x
+     */
+    WORD16 i2_independent_ctb_x;
+
+    /**
+     * The last independent slice's start ctb_y
+     * If the current slice is independent, it is the same as the current CTBs ctb_y
+     */
+    WORD16 i2_independent_ctb_y;
+
+    UWORD8 u1_parse_data_init_done;
+
+}slice_header_t;
+
+
+#if 0
+
+typedef struct
+{
+
+    /* scaling_list_pred_mode_flag */
+    WORD8 i1_scaling_list_pred_mode_flag;
+
+    /* scaling_list_pred_matrix_id_delta */
+    WORD8 i1_scaling_list_pred_matrix_id_delta;
+
+}sld_t;
+
+typedef struct
+{
+    /* scaling_list_dc_coef_minus8[ sizeID - 2 ][ matrixID ] */
+    WORD8 i1_scaling_list_dc_coef[ sizeID - 2 ][ matrixID ];
+
+    /* scaling_list_delta_coef */
+    WORD8 i1_scaling_list_delta_coef;
+
+}slm_t;
+
+typedef struct
+{
+
+    /* last_payload_type_byte */
+    UWORD8 i1_last_payload_type_byte;
+
+    /* last_payload_size_byte */
+    UWORD8 last_payload_size_byte;
+}sei_t;
+
+typedef struct
+{
+    /* pic_type*/
+    WORD8 pic_type;
+}aud_t;
+
+typedef struct
+{
+    /* slice_extention_flag */
+    WORD8 i1_slice_extention_flag;
+
+    /* slice_extension_data_flag */
+    WORD8 i1_slice_extension_data_flag;
+
+}slr_t;
+
+typedef struct
+{
+    /* op_num_layer_id_values_minus1[ opIdx ] */
+    WORD8 i1_op_num_layer_id_values_minus1[VPS_MAX_HRD_PARAMS];
+
+    /* op_layer_id[ opIdx ][ i ] */
+    WORD8 i1_op_layer_id[VPS_MAX_HRD_PARAMS][VPS_MAX_OP_LAYERS];
+}op_point_t;
+
+
+typedef struct
+{
+}sds_t;
+
+#endif
+
+
+
+
+
+
+#endif /* _IHEVC_STRUCTS_H_ */

diff --git a/common/ihevc_tables_x86_intr.h b/common/ihevc_tables_x86_intr.h
new file mode 100644
index 0000000..4ded3ea
--- /dev/null
+++ b/common/ihevc_tables_x86_intr.h

@@ -0,0 +1,70 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_tables_x86_intr.h
+*
+* @brief
+*  Declarations for the fucntions defined in  ihevc_intra_pred_filters
+*
+* @author
+*  Mamatha
+*
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef IHEVC_TABLES_X86_INTR_H_
+#define IHEVC_TABLES_X86_INTR_H_
+
+
+//Luma intra pred
+extern MEM_ALIGN16 const UWORD8 IHEVCE_SHUFFLEMASKY1[16];
+extern MEM_ALIGN16 const UWORD8 IHEVCE_SHUFFLEMASKY2[16];
+extern MEM_ALIGN16 const UWORD8 IHEVCE_SHUFFLEMASKY3[16];
+extern MEM_ALIGN16 const UWORD8 IHEVCE_SHUFFLEMASK4[16];
+extern MEM_ALIGN16 const UWORD8 IHEVCE_SHUFFLEMASK5[16];
+//Chroma intra pred
+extern MEM_ALIGN16 const UWORD8 IHEVCE_SHUFFLEMASKY7[16];
+
+extern MEM_ALIGN16 const UWORD8 IHEVCE_SHUFFLEMASKY8[16];
+
+extern MEM_ALIGN16 const UWORD8 IHEVCE_SHUFFLEMASKY9[16];
+
+extern MEM_ALIGN16 const UWORD8 IHEVCE_SHUFFLEMASKY11[16];
+
+extern MEM_ALIGN16 const UWORD8 inv_angle_shuffle[7][32];
+// DEBLOCK TABLES
+extern MEM_ALIGN16 const WORD8 coef_d[16];
+extern MEM_ALIGN16 const WORD8 coef_de1[16];
+extern MEM_ALIGN16 const WORD8 coef_dep1[16];
+extern MEM_ALIGN16 const WORD32 shuffle_d[4];
+extern const WORD32 shuffle0[2];
+extern MEM_ALIGN16 const WORD32 shuffle1[4];
+extern MEM_ALIGN16 const WORD32 shuffle2[4];
+extern MEM_ALIGN16 const WORD32 shuffle3[4];
+
+extern MEM_ALIGN16 const WORD8 delta0[16];
+extern MEM_ALIGN16 const WORD8 delta1[16];
+extern MEM_ALIGN16 const WORD32 shuffle_uv[4];
+
+#endif /*IHEVC_TABLES_X86_INTR_H_*/

diff --git a/common/ihevc_trans.h b/common/ihevc_trans.h
new file mode 100644
index 0000000..45cc6b8
--- /dev/null
+++ b/common/ihevc_trans.h

@@ -0,0 +1,75 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_trans.h
+*
+* @brief
+*  Functions declarations for forward transform
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_TRANS_H_
+#define _IHEVC_TRANS_H_
+
+typedef void ihevc_trans_4x4_ttype1_ft(WORD16 *pi2_src,
+                                       WORD16 *pi2_dst,
+                                       WORD32 i4_src_strd,
+                                       WORD32 i4_dst_strd,
+                                       WORD32 i4_shift,
+                                       WORD32 i4_zero_rows);
+typedef void ihevc_trans_4x4_ft(WORD16 *pi2_src,
+                                WORD16 *pi2_dst,
+                                WORD32 i4_src_strd,
+                                WORD32 i4_dst_strd,
+                                WORD32 i4_shift,
+                                WORD32 i4_zero_rows);
+typedef void ihevc_trans_8x8_ft(WORD16 *pi2_src,
+                                WORD16 *pi2_dst,
+                                WORD32 i4_src_strd,
+                                WORD32 i4_dst_strd,
+                                WORD32 i4_shift,
+                                WORD32 i4_zero_rows);
+typedef void ihevc_trans_16x16_ft(WORD16 *pi2_src,
+                                  WORD16 *pi2_dst,
+                                  WORD32 i4_src_strd,
+                                  WORD32 i4_dst_strd,
+                                  WORD32 i4_shift,
+                                  WORD32 i4_zero_rows);
+typedef void ihevc_trans_32x32_ft(WORD16 *pi2_src,
+                                  WORD16 *pi2_dst,
+                                  WORD32 i4_src_strd,
+                                  WORD32 i4_dst_strd,
+                                  WORD32 i4_shift,
+                                  WORD32 i4_zero_rows);
+
+ihevc_trans_4x4_ttype1_ft ihevc_trans_4x4_ttype1;
+ihevc_trans_4x4_ft ihevc_trans_4x4;
+ihevc_trans_8x8_ft ihevc_trans_8x8;
+ihevc_trans_16x16_ft ihevc_trans_16x16;
+ihevc_trans_32x32_ft ihevc_trans_32x32;
+
+
+#endif /*_IHEVC_TRANS_H_*/

diff --git a/common/ihevc_trans_macros.h b/common/ihevc_trans_macros.h
new file mode 100644
index 0000000..079784d
--- /dev/null
+++ b/common/ihevc_trans_macros.h

@@ -0,0 +1,182 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_trans_macros.h
+*
+* @brief
+*  Macros used in the forward transform and inverse transform functions
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef IHEVC_TRANS_MACROS_H_
+#define IHEVC_TRANS_MACROS_H_
+
+#define QUANT(out, inp, quant_coeff, qp_div, log2_trans_size, q_add) \
+{                                                                                                                                                                \
+    LWORD64 tmp;                                                                                                                                                  \
+    WORD32 sign;                                                                                                                                                 \
+    WORD32 bit_depth,transform_shift;                                                                                                                            \
+    WORD32  q_bits, quant_multiplier;                                                                                                                            \
+                                                                                                                                                                 \
+    /* q_bits and q_add calculation*/                                                                                                                            \
+    /* To be moved outside in neon. To be computer once per transform call */                                                                                    \
+    bit_depth = 8;                                                                                                                                               \
+    transform_shift = MAX_TR_DYNAMIC_RANGE - bit_depth - log2_trans_size;                                                                                        \
+    quant_multiplier = 4 ; /* because quant_coeff are multiplied by 16. Instead of multiplying, we can reduce the division factor q_bits by 4 */                 \
+    q_bits = QUANT_SHIFT + qp_div + transform_shift + SCALING_Q_SHIFT - quant_multiplier ;                                                                       \
+                                                                                                                                                                 \
+    sign = (inp)<0 ? -1:1;                                                                                                                                       \
+                                                                                                                                                                 \
+    tmp = (LWORD64)(abs(inp));                                                                                                                                    \
+    tmp = tmp * (quant_coeff);                                                                                                                                   \
+    tmp = tmp + (((LWORD64)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q));                                                                                            \
+    tmp = tmp >> q_bits;                                                                                                                                         \
+                                                                                                                                                                 \
+    tmp = tmp * sign;                                                                                                                                            \
+    out = (WORD16) CLIP_S16(tmp);                                                                                                                                \
+}                                                                                                                                                                \
+
+#define QUANT_HBD(out, inp, quant_coeff, qp_div, log2_trans_size, q_add, bit_depth) \
+{                                                                                                                                                                \
+    LWORD64 tmp;                                                                                                                                                  \
+    WORD32 sign;                                                                                                                                                 \
+    WORD32 transform_shift;                                                                                                                                      \
+    WORD32  q_bits, quant_multiplier;                                                                                                                            \
+                                                                                                                                                                 \
+    /* q_bits and q_add calculation*/                                                                                                                            \
+    /* To be moved outside in neon. To be computer once per transform call */                                                                                    \
+                                                                                                                                                                 \
+    transform_shift = MAX_TR_DYNAMIC_RANGE - bit_depth - log2_trans_size;                                                                                        \
+    quant_multiplier = 4 ; /* because quant_coeff are multiplied by 16. Instead of multiplying, we can reduce the division factor q_bits by 4 */                 \
+    q_bits = QUANT_SHIFT + qp_div + transform_shift + SCALING_Q_SHIFT - quant_multiplier ;                                                                       \
+                                                                                                                                                                 \
+    sign = (inp)<0 ? -1:1;                                                                                                                                       \
+                                                                                                                                                                 \
+    tmp = (LWORD64)(abs(inp));                                                                                                                                    \
+    tmp = tmp * (quant_coeff);                                                                                                                                   \
+    tmp = tmp + (((LWORD64)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q));                                                                                            \
+    tmp = tmp >> q_bits;                                                                                                                                         \
+                                                                                                                                                                 \
+    tmp = tmp * sign;                                                                                                                                            \
+    out = (WORD16) CLIP_S16(tmp);                                                                                                                                \
+}
+/* added by 100028 */
+#define QUANT_NO_WEIGHTMAT(out, inp, quant_coeff, qp_div, log2_trans_size, q_add) \
+{                                                                                                                                                                \
+    WORD32 tmp;                                                                                                                                                  \
+    WORD32 sign;                                                                                                                                                 \
+    WORD32 bit_depth,transform_shift;                                                                                                                            \
+    WORD32  q_bits, quant_multiplier;                                                                                                                            \
+                                                                                                                                                                 \
+    /* q_bits and q_add calculation*/                                                                                                                            \
+    /* To be moved outside in neon. To be computer once per transform call */                                                                                    \
+    bit_depth = 8;                                                                                                                                               \
+    transform_shift = MAX_TR_DYNAMIC_RANGE - bit_depth - log2_trans_size;                                                                                        \
+    quant_multiplier = 4 ; /* because quant_coeff are multiplied by 16. Instead of multiplying, we can reduce the division factor q_bits by 4 */                 \
+    q_bits = QUANT_SHIFT + qp_div + transform_shift + SCALING_Q_SHIFT - quant_multiplier - FLAT_RESCALE_MAT_Q_SHIFT /* 2048 */;                                                                       \
+                                                                                                                                                                 \
+    sign = (inp)<0 ? -1:1;                                                                                                                                       \
+                                                                                                                                                                 \
+    tmp = (WORD32)(abs(inp));                                                                                                                                    \
+    tmp = tmp * (quant_coeff);                                                                                                                                   \
+    tmp = tmp + (((WORD32)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q));                                                                                            \
+    tmp = tmp >> q_bits;                                                                                                                                         \
+                                                                                                                                                                 \
+    tmp = tmp * sign;                                                                                                                                            \
+    out = (WORD16) CLIP_S16(tmp);                                                                                                                                \
+}
+
+#define QUANT_NO_WEIGHTMAT_HBD(out, inp, quant_coeff, qp_div, log2_trans_size, q_add, bit_depth) \
+{                                                                                                                                                                \
+    WORD32 tmp;                                                                                                                                                  \
+    WORD32 sign;                                                                                                                                                 \
+    WORD32 transform_shift;                                                                                                                                      \
+    WORD32  q_bits, quant_multiplier;                                                                                                                            \
+                                                                                                                                                                 \
+    /* q_bits and q_add calculation*/                                                                                                                            \
+    /* To be moved outside in neon. To be computer once per transform call */                                                                                    \
+                                                                                                                                                                 \
+    transform_shift = MAX_TR_DYNAMIC_RANGE - bit_depth - log2_trans_size;                                                                                        \
+    quant_multiplier = 4 ; /* because quant_coeff are multiplied by 16. Instead of multiplying, we can reduce the division factor q_bits by 4 */                 \
+    q_bits = QUANT_SHIFT + qp_div + transform_shift + SCALING_Q_SHIFT - quant_multiplier - FLAT_RESCALE_MAT_Q_SHIFT /* 2048 */;                                                                       \
+                                                                                                                                                                 \
+    sign = (inp)<0 ? -1:1;                                                                                                                                       \
+                                                                                                                                                                 \
+    tmp = (WORD32)(abs(inp));                                                                                                                                    \
+    tmp = tmp * (quant_coeff);                                                                                                                                   \
+    tmp = tmp + (((WORD32)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q));                                                                                            \
+    tmp = tmp >> q_bits;                                                                                                                                         \
+                                                                                                                                                                 \
+    tmp = tmp * sign;                                                                                                                                            \
+    out = (WORD16) CLIP_S16(tmp);                                                                                                                                \
+}
+/* Reference Inverse Quantization: "pi2_src"(Coefficients) will be clipped to 15 or 14 bits when (qp_div > shift_iq). Spec doesn't have any clip mentioned  */
+
+/* Inverse quantization other than 4x4 */
+/* No clipping is needed for "pi2_src"(coefficients) */
+#define IQUANT(res, coeff /*pi2_src[index*src_strd]*/, dequant_coeff /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */, shift_iq, qp_div)       \
+{                                                                                                                                              \
+    WORD32 tmp, add_iq;                                                                                                                        \
+                                                                                                                                               \
+    add_iq = SHL_NEG(1 , (shift_iq - qp_div - 1));  /* To be moved outside in neon. To be computed once per transform call */                  \
+                                                                                                                                               \
+    tmp = coeff * dequant_coeff ;                                                                                                              \
+    tmp = tmp + add_iq;                                                                                                                        \
+    tmp = SHR_NEG(tmp,(shift_iq - qp_div));                                                                                                    \
+                                                                                                                                               \
+    res = CLIP_S16(tmp);                                                                                                                       \
+}
+
+/* 4x4 inverse quantization */
+/* Options : */
+/* 1. Clip "pi2_src"(coefficients) to 10 bits if "(qp_div >= shift_iq)" or 16 bits if "(qp_div < shift_iq)"*/
+/* 2. Increasing precision of "pi2_src"(coefficients) to 64 bits */
+
+#define IQUANT_4x4(res, coeff /*pi2_src[index*src_strd]*/, dequant_coeff /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */, shift_iq, qp_div)   \
+{                                                                                                                                              \
+    WORD32 clip_coeff, tmp;                                                                                                                    \
+    WORD32 coeff_min,coeff_max;                                                                                                                \
+    WORD32 coeff_bit_range;                                                                                                                    \
+    WORD32 add_iq;                                                                                                                             \
+    add_iq = SHL_NEG(1 , (shift_iq - qp_div - 1));  /* To be moved outside in neon. To be computed once per transform call */                  \
+                                                                                                                                               \
+    coeff_bit_range = 16;                                                                                                                      \
+    if(qp_div > shift_iq)                                                                                                                      \
+        coeff_bit_range = 10;                                                                                                                  \
+                                                                                                                                               \
+    coeff_min = -(1<<(coeff_bit_range-1));                                                                                                     \
+    coeff_max = (1<<(coeff_bit_range-1)) - 1;                                                                                                  \
+                                                                                                                                               \
+    clip_coeff = CLIP3(coeff,coeff_min,coeff_max);                                                                                             \
+                                                                                                                                               \
+    tmp = clip_coeff * dequant_coeff ;                                                                                                         \
+    tmp = tmp + add_iq;                                                                                                                        \
+    tmp = SHR_NEG(tmp,(shift_iq - qp_div));                                                                                                    \
+                                                                                                                                               \
+    res = CLIP_S16(tmp);                                                                                                                       \
+}
+
+#endif /* IHEVC_TRANS_MACROS_H_ */

diff --git a/common/ihevc_trans_tables.c b/common/ihevc_trans_tables.c
new file mode 100644
index 0000000..139699a
--- /dev/null
+++ b/common/ihevc_trans_tables.c

@@ -0,0 +1,926 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_trans_tables.c
+*
+* @brief
+*  Contains tables used for forward and inverse  transform
+*
+* @author
+*  100470
+*
+* @par List of Tables:
+* g_ihevc_iquant_scales
+* g_ihevc_iquant_intr_scales
+* g_ihevc_quant_scales
+* g_ai4_ihevc_trans_4_ttype1
+* g_ai4_ihevc_trans_4_ttype0
+* g_ai2_ihevc_trans_dst_4
+* g_ai4_ihevc_trans_dst_intr_4
+* g_ai2_ihevc_trans_4
+* g_ai2_ihevc_trans_4_transpose
+* g_ai4_ihevc_trans_4_intr
+* g_ai2_ihevc_trans_4_intr
+* g_ai2_ihevc_trans_8
+* g_ai2_ihevc_trans_8_transpose
+* g_ai4_ihevc_trans_8_intr
+* g_ai2_ihevc_trans_8_intr
+* g_ai4_ihevc_trans_intr_even_8
+* g_ai4_ihevc_trans_intr_odd_8
+* g_ai2_ihevc_trans_16
+* g_ai2_ihevc_trans_16_transpose
+* g_ai2_ihevc_trans_32_intr_8
+* g_ai4_ihevc_trans_16_even
+* g_ai4_ihevc_trans_16_odd
+* g_ai2_ihevc_trans_32_transpose
+* g_ai2_ihevc_trans_32
+* g_ai2_ihevc_trans_32_intr_16
+* g_ai2_ihevc_trans_16_intr_odd
+* g_ai2_ihevc_trans_16_intr_even
+* g_ai2_ihevc_trans_32_intr_even
+* g_ai2_ihevc_trans_32_intr_odd
+* g_ai2_ihevc_trans_16_even_packed
+* g_ai2_ihevc_trans_32_intr_packed
+* g_ai2_ihevc_trans_32_intr_odd_packed
+* g_ai2_ihevc_trans_16_even
+* g_ai2_ihevc_trans_16_odd
+* g_ai2_ihevc_trans_intr_even_8
+* g_ai2_ihevc_trans_intr_odd_8
+* g_ai2_ihevc_trans_intr_4
+* IHEVCE_CHROMA_SHUFFLEMASK_HBD
+* g_ai4_ihevc_trans_8_intr_avx2
+* g_ai2_ihevc_trans_8_intr_avx2
+* g_ai2_ihevc_trans_32_intr_8_avx2
+* g_ai2_ihevc_trans_32_intr_16_avx2
+* g_ai2_ihevc_trans_16_intr_odd_avx2
+* g_ai2_ihevc_trans_16_intr_even_avx2
+
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+
+#include "ihevc_platform_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_macros.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_defs.h"
+
+const WORD32 g_ihevc_iquant_scales[6] =
+{
+    40, 45, 51, 57, 64, 72
+};
+
+const WORD16 g_ihevc_iquant_intr_scales[6][8] =
+{
+    { 40, 40, 40, 40, 40, 40, 40, 40 },
+    { 45, 45, 45, 45, 45, 45, 45, 45 },
+    { 51, 51, 51, 51, 51, 51, 51, 51 },
+    { 57, 57, 57, 57, 57, 57, 57, 57 },
+    { 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 72, 72, 72, 72, 72, 72, 72, 72 }
+};
+
+const WORD32 g_ihevc_quant_scales[6] =
+{
+    26214, 23302, 20560, 18396, 16384, 14564
+};
+
+//DST coeffs
+const WORD32 g_ai4_ihevc_trans_4_ttype1[3][4] =
+{
+    { 55, 55, 55, 55 },
+    { 29, 29, 29, 29 },
+    { 74, 74, 74, 74 }
+};
+
+//DCT coeffs
+const WORD32 g_ai4_ihevc_trans_4_ttype0[3][4] =
+{
+    { 36, 36, 36, 36 },
+    { 64, 64, 64, 64 },
+    { 83, 83, 83, 83 }
+};
+
+const WORD16 g_ai2_ihevc_trans_dst_4[4][4] =
+{
+    { 29, 55, 74, 84 },
+    { 74, 74, 0, -74 },
+    { 84, -29, -74, 55 },
+    { 55, -84, 74, -29 }
+};
+
+const WORD32 g_ai4_ihevc_trans_dst_intr_4[3][4] =
+{ /* 4*32 = 128 bit */
+    { 29, 29, 29, 29 },
+    { 55, 55, 55, 55 },
+    { 74, 74, 74, 74 }
+};
+
+const WORD16 g_ai2_ihevc_trans_4[4][4] =
+{
+    { 64,  64,  64,  64 },
+    { 83,  36, -36, -83 },
+    { 64, -64, -64,  64 },
+    { 36, -83,  83, -36 }
+};
+
+const WORD16 g_ai2_ihevc_trans_4_transpose[4][4] =
+{
+    { 64,  83,  64,  36 },
+    { 64,  36, -64, -83 },
+    { 64, -36, -64,  83 },
+    { 64, -83,  64, -36 }
+};
+
+const WORD32 g_ai4_ihevc_trans_4_intr[3][4] =
+{ /* 4*32 = 128 bit */
+    { 64, 64, 64, 64 },
+    { 83, 83, 83, 83 },
+    { 36, 36, 36, 36 }
+};
+
+const WORD16 g_ai2_ihevc_trans_4_intr[8] = { 64, 64, 83, 36, 64, -64, 36, -83 };
+
+
+const WORD16 g_ai2_ihevc_trans_8[8][8] =
+{
+    { 64,  64,  64,  64,  64,  64,  64,  64 },
+    { 89,  75,  50,  18, -18, -50, -75, -89 },
+    { 83,  36, -36, -83, -83, -36,  36,  83 },
+    { 75, -18, -89, -50,  50,  89,  18, -75 },
+    { 64, -64, -64,  64,  64, -64, -64,  64 },
+    { 50, -89,  18,  75, -75, -18,  89, -50 },
+    { 36, -83,  83, -36, -36,  83, -83,  36 },
+    { 18, -50,  75, -89,  89, -75,  50, -18 }
+};
+
+/* Used by itrans_recon_8x8 */
+const WORD16 g_ai2_ihevc_trans_8_transpose[8][8] =
+{
+    { 64,  89,  83,  75,  64,  50,  36,  18 },
+    { 64,  75,  36, -18, -64, -89, -83, -50 },
+    { 64,  50, -36, -89, -64,  18,  83,  75 },
+    { 64,  18, -83, -50,  64,  75, -36, -89 },
+    { 64, -18, -83,  50,  64, -75, -36,  89 },
+    { 64, -50, -36,  89, -64, -18,  83, -75 },
+    { 64, -75,  36,  18, -64,  89, -83,  50 },
+    { 64, -89,  83, -75,  64, -50,  36, -18 }
+};
+
+const WORD32 g_ai4_ihevc_trans_8_intr[7][4] =
+{ /* 4*32 = 128 bit */
+    { 64, 64, 64, 64 },
+    { 83, 83, 83, 83 },
+    { 36, 36, 36, 36 },
+    { 75, 75, 75, 75 },
+    { 18, 18, 18, 18 },
+    { 89, 89, 89, 89 },
+    { 50, 50, 50, 50 },
+};
+
+
+const WORD16 g_ai2_ihevc_trans_8_intr[8][8] =
+{ /* 4*32 = 128 bit */
+    { 64,  64,  64,  64,  64,  64,  64,  64 },
+    { 89,  75,  18,  50,  89,  75,  18,  50 },
+    { 83,  36,  83,  36,  83,  36,  83,  36 },
+    { 75, -18, -50, -89,  75, -18, -50, -89 },
+    { 64, -64,  64, -64,  64, -64,  64, -64 },
+    { 50, -89,  75,  18,  50, -89,  75,  18 },
+    { 36, -83,  36, -83,  36, -83,  36, -83 },
+    { 18, -50, -89,  75,  18, -50, -89,  75 }
+};
+
+
+const WORD32 g_ai4_ihevc_trans_intr_even_8[3][4] =
+{
+    { 64, 64, 64, 64 },
+    { 83, 83, 83, 83 },
+    { 36, 36, 36, 36 },
+};
+
+const WORD32 g_ai4_ihevc_trans_intr_odd_8[4][4] =
+{
+    { 89, 89, 89, 89 },
+    { 75, 75, 75, 75 },
+    { 50, 50, 50, 50 },
+    { 18, 18, 18, 18 }
+};
+
+const WORD16 g_ai2_ihevc_trans_16[16][16] =
+{
+    { 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64 },
+    { 90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90 },
+    { 89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89 },
+    { 87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87 },
+    { 83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83 },
+    { 80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80 },
+    { 75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75 },
+    { 70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70 },
+    { 64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64 },
+    { 57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57 },
+    { 50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50 },
+    { 43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43 },
+    { 36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36 },
+    { 25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25 },
+    { 18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18 },
+    {  9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9 }
+};
+
+const WORD16 g_ai2_ihevc_trans_16_transpose[1][16] =
+{
+    { 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9 }
+};
+
+const WORD32 g_ai2_ihevc_trans_32_intr_8[8][4] =
+{ /* 4*32 = 128 bit */
+    { 90, 90, 90, 90 },
+    { 87, 87, 87, 87 },
+    { 80, 80, 80, 80 },
+    { 70, 70, 70, 70 },
+    { 57, 57, 57, 57 },
+    { 43, 43, 43, 43 },
+    { 25, 25, 25, 25 },
+    {  9,  9,  9,  9 }
+};
+
+const WORD32 g_ai4_ihevc_trans_16_even[7][4] =
+{
+    { 64, 64, 64, 64 },
+    { 89, 89, 89, 89 },
+    { 75, 75, 75, 75 },
+    { 83, 83, 83, 83 },
+    { 36, 36, 36, 36 },
+    { 18, 18, 18, 18 },
+    { 50, 50, 50, 50 },
+};
+
+const WORD32 g_ai4_ihevc_trans_16_odd[8][4] =
+{
+    { 90, 90, 90, 90 },
+    { 87, 87, 87, 87 },
+    { 80, 80, 80, 80 },
+    { 70, 70, 70, 70 },
+    { 57, 57, 57, 57 },
+    { 43, 43, 43, 43 },
+    { 25, 25, 25, 25 },
+    { 9,  9,  9,  9  }
+};
+
+const WORD16 g_ai2_ihevc_trans_32_transpose[1][32] =
+{
+    { 64, 90, 90, 90, 89, 88, 87, 85, 83, 82, 80, 78, 75, 73, 70, 67, 64, 61, 57, 54, 50, 46, 43, 38, 36, 31, 25, 22, 18, 13, 9, 4 }
+};
+const WORD16 g_ai2_ihevc_trans_32[32][32] =
+{
+    { 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64 },
+    { 90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4,  -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
+    { 90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90, -90, -87, -80, -70, -57, -43, -25,  -9,   9,  25,  43,  57,  70,  80,  87,  90 },
+    { 90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,  13,  38,  61,  78,  88,  90,  85,  73,  54,  31,   4, -22, -46, -67, -82, -90 },
+    { 89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89,  89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89 },
+    { 88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22, -22, -61, -85, -90, -73, -38,   4,  46,  78,  90,  82,  54,  13, -31, -67, -88 },
+    { 87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87, -87, -57,  -9,  43,  80,  90,  70,  25, -25, -70, -90, -80, -43,   9,  57,  87 },
+    { 85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31,  31,  78,  90,  61,   4, -54, -88, -82, -38,  22,  73,  90,  67,  13, -46, -85 },
+    { 83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83 },
+    { 82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38, -38, -88, -73,  -4,  67,  90,  46, -31, -85, -78, -13,  61,  90,  54, -22, -82 },
+    { 80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80, -80,  -9,  70,  87,  25, -57, -90, -43,  43,  90,  57, -25, -87, -70,   9,  80 },
+    { 78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46,  46,  90,  38, -54, -90, -31,  61,  88,  22, -67, -85, -13,  73,  82,   4, -78 },
+    { 75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75,  75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75 },
+    { 73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54, -54, -85,   4,  88,  46, -61, -82,  13,  90,  38, -67, -78,  22,  90,  31, -73 },
+    { 70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70, -70,  43,  87,  -9, -90, -25,  80,  57, -57, -80,  25,  90,   9, -87, -43,  70 },
+    { 67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61,  61,  73, -46, -82,  31,  88, -13, -90,  -4,  90,  22, -85, -38,  78,  54, -67 },
+    { 64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64 },
+    { 61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67, -67, -54,  78,  38, -85, -22,  90,   4, -90,  13,  88, -31, -82,  46,  73, -61 },
+    { 57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57, -57,  80,  25, -90,   9,  87, -43, -70,  70,  43, -87,  -9,  90, -25, -80,  57 },
+    { 54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73,  73,  31, -90,  22,  78, -67, -38,  90, -13, -82,  61,  46, -88,   4,  85, -54 },
+    { 50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50,  50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50 },
+    { 46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78, -78,  -4,  82, -73, -13,  85, -67, -22,  88, -61, -31,  90, -54, -38,  90, -46 },
+    { 43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43, -43,  90, -57, -25,  87, -70,  -9,  80, -80,   9,  70, -87,  25,  57, -90,  43 },
+    { 38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82,  82, -22, -54,  90, -61, -13,  78, -85,  31,  46, -90,  67,   4, -73,  88, -38 },
+    { 36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36 },
+    { 31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85, -85,  46,  13, -67,  90, -73,  22,  38, -82,  88, -54,  -4,  61, -90,  78, -31 },
+    { 25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25, -25,  70, -90,  80, -43,  -9,  57, -87,  87, -57,   9,  43, -80,  90, -70,  25 },
+    { 22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88,  88, -67,  31,  13, -54,  82, -90,  78, -46,   4,  38, -73,  90, -85,  61, -22 },
+    { 18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18,  18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18 },
+    { 13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90, -90,  82, -67,  46, -22,  -4,  31, -54,  73, -85,  90, -88,  78, -61,  38, -13 },
+    {  9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9,  -9,  25, -43,  57, -70,  80, -87,  90, -90,  87, -80,  70, -57,  43, -25,   9 },
+    {  4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90,  90, -90,  88, -85,  82, -78,  73, -67,  61, -54,  46, -38,  31, -22,  13,  -4 }
+};
+
+
+
+const WORD32 g_ai2_ihevc_trans_32_intr_16[15][4] =
+{ /* 4*32 = 128 bit */
+    { 90, 90, 90, 90 },
+    { 88, 88, 88, 88 },
+    { 85, 85, 85, 85 },
+    { 82, 82, 82, 82 },
+    { 78, 78, 78, 78 },
+    { 73, 73, 73, 73 },
+    { 67, 67, 67, 67 },
+    { 61, 61, 61, 61 },
+    { 54, 54, 54, 54 },
+    { 46, 46, 46, 46 },
+    { 38, 38, 38, 38 },
+    { 31, 31, 31, 31 },
+    { 22, 22, 22, 22 },
+    { 13, 13, 13, 13 },
+    { 4,  4,  4,  4  }
+};
+
+const WORD16 g_ai2_ihevc_trans_16_intr_odd[32][8] =
+{
+    {  90,  87,  90,  87,  90,  87,  90,  87 },
+    {  70,  80,  70,  80,  70,  80,  70,  80 },
+    {  57,  43,  57,  43,  57,  43,  57,  43 },
+    {   9,  25,   9,  25,   9,  25,   9,  25 },
+    {  87,  57,  87,  57,  87,  57,  87,  57 },
+    { -43,   9, -43,   9, -43,   9, -43,   9 },
+    { -80, -90, -80, -90, -80, -90, -80, -90 },
+    { -25, -70, -25, -70, -25, -70, -25, -70 },
+    {  80,   9,  80,   9,  80,   9,  80,   9 },
+    { -87, -70, -87, -70, -87, -70, -87, -70 },
+    { -25,  57, -25,  57, -25,  57, -25,  57 },
+    {  43,  90,  43,  90,  43,  90,  43,  90 },
+    {  70, -43,  70, -43,  70, -43,  70, -43 },
+    {   9, -87,   9, -87,   9, -87,   9, -87 },
+    {  90,  25,  90,  25,  90,  25,  90,  25 },
+    { -57, -80, -57, -80, -57, -80, -57, -80 },
+    {  57, -80,  57, -80,  57, -80,  57, -80 },
+    {  90, -25,  90, -25,  90, -25,  90, -25 },
+    {  -9, -87,  -9, -87,  -9, -87,  -9, -87 },
+    {  70,  43,  70,  43,  70,  43,  70,  43 },
+    {  43, -90,  43, -90,  43, -90,  43, -90 },
+    {  25,  57,  25,  57,  25,  57,  25,  57 },
+    { -87,  70, -87,  70, -87,  70, -87,  70 },
+    { -80,   9, -80,   9, -80,   9, -80,   9 },
+    {  25, -70,  25, -70,  25, -70,  25, -70 },
+    { -80,  90, -80,  90, -80,  90, -80,  90 },
+    {  43,   9,  43,   9,  43,   9,  43,   9 },
+    {  87, -57,  87, -57,  87, -57,  87, -57 },
+    {   9, -25,   9, -25,   9, -25,   9, -25 },
+    { -57,  43, -57,  43, -57,  43, -57,  43 },
+    {  70, -80,  70, -80,  70, -80,  70, -80 },
+    { -90,  87, -90,  87, -90,  87, -90,  87 }
+};
+
+const WORD16 g_ai2_ihevc_trans_16_intr_even[12][8] =
+{
+    {  64,  64,  64,  64,  64,  64,  64,  64 },
+    {  89,  75,  89,  75,  89,  75,  89,  75 },
+    {  18,  50,  18,  50,  18,  50,  18,  50 },
+    {  83,  36,  83,  36,  83,  36,  83,  36 },
+    {  75, -18,  75, -18,  75, -18,  75, -18 },
+    { -50, -89, -50, -89, -50, -89, -50, -89 },
+    {  64, -64,  64, -64,  64, -64,  64, -64 },
+    {  50, -89,  50, -89,  50, -89,  50, -89 },
+    {  75,  18,  75,  18,  75,  18,  75,  18 },
+    {  36, -83,  36, -83,  36, -83,  36, -83 },
+    {  18, -50,  18, -50,  18, -50,  18, -50 },
+    { -89,  75, -89,  75, -89,  75, -89,  75 }
+};
+
+
+const WORD16 g_ai2_ihevc_trans_32_intr_even[22][8] =
+{
+    {  64,  64,  64,  64,  83,  36,  83,  36 },
+    {  64, -64,  64, -64,  36, -83,  36, -83 },
+    {  89,  18,  89,  18,  75,  50,  75,  50 },
+    {  75, -50,  75, -50, -18, -89, -18, -89 },
+    {  50,  75,  50,  75, -89,  18, -89,  18 },
+    {  18, -89,  18, -89, -50,  75, -50,  75 },
+
+    {  90,  70,  90,  70,  87,  80,  87,  80 },
+    {   9,  57,   9,  57,  25,  43,  25,  43 },
+    {  87, -43,  87, -43,  57,   9,  57,   9 },
+    { -25, -80, -25, -80, -70, -90, -70, -90 },
+    {  80, -87,  80, -87,   9, -70,   9, -70 },
+    {  43, -25,  43, -25,  90,  57,  90,  57 },
+    {  70,   9,  70,   9, -43, -87, -43, -87 },
+    { -57,  90, -57,  90, -80,  25, -80,  25 },
+    {  57,  90,  57,  90, -80, -25, -80, -25 },
+    {  70,  -9,  70,  -9,  43, -87,  43, -87 },
+    {  43,  25,  43,  25, -90,  57, -90,  57 },
+    { -80, -87, -80, -87,   9,  70,   9,  70 },
+    {  25, -80,  25, -80, -70,  90, -70,  90 },
+    {  87,  43,  87,  43, -57,   9, -57,   9 },
+    {   9, -57,   9, -57, -25,  43, -25,  43 },
+    { -90,  70, -90,  70,  87, -80,  87, -80 }
+};
+
+
+const WORD16 g_ai2_ihevc_trans_32_intr_odd[32][16] =
+{
+    {  90,  85,  90,  85,  90,  88,  90,  88,  61,  82,  61,  82, -73, -46, -73, -46 },
+    {  67,  82,  67,  82,  73,  78,  73,  78,  90,  31,  90,  31, -13, -88, -13, -88 },
+    {  61,  38,  61,  38,  54,  46,  54,  46,  -4,  85,  -4,  85, -90,  22, -90,  22 },
+    {   4,  31,   4,  31,  13,  22,  13,  22,  67, -38,  67, -38,  54, -78,  54, -78 },
+
+    {  90,  46,  90,  46,  82,  67,  82,  67,  54,  88,  54,  88, -85,  -4, -85,  -4 },
+    { -54,  22, -54,  22, -31,  -4, -31,  -4,  13, -46,  13, -46,  82, -61,  82, -61 },
+    { -73, -88, -73, -88, -85, -90, -85, -90, -90, -78, -90, -78,  38,  67,  38,  67 },
+    { -13, -78, -13, -78, -38, -61, -38, -61, -73, -22, -73, -22, -31,  90, -31,  90 },
+
+    {  88, -13,  88, -13,  67,  31,  67,  31,  46,  54,  46,  54, -90,  38, -90,  38 },
+    { -78, -54, -78, -54, -90, -82, -90, -82, -88, -90, -88, -90,  61,  31,  61,  31 },
+    { -46,  73, -46,  73,  -4,  38,  -4,  38,  22,  13,  22,  13,  67, -85,  67, -85 },
+    {  22,  90,  22,  90,  61,  85,  61,  85,  78,  73,  78,  73,   4, -82,   4, -82 },
+
+    {  85, -67,  85, -67,  46, -13,  46, -13,  38,  -4,  38,  -4, -88,  73, -88,  73 },
+    {  38, -90,  38, -90, -22, -73, -22, -73, -31, -67, -31, -67, -46,  90, -46,  90 },
+    {  82,  -4,  82,  -4,  88,  54,  88,  54,  85,  61,  85,  61, -78,  13, -78,  13 },
+    { -31, -61, -31, -61, -78, -90, -78, -90, -82, -90, -82, -90,  22,  54,  22,  54 },
+
+    {  82, -90,  82, -90,  22, -54,  22, -54,  31, -61,  31, -61, -78,  90, -78,  90 },
+    {  85, -61,  85, -61,  78,  13,  78,  13,  82,   4,  82,   4, -88,  54, -88,  54 },
+    {  31, -67,  31, -67, -46, -90, -46, -90, -38, -90, -38, -90, -22,  73, -22,  73 },
+    {  38,   4,  38,   4,  88,  73,  88,  73,  85,  67,  85,  67, -46, -13, -46, -13 },
+
+    {  78, -73,  78, -73,  -4, -82,  -4, -82,  22, -90,  22, -90, -61,  85, -61,  85 },
+    { -22,  13, -22,  13,  67,  85,  67,  85,  46,  73,  46,  73,  -4, -38,  -4, -38 },
+    { -88,  90, -88,  90, -61,  31, -61,  31, -78,  54, -78,  54,  90, -82,  90, -82 },
+    { -46,  54, -46,  54, -90, -38, -90, -38, -88, -13, -88, -13,  67, -31,  67, -31 },
+
+    {  73, -22,  73, -22, -31, -90, -31, -90,  13, -78,  13, -78, -38,  61, -38,  61 },
+    { -90,  78, -90,  78, -38,  67, -38,  67, -73,  88, -73,  88,  85, -90,  85, -90 },
+    { -13, -46, -13, -46,  82,  61,  82,  61,  54,  22,  54,  22, -31,   4, -31,   4 },
+    {  54, -88,  54, -88,  85,  -4,  85,  -4,  90, -46,  90, -46, -82,  67, -82,  67 },
+
+    {  67,  38,  67,  38, -54, -78, -54, -78,   4, -31,   4, -31, -13,  22, -13,  22 },
+    {   4,  85,   4,  85, -90, -22, -90, -22, -61,  38, -61,  38,  54, -46,  54, -46 },
+    {  90, -31,  90, -31,  13, -88,  13, -88,  67, -82,  67, -82, -73,  78, -73,  78 },
+    { -61,  82, -61,  82, -73,  46, -73,  46, -90,  85, -90,  85,  90, -88,  90, -88 }
+
+};
+
+
+/*Tables for itrans_recon functions*/
+const WORD16 g_ai2_ihevc_trans_16_even_packed[12][8] =
+{
+    {  83,  36,  83,  36,  83,  36,  83,  36 },
+
+    {  36, -83,  36, -83,  36, -83,  36, -83 },
+
+    {  64,  64,  64,  64,  64,  64,  64,  64 },
+
+    {  64, -64,  64, -64,  64, -64,  64, -64 },
+
+    {  89,  75,  89,  75,  89,  75,  89,  75 },
+
+    {  50,  18,  50,  18,  50,  18,  50,  18 },
+
+    {  75, -18,  75, -18,  75, -18,  75, -18 },
+
+    {  89,  50,  89,  50,  89,  50,  89,  50 },
+
+    {  50, -89,  50, -89,  50, -89,  50, -89 },
+
+    {  18,  75,  18,  75,  18,  75,  18,  75 },
+
+    {  18, -50,  18, -50,  18, -50,  18, -50 },
+
+    {  75, -89,  75, -89,  75, -89,  75, -89 },
+
+
+};
+
+const WORD16 g_ai2_ihevc_trans_32_intr_packed[32][8] =
+{
+    {  90,  87,  90,  87,  90,  87,  90,  87 },
+
+    {  80,  70,  80,  70,  80,  70,  80,  70 },
+
+    {  57,  43,  57,  43,  57,  43,  57,  43 },
+
+    {  25,   9,  25,   9,  25,   9,  25,   9 },
+
+    {  87,  57,  87,  57,  87,  57,  87,  57 },
+
+    {   9, -43,   9, -43,   9, -43,   9, -43 },
+
+    {  80,  90,  80,  90,  80,  90,  80,  90 },
+
+    {  70,  25,  70,  25,  70,  25,  70,  25 },
+
+    {  80,   9,  80,   9,  80,   9,  80,   9 },
+
+    {  70,  87,  70,  87,  70,  87,  70,  87 },
+
+    { -25,  57, -25,  57, -25,  57, -25,  57 },
+
+    {  90,  43,  90,  43,  90,  43,  90,  43 },
+
+    {  70, -43,  70, -43,  70, -43,  70, -43 },
+
+    { -87,   9, -87,   9, -87,   9, -87,   9 },
+
+    {  90,  25,  90,  25,  90,  25,  90,  25 },
+
+    {  80,  57,  80,  57,  80,  57,  80,  57 },
+
+    {  57, -80,  57, -80,  57, -80,  57, -80 },
+
+    { -25,  90, -25,  90, -25,  90, -25,  90 },
+
+    {   9,  87,   9,  87,   9,  87,   9,  87 },
+
+    {  43,  70,  43,  70,  43,  70,  43,  70 },
+
+    {  43, -90,  43, -90,  43, -90,  43, -90 },
+
+    {  57,  25,  57,  25,  57,  25,  57,  25 },
+
+    { -87,  70, -87,  70, -87,  70, -87,  70 },
+
+    {   9, -80,   9, -80,   9, -80,   9, -80 },
+
+    {  25, -70,  25, -70,  25, -70,  25, -70 },
+
+    {  90, -80,  90, -80,  90, -80,  90, -80 },
+
+    {  43,   9,  43,   9,  43,   9,  43,   9 },
+
+    { -57,  87, -57,  87, -57,  87, -57,  87 },
+
+    {   9, -25,   9, -25,   9, -25,   9, -25 },
+
+    {  43, -57,  43, -57,  43, -57,  43, -57 },
+
+    {  70, -80,  70, -80,  70, -80,  70, -80 },
+
+    {  87, -90,  87, -90,  87, -90,  87, -90 },
+
+};
+
+const WORD16 g_ai2_ihevc_trans_32_intr_odd_packed[128][8] =
+{
+    /*o0*/
+    {  90,  90,  90,  90,  90,  90,  90,  90 },
+    {  88,  85,  88,  85,  88,  85,  88,  85 },
+    {  82,  78,  82,  78,  82,  78,  82,  78 },
+    {  73,  67,  73,  67,  73,  67,  73,  67 },
+    {  61,  54,  61,  54,  61,  54,  61,  54 },
+    {  46,  38,  46,  38,  46,  38,  46,  38 },
+    {  31,  22,  31,  22,  31,  22,  31,  22 },
+    {  13,   4,  13,   4,  13,   4,  13,   4 },
+
+    /*o1*/
+
+    {  90,  82,  90,  82,  90,  82,  90,  82 },
+    {  67,  46,  67,  46,  67,  46,  67,  46 },
+    { -22,   4, -22,   4, -22,   4, -22,   4 },
+    {  31,  54,  31,  54,  31,  54,  31,  54 },
+    {  73,  85,  73,  85,  73,  85,  73,  85 },
+    {  90,  88,  90,  88,  90,  88,  90,  88 },
+    {  78,  61,  78,  61,  78,  61,  78,  61 },
+    {  38,  13,  38,  13,  38,  13,  38,  13 },
+
+    /*o2*/
+    {  88,  67,  88,  67,  88,  67,  88,  67 },
+    { -31,  13, -31,  13, -31,  13, -31,  13 },
+    {  54,  82,  54,  82,  54,  82,  54,  82 },
+    {  90,  78,  90,  78,  90,  78,  90,  78 },
+    {  46,   4,  46,   4,  46,   4,  46,   4 },
+    {  38,  73,  38,  73,  38,  73,  38,  73 },
+    {  90,  85,  90,  85,  90,  85,  90,  85 },
+    {  61,  22,  61,  22,  61,  22,  61,  22 },
+
+    /*o3*/
+    {  85,  46,  85,  46,  85,  46,  85,  46 },
+    {  13,  67,  13,  67,  13,  67,  13,  67 },
+    {  90,  73,  90,  73,  90,  73,  90,  73 },
+    {  22, -38,  22, -38,  22, -38,  22, -38 },
+    {  82,  88,  82,  88,  82,  88,  82,  88 },
+    { -54,   4, -54,   4, -54,   4, -54,   4 },
+    {  61,  90,  61,  90,  61,  90,  61,  90 },
+    {  78,  31,  78,  31,  78,  31,  78,  31 },
+
+    /*o4*/
+    { -82, -22, -82, -22, -82, -22, -82, -22 },
+    {  54,  90,  54,  90,  54,  90,  54,  90 },
+    {  61, -13,  61, -13,  61, -13,  61, -13 },
+    { -78, -85, -78, -85, -78, -85, -78, -85 },
+    { -31,  46, -31,  46, -31,  46, -31,  46 },
+    {  90,  67,  90,  67,  90,  67,  90,  67 },
+    {  -4, -73,  -4, -73,  -4, -73,  -4, -73 },
+    { -88, -38, -88, -38, -88, -38, -88, -38 },
+
+    /*o5*/
+    { -78,   4, -78,   4, -78,   4, -78,   4 },
+    {  82,  73,  82,  73,  82,  73,  82,  73 },
+    { -13, -85, -13, -85, -13, -85, -13, -85 },
+    { -67,  22, -67,  22, -67,  22, -67,  22 },
+    {  88,  61,  88,  61,  88,  61,  88,  61 },
+    { -31, -90, -31, -90, -31, -90, -31, -90 },
+    { -54,  38, -54,  38, -54,  38, -54,  38 },
+    {  90,  46,  90,  46,  90,  46,  90,  46 },
+
+    /*o6*/
+    { -73,  31, -73,  31, -73,  31, -73,  31 },
+    {  90,  22,  90,  22,  90,  22,  90,  22 },
+    { -78, -67, -78, -67, -78, -67, -78, -67 },
+    {  38,  90,  38,  90,  38,  90,  38,  90 },
+    {  13, -82,  13, -82,  13, -82,  13, -82 },
+    { -61,  46, -61,  46, -61,  46, -61,  46 },
+    {  88,   4,  88,   4,  88,   4,  88,   4 },
+    { -85, -54, -85, -54, -85, -54, -85, -54 },
+
+    /*o7*/
+    { -67,  54, -67,  54, -67,  54, -67,  54 },
+    {  78, -38,  78, -38,  78, -38,  78, -38 },
+    { -85,  22, -85,  22, -85,  22, -85,  22 },
+    {  90,  -4,  90,  -4,  90,  -4,  90,  -4 },
+    { -90, -13, -90, -13, -90, -13, -90, -13 },
+    {  88,  31,  88,  31,  88,  31,  88,  31 },
+    { -82, -46, -82, -46, -82, -46, -82, -46 },
+    {  73,  61,  73,  61,  73,  61,  73,  61 },
+
+    /*o8*/
+    { -61,  73, -61,  73, -61,  73, -61,  73 },
+    {  46, -82,  46, -82,  46, -82,  46, -82 },
+    { -31,  88, -31,  88, -31,  88, -31,  88 },
+    {  13, -90,  13, -90,  13, -90,  13, -90 },
+    {   4,  90,   4,  90,   4,  90,   4,  90 },
+    { -22, -85, -22, -85, -22, -85, -22, -85 },
+    {  38,  78,  38,  78,  38,  78,  38,  78 },
+    { -54, -67, -54, -67, -54, -67, -54, -67 },
+
+    /*o9*/
+    { -54,  85, -54,  85, -54,  85, -54,  85 },
+    {   4, -88,   4, -88,   4, -88,   4, -88 },
+    {  46,  61,  46,  61,  46,  61,  46,  61 },
+    { -82, -13, -82, -13, -82, -13, -82, -13 },
+    {  90, -38,  90, -38,  90, -38,  90, -38 },
+    { -67,  78, -67,  78, -67,  78, -67,  78 },
+    {  22, -90,  22, -90,  22, -90,  22, -90 },
+    {  31,  73,  31,  73,  31,  73,  31,  73 },
+
+    /*o10*/
+    { -46,  90, -46,  90, -46,  90, -46,  90 },
+    { -38, -54, -38, -54, -38, -54, -38, -54 },
+    {  90, -31,  90, -31,  90, -31,  90, -31 },
+    { -61,  88, -61,  88, -61,  88, -61,  88 },
+    { -22, -67, -22, -67, -22, -67, -22, -67 },
+    {  85, -13,  85, -13,  85, -13,  85, -13 },
+    { -73,  82, -73,  82, -73,  82, -73,  82 },
+    {  -4, -78,  -4, -78,  -4, -78,  -4, -78 },
+
+    /*o11*/
+    { -38,  88, -38,  88, -38,  88, -38,  88 },
+    { -73,   4, -73,   4, -73,   4, -73,   4 },
+    {  67, -90,  67, -90,  67, -90,  67, -90 },
+    {  46,  31,  46,  31,  46,  31,  46,  31 },
+    { -85,  78, -85,  78, -85,  78, -85,  78 },
+    { -13, -61, -13, -61, -13, -61, -13, -61 },
+    {  90, -54,  90, -54,  90, -54,  90, -54 },
+    { -22,  82, -22,  82, -22,  82, -22,  82 },
+
+
+    /*012*/
+    { -31,  78, -31,  78, -31,  78, -31,  78 },
+    { -90,  61, -90,  61, -90,  61, -90,  61 },
+    {  -4, -54,  -4, -54,  -4, -54,  -4, -54 },
+    {  88, -82,  88, -82,  88, -82,  88, -82 },
+    {  38,  22,  38,  22,  38,  22,  38,  22 },
+    { -73,  90, -73,  90, -73,  90, -73,  90 },
+    { -67,  13, -67,  13, -67,  13, -67,  13 },
+    {  46, -85,  46, -85,  46, -85,  46, -85 },
+
+    /*o13*/
+    { -22,  61, -22,  61, -22,  61, -22,  61 },
+    { -85,  90, -85,  90, -85,  90, -85,  90 },
+    { -73,  38, -73,  38, -73,  38, -73,  38 },
+    {   4, -46,   4, -46,   4, -46,   4, -46 },
+    {  78, -90,  78, -90,  78, -90,  78, -90 },
+    {  82, -54,  82, -54,  82, -54,  82, -54 },
+    {  13,  31,  13,  31,  13,  31,  13,  31 },
+    { -67,  88, -67,  88, -67,  88, -67,  88 },
+
+    /*o14*/
+    { -13,  38, -13,  38, -13,  38, -13,  38 },
+    { -61,  78, -61,  78, -61,  78, -61,  78 },
+    { -88,  90, -88,  90, -88,  90, -88,  90 },
+    { -85,  73, -85,  73, -85,  73, -85,  73 },
+    { -54,  31, -54,  31, -54,  31, -54,  31 },
+    {  -4, -22,  -4, -22,  -4, -22,  -4, -22 },
+    {  46, -67,  46, -67,  46, -67,  46, -67 },
+    {  82, -90,  82, -90,  82, -90,  82, -90 },
+
+    /*o15*/
+    {  -4,  13,  -4,  13,  -4,  13,  -4,  13 },
+    { -22,  31, -22,  31, -22,  31, -22,  31 },
+    { -38,  46, -38,  46, -38,  46, -38,  46 },
+    { -54,  61, -54,  61, -54,  61, -54,  61 },
+    { -67,  73, -67,  73, -67,  73, -67,  73 },
+    { -78,  82, -78,  82, -78,  82, -78,  82 },
+    { -85,  88, -85,  88, -85,  88, -85,  88 },
+    { -90,  90, -90,  90, -90,  90, -90,  90 },
+
+};
+const WORD16 g_ai2_ihevc_trans_16_even[12][8] =
+{
+    {  64,  64,  64,  64,  64,  64,  64,  64 },
+    {  64, -64,  64, -64,  64, -64,  64, -64 },
+    {  89,  75,  89,  75,  89,  75,  89,  75 },
+    {  75, -18,  75, -18,  75, -18,  75, -18 },
+    {  50,  18,  50,  18,  50,  18,  50,  18 },
+    {  89,  50,  89,  50,  89,  50,  89,  50 },
+    {  83,  36,  83,  36,  83,  36,  83,  36 },
+    {  36, -83,  36, -83,  36, -83,  36, -83 },
+    {  50, -89,  50, -89,  50, -89,  50, -89 },
+    {  18, -50,  18, -50,  18, -50,  18, -50 },
+    {  18,  75,  18,  75,  18,  75,  18,  75 },
+    {  75, -89,  75, -89,  75, -89,  75, -89 },
+};
+const WORD16 g_ai2_ihevc_trans_16_odd[32][8] =
+{
+    {  90,  87,  90,  87,  90,  87,  90,  87 },
+    {  80,  70,  80,  70,  80,  70,  80,  70 },
+    {  57,  43,  57,  43,  57,  43,  57,  43 },
+    {  25,   9,  25,   9,  25,   9,  25,   9 },
+    {  87,  57,  87,  57,  87,  57,  87,  57 },
+    {   9, -43,   9, -43,   9, -43,   9, -43 },
+    {  80,  90,  80,  90,  80,  90,  80,  90 },
+    {  70,  25,  70,  25,  70,  25,  70,  25 },
+    {  80,   9,  80,   9,  80,   9,  80,   9 },
+    {  70,  87,  70,  87,  70,  87,  70,  87 },
+    {  25, -57,  25, -57,  25, -57,  25, -57 },
+    {  90,  43,  90,  43,  90,  43,  90,  43 },
+    {  70, -43,  70, -43,  70, -43,  70, -43 },
+    {  87,  -9,  87,  -9,  87,  -9,  87,  -9 },
+    {  90,  25,  90,  25,  90,  25,  90,  25 },
+    {  80,  57,  80,  57,  80,  57,  80,  57 },
+    {  57, -80,  57, -80,  57, -80,  57, -80 },
+    {  25, -90,  25, -90,  25, -90,  25, -90 },
+    {   9,  87,   9,  87,   9,  87,   9,  87 },
+    {  43,  70,  43,  70,  43,  70,  43,  70 },
+    {  43, -90,  43, -90,  43, -90,  43, -90 },
+    {  57,  25,  57,  25,  57,  25,  57,  25 },
+    {  87, -70,  87, -70,  87, -70,  87, -70 },
+    {   9, -80,   9, -80,   9, -80,   9, -80 },
+    {  25, -70,  25, -70,  25, -70,  25, -70 },
+    {  90, -80,  90, -80,  90, -80,  90, -80 },
+    {  43,   9,  43,   9,  43,   9,  43,   9 },
+    {  57, -87,  57, -87,  57, -87,  57, -87 },
+    {   9, -25,   9, -25,   9, -25,   9, -25 },
+    {  43, -57,  43, -57,  43, -57,  43, -57 },
+    {  70, -80,  70, -80,  70, -80,  70, -80 },
+    {  87, -90,  87, -90,  87, -90,  87, -90 },
+};
+const WORD16 g_ai2_ihevc_trans_intr_even_8[4][8] =
+{
+    {  64,  64,  64,  64,  64,  64,  64,  64 },
+    {  36, -83,  36, -83,  36, -83,  36, -83 },
+    {  83,  36,  83,  36,  83,  36,  83,  36 },
+    {  64, -64,  64, -64,  64, -64,  64, -64 }
+};
+const WORD16 g_ai2_ihevc_trans_intr_odd_8[8][8] =
+{
+    {  89,  75,  89,  75,  89,  75,  89,  75 },
+    {  50,  18,  50,  18,  50,  18,  50,  18 },
+    {  75, -18,  75, -18,  75, -18,  75, -18 },
+    {  89,  50,  89,  50,  89,  50,  89,  50 },
+    {  50, -89,  50, -89,  50, -89,  50, -89 },
+    {  18,  75,  18,  75,  18,  75,  18,  75 },
+    {  18, -50,  18, -50,  18, -50,  18, -50 },
+    {  75, -89,  75, -89,  75, -89,  75, -89 },
+};
+const WORD16 g_ai2_ihevc_trans_intr_4[4][8] =
+{
+    {  83,  36,  83,  36,  83,  36,  83,  36 },
+    {  36, -83,  36, -83,  36, -83,  36, -83 },
+    {  64,  64,  64,  64,  64,  64,  64,  64 },
+    {  64, -64,  64, -64,  64, -64,  64, -64 }
+};
+
+const UWORD8 IHEVCE_CHROMA_SHUFFLEMASK_HBD[8] = { 0x00, 0x01, 0x04, 0x05,
+    0x08, 0x09, 0x0C, 0x0D };
+#ifndef DISABLE_AVX2
+const WORD32 g_ai4_ihevc_trans_8_intr_avx2[7][8] =
+{ /* 4*32 = 128 bit */
+    {  64,  64,  64,  64,  64,  64,  64,  64 },
+    {  83,  83,  83,  83,  83,  83,  83,  83 },
+    {  36,  36,  36,  36,  36,  36,  36,  36 },
+    {  75,  75,  75,  75,  75,  75,  75,  75 },
+    {  18,  18,  18,  18,  18,  18,  18,  18 },
+    {  89,  89,  89,  89,  89,  89,  89,  89 },
+    {  50,  50,  50,  50,  50,  50,  50,  50 },
+};
+const WORD16 g_ai2_ihevc_trans_8_intr_avx2[8][16] =
+{ /* 4*32 = 128 bit */
+    {  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64 },
+    {  89,  75,  18,  50,  89,  75,  18,  50,  89,  75,  18,  50,  89,  75,  18,  50 },
+    {  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36 },
+    {  75, -18, -50, -89,  75, -18, -50, -89,  75, -18, -50, -89,  75, -18, -50, -89 },
+    {  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64 },
+    {  50, -89,  75,  18,  50, -89,  75,  18,  50, -89,  75,  18,  50, -89,  75,  18 },
+    {  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83 },
+    {  18, -50, -89,  75,  18, -50, -89,  75,  18, -50, -89,  75,  18, -50, -89,  75 }
+};
+
+const WORD32 g_ai2_ihevc_trans_32_intr_8_avx2[8][8] =
+{ /* 4*32 = 128 bit */
+    { 90, 90, 90, 90, 90, 90, 90, 90 },
+    { 87, 87, 87, 87, 87, 87, 87, 87 },
+    { 80, 80, 80, 80, 80, 80, 80, 80 },
+    { 70, 70, 70, 70, 70, 70, 70, 70 },
+    { 57, 57, 57, 57, 57, 57, 57, 57 },
+    { 43, 43, 43, 43, 43, 43, 43, 43 },
+    { 25, 25, 25, 25, 25, 25, 25, 25 },
+    {  9,  9,  9,  9,  9,  9,  9,  9 }
+};
+const WORD32 g_ai2_ihevc_trans_32_intr_16_avx2[15][8] =
+{ /* 4*32 = 128 bit */
+    { 90, 90, 90, 90, 90, 90, 90, 90, },
+    { 88, 88, 88, 88, 88, 88, 88, 88, },
+    { 85, 85, 85, 85, 85, 85, 85, 85, },
+    { 82, 82, 82, 82, 82, 82, 82, 82, },
+    { 78, 78, 78, 78, 78, 78, 78, 78, },
+    { 73, 73, 73, 73, 73, 73, 73, 73, },
+    { 67, 67, 67, 67, 67, 67, 67, 67, },
+    { 61, 61, 61, 61, 61, 61, 61, 61, },
+    { 54, 54, 54, 54, 54, 54, 54, 54, },
+    { 46, 46, 46, 46, 46, 46, 46, 46, },
+    { 38, 38, 38, 38, 38, 38, 38, 38, },
+    { 31, 31, 31, 31, 31, 31, 31, 31, },
+    { 22, 22, 22, 22, 22, 22, 22, 22, },
+    { 13, 13, 13, 13, 13, 13, 13, 13, },
+    { 4,  4,  4,  4,  4,  4,  4,  4,  }
+};
+const WORD16 g_ai2_ihevc_trans_16_intr_odd_avx2[32][16] =
+{
+    {  90,  87,  90,  87,  90,  87,  90,  87,  90,  87,  90,  87,  90,  87,  90,  87 },
+    {  70,  80,  70,  80,  70,  80,  70,  80,  70,  80,  70,  80,  70,  80,  70,  80 },
+    {  57,  43,  57,  43,  57,  43,  57,  43,  57,  43,  57,  43,  57,  43,  57,  43 },
+    {   9,  25,   9,  25,   9,  25,   9,  25,   9,  25,   9,  25,   9,  25,   9,  25 },
+    {  87,  57,  87,  57,  87,  57,  87,  57,  87,  57,  87,  57,  87,  57,  87,  57 },
+    { -43,   9, -43,   9, -43,   9, -43,   9, -43,   9, -43,   9, -43,   9, -43,   9 },
+    { -80, -90, -80, -90, -80, -90, -80, -90, -80, -90, -80, -90, -80, -90, -80, -90 },
+    { -25, -70, -25, -70, -25, -70, -25, -70, -25, -70, -25, -70, -25, -70, -25, -70 },
+    {  80,   9,  80,   9,  80,   9,  80,   9,  80,   9,  80,   9,  80,   9,  80,   9 },
+    { -87, -70, -87, -70, -87, -70, -87, -70, -87, -70, -87, -70, -87, -70, -87, -70 },
+    { -25,  57, -25,  57, -25,  57, -25,  57, -25,  57, -25,  57, -25,  57, -25,  57 },
+    {  43,  90,  43,  90,  43,  90,  43,  90,  43,  90,  43,  90,  43,  90,  43,  90 },
+    {  70, -43,  70, -43,  70, -43,  70, -43,  70, -43,  70, -43,  70, -43,  70, -43 },
+    {   9, -87,   9, -87,   9, -87,   9, -87,   9, -87,   9, -87,   9, -87,   9, -87 },
+    {  90,  25,  90,  25,  90,  25,  90,  25,  90,  25,  90,  25,  90,  25,  90,  25 },
+    { -57, -80, -57, -80, -57, -80, -57, -80, -57, -80, -57, -80, -57, -80, -57, -80 },
+    {  57, -80,  57, -80,  57, -80,  57, -80,  57, -80,  57, -80,  57, -80,  57, -80 },
+    {  90, -25,  90, -25,  90, -25,  90, -25,  90, -25,  90, -25,  90, -25,  90, -25 },
+    {  -9, -87,  -9, -87,  -9, -87,  -9, -87,  -9, -87,  -9, -87,  -9, -87,  -9, -87 },
+    {  70,  43,  70,  43,  70,  43,  70,  43,  70,  43,  70,  43,  70,  43,  70,  43 },
+    {  43, -90,  43, -90,  43, -90,  43, -90,  43, -90,  43, -90,  43, -90,  43, -90 },
+    {  25,  57,  25,  57,  25,  57,  25,  57,  25,  57,  25,  57,  25,  57,  25,  57 },
+    { -87,  70, -87,  70, -87,  70, -87,  70, -87,  70, -87,  70, -87,  70, -87,  70 },
+    { -80,   9, -80,   9, -80,   9, -80,   9, -80,   9, -80,   9, -80,   9, -80,   9 },
+    {  25, -70,  25, -70,  25, -70,  25, -70,  25, -70,  25, -70,  25, -70,  25, -70 },
+    { -80,  90, -80,  90, -80,  90, -80,  90, -80,  90, -80,  90, -80,  90, -80,  90 },
+    {  43,   9,  43,   9,  43,   9,  43,   9,  43,   9,  43,   9,  43,   9,  43,   9 },
+    {  87, -57,  87, -57,  87, -57,  87, -57,  87, -57,  87, -57,  87, -57,  87, -57 },
+    {   9, -25,   9, -25,   9, -25,   9, -25,   9, -25,   9, -25,   9, -25,   9, -25 },
+    { -57,  43, -57,  43, -57,  43, -57,  43, -57,  43, -57,  43, -57,  43, -57,  43 },
+    {  70, -80,  70, -80,  70, -80,  70, -80,  70, -80,  70, -80,  70, -80,  70, -80 },
+    { -90,  87, -90,  87, -90,  87, -90,  87, -90,  87, -90,  87, -90,  87, -90,  87 }
+};
+
+const WORD16 g_ai2_ihevc_trans_16_intr_even_avx2[12][16] =
+{
+    {  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64 },
+    {  89,  75,  89,  75,  89,  75,  89,  75,  89,  75,  89,  75,  89,  75,  89,  75 },
+    {  18,  50,  18,  50,  18,  50,  18,  50,  18,  50,  18,  50,  18,  50,  18,  50 },
+    {  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36,  83,  36 },
+    {  75, -18,  75, -18,  75, -18,  75, -18,  75, -18,  75, -18,  75, -18,  75, -18 },
+    { -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89 },
+    {  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64,  64, -64 },
+    {  50, -89,  50, -89,  50, -89,  50, -89,  50, -89,  50, -89,  50, -89,  50, -89 },
+    {  75,  18,  75,  18,  75,  18,  75,  18,  75,  18,  75,  18,  75,  18,  75,  18 },
+    {  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83,  36, -83 },
+    {  18, -50,  18, -50,  18, -50,  18, -50,  18, -50,  18, -50,  18, -50,  18, -50 },
+    { -89,  75, -89,  75, -89,  75, -89,  75, -89,  75, -89,  75, -89,  75, -89,  75 }
+};
+
+
+#endif

diff --git a/common/ihevc_trans_tables.h b/common/ihevc_trans_tables.h
new file mode 100644
index 0000000..7295967
--- /dev/null
+++ b/common/ihevc_trans_tables.h

@@ -0,0 +1,116 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_trans_tables.h
+*
+* @brief
+*  Tables for forward and inverse transform
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_TRANS_TABLES_H_
+#define _IHEVC_TRANS_TABLES_H_
+
+
+#include "ihevc_defs.h"
+
+extern const WORD32 g_ihevc_iquant_scales[6];
+
+extern const WORD16 g_ihevc_iquant_intr_scales[6][8];
+
+extern const WORD32 g_ihevc_quant_scales[6];
+
+extern const WORD16 g_ai2_ihevc_trans_dst_4[4][4];
+
+extern const WORD16 g_ai2_ihevc_trans_4[4][4];
+
+extern const WORD16 g_ai2_ihevc_trans_4_transpose[4][4];
+
+extern const WORD16 g_ai2_ihevc_trans_8[8][8];
+
+extern const WORD16 g_ai2_ihevc_trans_16[16][16];
+extern const WORD16 g_ai2_ihevc_trans_16_transpose[1][16];
+extern const WORD16 g_ai2_ihevc_trans_32_transpose[1][32];
+extern const WORD16 g_ai2_ihevc_trans_32[32][32];
+
+
+extern const WORD32 g_ai4_ihevc_trans_dst_intr_4[3][4];
+
+extern const WORD32 g_ai4_ihevc_trans_4_intr[3][4];
+extern const WORD16 g_ai2_ihevc_trans_4_intr[8];
+
+extern const WORD32 g_ai4_ihevc_trans_8_intr[7][4];
+extern const WORD16 g_ai2_ihevc_trans_8_intr[8][8];
+
+
+extern const WORD32 g_ai4_ihevc_trans_4_ttype1[3][4];
+
+extern const WORD32 g_ai4_ihevc_trans_4_ttype0[3][4];
+
+extern const WORD32 g_ai4_ihevc_trans_intr_even_8[3][4];
+
+extern const WORD32 g_ai4_ihevc_trans_intr_odd_8[4][4];
+
+extern const WORD32 g_ai4_ihevc_trans_16_even[7][4];
+
+extern const WORD32 g_ai4_ihevc_trans_16_odd[8][4];
+
+extern const WORD32 g_ai2_ihevc_trans_32_intr_8[8][4];
+extern const WORD32 g_ai2_ihevc_trans_32_intr_16[15][4];
+
+extern const WORD16 g_ai2_ihevc_trans_16_intr_even[12][8];
+
+extern const WORD16 g_ai2_ihevc_trans_16_intr_odd[32][8];
+
+
+extern const WORD16 g_ai2_ihevc_trans_32_intr_odd[32][16];
+
+extern const WORD16 g_ai2_ihevc_trans_32_intr_even[22][8];
+
+#ifndef DISABLE_AVX2
+extern const WORD16 g_ai2_ihevc_trans_8_intr_avx2[8][16];
+extern const WORD32 g_ai4_ihevc_trans_8_intr_avx2[7][8];
+extern const WORD16 g_ai2_ihevc_trans_16_intr_odd_avx2[32][16];
+extern const WORD16 g_ai2_ihevc_trans_16_intr_even_avx2[12][16];
+extern const WORD32 g_ai2_ihevc_trans_32_intr_8_avx2[8][8];
+extern const WORD32 g_ai2_ihevc_trans_32_intr_16_avx2[15][8];
+#endif
+
+extern MEM_ALIGN16 const WORD16 g_ai2_ihevc_trans_16_even_packed[12][8];
+extern MEM_ALIGN16 const WORD16 g_ai2_ihevc_trans_32_intr_packed[32][8];
+extern MEM_ALIGN16 const WORD16 g_ai2_ihevc_trans_32_intr_odd_packed[128][8];
+
+extern MEM_ALIGN16 const WORD16 g_ai2_ihevc_trans_16_even[12][8];
+extern MEM_ALIGN16 const WORD16 g_ai2_ihevc_trans_16_odd[32][8];
+
+extern MEM_ALIGN16 const WORD16 g_ai2_ihevc_trans_intr_even_8[4][8];
+extern MEM_ALIGN16 const WORD16 g_ai2_ihevc_trans_intr_odd_8[8][8];
+
+extern const WORD16 g_ai2_ihevc_trans_intr_4[4][8];
+
+extern const UWORD8 IHEVCE_CHROMA_SHUFFLEMASK_HBD[8];
+
+#endif /*_IHEVC_TRANS_TABLES_H_*/

diff --git a/common/ihevc_typedefs.h b/common/ihevc_typedefs.h
new file mode 100644
index 0000000..47a7a2f
--- /dev/null
+++ b/common/ihevc_typedefs.h

@@ -0,0 +1,65 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  typedefs.h
+*
+* @brief
+*  Type definitions used in the code
+*
+* @author
+*  Srinivas T
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVC_TYPEDEFS_H_
+#define _IHEVC_TYPEDEFS_H_
+
+
+typedef unsigned char   UWORD8;
+typedef unsigned short  UWORD16;
+typedef unsigned int    UWORD32;
+
+typedef signed char     WORD8;
+typedef signed short    WORD16;
+typedef signed int      WORD32;
+
+typedef char            CHAR;
+
+typedef double          DOUBLE;
+
+
+
+
+#ifndef MSVC
+
+typedef unsigned long long ULWORD64;
+typedef signed long long    LWORD64;
+
+#else
+typedef unsigned __int64    ULWORD64;
+typedef __int64             LWORD64;
+
+
+#endif
+#endif /*   _IHEVC_TYPEDEFS_H_ */

diff --git a/common/ihevc_weighted_pred.c b/common/ihevc_weighted_pred.c
new file mode 100644
index 0000000..a806293
--- /dev/null
+++ b/common/ihevc_weighted_pred.c

@@ -0,0 +1,604 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_weighted_pred.c
+*
+* @brief
+*  Contains function definitions for weighted prediction used in inter
+* prediction
+*
+* @author
+*  Srinivas T
+*
+* @par List of Functions:
+*   - ihevc_weighted_pred_uni()
+*   - ihevc_weighted_pred_bi()
+*   - ihevc_weighted_pred_bi_default()
+*   - ihevc_weighted_pred_chroma_uni()
+*   - ihevc_weighted_pred_chroma_bi()
+*   - ihevc_weighted_pred_chroma_bi_default()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_func_selector.h"
+
+#include "ihevc_inter_pred.h"
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Does uni-weighted prediction on the array pointed by  pi2_src and stores
+* it at the location pointed by pi2_dst
+*
+* @par Description:
+*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
+* offset
+*
+* @param[in] pi2_src
+*  Pointer to the source
+*
+* @param[out] pu1_dst
+*  Pointer to the destination
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] wgt0
+*  weight to be multiplied to the source
+*
+* @param[in] off0
+*  offset to be added after rounding and
+*
+* @param[in] shifting
+*
+*
+* @param[in] shift
+*  (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_uni(WORD16 *pi2_src,
+                             UWORD8 *pu1_dst,
+                             WORD32 src_strd,
+                             WORD32 dst_strd,
+                             WORD32 wgt0,
+                             WORD32 off0,
+                             WORD32 shift,
+                             WORD32 lvl_shift,
+                             WORD32 ht,
+                             WORD32 wd)
+{
+    WORD32 row, col;
+    WORD32 i4_tmp;
+
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < wd; col++)
+        {
+            i4_tmp = (pi2_src[col] + lvl_shift) * wgt0;
+            i4_tmp += 1 << (shift - 1);
+            i4_tmp = (i4_tmp >> shift) + off0;
+
+            pu1_dst[col] = CLIP_U8(i4_tmp);
+        }
+
+        pi2_src += src_strd;
+        pu1_dst += dst_strd;
+    }
+}
+//WEIGHTED_PRED_UNI
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does chroma uni-weighted prediction on array pointed by pi2_src and stores
+* it at the location pointed by pi2_dst
+*
+* @par Description:
+*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
+* offset
+*
+* @param[in] pi2_src
+*  Pointer to the source
+*
+* @param[out] pu1_dst
+*  Pointer to the destination
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] wgt0
+*  weight to be multiplied to the source
+*
+* @param[in] off0
+*  offset to be added after rounding and
+*
+* @param[in] shifting
+*
+*
+* @param[in] shift
+*  (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source (each colour component)
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_chroma_uni(WORD16 *pi2_src,
+                                    UWORD8 *pu1_dst,
+                                    WORD32 src_strd,
+                                    WORD32 dst_strd,
+                                    WORD32 wgt0_cb,
+                                    WORD32 wgt0_cr,
+                                    WORD32 off0_cb,
+                                    WORD32 off0_cr,
+                                    WORD32 shift,
+                                    WORD32 lvl_shift,
+                                    WORD32 ht,
+                                    WORD32 wd)
+{
+    WORD32 row, col;
+    WORD32 i4_tmp;
+
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < 2 * wd; col += 2)
+        {
+            i4_tmp = (pi2_src[col] + lvl_shift) * wgt0_cb;
+            i4_tmp += 1 << (shift - 1);
+            i4_tmp = (i4_tmp >> shift) + off0_cb;
+
+            pu1_dst[col] = CLIP_U8(i4_tmp);
+
+            i4_tmp = (pi2_src[col + 1] + lvl_shift) * wgt0_cr;
+            i4_tmp += 1 << (shift - 1);
+            i4_tmp = (i4_tmp >> shift) + off0_cr;
+
+            pu1_dst[col + 1] = CLIP_U8(i4_tmp);
+        }
+
+        pi2_src += src_strd;
+        pu1_dst += dst_strd;
+    }
+}
+//WEIGHTED_PRED_CHROMA_UNI
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Does bi-weighted prediction on the arrays pointed by  pi2_src1 and
+* pi2_src2 and stores it at location pointed  by pi2_dst
+*
+* @par Description:
+*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
+* off1 + 1) << (shift - 1) ) >> shift
+*
+* @param[in] pi2_src1
+*  Pointer to source 1
+*
+* @param[in] pi2_src2
+*  Pointer to source 2
+*
+* @param[out] pu1_dst
+*  Pointer to destination
+*
+* @param[in] src_strd1
+*  Source stride 1
+*
+* @param[in] src_strd2
+*  Source stride 2
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] wgt0
+*  weight to be multiplied to source 1
+*
+* @param[in] off0
+*  offset 0
+*
+* @param[in] wgt1
+*  weight to be multiplied to source 2
+*
+* @param[in] off1
+*  offset 1
+*
+* @param[in] shift
+*  (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift1
+*  added before shift and offset
+*
+* @param[in] lvl_shift2
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_bi(WORD16 *pi2_src1,
+                            WORD16 *pi2_src2,
+                            UWORD8 *pu1_dst,
+                            WORD32 src_strd1,
+                            WORD32 src_strd2,
+                            WORD32 dst_strd,
+                            WORD32 wgt0,
+                            WORD32 off0,
+                            WORD32 wgt1,
+                            WORD32 off1,
+                            WORD32 shift,
+                            WORD32 lvl_shift1,
+                            WORD32 lvl_shift2,
+                            WORD32 ht,
+                            WORD32 wd)
+{
+    WORD32 row, col;
+    WORD32 i4_tmp;
+
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < wd; col++)
+        {
+            i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0;
+            i4_tmp += (pi2_src2[col] + lvl_shift2) * wgt1;
+            i4_tmp += (off0 + off1 + 1) << (shift - 1);
+
+            pu1_dst[col] = CLIP_U8(i4_tmp >> shift);
+        }
+
+        pi2_src1 += src_strd1;
+        pi2_src2 += src_strd2;
+        pu1_dst += dst_strd;
+    }
+}
+//WEIGHTED_PRED_BI
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does chroma bi-weighted prediction on the arrays pointed by  pi2_src1 and
+* pi2_src2 and stores it at location pointed  by pi2_dst
+*
+* @par Description:
+*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
+* off1 + 1) << (shift - 1) ) >> shift
+*
+* @param[in] pi2_src1
+*  Pointer to source 1
+*
+* @param[in] pi2_src2
+*  Pointer to source 2
+*
+* @param[out] pu1_dst
+*  Pointer to destination
+*
+* @param[in] src_strd1
+*  Source stride 1
+*
+* @param[in] src_strd2
+*  Source stride 2
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] wgt0
+*  weight to be multiplied to source 1
+*
+* @param[in] off0
+*  offset 0
+*
+* @param[in] wgt1
+*  weight to be multiplied to source 2
+*
+* @param[in] off1
+*  offset 1
+*
+* @param[in] shift
+*  (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift1
+*  added before shift and offset
+*
+* @param[in] lvl_shift2
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source (each colour component)
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_chroma_bi(WORD16 *pi2_src1,
+                                   WORD16 *pi2_src2,
+                                   UWORD8 *pu1_dst,
+                                   WORD32 src_strd1,
+                                   WORD32 src_strd2,
+                                   WORD32 dst_strd,
+                                   WORD32 wgt0_cb,
+                                   WORD32 wgt0_cr,
+                                   WORD32 off0_cb,
+                                   WORD32 off0_cr,
+                                   WORD32 wgt1_cb,
+                                   WORD32 wgt1_cr,
+                                   WORD32 off1_cb,
+                                   WORD32 off1_cr,
+                                   WORD32 shift,
+                                   WORD32 lvl_shift1,
+                                   WORD32 lvl_shift2,
+                                   WORD32 ht,
+                                   WORD32 wd)
+{
+    WORD32 row, col;
+    WORD32 i4_tmp;
+
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < 2 * wd; col += 2)
+        {
+            i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0_cb;
+            i4_tmp += (pi2_src2[col] + lvl_shift2) * wgt1_cb;
+            i4_tmp += (off0_cb + off1_cb + 1) << (shift - 1);
+
+            pu1_dst[col] = CLIP_U8(i4_tmp >> shift);
+
+            i4_tmp = (pi2_src1[col + 1] + lvl_shift1) * wgt0_cr;
+            i4_tmp += (pi2_src2[col + 1] + lvl_shift2) * wgt1_cr;
+            i4_tmp += (off0_cr + off1_cr + 1) << (shift - 1);
+
+            pu1_dst[col + 1] = CLIP_U8(i4_tmp >> shift);
+        }
+
+        pi2_src1 += src_strd1;
+        pi2_src2 += src_strd2;
+        pu1_dst += dst_strd;
+    }
+}
+//WEIGHTED_PRED_CHROMA_BI
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location  pointed by pi2_dst
+*
+* @par Description:
+*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
+* >> shift  where shift = 15 - BitDepth
+*
+* @param[in] pi2_src1
+*  Pointer to source 1
+*
+* @param[in] pi2_src2
+*  Pointer to source 2
+*
+* @param[out] pu1_dst
+*  Pointer to destination
+*
+* @param[in] src_strd1
+*  Source stride 1
+*
+* @param[in] src_strd2
+*  Source stride 2
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] lvl_shift1
+*  added before shift and offset
+*
+* @param[in] lvl_shift2
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_bi_default(WORD16 *pi2_src1,
+                                    WORD16 *pi2_src2,
+                                    UWORD8 *pu1_dst,
+                                    WORD32 src_strd1,
+                                    WORD32 src_strd2,
+                                    WORD32 dst_strd,
+                                    WORD32 lvl_shift1,
+                                    WORD32 lvl_shift2,
+                                    WORD32 ht,
+                                    WORD32 wd)
+{
+    WORD32 row, col;
+    WORD32 i4_tmp;
+    WORD32 shift;
+
+    shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < wd; col++)
+        {
+            i4_tmp = pi2_src1[col] + lvl_shift1;
+            i4_tmp += pi2_src2[col] + lvl_shift2;
+            i4_tmp += 1 << (shift - 1);
+
+            pu1_dst[col] = CLIP_U8(i4_tmp >> shift);
+        }
+
+        pi2_src1 += src_strd1;
+        pi2_src2 += src_strd2;
+        pu1_dst += dst_strd;
+    }
+}
+//WEIGHTED_PRED_BI_DEFAULT
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Does chroma default bi-weighted prediction on arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location  pointed by pi2_dst
+*
+* @par Description:
+*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
+* >> shift  where shift = 15 - BitDepth
+*
+* @param[in] pi2_src1
+*  Pointer to source 1
+*
+* @param[in] pi2_src2
+*  Pointer to source 2
+*
+* @param[out] pu1_dst
+*  Pointer to destination
+*
+* @param[in] src_strd1
+*  Source stride 1
+*
+* @param[in] src_strd2
+*  Source stride 2
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] lvl_shift1
+*  added before shift and offset
+*
+* @param[in] lvl_shift2
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source (each colour component)
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_chroma_bi_default(WORD16 *pi2_src1,
+                                           WORD16 *pi2_src2,
+                                           UWORD8 *pu1_dst,
+                                           WORD32 src_strd1,
+                                           WORD32 src_strd2,
+                                           WORD32 dst_strd,
+                                           WORD32 lvl_shift1,
+                                           WORD32 lvl_shift2,
+                                           WORD32 ht,
+                                           WORD32 wd)
+{
+    WORD32 row, col;
+    WORD32 i4_tmp;
+    WORD32 shift;
+
+    shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
+    for(row = 0; row < ht; row++)
+    {
+        for(col = 0; col < 2 * wd; col++)
+        {
+            i4_tmp = pi2_src1[col] + lvl_shift1;
+            i4_tmp += pi2_src2[col] + lvl_shift2;
+            i4_tmp += 1 << (shift - 1);
+
+            pu1_dst[col] = CLIP_U8(i4_tmp >> shift);
+        }
+
+        pi2_src1 += src_strd1;
+        pi2_src2 += src_strd2;
+        pu1_dst += dst_strd;
+    }
+}
+//WEIGHTED_PRED_CHROMA_BI_DEFAULT

diff --git a/common/ihevc_weighted_pred.h b/common/ihevc_weighted_pred.h
new file mode 100644
index 0000000..aaf9797
--- /dev/null
+++ b/common/ihevc_weighted_pred.h

@@ -0,0 +1,178 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_weighted_pred.h
+*
+* @brief
+*  Function declarations used for buffer management
+*
+* @author
+*  Srinivas T
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef IHEVC_WEIGHTED_PRED_H_
+#define IHEVC_WEIGHTED_PRED_H_
+
+typedef void ihevc_weighted_pred_uni_ft(WORD16 *pi2_src,
+                                        UWORD8 *pu1_dst,
+                                        WORD32 src_strd,
+                                        WORD32 dst_strd,
+                                        WORD32 wgt0,
+                                        WORD32 off0,
+                                        WORD32 shift,
+                                        WORD32 lvl_shift,
+                                        WORD32 ht,
+                                        WORD32 wd);
+
+typedef void ihevc_weighted_pred_chroma_uni_ft(WORD16 *pi2_src,
+                                               UWORD8 *pu1_dst,
+                                               WORD32 src_strd,
+                                               WORD32 dst_strd,
+                                               WORD32 wgt0_cb,
+                                               WORD32 wgt0_cr,
+                                               WORD32 off0_cb,
+                                               WORD32 off0_cr,
+                                               WORD32 shift,
+                                               WORD32 lvl_shift,
+                                               WORD32 ht,
+                                               WORD32 wd);
+
+typedef void ihevc_weighted_pred_bi_ft(WORD16 *pi2_src1,
+                                       WORD16 *pi2_src2,
+                                       UWORD8 *pu1_dst,
+                                       WORD32 src_strd1,
+                                       WORD32 src_strd2,
+                                       WORD32 dst_strd,
+                                       WORD32 wgt0,
+                                       WORD32 off0,
+                                       WORD32 wgt1,
+                                       WORD32 off1,
+                                       WORD32 shift,
+                                       WORD32 lvl_shift1,
+                                       WORD32 lvl_shift2,
+                                       WORD32 ht,
+                                       WORD32 wd);
+
+typedef void ihevc_weighted_pred_chroma_bi_ft(WORD16 *pi2_src1,
+                                              WORD16 *pi2_src2,
+                                              UWORD8 *pu1_dst,
+                                              WORD32 src_strd1,
+                                              WORD32 src_strd2,
+                                              WORD32 dst_strd,
+                                              WORD32 wgt0_cb,
+                                              WORD32 wgt0_cr,
+                                              WORD32 off0_cb,
+                                              WORD32 off0_cr,
+                                              WORD32 wgt1_cb,
+                                              WORD32 wgt1_cr,
+                                              WORD32 off1_cb,
+                                              WORD32 off1_cr,
+                                              WORD32 shift,
+                                              WORD32 lvl_shift1,
+                                              WORD32 lvl_shift2,
+                                              WORD32 ht,
+                                              WORD32 wd);
+
+typedef void ihevc_weighted_pred_bi_default_ft(WORD16 *pi2_src1,
+                                               WORD16 *pi2_src2,
+                                               UWORD8 *pu1_dst,
+                                               WORD32 src_strd1,
+                                               WORD32 src_strd2,
+                                               WORD32 dst_strd,
+                                               WORD32 lvl_shift1,
+                                               WORD32 lvl_shift2,
+                                               WORD32 ht,
+                                               WORD32 wd);
+
+typedef void ihevc_weighted_pred_chroma_bi_default_ft(WORD16 *pi2_src1,
+                                                      WORD16 *pi2_src2,
+                                                      UWORD8 *pu1_dst,
+                                                      WORD32 src_strd1,
+                                                      WORD32 src_strd2,
+                                                      WORD32 dst_strd,
+                                                      WORD32 lvl_shift1,
+                                                      WORD32 lvl_shift2,
+                                                      WORD32 ht,
+                                                      WORD32 wd);
+/* C function declarations */
+ihevc_weighted_pred_uni_ft ihevc_weighted_pred_uni;
+ihevc_weighted_pred_chroma_uni_ft ihevc_weighted_pred_chroma_uni;
+ihevc_weighted_pred_bi_ft ihevc_weighted_pred_bi;
+ihevc_weighted_pred_chroma_bi_ft ihevc_weighted_pred_chroma_bi;
+ihevc_weighted_pred_bi_default_ft ihevc_weighted_pred_bi_default;
+ihevc_weighted_pred_chroma_bi_default_ft ihevc_weighted_pred_chroma_bi_default;
+
+/* A9 Q function declarations */
+ihevc_weighted_pred_uni_ft ihevc_weighted_pred_uni_a9q;
+ihevc_weighted_pred_chroma_uni_ft ihevc_weighted_pred_chroma_uni_a9q;
+ihevc_weighted_pred_bi_ft ihevc_weighted_pred_bi_a9q;
+ihevc_weighted_pred_chroma_bi_ft ihevc_weighted_pred_chroma_bi_a9q;
+ihevc_weighted_pred_bi_default_ft ihevc_weighted_pred_bi_default_a9q;
+ihevc_weighted_pred_chroma_bi_default_ft ihevc_weighted_pred_chroma_bi_default_a9q;
+
+/* A9 A function declarations */
+ihevc_weighted_pred_uni_ft ihevc_weighted_pred_uni_a9a;
+ihevc_weighted_pred_chroma_uni_ft ihevc_weighted_pred_chroma_uni_a9a;
+ihevc_weighted_pred_bi_ft ihevc_weighted_pred_bi_a9a;
+ihevc_weighted_pred_chroma_bi_ft ihevc_weighted_pred_chroma_bi_a9a;
+ihevc_weighted_pred_bi_default_ft ihevc_weighted_pred_bi_default_a9a;
+ihevc_weighted_pred_chroma_bi_default_ft ihevc_weighted_pred_chroma_bi_default_a9a;
+
+/* NEONINTR function declarations */
+ihevc_weighted_pred_uni_ft ihevc_weighted_pred_uni_neonintr;
+ihevc_weighted_pred_chroma_uni_ft ihevc_weighted_pred_chroma_uni_neonintr;
+ihevc_weighted_pred_bi_ft ihevc_weighted_pred_bi_neonintr;
+ihevc_weighted_pred_chroma_bi_ft ihevc_weighted_pred_chroma_bi_neonintr;
+ihevc_weighted_pred_bi_default_ft ihevc_weighted_pred_bi_default_neonintr;
+ihevc_weighted_pred_chroma_bi_default_ft ihevc_weighted_pred_chroma_bi_default_neonintr;
+/* SSSE3 function declarations */
+ihevc_weighted_pred_uni_ft ihevc_weighted_pred_uni_ssse3;
+ihevc_weighted_pred_chroma_uni_ft ihevc_weighted_pred_chroma_uni_ssse3;
+ihevc_weighted_pred_bi_ft ihevc_weighted_pred_bi_ssse3;
+ihevc_weighted_pred_chroma_bi_ft ihevc_weighted_pred_chroma_bi_ssse3;
+ihevc_weighted_pred_bi_default_ft ihevc_weighted_pred_bi_default_ssse3;
+ihevc_weighted_pred_chroma_bi_default_ft ihevc_weighted_pred_chroma_bi_default_ssse3;
+
+/* SSE42 function declarations */
+ihevc_weighted_pred_uni_ft ihevc_weighted_pred_uni_sse42;
+ihevc_weighted_pred_chroma_uni_ft ihevc_weighted_pred_chroma_uni_sse42;
+ihevc_weighted_pred_bi_ft ihevc_weighted_pred_bi_sse42;
+ihevc_weighted_pred_chroma_bi_ft ihevc_weighted_pred_chroma_bi_sse42;
+ihevc_weighted_pred_bi_default_ft ihevc_weighted_pred_bi_default_sse42;
+ihevc_weighted_pred_chroma_bi_default_ft ihevc_weighted_pred_chroma_bi_default_sse42;
+
+/* AVX2 function declarations */
+ihevc_weighted_pred_bi_default_ft ihevc_weighted_pred_bi_default_avx2;
+ihevc_weighted_pred_chroma_bi_default_ft ihevc_weighted_pred_chroma_bi_default_avx2;
+
+/* armv8 function declarations */
+ihevc_weighted_pred_uni_ft ihevc_weighted_pred_uni_av8;
+ihevc_weighted_pred_chroma_uni_ft ihevc_weighted_pred_chroma_uni_av8;
+ihevc_weighted_pred_bi_ft ihevc_weighted_pred_bi_av8;
+ihevc_weighted_pred_chroma_bi_ft ihevc_weighted_pred_chroma_bi_av8;
+ihevc_weighted_pred_bi_default_ft ihevc_weighted_pred_bi_default_av8;
+ihevc_weighted_pred_chroma_bi_default_ft ihevc_weighted_pred_chroma_bi_default_av8;
+
+#endif /* IHEVC_WEIGHTED_PRED_H_ */

diff --git a/common/ithread.c b/common/ithread.c
new file mode 100644
index 0000000..232ecfa
--- /dev/null
+++ b/common/ithread.c

@@ -0,0 +1,454 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/*****************************************************************************/
+/*                                                                           */
+/*  File Name         : ithread.c                                            */
+/*                                                                           */
+/*  Description       : Contains abstraction for threads, mutex and semaphores*/
+/*                                                                           */
+/*  List of Functions :                                                      */
+/*                                                                           */
+/*  Issues / Problems : None                                                 */
+/*                                                                           */
+/*  Revision History  :                                                      */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         07 09 2012   Harish          Initial Version                      */
+/*****************************************************************************/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ithread.h"
+#include <sys/types.h>
+
+#ifndef X86_MSVC
+//#define PTHREAD_AFFINITY
+//#define SYSCALL_AFFINITY
+
+#ifdef PTHREAD_AFFINITY
+#define _GNU_SOURCE
+#define __USE_GNU
+#endif
+
+#include <pthread.h>
+#include <sched.h>
+#include <semaphore.h>
+#include <unistd.h>
+
+
+#endif
+
+
+#if 0
+#include <sys/syscall.h>
+#endif
+
+
+#ifdef X86_MSVC
+
+#include <windows.h>
+#define SEM_MAX_COUNT       100
+#define SEM_INCREMENT_COUNT 1
+
+UWORD32 ithread_get_handle_size(void)
+{
+    return (sizeof(HANDLE));
+}
+
+UWORD32 ithread_get_mutex_lock_size(void)
+{
+    return (sizeof(HANDLE));
+}
+
+WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument)
+{
+    HANDLE *ppv_thread_handle;
+    HANDLE thread_handle_value;
+
+    if(0 == thread_handle)
+        return -1;
+
+    ppv_thread_handle = (HANDLE *)thread_handle;
+    thread_handle_value = (void *)CreateThread
+                    (NULL,                              /* Attributes      */
+                     1024 * 128,                        /* Stack size      */
+                     (LPTHREAD_START_ROUTINE)strt,      /* Thread function */
+                     argument,                          /* Parameters      */
+                     0,                                 /* Creation flags  */
+                     NULL);                             /* Thread ID       */
+    *ppv_thread_handle = (HANDLE)thread_handle_value;
+
+    return 0;
+}
+
+WORD32 ithread_join(void *thread_handle, void **val_ptr)
+{
+    HANDLE *ppv_thread_handle;
+    HANDLE thread_handle_value;
+
+    if(0 == thread_handle)
+        return -1;
+
+    ppv_thread_handle = (HANDLE *)thread_handle;
+    thread_handle_value = *ppv_thread_handle;
+
+    if(WAIT_OBJECT_0 == WaitForSingleObject(thread_handle_value, INFINITE))
+    {
+        CloseHandle(thread_handle_value);
+    }
+
+    return 0;
+}
+
+void ithread_exit(void *thread_handle)
+{
+    HANDLE *ppv_thread_handle;
+    HANDLE thread_handle_value;
+    DWORD thread_exit_code;
+
+    if(0 == thread_handle)
+        return;
+
+    ppv_thread_handle = (HANDLE *)thread_handle;
+    thread_handle_value = *ppv_thread_handle;
+    /* Get exit code for thread. If the return value is 0, means thread is busy */
+    if(0 != GetExitCodeThread(thread_handle_value, &thread_exit_code))
+    {
+        TerminateThread(thread_handle_value, thread_exit_code);
+    }
+
+    return;
+}
+
+WORD32 ithread_get_mutex_struct_size(void)
+{
+    return (sizeof(HANDLE));
+}
+
+WORD32 ithread_mutex_init(void *mutex)
+{
+    HANDLE *ppv_mutex_handle;
+    HANDLE mutex_handle_value;
+
+    if(0 == mutex)
+        return -1;
+
+    ppv_mutex_handle = (HANDLE *)mutex;
+    mutex_handle_value = CreateSemaphore(NULL, 1, 1, NULL);
+    *ppv_mutex_handle = mutex_handle_value;
+    return 0;
+}
+
+WORD32 ithread_mutex_destroy(void *mutex)
+{
+    HANDLE *ppv_mutex_handle;
+    HANDLE mutex_handle_value;
+
+    if(0 == mutex)
+        return -1;
+
+    ppv_mutex_handle = (HANDLE *)mutex;
+    mutex_handle_value = *ppv_mutex_handle;
+    CloseHandle(mutex_handle_value);
+    return 0;
+}
+
+WORD32 ithread_mutex_lock(void *mutex)
+{
+    HANDLE *ppv_mutex_handle;
+    HANDLE mutex_handle_value;
+    DWORD  result = 0;
+
+    if(0 == mutex)
+        return -1;
+
+    ppv_mutex_handle = (HANDLE *)mutex;
+    mutex_handle_value = *ppv_mutex_handle;
+    result = WaitForSingleObject(mutex_handle_value, INFINITE);
+
+    if(WAIT_OBJECT_0 == result)
+        return 0;
+
+    return 1;
+
+}
+
+WORD32 ithread_mutex_unlock(void *mutex)
+{
+    HANDLE *ppv_mutex_handle;
+    HANDLE mutex_handle_value;
+    DWORD  result = 0;
+
+    if(0 == mutex)
+        return -1;
+
+    ppv_mutex_handle = (HANDLE *)mutex;
+    mutex_handle_value = *ppv_mutex_handle;
+    result = ReleaseSemaphore(mutex_handle_value, 1, NULL);
+
+    if(0 == result)
+        return -1;
+
+    return 0;
+}
+
+void ithread_yield(void) { }
+
+void ithread_usleep(UWORD32 u4_time_us)
+{
+    UWORD32 u4_time_ms = u4_time_us / 1000;
+    Sleep(u4_time_ms);
+}
+
+void ithread_msleep(UWORD32 u4_time_ms)
+{
+    Sleep(u4_time_ms);
+}
+
+
+void ithread_sleep(UWORD32 u4_time)
+{
+    UWORD32 u4_time_ms = u4_time * 1000;
+    Sleep(u4_time_ms);
+}
+
+UWORD32 ithread_get_sem_struct_size(void)
+{
+    return (sizeof(HANDLE));
+}
+
+WORD32 ithread_sem_init(void *sem, WORD32 pshared, UWORD32 value)
+{
+    HANDLE *sem_handle = (HANDLE *)sem;
+    HANDLE sem_handle_value;
+
+    if(0 == sem)
+        return -1;
+
+    sem_handle_value = CreateSemaphore(NULL,  /* Security Attribute*/
+                                       value,  /* Initial count     */
+                                       SEM_MAX_COUNT, /* Max value         */
+                                       NULL);        /* Name, not used    */
+    *sem_handle = sem_handle_value;
+    return 0;
+}
+
+WORD32 ithread_sem_post(void *sem)
+{
+    HANDLE *sem_handle = (HANDLE *)sem;
+    HANDLE sem_handle_value;
+
+    if(0 == sem)
+        return -1;
+
+    sem_handle_value = *sem_handle;
+
+    /* Post on Semaphore by releasing the lock on mutex */
+    if(ReleaseSemaphore(sem_handle_value, SEM_INCREMENT_COUNT, NULL))
+        return 0;
+
+    return -1;
+}
+
+WORD32 ithread_sem_wait(void *sem)
+{
+    DWORD          result = 0;
+    HANDLE *sem_handle = (HANDLE *)sem;
+    HANDLE sem_handle_value;
+
+    if(0 == sem)
+        return -1;
+
+    sem_handle_value = *sem_handle;
+
+    /* Wait on Semaphore object infinitly */
+    result = WaitForSingleObject(sem_handle_value, INFINITE);
+
+    /* If lock on semaphore is acquired, return SUCCESS */
+    if(WAIT_OBJECT_0 == result)
+        return 0;
+
+    /* If call timeouts, return FAILURE */
+    if(WAIT_TIMEOUT == result)
+        return -1;
+
+    return 0;
+}
+
+WORD32 ithread_sem_destroy(void *sem)
+{
+    HANDLE *sem_handle = (HANDLE *)sem;
+    HANDLE sem_handle_value;
+
+    if(0 == sem)
+        return -1;
+
+    sem_handle_value = *sem_handle;
+
+    if(FALSE == CloseHandle(sem_handle_value))
+    {
+        return -1;
+    }
+    return 0;
+}
+
+WORD32 ithread_set_affinity(WORD32 core_id)
+{
+    return 1;
+}
+
+#else
+UWORD32 ithread_get_handle_size(void)
+{
+    return sizeof(pthread_t);
+}
+
+UWORD32 ithread_get_mutex_lock_size(void)
+{
+    return sizeof(pthread_mutex_t);
+}
+
+
+WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument)
+{
+    return pthread_create((pthread_t *)thread_handle, attribute, (void * (*)(void *))strt, argument);
+}
+
+WORD32 ithread_join(void *thread_handle, void **val_ptr)
+{
+    pthread_t *pthread_handle   = (pthread_t *)thread_handle;
+    return pthread_join(*pthread_handle, val_ptr);
+}
+
+void ithread_exit(void *val_ptr)
+{
+    return pthread_exit(val_ptr);
+}
+
+WORD32 ithread_get_mutex_struct_size(void)
+{
+    return (sizeof(pthread_mutex_t));
+}
+WORD32 ithread_mutex_init(void *mutex)
+{
+    return pthread_mutex_init((pthread_mutex_t *)mutex, NULL);
+}
+
+WORD32 ithread_mutex_destroy(void *mutex)
+{
+    return pthread_mutex_destroy((pthread_mutex_t *)mutex);
+}
+
+WORD32 ithread_mutex_lock(void *mutex)
+{
+    return pthread_mutex_lock((pthread_mutex_t *)mutex);
+}
+
+WORD32 ithread_mutex_unlock(void *mutex)
+{
+    return pthread_mutex_unlock((pthread_mutex_t *)mutex);
+}
+
+void ithread_yield(void)
+{
+    sched_yield();
+}
+
+void ithread_sleep(UWORD32 u4_time)
+{
+    usleep(u4_time * 1000 * 1000);
+}
+
+void ithread_msleep(UWORD32 u4_time_ms)
+{
+    usleep(u4_time_ms * 1000);
+}
+
+void ithread_usleep(UWORD32 u4_time_us)
+{
+    usleep(u4_time_us);
+}
+
+UWORD32 ithread_get_sem_struct_size(void)
+{
+    return (sizeof(sem_t));
+}
+
+
+WORD32 ithread_sem_init(void *sem, WORD32 pshared, UWORD32 value)
+{
+    return sem_init((sem_t *)sem, pshared, value);
+}
+
+WORD32 ithread_sem_post(void *sem)
+{
+    return sem_post((sem_t *)sem);
+}
+
+
+WORD32 ithread_sem_wait(void *sem)
+{
+    return sem_wait((sem_t *)sem);
+}
+
+
+WORD32 ithread_sem_destroy(void *sem)
+{
+    return sem_destroy((sem_t *)sem);
+}
+
+
+WORD32 ithread_set_affinity(WORD32 core_id)
+{
+
+#ifdef PTHREAD_AFFINITY
+    cpu_set_t cpuset;
+    int num_cores = sysconf(_SC_NPROCESSORS_ONLN);
+    pthread_t cur_thread = pthread_self();
+
+    if(core_id >= num_cores)
+        return -1;
+
+    CPU_ZERO(&cpuset);
+    CPU_SET(core_id, &cpuset);
+
+    return pthread_setaffinity_np(cur_thread, sizeof(cpu_set_t), &cpuset);
+
+#elif SYSCALL_AFFINITY
+    WORD32 i4_sys_res;
+
+    pid_t pid = gettid();
+
+
+    i4_sys_res = syscall(__NR_sched_setaffinity, pid, sizeof(i4_mask), &i4_mask);
+    if(i4_sys_res)
+    {
+        //WORD32 err;
+        //err = errno;
+        //perror("Error in setaffinity syscall PERROR : ");
+        //LOG_ERROR("Error in the syscall setaffinity: mask=0x%x err=0x%x", i4_mask, i4_sys_res);
+        return -1;
+    }
+#endif
+
+    return core_id;
+
+}
+#endif

diff --git a/common/ithread.h b/common/ithread.h
new file mode 100644
index 0000000..f435e78
--- /dev/null
+++ b/common/ithread.h

@@ -0,0 +1,78 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ithread.h
+*
+* @brief
+*  This file contains all the necessary structure and  enumeration
+* definitions needed for the Application  Program Interface(API) of the
+* Thread Abstraction Layer
+*
+* @author
+*  Harish
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef __ITHREAD_H__
+#define __ITHREAD_H__
+
+UWORD32 ithread_get_handle_size(void);
+
+UWORD32 ithread_get_mutex_lock_size(void);
+
+WORD32  ithread_create(void *thread_handle, void *attribute, void *strt, void *argument);
+
+void    ithread_exit(void *val_ptr);
+
+WORD32  ithread_join(void *thread_id, void **val_ptr);
+
+WORD32  ithread_get_mutex_struct_size(void);
+
+WORD32  ithread_mutex_init(void *mutex);
+
+WORD32  ithread_mutex_destroy(void *mutex);
+
+WORD32  ithread_mutex_lock(void *mutex);
+
+WORD32  ithread_mutex_unlock(void *mutex);
+
+void    ithread_yield(void);
+
+void    ithread_sleep(UWORD32 u4_time);
+
+void    ithread_msleep(UWORD32 u4_time_ms);
+
+void    ithread_usleep(UWORD32 u4_time_us);
+
+UWORD32 ithread_get_sem_struct_size(void);
+
+WORD32  ithread_sem_init(void *sem, WORD32 pshared, UWORD32 value);
+
+WORD32  ithread_sem_post(void *sem);
+
+WORD32  ithread_sem_wait(void *sem);
+
+WORD32  ithread_sem_destroy(void *sem);
+
+WORD32 ithread_set_affinity(WORD32 core_id);
+#endif /* __ITHREAD_H__ */

diff --git a/common/iv.h b/common/iv.h
new file mode 100644
index 0000000..a60cf47
--- /dev/null
+++ b/common/iv.h

@@ -0,0 +1,418 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  iv.h
+*
+* @brief
+*  This file contains all the necessary structure and  enumeration
+* definitions needed for the Application  Program Interface(API) of the
+* Ittiam Video and Image  codecs
+*
+* @author
+*  100239(RCY)
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+#ifndef _IV_H
+#define _IV_H
+
+/*****************************************************************************/
+/* Constant Macros                                                           */
+/*****************************************************************************/
+
+
+/*****************************************************************************/
+/* Typedefs                                                                  */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Enums                                                                     */
+/*****************************************************************************/
+
+
+/* IV_API_CALL_STATUS_T:This is only to return the FAIL/PASS status to the  */
+/* application for the current API call                                     */
+
+typedef enum {
+    IV_STATUS_NA                                = 0x7FFFFFFF,
+    IV_SUCCESS                                  = 0x0,
+    IV_FAIL                                     = 0x1,
+}IV_API_CALL_STATUS_T;
+
+/* IV_MEM_TYPE_T: This Enumeration defines the type of memory (Internal/Ext */
+/* -ernal) along with the cacheable/non-cacheable attributes                */
+
+typedef enum {
+    IV_NA_MEM_TYPE                              = 0x7FFFFFFF,
+    IV_INTERNAL_CACHEABLE_PERSISTENT_MEM        = 0x1,
+    IV_INTERNAL_CACHEABLE_SCRATCH_MEM           = 0x2,
+    IV_EXTERNAL_CACHEABLE_PERSISTENT_MEM        = 0x3,
+    IV_EXTERNAL_CACHEABLE_SCRATCH_MEM           = 0x4,
+    IV_INTERNAL_NONCACHEABLE_PERSISTENT_MEM     = 0x5,
+    IV_INTERNAL_NONCACHEABLE_SCRATCH_MEM        = 0x6,
+    IV_EXTERNAL_NONCACHEABLE_PERSISTENT_MEM     = 0x7,
+    IV_EXTERNAL_NONCACHEABLE_SCRATCH_MEM        = 0x8
+}IV_MEM_TYPE_T;
+
+/* IV_COLOR_FORMAT_T: This enumeration lists all the color formats which    */
+/* finds usage in video/image codecs                                        */
+
+typedef enum {
+    IV_CHROMA_NA                            = 0x7FFFFFFF,
+    IV_YUV_420P                             = 0x1,
+    IV_YUV_422P                             = 0x2,
+    IV_420_UV_INTL                          = 0x3,
+    IV_YUV_422IBE                           = 0x4,
+    IV_YUV_422ILE                           = 0x5,
+    IV_YUV_444P                             = 0x6,
+    IV_YUV_411P                             = 0x7,
+    IV_GRAY                                 = 0x8,
+    IV_RGB_565                              = 0x9,
+    IV_RGB_24                               = 0xa,
+    IV_YUV_420SP_UV                         = 0xb,
+    IV_YUV_420SP_VU                         = 0xc,
+    IV_RGBA_8888                            = 0xd
+}IV_COLOR_FORMAT_T;
+
+/* IV_PICTURE_CODING_TYPE_T: VOP/Frame coding type Enumeration              */
+
+typedef enum {
+    IV_NA_FRAME                             = 0x7FFFFFFF,
+    IV_I_FRAME                              = 0x0,
+    IV_P_FRAME                              = 0x1,
+    IV_B_FRAME                              = 0x2,
+    IV_IDR_FRAME                            = 0x3,
+    IV_II_FRAME                             = 0x4,
+    IV_IP_FRAME                             = 0x5,
+    IV_IB_FRAME                             = 0x6,
+    IV_PI_FRAME                             = 0x7,
+    IV_PP_FRAME                             = 0x8,
+    IV_PB_FRAME                             = 0x9,
+    IV_BI_FRAME                             = 0xa,
+    IV_BP_FRAME                             = 0xb,
+    IV_BB_FRAME                             = 0xc,
+    IV_MBAFF_I_FRAME                        = 0xd,
+    IV_MBAFF_P_FRAME                        = 0xe,
+    IV_MBAFF_B_FRAME                        = 0xf,
+    IV_MBAFF_IDR_FRAME                      = 0x10,
+    IV_NOT_CODED_FRAME                      = 0x11,
+    IV_FRAMETYPE_DEFAULT                    = IV_I_FRAME
+}IV_PICTURE_CODING_TYPE_T;
+
+/* IV_FLD_TYPE_T: field type Enumeration                                    */
+
+typedef enum {
+    IV_NA_FLD                               = 0x7FFFFFFF,
+    IV_TOP_FLD                              = 0x0,
+    IV_BOT_FLD                              = 0x1,
+    IV_FLD_TYPE_DEFAULT                     = IV_TOP_FLD
+}IV_FLD_TYPE_T;
+
+/* IV_CONTENT_TYPE_T: Video content type                                     */
+
+typedef enum {
+    IV_CONTENTTYPE_NA                       = 0x7FFFFFFF,
+    IV_PROGRESSIVE                          = 0x0,
+    IV_INTERLACED                           = 0x1,
+    IV_PROGRESSIVE_FRAME                    = 0x2,
+    IV_INTERLACED_FRAME                     = 0x3,
+    IV_INTERLACED_TOPFIELD                  = 0x4,
+    IV_INTERLACED_BOTTOMFIELD               = 0x5,
+    IV_CONTENTTYPE_DEFAULT                  = IV_PROGRESSIVE,
+}IV_CONTENT_TYPE_T;
+
+/* IV_API_COMMAND_TYPE_T:API command type                                   */
+typedef enum {
+    IV_CMD_NA                           = 0x7FFFFFFF,
+    IV_CMD_GET_NUM_MEM_REC              = 0x0,
+    IV_CMD_FILL_NUM_MEM_REC             = 0x1,
+    IV_CMD_RETRIEVE_MEMREC              = 0x2,
+    IV_CMD_INIT                         = 0x3,
+    IV_CMD_DUMMY_ELEMENT                = 0x4,
+}IV_API_COMMAND_TYPE_T;
+
+/*****************************************************************************/
+/* Structure                                                                 */
+/*****************************************************************************/
+
+/* IV_OBJ_T: This structure defines the handle for the codec instance        */
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * Pointer to the API function pointer table of the codec
+     */
+    void                                        *pv_fxns;
+
+    /**
+     * Pointer to the handle of the codec
+     */
+    void                                        *pv_codec_handle;
+}iv_obj_t;
+
+/* iv_mem_rec_t: This structure defines the memory record holder which will  */
+/* be used by the codec to communicate its memory requirements to the        */
+/* application through appropriate API functions                             */
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * Pointer to the memory allocated by the application
+     */
+    void                                        *pv_base;
+
+    /**
+     * u4_size of the memory to be allocated
+     */
+    UWORD32                                     u4_mem_size;
+
+    /**
+     * Alignment of the memory pointer
+     */
+    UWORD32                                     u4_mem_alignment;
+    /**
+     * Nature of the memory to be allocated
+     */
+    IV_MEM_TYPE_T                               e_mem_type;
+}iv_mem_rec_t;
+
+/* IV_YUV_BUF_T: This structure defines attributes for the yuv buffer        */
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * Pointer to Luma (Y) Buffer
+     */
+
+    void                                        *pv_y_buf;
+    /**
+     * Pointer to Chroma (Cb) Buffer
+     */
+    void                                        *pv_u_buf;
+
+    /**
+     * Pointer to Chroma (Cr) Buffer
+     */
+    void                                        *pv_v_buf;
+
+    /**
+     * Width of the Luma (Y) Buffer
+     */
+    UWORD32                                     u4_y_wd;
+
+    /**
+     * Height of the Luma (Y) Buffer
+     */
+    UWORD32                                     u4_y_ht;
+
+    /**
+     * Stride/Pitch of the Luma (Y) Buffer
+     */
+    UWORD32                                     u4_y_strd;
+
+    /**
+     * Width of the Chroma (Cb) Buffer
+     */
+    UWORD32                                     u4_u_wd;
+
+    /**
+     * Height of the Chroma (Cb) Buffer
+     */
+    UWORD32                                     u4_u_ht;
+
+    /**
+     * Stride/Pitch of the Chroma (Cb) Buffer
+     */
+    UWORD32                                     u4_u_strd;
+
+    /**
+     * Width of the Chroma (Cr) Buffer
+     */
+    UWORD32                                     u4_v_wd;
+
+    /**
+     * Height of the Chroma (Cr) Buffer
+     */
+    UWORD32                                     u4_v_ht;
+
+    /**
+     * Stride/Pitch of the Chroma (Cr) Buffer
+     */
+    UWORD32                                     u4_v_strd;
+}iv_yuv_buf_t;
+
+/*****************************************************************************/
+/*  Get Number of Memory Records                                             */
+/*****************************************************************************/
+
+/* IV_API_COMMAND_TYPE_T::e_cmd = IV_CMD_GET_NUM_MEM_REC                     */
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * cmd
+     */
+    IV_API_COMMAND_TYPE_T                       e_cmd;
+}iv_num_mem_rec_ip_t;
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * error code
+     */
+    UWORD32                                     u4_error_code;
+
+    /**
+     * num_mem_rec
+     */
+    UWORD32                                     u4_num_mem_rec;
+}iv_num_mem_rec_op_t;
+
+
+/*****************************************************************************/
+/*  Fill Memory Records                                                      */
+/*****************************************************************************/
+
+/* IV_API_COMMAND_TYPE_T::e_cmd = IV_CMD_FILL_NUM_MEM_REC                    */
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * cmd
+     */
+    IV_API_COMMAND_TYPE_T                       e_cmd;
+
+    /**
+     * pointer to array of memrecords structures should be filled by codec
+    with details of memory resource requirements
+     */
+    iv_mem_rec_t                                *pv_mem_rec_location;
+
+    /**
+     * maximum width for which codec should request memory requirements
+     */
+    UWORD32                                     u4_max_frm_wd;
+
+    /**
+     * maximum height for which codec should request memory requirements
+     */
+    UWORD32                                     u4_max_frm_ht;
+}iv_fill_mem_rec_ip_t;
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * error_code
+     */
+    UWORD32                                     u4_error_code;
+
+    /**
+     * no of memory record structures which are filled by codec
+     */
+    UWORD32                                     u4_num_mem_rec_filled;
+}iv_fill_mem_rec_op_t;
+
+
+/*****************************************************************************/
+/*  Retrieve Memory Records                                                  */
+/*****************************************************************************/
+
+/* IV_API_COMMAND_TYPE_T::e_cmd = IV_CMD_RETRIEVE_MEMREC                     */
+
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * cmd
+     */
+    IV_API_COMMAND_TYPE_T                       e_cmd;
+
+    /**
+     * array of structures where codec should fill with all resources(memory) with it
+     */
+    iv_mem_rec_t                                *pv_mem_rec_location;
+}iv_retrieve_mem_rec_ip_t;
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * error_code
+     */
+    UWORD32                                     u4_error_code;
+
+    /**
+     * no of memory records filled by codec
+     */
+    UWORD32                                     u4_num_mem_rec_filled;
+}iv_retrieve_mem_rec_op_t;
+
+
+
+#endif /* _IV_H */
+

diff --git a/common/ivd.h b/common/ivd.h
new file mode 100644
index 0000000..812da18
--- /dev/null
+++ b/common/ivd.h

@@ -0,0 +1,946 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ivd.h
+*
+* @brief
+*  This file contains all the necessary structure and  enumeration
+* definitions needed for the Application  Program Interface(API) of the
+* Ittiam Video Decoders
+*
+* @author
+*  100239(RCY)
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IVD_H
+#define _IVD_H
+
+/*****************************************************************************/
+/* Constant Macros                                                           */
+/*****************************************************************************/
+#define IVD_VIDDEC_MAX_IO_BUFFERS 64
+/*****************************************************************************/
+/* Typedefs                                                                  */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Enums                                                                     */
+/*****************************************************************************/
+
+/* IVD_ARCH_T: Architecture Enumeration                               */
+typedef enum
+{
+    ARCH_NA                 =   0x7FFFFFFF,
+    ARCH_ARM_NONEON         =   0x0,
+    ARCH_ARM_A9Q,
+    ARCH_ARM_A9A,
+    ARCH_ARM_A9,
+    ARCH_ARM_A7,
+    ARCH_ARM_A5,
+    ARCH_ARM_A15,
+    ARCH_ARM_NEONINTR,
+    ARCH_ARMV8_GENERIC,
+    ARCH_X86_GENERIC        =   0x100,
+    ARCH_X86_SSSE3,
+    ARCH_X86_SSE42,
+    ARCH_X86_AVX2,
+    ARCH_MIPS_GENERIC       =   0x200,
+    ARCH_MIPS_32
+}IVD_ARCH_T;
+
+/* IVD_SOC_T: SOC Enumeration                               */
+typedef enum
+{
+    SOC_NA                  = 0x7FFFFFFF,
+    SOC_GENERIC             = 0x0,
+    SOC_HISI_37X            = 0x100,
+}IVD_SOC_T;
+
+/* IVD_FRAME_SKIP_MODE_T:Skip mode Enumeration                               */
+
+typedef enum {
+    IVD_SKIP_NONE                               = 0x7FFFFFFF,
+    IVD_SKIP_P                                  = 0x1,
+    IVD_SKIP_B                                  = 0x2,
+    IVD_SKIP_I                                  = 0x3,
+    IVD_SKIP_IP                                 = 0x4,
+    IVD_SKIP_IB                                 = 0x5,
+    IVD_SKIP_PB                                 = 0x6,
+    IVD_SKIP_IPB                                = 0x7,
+    IVD_SKIP_IDR                                = 0x8,
+    IVD_SKIP_DEFAULT                            = IVD_SKIP_NONE,
+}IVD_FRAME_SKIP_MODE_T;
+
+/* IVD_VIDEO_DECODE_MODE_T: Set decoder to decode either frame worth of data */
+/* or only header worth of data                                              */
+
+typedef enum {
+    IVD_DECODE_MODE_NA                          = 0x7FFFFFFF,
+
+    /* This enables the codec to process all decodable units */
+    IVD_DECODE_FRAME                            = 0x0,
+
+    /* This enables the codec to decode header only */
+    IVD_DECODE_HEADER                           = 0x1,
+
+
+
+}IVD_VIDEO_DECODE_MODE_T;
+
+
+/* IVD_DISPLAY_FRAME_OUT_MODE_T: Video Display Frame Output Mode             */
+
+typedef enum {
+
+    IVD_DISPLAY_ORDER_NA                        = 0x7FFFFFFF,
+    /* To set codec to fill output buffers in display order */
+    IVD_DISPLAY_FRAME_OUT                       = 0x0,
+
+    /* To set codec to fill output buffers in decode order */
+    IVD_DECODE_FRAME_OUT                        = 0x1,
+}IVD_DISPLAY_FRAME_OUT_MODE_T;
+
+
+/* IVD_API_COMMAND_TYPE_T:API command type                                   */
+typedef enum {
+    IVD_CMD_VIDEO_NA                          = 0x7FFFFFFF,
+    IVD_CMD_VIDEO_CTL                         = IV_CMD_DUMMY_ELEMENT + 1,
+    IVD_CMD_VIDEO_DECODE,
+    IVD_CMD_GET_DISPLAY_FRAME,
+    IVD_CMD_REL_DISPLAY_FRAME,
+    IVD_CMD_SET_DISPLAY_FRAME
+}IVD_API_COMMAND_TYPE_T;
+
+/* IVD_CONTROL_API_COMMAND_TYPE_T: Video Control API command type            */
+
+typedef enum {
+    IVD_CMD_NA                          = 0x7FFFFFFF,
+    IVD_CMD_CTL_GETPARAMS               = 0x0,
+    IVD_CMD_CTL_SETPARAMS               = 0x1,
+    IVD_CMD_CTL_RESET                   = 0x2,
+    IVD_CMD_CTL_SETDEFAULT              = 0x3,
+    IVD_CMD_CTL_FLUSH                   = 0x4,
+    IVD_CMD_CTL_GETBUFINFO              = 0x5,
+    IVD_CMD_CTL_GETVERSION              = 0x6,
+    IVD_CMD_CTL_CODEC_SUBCMD_START         = 0x7
+}IVD_CONTROL_API_COMMAND_TYPE_T;
+
+
+/* IVD_ERROR_BITS_T: A UWORD32 container will be used for reporting the error*/
+/* code to the application. The first 8 bits starting from LSB have been     */
+/* reserved for the codec to report internal error details. The rest of the  */
+/* bits will be generic for all video decoders and each bit has an associated*/
+/* meaning as mentioned below. The unused bit fields are reserved for future */
+/* extenstions and will be zero in the current implementation                */
+
+typedef enum {
+    /* Bit 8  - Applied concealment.                                         */
+    IVD_APPLIEDCONCEALMENT                      = 0x8,
+    /* Bit 9 - Insufficient input data.                                     */
+    IVD_INSUFFICIENTDATA                        = 0x9,
+    /* Bit 10 - Data problem/corruption.                                     */
+    IVD_CORRUPTEDDATA                           = 0xa,
+    /* Bit 11 - Header problem/corruption.                                   */
+    IVD_CORRUPTEDHEADER                         = 0xb,
+    /* Bit 12 - Unsupported feature/parameter in input.                      */
+    IVD_UNSUPPORTEDINPUT                        = 0xc,
+    /* Bit 13 - Unsupported input parameter orconfiguration.                 */
+    IVD_UNSUPPORTEDPARAM                        = 0xd,
+    /* Bit 14 - Fatal error (stop the codec).If there is an                  */
+    /* error and this bit is not set, the error is a recoverable one.        */
+    IVD_FATALERROR                              = 0xe,
+    /* Bit 15 - Invalid bitstream. Applies when Bitstream/YUV frame          */
+    /* buffer for encode/decode call is made with non-valid or zero u4_size  */
+    /* data                                                                  */
+    IVD_INVALID_BITSTREAM                       = 0xf,
+    /* Bit 16          */
+    IVD_INCOMPLETE_BITSTREAM                    = 0x10,
+    IVD_ERROR_BITS_T_DUMMY_ELEMENT              = 0x7FFFFFFF
+}IVD_ERROR_BITS_T;
+
+
+/* IVD_CONTROL_API_COMMAND_TYPE_T: Video Control API command type            */
+typedef enum {
+    IVD_ERROR_NONE                              = 0x0,
+    IVD_NUM_MEM_REC_FAILED                      = 0x1,
+    IVD_NUM_REC_NOT_SUFFICIENT                  = 0x2,
+    IVD_FILL_MEM_REC_FAILED                     = 0x3,
+    IVD_REQUESTED_WIDTH_NOT_SUPPPORTED          = 0x4,
+    IVD_REQUESTED_HEIGHT_NOT_SUPPPORTED         = 0x5,
+    IVD_INIT_DEC_FAILED                         = 0x6,
+    IVD_INIT_DEC_NOT_SUFFICIENT                 = 0x7,
+    IVD_INIT_DEC_WIDTH_NOT_SUPPPORTED           = 0x8,
+    IVD_INIT_DEC_HEIGHT_NOT_SUPPPORTED          = 0x9,
+    IVD_INIT_DEC_MEM_NOT_ALIGNED                = 0xa,
+    IVD_INIT_DEC_COL_FMT_NOT_SUPPORTED          = 0xb,
+    IVD_INIT_DEC_MEM_REC_NOT_SUFFICIENT         = 0xc,
+    IVD_GET_VERSION_DATABUFFER_SZ_INSUFFICIENT  = 0xd,
+    IVD_BUFFER_SIZE_SET_TO_ZERO                 = 0xe,
+    IVD_UNEXPECTED_END_OF_STREAM                = 0xf,
+    IVD_SEQUENCE_HEADER_NOT_DECODED             = 0x10,
+    IVD_STREAM_WIDTH_HEIGHT_NOT_SUPPORTED       = 0x11,
+    IVD_MAX_FRAME_LIMIT_REACHED                 = 0x12,
+    IVD_IP_API_STRUCT_SIZE_INCORRECT            = 0x13,
+    IVD_OP_API_STRUCT_SIZE_INCORRECT            = 0x14,
+    IVD_HANDLE_NULL                             = 0x15,
+    IVD_HANDLE_STRUCT_SIZE_INCORRECT            = 0x16,
+    IVD_INVALID_HANDLE_NULL                     = 0x17,
+    IVD_INVALID_API_CMD                         = 0x18,
+    IVD_UNSUPPORTED_API_CMD                     = 0x19,
+    IVD_MEM_REC_STRUCT_SIZE_INCORRECT           = 0x1a,
+    IVD_DISP_FRM_ZERO_OP_BUFS                   = 0x1b,
+    IVD_DISP_FRM_OP_BUF_NULL                    = 0x1c,
+    IVD_DISP_FRM_ZERO_OP_BUF_SIZE               = 0x1d,
+    IVD_DEC_FRM_BS_BUF_NULL                     = 0x1e,
+    IVD_SET_CONFG_INVALID_DEC_MODE              = 0x1f,
+    IVD_SET_CONFG_UNSUPPORTED_DISP_WIDTH        = 0x20,
+    IVD_RESET_FAILED                            = 0x21,
+    IVD_INIT_DEC_MEM_REC_OVERLAP_ERR            = 0x22,
+    IVD_INIT_DEC_MEM_REC_BASE_NULL              = 0x23,
+    IVD_INIT_DEC_MEM_REC_ALIGNMENT_ERR          = 0x24,
+    IVD_INIT_DEC_MEM_REC_INSUFFICIENT_SIZE      = 0x25,
+    IVD_INIT_DEC_MEM_REC_INCORRECT_TYPE         = 0x26,
+    IVD_DEC_NUMBYTES_INV                        = 0x27,
+    IVD_DEC_REF_BUF_NULL                        = 0x28,
+    IVD_DEC_FRM_SKIPPED                         = 0x29,
+    IVD_RES_CHANGED                             = 0x2a,
+    IVD_DUMMY_ELEMENT_FOR_CODEC_EXTENSIONS      = 0x300,
+}IVD_ERROR_CODES_T;
+
+
+/*****************************************************************************/
+/* Structure                                                                 */
+/*****************************************************************************/
+/* structure for passing output buffers to codec during get display buffer   */
+/* call                                                                      */
+typedef struct {
+
+    /**
+     * number of output buffers
+     */
+    UWORD32             u4_num_bufs;
+
+    /**
+     *list of pointers to output buffers
+     */
+    UWORD8              *pu1_bufs[IVD_VIDDEC_MAX_IO_BUFFERS];
+
+    /**
+     * sizes of each output buffer
+     */
+    UWORD32             u4_min_out_buf_size[IVD_VIDDEC_MAX_IO_BUFFERS];
+
+}ivd_out_bufdesc_t;
+
+/*****************************************************************************/
+/*   Initialize decoder                                                      */
+/*****************************************************************************/
+
+/* IVD_API_COMMAND_TYPE_T::e_cmd = IVD_CMD_INIT                              */
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                 u4_size;
+
+    /**
+     *  e_cmd
+     */
+    IVD_API_COMMAND_TYPE_T                  e_cmd;
+
+    /**
+     *no memrecords which are allocated on request of codec through fill mem records
+     */
+    UWORD32                                 u4_num_mem_rec;
+    /**
+     * maximum height for which codec should be initialized
+     */
+    UWORD32                                 u4_frm_max_wd;
+    /**
+     * maximum width for which codec should be initialized
+     */
+    UWORD32                                 u4_frm_max_ht;
+    /**
+     * format in which codec has to give out frame data for display
+     */
+    IV_COLOR_FORMAT_T                       e_output_format;
+    /**
+     * pointer to memrecord array, which contains allocated resources
+     */
+    iv_mem_rec_t                            *pv_mem_rec_location;
+}ivd_init_ip_t;
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                 u4_size;
+
+    /**
+     * u4_error_code
+     */
+    UWORD32                                 u4_error_code;
+}ivd_init_op_t;
+
+
+/*****************************************************************************/
+/*   Video Decode                                                            */
+/*****************************************************************************/
+
+
+/* IVD_API_COMMAND_TYPE_T::e_cmd = IVD_CMD_VIDEO_DECODE                      */
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                 u4_size;
+
+    /**
+     * e_cmd
+     */
+    IVD_API_COMMAND_TYPE_T                  e_cmd;
+
+    /**
+     * u4_ts
+     */
+    UWORD32                                 u4_ts;
+
+    /**
+     * u4_num_Bytes
+     */
+    UWORD32                                 u4_num_Bytes;
+
+    /**
+     * pv_stream_buffer
+     */
+    void                                    *pv_stream_buffer;
+
+    /**
+     * output buffer desc
+     */
+    ivd_out_bufdesc_t                       s_out_buffer;
+
+}ivd_video_decode_ip_t;
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                 u4_size;
+
+    /**
+     * u4_error_code
+     */
+    UWORD32                                 u4_error_code;
+
+    /**
+     * num_bytes_consumed
+     */
+    UWORD32                                 u4_num_bytes_consumed;
+
+    /**
+     * pic_wd
+     */
+    UWORD32                                 u4_pic_wd;
+
+    /**
+     * pic_ht
+     */
+    UWORD32                                 u4_pic_ht;
+
+    /**
+     * pic_type
+     */
+    IV_PICTURE_CODING_TYPE_T                e_pic_type;
+
+    /**
+     * frame_decoded_flag
+     */
+    UWORD32                                 u4_frame_decoded_flag;
+
+    /**
+     * new_seq
+     */
+    UWORD32                                 u4_new_seq;
+
+    /**
+     * output_present
+     */
+    UWORD32                                 u4_output_present;
+
+    /**
+     * progressive_frame_flag
+     */
+    UWORD32                                 u4_progressive_frame_flag;
+
+    /**
+     * is_ref_flag
+     */
+    UWORD32                                 u4_is_ref_flag;
+
+    /**
+     * output_format
+     */
+    IV_COLOR_FORMAT_T                       e_output_format;
+
+    /**
+     * disp_frm_buf
+     */
+    iv_yuv_buf_t                            s_disp_frm_buf;
+
+    /**
+     * fld_type
+     */
+    IV_FLD_TYPE_T                           e4_fld_type;
+
+    /**
+     * ts
+     */
+    UWORD32                                 u4_ts;
+
+    /**
+     * disp_buf_id
+     */
+    UWORD32                                 u4_disp_buf_id;
+}ivd_video_decode_op_t;
+
+
+/*****************************************************************************/
+/*   Get Display Frame                                                       */
+/*****************************************************************************/
+
+
+/* IVD_API_COMMAND_TYPE_T::e_cmd = IVD_CMD_GET_DISPLAY_FRAME                 */
+
+typedef struct
+{
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                 u4_size;
+
+    /**
+     * e_cmd
+     */
+    IVD_API_COMMAND_TYPE_T                  e_cmd;
+
+    /**
+     * output buffer desc
+     */
+    ivd_out_bufdesc_t                       s_out_buffer;
+
+}ivd_get_display_frame_ip_t;
+
+
+typedef struct
+{
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                 u4_size;
+
+    /**
+     * error_code
+     */
+    UWORD32                                 u4_error_code;
+
+    /**
+     * progressive_frame_flag
+     */
+    UWORD32                                 u4_progressive_frame_flag;
+
+    /**
+     * pic_type
+     */
+    IV_PICTURE_CODING_TYPE_T                e_pic_type;
+
+    /**
+     * is_ref_flag
+     */
+    UWORD32                                 u4_is_ref_flag;
+
+    /**
+     * output_format
+     */
+    IV_COLOR_FORMAT_T                       e_output_format;
+
+    /**
+     * disp_frm_buf
+     */
+    iv_yuv_buf_t                            s_disp_frm_buf;
+
+    /**
+     * fld_type
+     */
+    IV_FLD_TYPE_T                           e4_fld_type;
+
+    /**
+     * ts
+     */
+    UWORD32                                 u4_ts;
+
+    /**
+     * disp_buf_id
+     */
+    UWORD32                                 u4_disp_buf_id;
+}ivd_get_display_frame_op_t;
+
+/*****************************************************************************/
+/*   Set Display Frame                                                       */
+/*****************************************************************************/
+
+
+/* IVD_API_COMMAND_TYPE_T::e_cmd = IVD_CMD_SET_DISPLAY_FRAME                 */
+
+typedef struct
+{
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                 u4_size;
+
+    /**
+     * cmd
+     */
+    IVD_API_COMMAND_TYPE_T                  e_cmd;
+
+    /**
+     * num_disp_bufs
+     */
+    UWORD32                                 num_disp_bufs;
+
+    /**
+     * output buffer desc
+     */
+    ivd_out_bufdesc_t                       s_disp_buffer[IVD_VIDDEC_MAX_IO_BUFFERS];
+
+}ivd_set_display_frame_ip_t;
+
+
+typedef struct
+{
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                 u4_size;
+
+    /**
+     * error code
+     */
+    UWORD32                                 u4_error_code;
+}ivd_set_display_frame_op_t;
+
+
+/*****************************************************************************/
+/*   Release Display Frame                                                       */
+/*****************************************************************************/
+
+
+/* IVD_API_COMMAND_TYPE_T::e_cmd = IVD_CMD_SET_DISPLAY_FRAME                 */
+
+typedef struct
+{
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                 u4_size;
+
+    /**
+     * e_cmd
+     */
+    IVD_API_COMMAND_TYPE_T                  e_cmd;
+
+    /**
+     * disp_buf_id
+     */
+    UWORD32                                 u4_disp_buf_id;
+}ivd_rel_display_frame_ip_t;
+
+
+typedef struct
+{
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                 u4_size;
+
+    /**
+     * error code
+     */
+    UWORD32                                 u4_error_code;
+}ivd_rel_display_frame_op_t;
+
+/*****************************************************************************/
+/*   Video control  Flush                                                    */
+/*****************************************************************************/
+/* IVD_API_COMMAND_TYPE_T::e_cmd            = IVD_CMD_VIDEO_CTL              */
+/* IVD_CONTROL_API_COMMAND_TYPE_T::e_sub_cmd    = IVD_CMD_ctl_FLUSH          */
+
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                 u4_size;
+
+    /**
+     * cmd
+     */
+    IVD_API_COMMAND_TYPE_T                  e_cmd;
+
+    /**
+     * sub_cmd
+     */
+    IVD_CONTROL_API_COMMAND_TYPE_T          e_sub_cmd;
+}ivd_ctl_flush_ip_t;
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                 u4_size;
+
+    /**
+     * error code
+     */
+    UWORD32                                 u4_error_code;
+}ivd_ctl_flush_op_t;
+
+/*****************************************************************************/
+/*   Video control reset                                                     */
+/*****************************************************************************/
+/* IVD_API_COMMAND_TYPE_T::e_cmd            = IVD_CMD_VIDEO_CTL              */
+/* IVD_CONTROL_API_COMMAND_TYPE_T::e_sub_cmd    = IVD_CMD_ctl_RESET          */
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                 u4_size;
+
+    /**
+     * cmd
+     */
+    IVD_API_COMMAND_TYPE_T                  e_cmd;
+
+    /**
+     * sub_cmd
+     */
+
+    IVD_CONTROL_API_COMMAND_TYPE_T          e_sub_cmd;
+}ivd_ctl_reset_ip_t;
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                 u4_size;
+
+    /**
+     * error code
+     */
+    UWORD32                                 u4_error_code;
+}ivd_ctl_reset_op_t;
+
+
+/*****************************************************************************/
+/*   Video control  Set Params                                               */
+/*****************************************************************************/
+/* IVD_API_COMMAND_TYPE_T::e_cmd        = IVD_CMD_VIDEO_CTL                  */
+/* IVD_CONTROL_API_COMMAND_TYPE_T::e_sub_cmd=IVD_CMD_ctl_SETPARAMS           */
+/* IVD_CONTROL_API_COMMAND_TYPE_T::e_sub_cmd=IVD_CMD_ctl_SETDEFAULT          */
+
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * cmd
+     */
+    IVD_API_COMMAND_TYPE_T                      e_cmd;
+
+    /**
+     * sub_cmd
+     */
+    IVD_CONTROL_API_COMMAND_TYPE_T              e_sub_cmd;
+
+    /**
+     * vid_dec_mode
+     */
+    IVD_VIDEO_DECODE_MODE_T                     e_vid_dec_mode;
+
+    /**
+     * disp_wd
+     */
+    UWORD32                                     u4_disp_wd;
+
+    /**
+     * frm_skip_mode
+     */
+    IVD_FRAME_SKIP_MODE_T                       e_frm_skip_mode;
+
+    /**
+     * frm_out_mode
+     */
+    IVD_DISPLAY_FRAME_OUT_MODE_T                e_frm_out_mode;
+}ivd_ctl_set_config_ip_t;
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * u4_error_code
+     */
+    UWORD32                                     u4_error_code;
+}ivd_ctl_set_config_op_t;
+
+/*****************************************************************************/
+/*   Video control:Get Buf Info                                              */
+/*****************************************************************************/
+
+/* IVD_API_COMMAND_TYPE_T::e_cmd         = IVD_CMD_VIDEO_CTL                 */
+/* IVD_CONTROL_API_COMMAND_TYPE_T::e_sub_cmd=IVD_CMD_ctl_GETBUFINFO          */
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     *  e_cmd
+     */
+    IVD_API_COMMAND_TYPE_T                      e_cmd;
+
+    /**
+     * sub_cmd
+     */
+    IVD_CONTROL_API_COMMAND_TYPE_T              e_sub_cmd;
+}ivd_ctl_getbufinfo_ip_t;
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * error code
+     */
+    UWORD32                                     u4_error_code;
+
+    /**
+     * no of display buffer sets required by codec
+     */
+    UWORD32                                     u4_num_disp_bufs;
+
+    /**
+     * no of input buffers required for codec
+     */
+    UWORD32                                     u4_min_num_in_bufs;
+
+    /**
+     * no of output buffers required for codec
+     */
+    UWORD32                                     u4_min_num_out_bufs;
+
+    /**
+     * sizes of each input buffer required
+     */
+    UWORD32                                     u4_min_in_buf_size[IVD_VIDDEC_MAX_IO_BUFFERS];
+
+    /**
+     * sizes of each output buffer required
+     */
+    UWORD32                                     u4_min_out_buf_size[IVD_VIDDEC_MAX_IO_BUFFERS];
+}ivd_ctl_getbufinfo_op_t;
+
+
+/*****************************************************************************/
+/*   Video control:Getstatus Call                                            */
+/*****************************************************************************/
+
+
+/* IVD_API_COMMAND_TYPE_T::e_cmd        = IVD_CMD_VIDEO_CTL                  */
+/* IVD_CONTROL_API_COMMAND_TYPE_T::e_sub_cmd=IVD_CMD_ctl_GETPARAMS           */
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * cmd
+     */
+    IVD_API_COMMAND_TYPE_T                      e_cmd;
+
+    /**
+     * sub_cmd
+     */
+    IVD_CONTROL_API_COMMAND_TYPE_T              e_sub_cmd;
+}ivd_ctl_getstatus_ip_t;
+
+
+typedef struct {
+
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                  u4_size;
+
+    /**
+      * error code
+      */
+    UWORD32                  u4_error_code;
+
+    /**
+     * no of display buffer sets required by codec
+     */
+    UWORD32                  u4_num_disp_bufs;
+
+    /**
+     * u4_pic_ht
+     */
+    UWORD32                  u4_pic_ht;
+
+    /**
+     * u4_pic_wd
+     */
+    UWORD32                  u4_pic_wd;
+
+    /**
+     * frame_rate
+     */
+    UWORD32                  u4_frame_rate;
+
+    /**
+     * u4_bit_rate
+     */
+    UWORD32                  u4_bit_rate;
+
+    /**
+     * content_type
+     */
+    IV_CONTENT_TYPE_T        e_content_type;
+
+    /**
+     * output_chroma_format
+     */
+    IV_COLOR_FORMAT_T        e_output_chroma_format;
+
+    /**
+     * no of input buffers required for codec
+     */
+    UWORD32                  u4_min_num_in_bufs;
+
+    /**
+     * no of output buffers required for codec
+     */
+    UWORD32                  u4_min_num_out_bufs;
+
+    /**
+     * sizes of each input buffer required
+     */
+    UWORD32                  u4_min_in_buf_size[IVD_VIDDEC_MAX_IO_BUFFERS];
+
+    /**
+     * sizes of each output buffer required
+     */
+    UWORD32                  u4_min_out_buf_size[IVD_VIDDEC_MAX_IO_BUFFERS];
+}ivd_ctl_getstatus_op_t;
+
+
+/*****************************************************************************/
+/*   Video control:Get Version Info                                          */
+/*****************************************************************************/
+
+/* IVD_API_COMMAND_TYPE_T::e_cmd        = IVD_CMD_VIDEO_CTL                  */
+/* IVD_CONTROL_API_COMMAND_TYPE_T::e_sub_cmd=IVD_CMD_ctl_GETVERSION          */
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * cmd
+     */
+    IVD_API_COMMAND_TYPE_T                      e_cmd;
+
+    /**
+     * sub_cmd
+     */
+    IVD_CONTROL_API_COMMAND_TYPE_T              e_sub_cmd;
+
+    /**
+     * pv_version_buffer
+     */
+    void                                        *pv_version_buffer;
+
+    /**
+     * version_buffer_size
+     */
+    UWORD32                                     u4_version_buffer_size;
+}ivd_ctl_getversioninfo_ip_t;
+
+
+typedef struct {
+    /**
+     * u4_size of the structure
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * error code
+     */
+    UWORD32                                     u4_error_code;
+}ivd_ctl_getversioninfo_op_t;
+
+#endif /* __IVD_H__ */
+

diff --git a/common/mips/ihevc_func_selector.h b/common/mips/ihevc_func_selector.h
new file mode 100644
index 0000000..8188178
--- /dev/null
+++ b/common/mips/ihevc_func_selector.h

@@ -0,0 +1,227 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_func_selector.h
+*
+* @brief
+*  For each function decide whether to use C function,  or Neon intrinsics
+* or Cortex A8 intrinsics or Neon  assembly or cortex a8 assembly
+*
+* @author
+*  Harish
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef __IHEVC_FUNC_SELECTOR_H__
+#define __IHEVC_FUNC_SELECTOR_H__
+
+#include "ihevc_func_types.h"
+
+#define    INTER_PRED_LUMA_COPY                     C
+#define    INTER_PRED_LUMA_HORZ                     C
+#define    INTER_PRED_LUMA_VERT                     C
+#define    INTER_PRED_LUMA_COPY_W16OUT              C
+#define    INTER_PRED_LUMA_HORZ_W16OUT              C
+
+#define    INTER_PRED_LUMA_VERT_W16OUT              C
+#define    INTER_PRED_LUMA_VERT_W16INP              C
+#define    INTER_PRED_LUMA_VERT_W16INP_W16OUT       C
+
+#define    INTER_PRED_CHROMA_COPY                   C
+#define    INTER_PRED_CHROMA_HORZ                   C
+#define    INTER_PRED_CHROMA_VERT                   C
+#define    INTER_PRED_CHROMA_COPY_W16OUT            C
+#define    INTER_PRED_CHROMA_HORZ_W16OUT            C
+#define    INTER_PRED_CHROMA_VERT_W16OUT            C
+#define    INTER_PRED_CHROMA_VERT_W16INP            C
+#define    INTER_PRED_CHROMA_VERT_W16INP_W16OUT     C
+
+#define    WEIGHTED_PRED_UNI                        C
+#define    WEIGHTED_PRED_BI                         C
+#define    WEIGHTED_PRED_BI_DEFAULT                 C
+#define    WEIGHTED_PRED_CHROMA_UNI                 C
+#define    WEIGHTED_PRED_CHROMA_BI                  C
+#define    WEIGHTED_PRED_CHROMA_BI_DEFAULT          C
+
+#define    PAD_VERT                                 C
+#define    PAD_HORZ                                 C
+#define    PAD_LEFT_LUMA                            C
+#define    PAD_LEFT_CHROMA                          C
+#define    PAD_RIGHT_LUMA                           C
+#define    PAD_RIGHT_CHROMA                         C
+
+#define     DEBLOCKING_ASM                          C
+#define     DEBLK_LUMA_HORZ                         C
+#define     DEBLK_LUMA_VERT                         C
+#define     DEBLK_CHROMA_HORZ                       C
+#define     DEBLK_CHROMA_VERT                       C
+
+#define     SAO_BAND_OFFSET_LUMA                    C
+#define     SAO_BAND_OFFSET_CHROMA                  C
+#define     SAO_EDGE_OFFSET_CLASS0_LUMA             C
+#define     SAO_EDGE_OFFSET_CLASS1_LUMA             C
+#define     SAO_EDGE_OFFSET_CLASS2_LUMA             C
+#define     SAO_EDGE_OFFSET_CLASS3_LUMA             C
+#define     SAO_EDGE_OFFSET_CLASS0_CHROMA           C
+#define     SAO_EDGE_OFFSET_CLASS1_CHROMA           C
+#define     SAO_EDGE_OFFSET_CLASS2_CHROMA           C
+#define     SAO_EDGE_OFFSET_CLASS3_CHROMA           C
+
+#define     INTRA_PRED_LUMA_REF_SUBSTITUTION        C
+#define     INTRA_PRED_REF_FILTERING                 C
+#define     INTRA_PRED_LUMA_PLANAR                  C
+#define     INTRA_PRED_LUMA_DC                      C
+#define     INTRA_PRED_LUMA_HORZ                    C
+#define     INTRA_PRED_LUMA_VER                     C
+#define     INTRA_PRED_LUMA_MODE_2                  C
+#define     INTRA_PRED_LUMA_MODE_18_34              C
+#define     INTRA_PRED_LUMA_MODE_3_T0_9             C
+#define     INTRA_PRED_LUMA_MODE_11_T0_17           C
+#define     INTRA_PRED_LUMA_MODE_19_T0_25           C
+#define     INTRA_PRED_LUMA_MODE_27_T0_33           C
+
+#define     INTRA_PRED_CHROMA_PLANAR                C
+#define     INTRA_PRED_CHROMA_DC                    C
+#define     INTRA_PRED_CHROMA_HOR                   C
+#define     INTRA_PRED_CHROMA_VER                   C
+#define     INTRA_PRED_CHROMA_MODE_2                C
+#define     INTRA_PRED_CHROMA_18_34                 C
+#define     INTRA_PRED_CHROMA_3_T0_9                C
+#define     INTRA_PRED_CHROMA_11_T0_17              C
+#define     INTRA_PRED_CHROMA_19_T0_25              C
+#define     INTRA_PRED_CHROMA_27_T0_33              C
+#define     INTRA_PRED_CHROMA_REF_SUBSTITUTION      C
+
+/* Forward transform functions */
+/* Luma */
+#define RESI_TRANS_QUANT_4X4_TTYPE1                 C
+#define RESI_TRANS_QUANT_4X4                        C
+#define RESI_TRANS_QUANT_8X8                        C
+#define RESI_TRANS_QUANT_16X16                      C
+#define RESI_TRANS_QUANT_32X32                      C
+
+#define RESI_QUANT_4X4_TTYPE1                       C
+#define RESI_QUANT_4X4                              C
+#define RESI_QUANT_8X8                              C
+#define RESI_QUANT_16X16                            C
+#define RESI_QUANT_32X32                            C
+
+#define RESI_TRANS_4X4_TTYPE1                       C
+#define RESI_TRANS_4X4                              C
+#define RESI_TRANS_8X8                              C
+#define RESI_TRANS_16X16                            C
+#define RESI_TRANS_32X32                            C
+
+#define RESI_4X4_TTYPE1                             C
+#define RESI_4X4                                    C
+#define RESI_8X8                                    C
+#define RESI_16X16                                  C
+#define RESI_32X32                                  C
+
+#define TRANS_4X4_TTYPE1                            C
+#define TRANS_4X4                                   C
+#define TRANS_8X8                                   C
+#define TRANS_16X16                                 C
+#define TRANS_32X32                                 C
+
+#define QUANT_4X4_TTYPE1                            C
+#define QUANT_4X4                                   C
+#define QUANT_8X8                                   C
+#define QUANT_16X16                                 C
+#define QUANT_32X32                                 C
+
+/* Chroma interleaved*/
+#define CHROMA_RESI_TRANS_QUANT_4X4                        C
+#define CHROMA_RESI_TRANS_QUANT_8X8                        C
+#define CHROMA_RESI_TRANS_QUANT_16X16                      C
+
+#define CHROMA_RESI_QUANT_4X4                              C
+#define CHROMA_RESI_QUANT_8X8                              C
+#define CHROMA_RESI_QUANT_16X16                            C
+
+#define CHROMA_RESI_TRANS_4X4                              C
+#define CHROMA_RESI_TRANS_8X8                              C
+#define CHROMA_RESI_TRANS_16X16                            C
+
+#define CHROMA_RESI_4X4                                    C
+#define CHROMA_RESI_8X8                                    C
+#define CHROMA_RESI_16X16                                  C
+
+/* Inverse transform functions */
+/* Luma */
+#define IQUANT_ITRANS_RECON_4X4_TTYPE1              C
+#define IQUANT_ITRANS_RECON_4X4                     C
+#define IQUANT_ITRANS_RECON_8X8                     C
+#define IQUANT_ITRANS_RECON_16X16                   C
+#define IQUANT_ITRANS_RECON_32X32                   C
+
+#define IQUANT_RECON_4X4_TTYPE1                     C
+#define IQUANT_RECON_4X4                            C
+#define IQUANT_RECON_8X8                            C
+#define IQUANT_RECON_16X16                          C
+#define IQUANT_RECON_32X32                          C
+
+#define ITRANS_RECON_4X4_TTYPE1                     C
+#define ITRANS_RECON_4X4                            C
+#define ITRANS_RECON_8X8                            C
+#define ITRANS_RECON_16X16                          C
+#define ITRANS_RECON_32X32                          C
+
+#define RECON_4X4_TTYPE1                            C
+#define RECON_4X4                                   C
+#define RECON_8X8                                   C
+#define RECON_16X16                                 C
+#define RECON_32X32                                 C
+
+#define ITRANS_4X4_TTYPE1                           C
+#define ITRANS_4X4                                  C
+#define ITRANS_8X8                                  C
+#define ITRANS_16X16                                C
+#define ITRANS_32X32                                C
+
+/* Chroma interleaved */
+#define CHROMA_IQUANT_ITRANS_RECON_4X4                     C
+#define CHROMA_IQUANT_ITRANS_RECON_8X8                     C
+#define CHROMA_IQUANT_ITRANS_RECON_16X16                   C
+
+#define CHROMA_IQUANT_RECON_4X4                            C
+#define CHROMA_IQUANT_RECON_8X8                            C
+#define CHROMA_IQUANT_RECON_16X16                          C
+
+#define CHROMA_ITRANS_RECON_4X4                            C
+#define CHROMA_ITRANS_RECON_8X8                            C
+#define CHROMA_ITRANS_RECON_16X16                          C
+
+#define CHROMA_RECON_4X4                                   C
+#define CHROMA_RECON_8X8                                   C
+#define CHROMA_RECON_16X16                                 C
+
+#define IHEVC_MEMCPY                                C
+#define IHEVC_MEMSET                                C
+#define IHEVC_MEMSET_16BIT                          C
+#define IHEVC_MEMCPY_MUL_8                          C
+#define IHEVC_MEMSET_MUL_8                          C
+#define IHEVC_MEMSET_16BIT_MUL_8                    C
+
+#endif  /* __IHEVC_FUNC_SELECTOR_H__ */

diff --git a/common/mips/ihevc_platform_macros.h b/common/mips/ihevc_platform_macros.h
new file mode 100644
index 0000000..4973239
--- /dev/null
+++ b/common/mips/ihevc_platform_macros.h

@@ -0,0 +1,88 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_platform_macros.h
+*
+* @brief
+*  Platform specific Macro definitions used in the codec
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_PLATFORM_MACROS_H_
+#define _IHEVC_PLATFORM_MACROS_H_
+
+
+#define CLIP_U8(x) CLIP3((x), 0,     255)
+#define CLIP_S8(x) CLIP3((x), -128,  127)
+
+#define CLIP_U10(x) CLIP3((x), 0,     1023);
+#define CLIP_S10(x) CLIP3((x), -512,  511);
+
+#define CLIP_U12(x) CLIP3((x), 0,     4095);
+#define CLIP_S12(x) CLIP3((x), -2048,  2047);
+
+#define CLIP_U16(x) CLIP3((x), 0,        65535)
+#define CLIP_S16(x) CLIP3((x), -32768,   32767)
+
+#define ITT_BIG_ENDIAN(x)   ((x << 24))                |   \
+                            ((x & 0x0000ff00) << 8)    |   \
+                            ((x & 0x00ff0000) >> 8)    |   \
+                            ((UWORD32)x >> 24);
+
+#define SHL(x,y) ((x) << (y))
+#define SHR(x,y) ((x) >> (y))
+
+#define SHR_NEG(val,shift)  ((shift>0)?(val>>shift):(val<<(-shift)))
+#define SHL_NEG(val,shift)  ((shift<0)?(val>>(-shift)):(val<<shift))
+
+
+static inline UWORD32 CLZ(UWORD32 x)
+{
+    asm("clz %0, %1" : "=r"(x) : "r"(x));
+    return x;
+}
+
+static inline UWORD32 CTZ(UWORD32 u4_word)
+{
+    if(0 == u4_word)
+        return 31;
+    else
+    {
+        unsigned int index;
+        index = __builtin_ctz(u4_word);
+        return (UWORD32)index;
+    }
+}
+
+#define NOP(nop_cnt)    {UWORD32 nop_i; for (nop_i = 0; nop_i < nop_cnt; nop_i++);}
+
+#define INLINE
+
+#define MEM_ALIGN8 __attribute__ ((aligned (8)))
+#define MEM_ALIGN16 __attribute__ ((aligned (16)))
+#define MEM_ALIGN32 __attribute__ ((aligned (32)))
+
+#endif /* _IHEVC_PLATFORM_MACROS_H_ */

diff --git a/common/x86/ihevc_16x16_itrans_recon_sse42_intr.c b/common/x86/ihevc_16x16_itrans_recon_sse42_intr.c
new file mode 100644
index 0000000..afdca10
--- /dev/null
+++ b/common/x86/ihevc_16x16_itrans_recon_sse42_intr.c

@@ -0,0 +1,3337 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_16x16_itrans_recon_x86_intr.c
+ *
+ * @brief
+ *  Contains function definitions for inverse
+ * transform and reconstruction for 16x16.
+ *
+ * @author
+ *  100470
+ *  100592 (edited by)
+ *
+ * @par List of Functions:
+ *  - ihevc_itrans_recon_16x16_sse42()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization, inverse  transform and
+ * reconstruction for 16x16 input block
+ *
+ * @par Description:
+ *  Performs inverse quantization , inverse transform  and adds the
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 16x16 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 16x16 buffer for storing inverse
+ *  transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 16x16 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 16x16 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_16x16_sse42(WORD16 *pi2_src,
+                                    WORD16 *pi2_tmp,
+                                    UWORD8 *pu1_pred,
+                                    UWORD8 *pu1_dst,
+                                    WORD32 src_strd,
+                                    WORD32 pred_strd,
+                                    WORD32 dst_strd,
+                                    WORD32 zero_cols,
+                                    WORD32 zero_rows)
+{
+    __m128i m_temp_reg_0;
+    __m128i m_temp_reg_1;
+    __m128i m_temp_reg_10;
+    __m128i m_temp_reg_11;
+    __m128i m_temp_reg_12;
+    __m128i m_temp_reg_13;
+    __m128i m_temp_reg_14;
+    __m128i m_temp_reg_20;
+    __m128i m_temp_reg_21;
+    __m128i m_temp_reg_22;
+    __m128i m_temp_reg_23;
+    __m128i m_temp_reg_24;
+    __m128i m_temp_reg_25;
+    __m128i m_temp_reg_26;
+    __m128i m_temp_reg_27;
+    __m128i m_temp_reg_30;
+    __m128i m_temp_reg_31;
+    __m128i m_temp_reg_32;
+    __m128i m_temp_reg_33;
+    __m128i m_temp_reg_34;
+    __m128i m_temp_reg_35;
+    __m128i m_temp_reg_36;
+    __m128i m_temp_reg_37;
+    __m128i m_temp_reg_40;
+    __m128i m_temp_reg_41;
+    __m128i m_temp_reg_42;
+    __m128i m_temp_reg_43;
+    __m128i m_temp_reg_44;
+    __m128i m_temp_reg_45;
+    __m128i m_temp_reg_46;
+    __m128i m_temp_reg_47;
+
+    __m128i m_temp_reg_70;
+    __m128i m_temp_reg_71;
+    __m128i m_temp_reg_72;
+    __m128i m_temp_reg_73;
+    __m128i m_temp_reg_74;
+    __m128i m_temp_reg_75;
+    __m128i m_temp_reg_76;
+    __m128i m_temp_reg_77;
+    __m128i m_rdng_factor;
+    __m128i m_count;
+    __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
+    __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
+
+    WORD32 i;
+
+    WORD32  zero_last8_cols_stg1;
+    WORD32  zero_last8_rows_stg1;
+    WORD32  zero_last12_rows_stg1;
+    WORD32  zero_last12_rows_stg2;
+    WORD32  zero_last8_rows_stg2;
+
+    WORD32  loop = 0;
+
+    WORD32 i4_shift = IT_SHIFT_STAGE_1;
+    WORD32 trans_size = TRANS_SIZE_16;
+
+    /* Following 3 instructions replicates the value in the */
+    /* lower 16 bits of m_add_iq in the entire register */
+
+    /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
+
+    zero_last8_cols_stg1  = ((zero_cols & 0xFF00) == 0xFF00) ? 1 : 0;
+    zero_last8_rows_stg1  = ((zero_rows & 0xFF00) == 0xFF00) ? 1 : 0;
+    zero_last12_rows_stg1 = ((zero_rows & 0xFFF0) == 0xFFF0) ? 1 : 0;
+
+    zero_last12_rows_stg2 = ((zero_cols & 0xFFF0) == 0xFFF0) ? 1 : 0;
+    zero_last8_rows_stg2 = zero_last8_cols_stg1;
+
+    if(zero_last8_cols_stg1)
+    {
+        loop = 1;
+    }
+    else
+        loop = 2;
+
+    /* i = 0 => lower 8 samples */
+    /* i = 1 => higher 8 samples */
+    for(i = 0; i < loop; i++)
+    {
+        {
+            WORD32 sample_half_index = i << 3;
+            WORD16 *pi2_tmp_src = pi2_src + sample_half_index;
+            WORD16 *pi2_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
+
+            m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+
+
+
+
+            /* If last 12 rows are zero : Rishab */
+            if(zero_last12_rows_stg1)
+            {
+
+                /* eee */
+                /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+                /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+                {
+                    /* Loading coeff and src for use in next block */
+
+                    m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get sign
+                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
+
+                    m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
+
+                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
+
+                    m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+
+                    m_temp_reg_26 = m_temp_reg_24;
+                    m_temp_reg_27 = m_temp_reg_25;
+                }
+
+                /* eo */
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+
+                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
+                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo0[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+
+                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
+                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+                }
+
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+
+                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+                }
+
+                /* eo1[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+
+                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+
+                }
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo2[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+
+                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
+
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+                }
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+                }
+
+                /* eo3[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+
+                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+                }
+            }
+            /* If last 8 rows are zero : Rishab */
+            else if(zero_last8_rows_stg1)
+            {
+                /* eeo */
+                /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+                /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+                {
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
+
+                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
+                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
+
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+                }
+
+                /* eee */
+                /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+                /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+                {
+                    /* Loading coeff and src for use in next block */
+                    m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to  get signs
+                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
+
+                    m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
+
+                    //m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+
+                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
+
+                    m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
+
+                    m_temp_reg_26 = m_temp_reg_24;
+                    m_temp_reg_27 = m_temp_reg_25;
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
+                }
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
+                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
+
+                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
+                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo0[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
+                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
+
+                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
+                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+                }
+
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
+                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
+
+                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo1[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
+                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
+
+                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
+
+                }
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo2[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+
+                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+                }
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+                }
+
+                /* eo3[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+
+                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+                }
+            } /* If all the rows are non-zero : Rishab */
+            else
+            {
+                /* eeo */
+                /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+                /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+
+                {
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
+
+                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
+                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
+
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+                }
+
+                /* eee */
+                /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+                /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+                {
+                    /* Loading coeff and src for use in next block */
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64  64
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
+
+                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
+                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
+
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
+
+                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
+
+                }
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+                    m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
+
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
+                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
+
+                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
+                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+
+                }
+
+                /* eo0[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
+                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
+
+                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
+                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+                }
+
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
+                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
+
+                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
+                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo1[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
+                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
+
+                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
+                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
+                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
+                }
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
+
+                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+                }
+
+                /* eo2[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
+
+                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+                }
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
+
+                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+                }
+
+                /* eo3[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+                }
+
+            }
+        }
+
+        {
+            WORD32 sample_half_index = i << 3;
+            WORD16 *pi2_tmp_src = pi2_src + sample_half_index + src_strd;
+
+            m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+        }
+
+        /* o & stage 1 out */
+        {
+            WORD32 j;
+            WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
+            WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
+            WORD32 out_stride = (trans_size << 1);
+            WORD32 in_stride = trans_size << 1;
+
+            if(zero_last12_rows_stg1)
+            {
+                for(j = 0; j < 2; j++)
+                {
+                    if(j) //H8B= higher 8 bytes L8B lower 8 bytes
+                    {
+                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+                    }
+                    else
+                    {
+                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+                    }
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+
+
+                    /* o0[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o1[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o2[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o3[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+                    }
+
+                    /* o4[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    /* o5[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    /* o6[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    /* o7[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+                    }
+                }
+            }
+            else if(zero_last8_rows_stg1)
+            {
+                for(j = 0; j < 2; j++)
+                {
+                    if(j)
+                    {
+                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
+                    }
+                    else
+                    {
+                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
+                    }
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
+
+                    /* o0[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+
+                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o1[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o2[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o3[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
+
+                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+                    }
+
+                    /* o4[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    /* o5[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    /* o6[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    /* o7[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+                    }
+                }
+
+            }
+            else
+            {
+
+
+
+                for(j = 0; j < 2; j++)
+                {
+                    if(j) //H8B= higher 8 bytes L8B lower 8 bytes
+                    {
+                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
+                        m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
+                        m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
+                    }
+                    else
+                    {
+                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
+                        m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
+                        m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
+                    }
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25  9
+
+
+                    /* o0[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
+                        m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
+                        m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o1[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
+                        m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
+                        m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
+                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o2[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
+                        m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
+                        m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o3[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
+                        m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
+                        m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
+
+                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+                    }
+
+                    /* o4[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
+                        m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
+                        m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    /* o5[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
+                        m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
+                        m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
+                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    /* o6[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
+                        m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
+                        m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
+
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    /* o7[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+                    }
+                }
+            }
+        }
+
+        /* Transpose */
+        {
+            WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
+            WORD16 *pi2_dst_scratch = ((i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp);
+            WORD32 out_stride = (trans_size << 1);
+            WORD32 in_stride = (trans_size << 1);
+            WORD32 j;
+
+            for(j = 0; j < 2; j++)
+            {
+                m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
+                pi2_src_scratch += in_stride;
+                m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
+                pi2_src_scratch += in_stride;
+                m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
+                pi2_src_scratch += in_stride;
+                m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
+                pi2_src_scratch += 8;
+                m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
+                pi2_src_scratch += 8;
+
+                m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
+                m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
+
+                m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
+                m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
+
+                m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
+                m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
+
+                m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
+                m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
+
+
+                m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
+                m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
+
+                m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
+                m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
+
+                m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
+                m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
+
+                m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
+                m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
+
+
+                m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
+                m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
+
+                m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
+                m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
+
+                m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
+                m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp1
+
+                m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp2
+                m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp3
+
+                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+                pi2_dst_scratch += out_stride;
+                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_44);
+                pi2_dst_scratch += out_stride;
+                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_41);
+                pi2_dst_scratch += out_stride;
+                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_45);
+                pi2_dst_scratch += 8;
+                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_42);
+                pi2_dst_scratch -= out_stride;
+                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_46);
+                pi2_dst_scratch -= out_stride;
+                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_43);
+                pi2_dst_scratch -= out_stride;
+                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_47);
+                pi2_dst_scratch += 8;
+            }
+        }
+    }
+
+    if(zero_last8_cols_stg1)
+    {
+        WORD16 *pi2_dst_scratch = (pi2_tmp + 8 * trans_size);
+        WORD32 out_stride = (trans_size << 1);
+        WORD32 j;
+
+        m_temp_reg_40 = _mm_setzero_si128();
+        for(j = 0; j < 2; j++)
+        {
+            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+            pi2_dst_scratch += out_stride;
+            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+            pi2_dst_scratch += out_stride;
+            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+            pi2_dst_scratch += out_stride;
+            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+            pi2_dst_scratch += 8;
+            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+            pi2_dst_scratch -= out_stride;
+            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+            pi2_dst_scratch -= out_stride;
+            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+            pi2_dst_scratch -= out_stride;
+            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+            pi2_dst_scratch += 8;
+        }
+    }
+
+
+
+
+    /* Stage 2 */
+    for(i = 0; i < 2; i++)
+    {
+        //__m128i m_temp_reg_15,m_temp_reg_16;
+        WORD16 *pi2_src_temp = (i) ? (pi2_tmp + 2 * trans_size) : (WORD16 *)(pi2_tmp);
+        WORD32 stride = (trans_size);
+        WORD16 temp_array[256];
+
+        i4_shift = IT_SHIFT_STAGE_2;
+
+        if(zero_last12_rows_stg2)
+        {
+            /* eeo */
+            /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+            /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+            {
+                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
+
+                pi2_src_temp += (stride * 9);
+
+                if(!i)
+                {
+                    pi2_src_temp += (stride * 6 + 8);
+                }
+                else
+                {
+                    pi2_src_temp += (stride * 2 + 8);
+                }
+
+                pi2_src_temp -= (stride * 9);
+
+                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
+
+                m_temp_reg_20 = _mm_setzero_si128();
+                m_temp_reg_22 = _mm_setzero_si128();
+
+                m_temp_reg_21 = _mm_setzero_si128();
+                m_temp_reg_23 = _mm_setzero_si128();
+            }
+
+            /* eee */
+            /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+            /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+            {
+                /* Loading coeff and src for use in next block */
+
+                /* Loading coeff and src for use in next block */
+                m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_20, m_temp_reg_70);
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
+
+                m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
+
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
+                m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
+
+                m_temp_reg_26 = m_temp_reg_24;
+                m_temp_reg_27 = m_temp_reg_25;
+                /*  */
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_20);
+                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_20);
+            }
+
+            /* eo */
+            {
+                WORD16 *pi2_scratch = temp_array;
+                WORD32 out_stride = 8;
+
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+
+                    /* e[0][0-3] stored in pu1_dst[0] */
+                    /* e[7][0-3] stored in pu1_dst[1] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo0[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+
+                    /* e[0][4-7] stored in pu1_dst[2] */
+                    /* e[7][4-7] stored in pu1_dst[3] */
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+                }
+
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+
+                    /* e[1][0-3] stored in pu1_dst[4] */
+                    /* e[6][0-3] stored in pu1_dst[5] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo1[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+
+                    /* e[1][4-7] stored in pu1_dst[6]*/
+                    /* e[6][4-7] stored in pu1_dst[7] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+
+                }
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    /* e[2][0-3] stored in pu1_dst[8]*/
+                    /* e[5][0-3] stored in pu1_dst[9] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo2[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    /* e[2][4-7] stored in pu1_dst[10]*/
+                    /* e[5][4-7] stored in pu1_dst[11] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+                }
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                    /* e[3][0-3] stored in pu1_dst[12]*/
+                    /* e[4][0-3] stored in pu1_dst[13] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo3[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+                    /* e[3][4-7] stored in pu1_dst[14]*/
+                    /* e[4][4-7] stored in pu1_dst[15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+            }
+        }
+        else if(zero_last8_rows_stg2)
+        {
+            /* eeo */
+            /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+            /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+            {
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[3][0]); //83
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[4][0]); //36
+
+                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
+                pi2_src_temp += (stride);
+                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
+                pi2_src_temp += (stride * 8);
+
+                if(!i)
+                {
+                    pi2_src_temp += (stride * 6 + 8);
+                }
+                else
+                {
+                    pi2_src_temp += (stride * 2 + 8);
+                }
+
+                pi2_src_temp -= (stride * 8);
+                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
+                pi2_src_temp -= (stride);
+                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
+
+
+                m_temp_reg_76 = _mm_setzero_si128();
+
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
+
+                m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+                m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+            }
+
+            /* eee */
+            /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+            /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+            {
+                /* Loading coeff and src for use in next block */
+
+
+                m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_76, m_temp_reg_70);
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
+
+                m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
+                m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+
+                m_temp_reg_26 = m_temp_reg_24;
+                m_temp_reg_27 = m_temp_reg_25;
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+            }
+
+            /* eo */
+            {
+                WORD16 *pi2_scratch = temp_array;
+                WORD32 out_stride = 8;
+
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
+                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
+
+                    /* e[0][0-3] stored in pu1_dst[0] */
+                    /* e[7][0-3] stored in pu1_dst[1] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo0[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
+                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
+
+                    /* e[0][4-7] stored in pu1_dst[2] */
+                    /* e[7][4-7] stored in pu1_dst[3] */
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+                }
+
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
+                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
+
+                    /* e[1][0-3] stored in pu1_dst[4] */
+                    /* e[6][0-3] stored in pu1_dst[5] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo1[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
+                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
+
+                    /* e[1][4-7] stored in pu1_dst[6]*/
+                    /* e[6][4-7] stored in pu1_dst[7] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+
+                }
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    /* e[2][0-3] stored in pu1_dst[8]*/
+                    /* e[5][0-3] stored in pu1_dst[9] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo2[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    /* e[2][4-7] stored in pu1_dst[10]*/
+                    /* e[5][4-7] stored in pu1_dst[11] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+                }
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                    /* e[3][0-3] stored in pu1_dst[12]*/
+                    /* e[4][0-3] stored in pu1_dst[13] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo3[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+                    /* e[3][4-7] stored in pu1_dst[14]*/
+                    /* e[4][4-7] stored in pu1_dst[15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+            }
+        }
+
+        else
+        {
+            /* eeo */
+            /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+            /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+            {
+
+
+                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
+                pi2_src_temp += (stride);
+                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
+                pi2_src_temp += (stride * 7);
+                m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //8
+                pi2_src_temp += (stride);
+                m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //12
+                if(!i)
+                {
+                    pi2_src_temp += (stride * 6 + 8);
+                }
+                else
+                {
+                    pi2_src_temp += (stride * 2 + 8);
+                }
+                m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //14
+                pi2_src_temp -= (stride);
+                m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //10
+                pi2_src_temp -= (stride * 7);
+                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
+                pi2_src_temp -= (stride);
+                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
+
+                m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+                m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+
+            }
+
+            /* eee */
+            /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+            /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+            {
+                /* Loading coeff and src for use in next block */
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64  64
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
+
+                m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+                m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
+
+                m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+                m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
+
+            }
+
+            /* eo */
+            {
+                WORD16 *pi2_scratch = temp_array;
+                WORD32 out_stride = 8;
+
+
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+                    m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
+
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
+
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
+                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
+
+
+                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
+                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+
+                }
+
+                /* eo0[4-7] */
+                {
+
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
+                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
+
+                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
+                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+                }
+
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
+                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
+
+                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
+                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+                }
+
+                /* eo1[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
+                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
+
+                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
+                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
+                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
+                }
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
+
+                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo2[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
+
+                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+                }
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
+
+                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo3[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+            }
+        }
+
+        if(zero_last12_rows_stg2)
+        {
+            /* o & stage 2 pre-transposed out */
+            {
+                WORD32 j;
+                WORD16 *pi2_src_scratch = temp_array;
+                WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
+                WORD32 out_stride = (trans_size);
+                WORD32 in_stride = (8) * 4;
+
+                pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
+
+                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
+
+                pi2_src_temp += (stride * 9);
+
+                if(0 == i)
+                {
+                    pi2_src_temp -= (stride * 2 - 8);
+                }
+                else
+                {
+                    pi2_src_temp -= (stride * 6 - 8);
+                }
+                pi2_src_temp -= (stride * 9);
+
+                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
+
+
+                for(j = 0; j < 2; j++)
+                {
+                    if(j)
+                    {
+                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+                    }
+                    else
+                    {
+                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+                    }
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+
+                    /* o0[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o1[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += ((!i) * out_stride + 8);
+                    }
+
+                    /* o2[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o3[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += (i * out_stride + 8);
+                    }
+
+                    /* o4[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o5[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += ((!i) * out_stride + 8);
+                    }
+
+                    /* o6[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o7[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += (i * out_stride + 8);
+                    }
+
+
+                }
+            }
+        }
+        else if(zero_last8_rows_stg2)
+        {
+            /* o & stage 2 pre-transposed out */
+            {
+                WORD32 j;
+                WORD16 *pi2_src_scratch = temp_array;
+                WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
+                WORD32 out_stride = (trans_size);
+                WORD32 in_stride = (8) * 4;
+
+                pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
+
+
+                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
+                pi2_src_temp += (stride);
+                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
+                pi2_src_temp += (stride * 8);
+
+                if(0 == i)
+                {
+                    pi2_src_temp -= (stride * 2 - 8);
+                }
+                else
+                {
+                    pi2_src_temp -= (stride * 6 - 8);
+                }
+
+                pi2_src_temp -= (stride * 8);
+                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
+                pi2_src_temp -= (stride);
+                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
+
+
+                for(j = 0; j < 2; j++)
+                {
+                    if(j)
+                    {
+                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
+                    }
+                    else
+                    {
+                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
+                    }
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
+
+                    /* o0[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+
+                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o1[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += ((!i) * out_stride + 8);
+                    }
+
+                    /* o2[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o3[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
+
+                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += (i * out_stride + 8);
+                    }
+
+                    /* o4[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o5[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += ((!i) * out_stride + 8);
+                    }
+
+                    /* o6[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o7[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += (i * out_stride + 8);
+                    }
+                }
+            }
+        }
+        else
+        {
+            /* o & stage 2 pre-transposed out */
+            {
+                WORD32 j;
+                WORD16 *pi2_src_scratch = temp_array;
+                WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
+                WORD32 out_stride = (trans_size);
+                WORD32 in_stride = (8) * 4;
+
+                pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
+
+
+                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
+                pi2_src_temp += (stride);
+                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
+                pi2_src_temp += (stride * 7);
+                m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //9
+                pi2_src_temp += (stride);
+                m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //13
+                if(0 == i)
+                {
+                    pi2_src_temp -= (stride * 2 - 8);
+                }
+                else
+                {
+                    pi2_src_temp -= (stride * 6 - 8);
+                }
+                m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //15
+                pi2_src_temp -= (stride);
+                m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //11
+                pi2_src_temp -= (stride * 7);
+                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
+                pi2_src_temp -= (stride);
+                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
+
+
+                for(j = 0; j < 2; j++)
+                {
+
+                    if(j) //H8B= higher 8 bytes L8B lower 8 bytes
+                    {
+                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
+                        m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
+                        m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
+                    }
+                    else
+                    {
+                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
+                        m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
+                        m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
+                    }
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25  9
+
+
+                    /* o0[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
+                        m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
+                        m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o1[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
+                        m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
+                        m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
+                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += ((!i) * out_stride + 8);
+                    }
+
+                    /* o2[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
+                        m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
+                        m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o3[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
+                        m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
+                        m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
+
+                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += (i * out_stride + 8);
+                    }
+
+                    /* o4[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
+                        m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
+                        m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o5[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
+                        m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
+                        m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
+                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += ((!i) * out_stride + 8);
+                    }
+
+                    /* o6[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
+                        m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
+                        m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
+
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o7[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += (i * out_stride + 8);
+                    }
+
+                }
+            }
+        }
+    }
+
+    /* Transpose */
+    {
+        WORD16 *pi2_src_scratch;
+        UWORD8 *pu1_pred_temp = pu1_pred;
+        WORD32 out_stride = dst_strd;
+        WORD32 in_stride = trans_size;
+        WORD32 j;
+        m_temp_reg_1 = _mm_setzero_si128();
+        for(i = 0; i < 2; i++)
+        {
+            pi2_src_scratch = (i) ? (pi2_tmp + 8) : pi2_tmp;
+
+            for(j = 0; j < 2; j++)
+            {
+                m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
+                pi2_src_scratch += in_stride;
+                m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
+                pi2_src_scratch += ((!i) * in_stride + 8);
+                m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
+                pi2_src_scratch += (in_stride);
+                m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
+                pi2_src_scratch += (i * in_stride + 8);
+                m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
+                pi2_src_scratch += in_stride;
+                m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
+                pi2_src_scratch += ((!i) * in_stride + 8);
+                m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
+                pi2_src_scratch += in_stride;
+                m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
+                pi2_src_scratch += (i * in_stride + 8);
+
+                m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
+                m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
+
+                m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
+                m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
+
+                m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
+                m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
+
+                m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
+                m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
+
+
+                m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
+                m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
+
+                m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
+                m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
+
+                m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
+                m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
+
+                m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
+                m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
+
+
+                m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
+                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
+
+                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
+                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
+
+                m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
+                m_temp_reg_40 = _mm_add_epi16(m_temp_reg_40, m_temp_reg_0);
+                m_temp_reg_44 = _mm_add_epi16(m_temp_reg_44, m_temp_reg_12);
+
+                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+                pu1_dst += out_stride;
+                pu1_pred_temp += pred_strd;
+
+                m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
+                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
+
+                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
+                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
+
+                m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
+                m_temp_reg_41 = _mm_add_epi16(m_temp_reg_41, m_temp_reg_0);
+                m_temp_reg_45 = _mm_add_epi16(m_temp_reg_45, m_temp_reg_12);
+
+                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_41, m_temp_reg_45);
+                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+                pu1_dst += out_stride;
+                pu1_pred_temp += pred_strd;
+
+                m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
+                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
+
+                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
+                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
+
+                m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
+                m_temp_reg_42 = _mm_add_epi16(m_temp_reg_42, m_temp_reg_0);
+                m_temp_reg_46 = _mm_add_epi16(m_temp_reg_46, m_temp_reg_12);
+
+                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_42, m_temp_reg_46);
+                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+                pu1_dst += out_stride;
+                pu1_pred_temp += pred_strd;
+
+                m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
+                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
+
+                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
+                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
+
+                m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
+                m_temp_reg_43 = _mm_add_epi16(m_temp_reg_43, m_temp_reg_0);
+                m_temp_reg_47 = _mm_add_epi16(m_temp_reg_47, m_temp_reg_12);
+
+                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_43, m_temp_reg_47);
+                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+                pu1_dst += out_stride;
+                pu1_pred_temp += pred_strd;
+            }
+        }
+    }
+}

diff --git a/common/x86/ihevc_32x32_itrans_recon_sse42_intr.c b/common/x86/ihevc_32x32_itrans_recon_sse42_intr.c
new file mode 100644
index 0000000..ec8c5c1
--- /dev/null
+++ b/common/x86/ihevc_32x32_itrans_recon_sse42_intr.c

@@ -0,0 +1,6636 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_32x32_itrans_recon_x86_intr.c
+ *
+ * @brief
+ *  Contains function definitions for inverse  quantization, inverse
+ * transform and reconstruction
+ *
+ * @author
+ *  100470
+ *
+ * @par List of Functions:
+ *  - ihevc_itrans_recon_32x32_sse42()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_iquant_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization, inverse  transform and
+ * reconstruction for 16x16 input block
+ *
+ * @par Description:
+ *  Performs inverse quantization , inverse transform  and adds the
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 16x16 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 16x16 buffer for storing inverse
+ *  transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 16x16 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 16x16 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+/**/
+
+void ihevc_itrans_recon_32x32_sse42(WORD16 *pi2_src,
+                                    WORD16 *pi2_tmp,
+                                    UWORD8 *pu1_pred,
+                                    UWORD8 *pu1_dst,
+                                    WORD32 src_strd,
+                                    WORD32 pred_strd,
+                                    WORD32 dst_strd,
+                                    WORD32 zero_cols,
+                                    WORD32 zero_rows)
+{
+    /* Inverse Transform */
+
+    WORD32 j;
+
+
+    WORD16 *pi2_tmp_orig;
+
+
+    WORD16 *o_temp_ptr;
+    WORD16 *temp_ptr;
+
+    __m128i m_temp_reg_0;
+    __m128i m_temp_reg_1;
+    __m128i m_temp_reg_2;
+    __m128i m_temp_reg_3;
+    __m128i m_temp_reg_4;
+    __m128i m_temp_reg_5;
+    __m128i m_temp_reg_6;
+    __m128i m_temp_reg_7;
+    __m128i m_temp_reg_10;
+    __m128i m_temp_reg_11;
+    __m128i m_temp_reg_12;
+    __m128i m_temp_reg_13;
+    __m128i m_temp_reg_14;
+    __m128i m_temp_reg_15;
+    __m128i m_temp_reg_16;
+    __m128i m_temp_reg_17;
+    __m128i m_temp_reg_18;
+    __m128i m_temp_reg_19;
+    __m128i m_temp_reg_20;
+    __m128i m_temp_reg_21;
+    __m128i m_temp_reg_22;
+    __m128i m_temp_reg_23;
+    __m128i m_temp_reg_30;
+    __m128i m_temp_reg_31;
+    __m128i m_temp_reg_32;
+    __m128i m_temp_reg_33;
+    __m128i m_temp_reg_34;
+    __m128i m_temp_reg_35;
+    __m128i m_temp_reg_36;
+    __m128i m_temp_reg_37;
+    __m128i m_temp_reg_40;
+    __m128i m_temp_reg_41;
+    __m128i m_temp_reg_42;
+    __m128i m_temp_reg_43;
+    __m128i m_temp_reg_44;
+    __m128i m_temp_reg_45;
+    __m128i m_temp_reg_46;
+    __m128i m_temp_reg_47;
+
+    __m128i m_temp_reg_70;
+    __m128i m_temp_reg_71;
+    __m128i m_temp_reg_72;
+    __m128i m_temp_reg_73;
+    __m128i m_temp_reg_74;
+    __m128i m_temp_reg_75;
+    __m128i m_temp_reg_76;
+    __m128i m_temp_reg_77;
+
+    __m128i m_temp_reg_80;
+    __m128i m_temp_reg_81;
+    __m128i m_temp_reg_82;
+    __m128i m_temp_reg_83;
+    __m128i m_temp_reg_84;
+    __m128i m_temp_reg_85;
+    __m128i m_temp_reg_86;
+    __m128i m_temp_reg_87;
+
+    __m128i m_temp_reg_90;
+    __m128i m_temp_reg_91;
+    __m128i m_temp_reg_92;
+    __m128i m_temp_reg_93;
+    __m128i m_temp_reg_94;
+    __m128i m_temp_reg_95;
+    __m128i m_temp_reg_96;
+    __m128i m_temp_reg_97;
+
+    __m128i m_rdng_factor;
+    __m128i m_count;
+    __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
+    __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
+
+    __m128i temp1, temp2, temp3, temp4;
+    __m128i temp5, temp6, temp7, temp8;
+
+    __m128i all_zero_reg;
+    WORD32 i;
+
+    /*Lokesh*/
+    WORD32  zero_last24_cols_stg1;
+    WORD32  zero_last24_rows_stg1;
+    WORD32  zero_last28_rows_stg1;
+
+    WORD32  zero_last28_rows_stg2;
+    WORD32  zero_last24_rows_stg2;
+
+    WORD32  trans_size_stg1;
+
+    WORD32 i4_shift = IT_SHIFT_STAGE_1;
+    WORD32 trans_size = TRANS_SIZE_32;
+
+
+    /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
+    zero_last24_cols_stg1 = ((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
+    zero_last24_rows_stg1 = ((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
+    zero_last28_rows_stg1 = ((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
+
+    zero_last28_rows_stg2 = ((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
+    zero_last24_rows_stg2 = zero_last24_cols_stg1;
+
+    if((zero_last28_rows_stg2) || (zero_last24_cols_stg1))
+    {
+        trans_size_stg1 = 8;
+
+    }
+    else
+    {
+        trans_size_stg1 = 32;
+    }
+
+    all_zero_reg = _mm_setzero_si128();
+
+    o_temp_ptr  = pi2_tmp;
+    temp_ptr = (pi2_tmp + 1024);
+
+    pi2_tmp += 2048;
+    pi2_tmp_orig = pi2_tmp;
+
+    for(i = 0; i < trans_size_stg1; i += 8)
+    {
+
+        {
+            WORD16 *pi2_tmp_src = pi2_src;
+
+            m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+
+            m_temp_reg_80 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_81 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_82 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_83 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_84 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_85 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_86 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_87 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+        }
+
+        if(zero_last28_rows_stg1)
+        {
+            /* eeo */
+            /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+            /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+            {
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
+
+                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+/* eeeo[0]= m_temp_reg_20  */
+/* eeeo[1]= m_temp_reg_21  */
+/* eeee[0]= m_temp_reg_22  */
+/* eeee[1]= m_temp_reg_23  */
+
+                /* eee[0] = eeee[0] + eeeo[0]; */
+                m_temp_reg_40 = m_temp_reg_14;
+
+                /* eee[3] = eeee[0] - eeeo[0]; */
+                m_temp_reg_43 = m_temp_reg_14;
+
+                /* eee[2] = eeee[1] - eeeo[1]; */
+                m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
+
+                /* eee[1] = eeee[1] + eeeo[1];*/
+                m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
+
+                m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+
+                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+/* eeeo[0]= m_temp_reg_20  */
+/* eeeo[1]= m_temp_reg_21  */
+/* eeee[0]= m_temp_reg_22  */
+/* eeee[1]= m_temp_reg_23  */
+
+                /* eee[0] = eeee[0] + eeeo[0]; */
+                m_temp_reg_44 = m_temp_reg_14;
+
+                /* eee[3] = eeee[0] - eeeo[0]; */
+                m_temp_reg_47 = m_temp_reg_14;
+
+                /* eee[2] = eeee[1] - eeeo[1]; */
+                m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
+
+                /* eee[1] = eeee[1] + eeeo[1];*/
+                m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
+
+
+            }
+            /* eo */
+            {
+                WORD16 *pi2_scratch = o_temp_ptr;
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
+                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
+                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
+                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
+                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
+
+                m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo0[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo1[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo2[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /**************************************************************************/
+
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo3[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo4[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+                /* eo4[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /***********************************************************************/
+
+                /* eo5[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo5[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff6);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo6[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo6[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff7);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo7[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo7[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff8);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+            }
+        }
+        else if(zero_last24_rows_stg1)
+        {
+            {
+                /* eeo */
+                /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+                /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
+
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
+
+                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+                /* eeeo[0]= m_temp_reg_20  */
+                /* eeeo[1]= m_temp_reg_21  */
+                /* eeee[0]= m_temp_reg_22  */
+                /* eeee[1]= m_temp_reg_23  */
+
+                /* eee[0] = eeee[0] + eeeo[0]; */
+                m_temp_reg_40 = m_temp_reg_14;
+
+                /* eee[3] = eeee[0] - eeeo[0]; */
+                m_temp_reg_43 = m_temp_reg_14;
+
+                /* eee[2] = eeee[1] - eeeo[1]; */
+                m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
+
+                /* eee[1] = eeee[1] + eeeo[1];*/
+                m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
+
+                /* for row 4 to 7 */
+
+                m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+
+                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+                /* eeeo[0]= m_temp_reg_20  */
+                /* eeeo[1]= m_temp_reg_21  */
+                /* eeee[0]= m_temp_reg_22  */
+                /* eeee[1]= m_temp_reg_23  */
+
+                /* eee[0] = eeee[0] + eeeo[0]; */
+                m_temp_reg_44 = m_temp_reg_14;
+
+                /* eee[3] = eeee[0] - eeeo[0]; */
+                m_temp_reg_47 = m_temp_reg_14;
+
+                /* eee[2] = eeee[1] - eeeo[1]; */
+                m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
+
+                /* eee[1] = eeee[1] + eeeo[1];*/
+                m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
+
+
+                // eeo[]
+                /* for(k = 0; k < 4; k++) */
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
+
+                m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
+
+                m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
+
+                m_temp_reg_33 = _mm_setzero_si128();
+
+                /* eeo */
+                {
+                    /* eeo0[0-3] */
+                    {
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                        m_temp_reg_90 = m_temp_reg_34;
+                        m_temp_reg_97 = m_temp_reg_35;
+                    }
+                    /* eeo0[4-7] */
+                    {
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+                        m_temp_reg_91 = m_temp_reg_34;
+                        m_temp_reg_96 = m_temp_reg_35;
+
+                    }
+
+                    /* eeo1[0-3] */
+                    {
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
+
+                        /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+                        /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
+
+                        m_temp_reg_92 = m_temp_reg_34;
+                        m_temp_reg_95 = m_temp_reg_35;
+
+                    }
+
+                    /* eo1[4-7] */
+                    {
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
+
+                        /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+                        /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
+
+                        m_temp_reg_93 = m_temp_reg_34;
+                        m_temp_reg_94 = m_temp_reg_35;
+
+
+                    }
+
+                    /* eo2[0-3] */
+                    {
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
+
+                        /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+                        /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+                        temp1 = m_temp_reg_34;
+                        temp7 = m_temp_reg_35;
+
+                    }
+
+                    /* eo2[4-7] */
+                    {
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
+
+                        /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+                        /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+                        temp2 = m_temp_reg_34;
+                        temp6 = m_temp_reg_35;
+
+                    }
+
+                    /* eo3[0-3] */
+                    {
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                        /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+                        /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
+
+                        temp3 = m_temp_reg_34;
+                        temp5 = m_temp_reg_35;
+
+                    }
+
+
+                    /* eo3[4-7] */
+                    {
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+                        /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+                        /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
+
+                        temp4 = m_temp_reg_34;
+                        temp8 = m_temp_reg_35;
+
+
+                    }
+                    /* All values of ee[] array in pi2_temp */
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
+
+                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+
+                    m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+                    m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
+
+                }
+            }
+            /* eo */
+            {
+
+                WORD16 *pi2_scratch = o_temp_ptr;
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo0[4-7] */
+                {
+                    m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
+
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo1[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
+
+                /* eo2[0-3] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo2[4-7] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /**************************************************************************/
+
+
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
+
+                /* eo3[0-3] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo3[4-7] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
+
+                /* eo4[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+                /* eo4[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /***********************************************************************/
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
+
+                /* eo5[0-3] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo5[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
+
+                /* eo6[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo6[4-7] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
+
+                /* eo7[0-3] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo7[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+            }
+
+        }
+        else
+        {
+
+            {
+                /* eeo */
+                /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+                /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
+
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
+
+                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
+
+                m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
+                m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
+
+                m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
+                m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
+
+
+                /* eeeo[0]= m_temp_reg_20  */
+                /* eeeo[1]= m_temp_reg_21  */
+                /* eeee[0]= m_temp_reg_22  */
+                /* eeee[1]= m_temp_reg_23  */
+
+                /* eee[0] = eeee[0] + eeeo[0]; */
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
+
+                /* eee[3] = eeee[0] - eeeo[0]; */
+                m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
+
+                /* eee[2] = eeee[1] - eeeo[1]; */
+                m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
+
+                /* eee[1] = eeee[1] + eeeo[1];*/
+                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
+
+                /* for row 4 to 7 */
+
+                m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
+                m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
+
+                /* Interleaving row 8 and row 24*/
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
+
+                m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+                m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
+
+                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
+
+                m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
+                m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
+
+                m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
+                m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
+
+
+                /* eeeo[0]= m_temp_reg_20  */
+                /* eeeo[1]= m_temp_reg_21  */
+                /* eeee[0]= m_temp_reg_22  */
+                /* eeee[1]= m_temp_reg_23  */
+
+                /* eee[0] = eeee[0] + eeeo[0]; */
+                m_temp_reg_44 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
+
+                /* eee[3] = eeee[0] - eeeo[0]; */
+                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
+
+                /* eee[2] = eeee[1] - eeeo[1]; */
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
+
+                /* eee[1] = eeee[1] + eeeo[1];*/
+                m_temp_reg_45 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
+
+
+                // eeo[]
+                /* for(k = 0; k < 4; k++) */
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
+
+                /* eeo */
+                {
+                    /* eeo0[0-3] */
+                    {
+                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
+
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        m_temp_reg_90 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                        m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    }
+
+                    m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
+                    m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
+                    m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
+                    m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
+
+                    /* eeo0[4-7] */
+                    {
+                        m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+                        m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
+
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        m_temp_reg_91 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+                        m_temp_reg_96 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+                    }
+
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89  50
+
+                    /* eeo1[0-3] */
+                    {
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
+
+                        m_temp_reg_92 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
+                        m_temp_reg_95 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
+
+                    }
+
+                    /* eeo1[4-7] */
+                    {
+
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
+
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
+
+                        m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
+                        m_temp_reg_94 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
+
+
+                    }
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18  75
+
+                    /* eeo2[0-3] */
+                    {
+
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+                        /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+                        /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+                        temp1 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
+                        temp7 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
+
+                    }
+
+                    /* eeo2[4-7] */
+                    {
+
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
+
+                        /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+                        /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+                        temp2 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
+                        temp6 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
+
+                    }
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75  -89
+
+                    /* eeo3[0-3] */
+                    {
+
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+                        /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+                        /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
+
+                        temp3 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
+                        temp5 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
+
+
+                    }
+
+                    /* eeo3[4-7] */
+                    {
+
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
+
+                        /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+                        /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
+                        temp4 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
+                        temp8 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
+
+                    }
+
+
+                    /* All values of ee[] array in pi2_temp */
+
+                    /* for(k = 0; k < 8; k++) */
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
+                }
+            }
+            /* eo */
+            {
+
+                WORD16 *pi2_scratch = o_temp_ptr;
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
+
+                m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+                m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
+                m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
+                m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
+
+                m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
+                m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
+                m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
+                m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+                /* eo0[4-7] */
+                {
+                    m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+                    m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
+                    m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0  -43
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80  90
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70  25
+
+                /* eo1[0-3] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo1[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70  87
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25  57
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90  43
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo2[4-7] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+                /**************************************************************************/
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87  9
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90  25
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80  57
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo3[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25  90
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9  87
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43  70
+
+                /* eo4[0-3] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo4[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /***********************************************************************/
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57  25
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87  70
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9  -80
+
+                /* eo5[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo5[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90  -80
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43  9
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57  87
+
+                /* eo6[0-3] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo6[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43  -57
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70  -80
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87  -90
+
+                /* eo7[0-3] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo7[4-7] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+            }
+
+        }
+        /*  All e[] are done */
+        /****************************/
+
+        {
+
+            WORD16 *pi2_tmp_src = pi2_src + src_strd;
+
+            m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+
+            m_temp_reg_80 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_81 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_82 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_83 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_84 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_85 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_86 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_87 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+        }
+
+        if(zero_last28_rows_stg1)
+        {
+            /* o & stage 1 out */
+            {
+                WORD32 j;
+                WORD16 *pi2_src_scratch = o_temp_ptr;
+                WORD16 *pi2_dst_scratch = temp_ptr;
+                WORD32 out_stride = (trans_size << 1);
+                WORD32 in_stride = trans_size;
+
+                for(j = 0; j < 2; j++)
+                {
+                    if(j)
+                    {
+                        m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+                        m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+                    }
+
+                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+
+                    /* o0[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+
+                    /* o1[0-3] */
+                    {
+
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+
+                    /* o2[0-3] */
+                    {
+
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+
+                    /* o3[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+
+                    /* o4[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+
+                    /* o5[0-3] */
+                    {
+
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+
+                    /* o6[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+
+                    /* o7[0-3] */
+                    {
+
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+
+                    /* o8[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+
+                    /* o9[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+
+                    /* o10[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+
+                    /* o11[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+
+                    /* o12[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+
+                    /* o13[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+
+                    /* o14[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+
+                    /* o15[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+                    }
+
+                }
+            }
+        }
+        else if(zero_last24_rows_stg1)
+        {
+            /* o & stage 1 out */
+            {
+                WORD32 j;
+
+                WORD16 *pi2_src_scratch = o_temp_ptr;
+                WORD16 *pi2_dst_scratch = temp_ptr;
+                WORD32 out_stride = (trans_size << 1);
+
+                WORD32 in_stride = trans_size;
+
+                for(j = 0; j < 2; j++)
+                {
+                    if(j)
+                    {
+                        m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+                        m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+                        m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
+                        m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
+                    }
+
+                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
+                    m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
+
+                    /* o0[0-3] */
+                    {
+
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
+
+                    /* o1[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
+
+                    /* o2[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
+
+                    /* o3[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
+
+                    /* o4[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
+
+                    /* o5[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
+
+                    /* o6[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
+
+                    /* o7[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
+
+                    /* o8[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
+
+                    /* o9[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
+
+                    /* o10[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
+
+                    /* o11[0-3] */
+                    {
+
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
+
+                    /* o12[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
+
+                    /* o13[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
+
+                    /* o14[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
+
+                    /* o15[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+                    }
+
+                }
+            }
+        }
+        else
+        {
+            /* o & stage 1 out */
+            {
+                WORD32 j;
+
+                WORD16 *pi2_src_scratch = o_temp_ptr;
+                WORD16 *pi2_dst_scratch = temp_ptr;
+                WORD32 out_stride = (trans_size << 1);
+
+                WORD32 in_stride = trans_size;
+
+
+                for(j = 0; j < 2; j++)
+                {
+                    if(j)
+                    {
+                        m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+                        m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+                        m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
+                        m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
+                        m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
+                        m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
+                        m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
+                        m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
+
+                        m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
+                        m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
+                        m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
+                        m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
+                        m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
+                        m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
+                        m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
+                        m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
+                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
+                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
+                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
+                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
+
+                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
+                    m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
+                    m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
+                    m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
+                    temp1 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
+                    temp2 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
+                    temp3 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
+                    temp4 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
+
+
+                    /* o0[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
+                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
+                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
+                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
+                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
+
+
+                    /* o1[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
+                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
+                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
+                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
+                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
+
+                    /* o2[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
+                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
+                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
+                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
+                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
+
+                    /* o3[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
+                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
+                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
+                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
+                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
+
+                    /* o4[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
+                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
+                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
+                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
+                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
+
+                    /* o5[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
+                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
+                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
+                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
+                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
+
+
+                    /* o6[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
+                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
+                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
+                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
+                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
+
+                    /* o7[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
+                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
+                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
+                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
+                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
+
+
+                    /* o8[0-3] */
+                    {
+
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
+                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
+                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
+                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
+                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
+
+
+                    /* o9[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
+                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
+                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
+                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
+                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
+
+                    /* o10[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
+                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
+                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
+                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
+                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
+
+                    /* o11[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
+                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
+                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
+                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
+                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
+
+
+                    /* o12[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
+                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
+                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
+                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
+                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
+
+
+                    /* o13[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
+                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
+                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
+                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
+                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
+
+
+                    /* o14[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
+                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
+                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
+                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
+                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
+
+                    /* o15[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+                    }
+
+                }
+            }
+        }
+        /* Transpose */
+        {
+            WORD16 *pi2_src_scratch = temp_ptr;
+            WORD16 *pi2_dst_scratch = pi2_tmp;
+            WORD32 in_stride = (trans_size << 1);
+
+            for(j = 0; j < 2; j++)
+            {
+                m_temp_reg_30 =  _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch += in_stride;
+                m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch += in_stride;
+                m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch += in_stride;
+                m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch += in_stride;
+                m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch += in_stride;
+                m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch += in_stride;
+                m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch += in_stride;
+                m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch += 8;
+
+                m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch += 8;
+
+
+                m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
+                m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
+
+                m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
+                m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
+
+                m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
+                m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
+
+                m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
+                m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
+
+                m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
+                m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
+
+                m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
+                m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
+
+                m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
+                m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
+
+                m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
+                m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
+
+                /****************/
+
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
+                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
+
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
+
+                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
+                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
+
+                m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
+                m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
+
+                m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
+                m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
+
+                m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
+                m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
+
+                m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
+                m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
+
+                /******************/
+
+                m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);
+                m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);
+
+                m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);
+                m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);
+
+                m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);
+                m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);
+
+                m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);
+                m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);
+
+                m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);
+                m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);
+
+                m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);
+                m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);
+
+                m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);
+                m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);
+
+                m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);
+                m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);
+
+                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_30);
+                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_34);
+                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_36);
+                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_32);
+
+                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_31);
+                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_35);
+                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_37);
+                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_33);
+
+                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_80);
+                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_84);
+                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_86);
+                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_82);
+
+                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_81);
+                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_85);
+                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_87);
+                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_83);
+
+                pi2_dst_scratch += 4 * trans_size;
+            }
+        }
+        pi2_src += 8;
+//      pi2_dequant_coeff +=8;
+        pi2_tmp += 8 * trans_size;
+        zero_cols = zero_cols >> 1;
+    }
+
+    if(trans_size_stg1 != TRANS_SIZE_32)
+    {
+        m_temp_reg_10 = _mm_setzero_si128();
+
+        for(i = trans_size_stg1; i < 32; i += 8)
+        {
+            WORD16 *pi2_dst_scratch = pi2_tmp;
+
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_10);
+
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_10);
+
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_10);
+
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_10);
+
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 8), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 16), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 24), m_temp_reg_10);
+
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 8), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 16), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 24), m_temp_reg_10);
+
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 8), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 16), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 24), m_temp_reg_10);
+
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 8), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 16), m_temp_reg_10);
+            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 24), m_temp_reg_10);
+
+            pi2_tmp += 8 * trans_size;
+        }
+    }
+
+    pi2_tmp = pi2_tmp_orig;
+
+    /* Inverse Transform 2nd stage */
+
+
+    for(j = 0; j < trans_size; j += 4)
+    {
+        i4_shift = IT_SHIFT_STAGE_2;
+
+        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+        if(zero_last28_rows_stg2)
+        {
+            {
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
+                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
+                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
+                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
+                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
+
+                m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, all_zero_reg);
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                }
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
+
+                }
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                }
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
+                }
+                /* eo4[0-3] */
+                {
+                    m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                }
+
+                /* eo5[0-3] */
+                {
+                    m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
+                }
+
+                /* eo6[0-3] */
+                {
+                    m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
+                }
+                /* eo7[0-3] */
+                {
+                    m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
+                }
+            }
+
+            m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
+
+            m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
+
+            m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+            m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+            m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+            /* e[]*/
+
+            temp1 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_90);  /* ee[0] */
+            temp2 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_90);  /* ee[15] */
+
+            temp3 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_91);  /* ee[1] */
+            temp4 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_91);  /* ee[14] */
+
+            temp5 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_92);  /* ee[2] */
+            temp6 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_92);  /* ee[13] */
+
+            temp7 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_93);  /* ee[3] */
+            temp8 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_93);  /* ee[12] */
+
+            m_temp_reg_90 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_94);  /* ee[4] */
+            m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_94);  /* ee[11] */
+
+            m_temp_reg_92 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_95);  /* ee[5] */
+            m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_95);  /* ee[10] */
+
+            m_temp_reg_94 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_96);  /* ee[6] */
+            m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_96);  /* ee[9] */
+
+            m_temp_reg_96 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_97);  /* ee[7] */
+            m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_97);  /* ee[8] */
+
+            /*o[k]*/
+            {
+
+                WORD16 *pi2_dst_scratch = temp_ptr;
+                WORD32 out_stride = 8;
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+
+                m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
+                m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
+
+
+                /* o0[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+
+                /* o1[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+
+                /* o2[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_sub_epi32(temp5, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_add_epi32(temp5, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+
+                /* o3[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_sub_epi32(temp7, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_add_epi32(temp7, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+
+                /* o4[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+
+                /* o5[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+
+                /* o6[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+
+                /* o7[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+
+                /* o8[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+
+                /* o9[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+
+                /* o10[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+
+                /* o11[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+
+                /* o12[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+
+                /* o13[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+
+                /* o14[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+
+                /* o15[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += 8;
+                }
+
+            }
+
+        }
+        else if(zero_last24_rows_stg2)
+        {
+            /* eo */
+            {
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+
+                m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
+                m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
+
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
+
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                }
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
+
+                /* eo3[0-3] */
+                {
+
+                    m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
+
+                /* eo4[0-3] */
+                {
+                    m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
+
+                /* eo5[0-3] */
+                {
+                    m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
+                /* eo6[0-3] */
+                {
+                    m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
+                /* eo7[0-3] */
+                {
+                    m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                }
+
+            }
+
+            /* eeo */
+            {
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50
+
+                m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
+
+                /* eeo0[0-3] */
+                {
+                    temp1 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                }
+
+                /* eeo1[0-3] */
+                {
+                    temp2 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
+
+                }
+
+                /* eo2[0-3] */
+                {
+                    temp3 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
+
+                }
+
+
+                /* eo3[0-3] */
+                {
+                    temp4 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                }
+
+            }
+
+            m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83
+            m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36
+            m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
+
+            m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
+
+            //m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_70);
+            m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+            m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+            m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+            m_temp_reg_70 = _mm_add_epi32(m_temp_reg_14, temp1);  /* ee[0] */
+            m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_14, temp1);  /* ee[7] */
+
+            m_temp_reg_72 = _mm_add_epi32(m_temp_reg_16, temp2);  /* ee[1] */
+            m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_16, temp2);  /* ee[6] */
+
+            m_temp_reg_74 = _mm_add_epi32(m_temp_reg_16, temp3);  /* ee[2] */
+            m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_16, temp3);  /* ee[5] */
+
+            m_temp_reg_76 = _mm_add_epi32(m_temp_reg_14, temp4);  /* ee[3] */
+            m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_14, temp4);  /* ee[4] */
+
+            /* e[]*/
+
+            temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[0] */
+            temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[15] */
+
+            temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[1] */
+            temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[14] */
+
+            temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[2] */
+            temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[13] */
+
+            temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[3] */
+            temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[12] */
+
+            m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[4] */
+            m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[11] */
+
+            m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[5] */
+            m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[10] */
+
+            m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[6] */
+            m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[9] */
+
+            m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[7] */
+            m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[8] */
+
+            /*o[k] */
+            {
+
+                WORD16 *pi2_dst_scratch = temp_ptr;
+                WORD32 out_stride = 8;
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
+
+                m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
+                m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
+                m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
+                m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
+
+                /* o0[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                    m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
+
+                /* o1[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                    m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
+
+                /* o2[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
+
+                /* o3[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
+
+                /* o4[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
+
+                /* o5[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
+
+                /* o6[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
+
+                /* o7[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
+
+                /* o8[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
+
+                /* o9[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
+
+                /* o10[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
+
+                /* o11[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
+
+                /* o12[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
+
+                /* o13[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
+
+                /* o14[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
+
+                /* o15[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += 8;
+                }
+
+            }
+        }
+        else
+        {
+            /* eo */
+            {
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
+
+
+                m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
+                m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
+                m_temp_reg_12 = _mm_loadu_si128((__m128i *)&pi2_tmp[10 * trans_size]);
+                m_temp_reg_13 = _mm_loadu_si128((__m128i *)&pi2_tmp[14 * trans_size]);
+                m_temp_reg_18 = _mm_loadu_si128((__m128i *)&pi2_tmp[18 * trans_size]);
+                m_temp_reg_19 = _mm_loadu_si128((__m128i *)&pi2_tmp[22 * trans_size]);
+                m_temp_reg_20 = _mm_loadu_si128((__m128i *)&pi2_tmp[26 * trans_size]);
+                m_temp_reg_21 = _mm_loadu_si128((__m128i *)&pi2_tmp[30 * trans_size]);
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_18, m_temp_reg_19);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_21);
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_90 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0  -43
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80  90
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70  25
+
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70  87
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25  57
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90  43
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_92 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87  9
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90  25
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80  57
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_93 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25  90
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9  87
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43  70
+
+
+                /* eo4[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+
+                    m_temp_reg_94 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57  25
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87  70
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9  -80
+
+                /* eo5[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_95 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90  -80
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43  9
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57  87
+
+                /* eo6[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_96 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43  -57
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70  -80
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87  -90
+
+                /* eo7[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_97 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+
+                }
+
+            }
+
+            /* eeo */
+            {
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
+
+                m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
+                m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[12 * trans_size]);
+                m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[20 * trans_size]);
+                m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[28 * trans_size]);
+
+                /* eeo0[0-3] */
+                {
+
+                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+                    m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    temp1 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                }
+
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89  50
+
+                /* eeo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+                    temp2 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                }
+
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18  75
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+                    temp3 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                }
+
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75  -89
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+                    temp4 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                }
+
+
+            }
+
+            m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
+            m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
+
+            m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
+            m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
+
+            m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[8 * trans_size]);
+            m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[24 * trans_size]);
+
+            m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
+
+            m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
+            m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[16 * trans_size]);
+
+            m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
+
+            m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
+            m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
+
+            m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
+            m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
+
+/* eeeo[0]= m_temp_reg_20  */
+/* eeeo[1]= m_temp_reg_21  */
+/* eeee[0]= m_temp_reg_22  */
+/* eeee[1]= m_temp_reg_23  */
+
+            /* eee[0] = eeee[0] + eeeo[0]; */
+            m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
+
+            /* eee[3] = eeee[0] - eeeo[0]; */
+            m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
+
+            /* eee[2] = eeee[1] - eeeo[1]; */
+            m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
+
+            /* eee[1] = eeee[1] + eeeo[1];*/
+            m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
+
+            m_temp_reg_70 = _mm_add_epi32(m_temp_reg_40, temp1);  /* ee[0] */
+            m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_40, temp1);  /* ee[7] */
+
+            m_temp_reg_72 = _mm_add_epi32(m_temp_reg_41, temp2);  /* ee[1] */
+            m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_41, temp2);  /* ee[6] */
+
+            m_temp_reg_74 = _mm_add_epi32(m_temp_reg_42, temp3);  /* ee[2] */
+            m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_42, temp3);  /* ee[5] */
+
+            m_temp_reg_76 = _mm_add_epi32(m_temp_reg_43, temp4);  /* ee[3] */
+            m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_43, temp4);  /* ee[4] */
+
+/* e[]*/
+
+            temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[0] */
+            temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[15] */
+
+            temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[1] */
+            temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[14] */
+
+            temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[2] */
+            temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[13] */
+
+            temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[3] */
+            temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[12] */
+
+            m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[4] */
+            m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[11] */
+
+            m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[5] */
+            m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[10] */
+
+            m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[6] */
+            m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[9] */
+
+            m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[7] */
+            m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[8] */
+
+/*o[k] */
+            {
+
+                WORD16 *pi2_dst_scratch = temp_ptr;
+                WORD32 out_stride = 8;
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
+                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
+                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
+                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
+                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
+
+
+                m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
+                m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
+                m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
+                m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
+                m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[9 * trans_size]);
+                m_temp_reg_75 = _mm_loadu_si128((__m128i *)&pi2_tmp[11 * trans_size]);
+                m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[13 * trans_size]);
+                m_temp_reg_77 = _mm_loadu_si128((__m128i *)&pi2_tmp[15 * trans_size]);
+
+                m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[17 * trans_size]);
+                m_temp_reg_81 = _mm_loadu_si128((__m128i *)&pi2_tmp[19 * trans_size]);
+                m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[21 * trans_size]);
+                m_temp_reg_83 = _mm_loadu_si128((__m128i *)&pi2_tmp[23 * trans_size]);
+                m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[25 * trans_size]);
+                m_temp_reg_85 = _mm_loadu_si128((__m128i *)&pi2_tmp[27 * trans_size]);
+                m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[29 * trans_size]);
+                m_temp_reg_87 = _mm_loadu_si128((__m128i *)&pi2_tmp[31 * trans_size]);
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
+                m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
+                m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
+                m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
+                m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
+
+                /* o0[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
+                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
+                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
+                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
+                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
+
+                /* o1[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp3, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp3, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
+                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
+                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
+                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
+                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
+
+                /* o2[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
+                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
+                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
+                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
+                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
+
+                /* o3[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
+                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
+                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
+                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
+                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
+
+                /* o4[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
+                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
+                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
+                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
+                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
+
+                /* o5[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
+                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
+                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
+                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
+                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
+
+                /* o6[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
+                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
+                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
+                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
+                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
+
+                /* o7[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
+                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
+                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
+                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
+                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
+
+                /* o8[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
+                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
+                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
+                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
+                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
+
+                /* o9[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
+                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
+                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
+                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
+                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
+
+                /* o10[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
+                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
+                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
+                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
+                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
+
+                /* o11[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
+                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
+                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
+                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
+                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
+
+                /* o12[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
+                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
+                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
+                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
+                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
+
+                /* o13[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
+                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
+                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
+                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
+                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
+
+                /* o14[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
+                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
+                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
+                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
+                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
+
+                /* o15[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += 8;
+                }
+
+            }
+        }
+
+        /* Transpose */
+        {
+
+            WORD16 *pi2_src_scratch = temp_ptr;
+            WORD32 out_stride = dst_strd;
+            WORD32 in_stride = 8;
+
+            m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += 8;
+
+            m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += 8;
+
+
+            m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
+            m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
+
+            m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
+            m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
+
+            m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
+            m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
+
+            m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
+            m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
+
+            m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
+            m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
+
+            m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
+            m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
+
+            m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
+            m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
+
+            m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
+            m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
+
+
+            m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
+            m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
+
+            m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
+            m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
+
+            m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
+            m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
+
+            m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
+            m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
+
+            m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
+            m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
+
+            m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
+            m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
+
+            m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
+            m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
+
+            m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
+            m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
+
+
+            m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);       // row0 = 0-7
+            m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);       // row1 = 0-7
+
+            m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);     // row0=24-31
+            m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);     // row1=24-31
+
+            m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);       // row0=8-15
+            m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);       // row1=8-15
+
+            m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);     // row0=16-23
+            m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);     // row1=16-23
+
+            m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);      // row2 =0-7
+            m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);      // row3 =0-7
+
+            m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);    // row2=24-31
+            m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);    // row3=24-31
+
+            m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);      // row2=8-15
+            m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);      // row3=8-15
+
+            m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);    // row2=16-23
+            m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);    // row3=16-23
+
+            m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
+
+            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_30, m_temp_reg_0);
+            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_34, m_temp_reg_0);
+            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+            _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+
+            m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
+
+            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_36, m_temp_reg_0);
+            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_32, m_temp_reg_0);
+            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
+            pu1_dst += out_stride;
+            pu1_pred += pred_strd;
+
+
+            m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
+
+            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_31, m_temp_reg_0);
+            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_35, m_temp_reg_0);
+            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+            _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+
+            m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
+
+            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_37, m_temp_reg_0);
+            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_33, m_temp_reg_0);
+            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
+            pu1_dst += out_stride;
+            pu1_pred += pred_strd;
+
+            m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
+
+            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_80, m_temp_reg_0);
+            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_84, m_temp_reg_0);
+            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+            _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+
+            m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
+
+            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_86, m_temp_reg_0);
+            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_82, m_temp_reg_0);
+            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
+            pu1_dst += out_stride;
+            pu1_pred += pred_strd;
+
+
+            m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
+
+            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_81, m_temp_reg_0);
+            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_85, m_temp_reg_0);
+            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+            _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+
+            m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
+
+            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_87, m_temp_reg_0);
+            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_83, m_temp_reg_0);
+            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
+            pu1_dst += out_stride;
+            pu1_pred += pred_strd;
+
+        }
+        pi2_tmp += 4;
+    }
+}
+

diff --git a/common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c b/common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c
new file mode 100644
index 0000000..1de4253
--- /dev/null
+++ b/common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c

@@ -0,0 +1,486 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_chroma_intra_pred_filters_x86_intr.c
+*
+* @brief
+*  Contains function Definition for intra prediction  interpolation filters
+*
+*
+* @author
+*  Ittiam
+*
+* @par List of Functions:
+*  ihevc_intra_pred_chroma_planar_sse42()
+*
+*  ihevc_intra_pred_chroma_dc_sse42()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_intra_pred.h"
+#include "ihevc_chroma_intra_pred.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_tables_x86_intr.h"
+
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+
+
+/****************************************************************************/
+/* Constant Macros                                                          */
+/****************************************************************************/
+#define MAX_CU_SIZE 64
+#define BIT_DEPTH 8
+#define T32_4NT 128
+#define T16_4NT 64
+#define T16C_4NT 64
+#define T8C_4NT 32
+/****************************************************************************/
+/* Function Macros                                                          */
+/****************************************************************************/
+
+#define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x)
+
+/* tables to shuffle 8-bit values */
+
+/*****************************************************************************/
+/* Function Definition                                                      */
+/*****************************************************************************/
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Planar Intraprediction with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'  Refer
+* to section 8.4.4.2.4 in the standard
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_planar_sse42(UWORD8 *pu1_ref,
+                                          WORD32 src_strd,
+                                          UWORD8 *pu1_dst,
+                                          WORD32 dst_strd,
+                                          WORD32 nt,
+                                          WORD32 mode)
+{
+
+    WORD32 row, col;
+    WORD32 log2nt = 5;
+    WORD32 two_nt, three_nt;
+
+    __m128i const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
+    __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b;
+    UNUSED(src_strd);
+    UNUSED(mode);
+
+    switch(nt)
+    {
+        case 16:
+            log2nt = 4;
+            break;
+        case 8:
+            log2nt = 3;
+            break;
+        case 4:
+            log2nt = 2;
+            break;
+        default:
+            break;
+    }
+    two_nt = 2 * nt;
+    three_nt = 3 * nt;
+
+    /* Planar filtering */
+
+/* setting vallues in  registera*/
+
+//  pu1_ref[2*(two_nt - 1 - row)]
+//  pu1_ref[2 * (three_nt + 1)]
+//  pu1_ref[2 * (two_nt + 1) + col]
+//  pu1_ref[2 * (nt - 1)]
+
+    const_temp_4x32b  = _mm_set_epi16(pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1],
+                                      pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)],
+                                      pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)]);
+
+    const_temp1_4x32b = _mm_set_epi16(pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)],
+                                      pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)]);
+
+    const_temp4_4x32b = _mm_set1_epi16(nt - 1);
+    const_temp6_4x32b = _mm_set1_epi16(nt);
+    const_temp7_4x32b = _mm_set1_epi16(4);
+
+    zero_8x16b = _mm_set1_epi32(0);
+
+    if(nt % 4 == 0)
+    {
+        const_temp7_4x32b = _mm_set1_epi16(4);
+
+        for(row = 0; row < nt; row++)
+        {
+            __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b;
+            __m128i res_temp3_8x16b;
+
+            const_temp2_4x32b  = _mm_set_epi16(pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1],
+                                               pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)],
+                                               pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)]);
+
+            const_temp3_4x32b  = _mm_set1_epi16((row + 1));
+            row_8x16b = _mm_set1_epi16((nt - 1 - row));
+
+            const_temp5_4x32b = _mm_set_epi16(3, 3, 2, 2, 1, 1, 0, 0);
+            col_8x16b = _mm_set_epi16(4, 4, 3, 3, 2, 2, 1, 1);
+
+            const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b);
+
+            /*(row + 1) * pu1_ref[nt - 1]*/
+            res_temp_8x16b  = _mm_mullo_epi16(const_temp3_4x32b,  const_temp1_4x32b);
+
+            /*(row + 1) * pu1_ref[nt - 1] + nt)*/
+            res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
+
+            for(col = 0; col < 2 * nt; col += 8)
+            {
+                __m128i src_temp_8x16b;
+
+                /* loding 8bit 16 pixles*/
+                src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (two_nt + 1) + col));
+
+                src_temp_8x16b =  _mm_cvtepu8_epi16(src_temp_8x16b); /* row=0*/
+
+                /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */
+                res_temp1_8x16b  = _mm_mullo_epi16(src_temp_8x16b,  row_8x16b);
+
+                /*(col + 1) * pu1_ref[three_nt + 1]*/
+                res_temp2_8x16b  = _mm_mullo_epi16(const_temp_4x32b,  col_8x16b);
+
+                /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/
+                res_temp3_8x16b  = _mm_mullo_epi16(const_temp2_4x32b,  const_temp5_4x32b);
+
+                res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b);
+                res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+                res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b);
+
+                res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, (log2nt + 1));
+                res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd) + col), res_temp1_8x16b);
+
+                const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b);
+                col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b);
+            } /* inner loop ends here */
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Intraprediction for DC mode with reference neighboring  samples location
+* pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
+* to section 8.4.4.2.5 in the standard
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size (Chroma)
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_dc_sse42(UWORD8 *pu1_ref,
+                                      WORD32 src_strd,
+                                      UWORD8 *pu1_dst,
+                                      WORD32 dst_strd,
+                                      WORD32 nt,
+                                      WORD32 mode)
+{
+
+    WORD32 acc_dc_u, acc_dc_v;
+    WORD32 dc_val_u, dc_val_v;
+    WORD32 row;
+    WORD32 log2nt = 5;
+    __m128i src_temp1, src_temp3, src_temp4, src_temp5, src_temp6, m_mask;
+    __m128i src_temp7, src_temp8, src_temp9, src_temp10;
+    __m128i m_zero = _mm_set1_epi32(0);
+    UNUSED(src_strd);
+    UNUSED(mode);
+
+    switch(nt)
+    {
+        case 32:
+            log2nt = 5;
+            break;
+        case 16:
+            log2nt = 4;
+            break;
+        case 8:
+            log2nt = 3;
+            break;
+        case 4:
+            log2nt = 2;
+            break;
+        default:
+            break;
+    }
+
+    acc_dc_u = 0;
+    acc_dc_v = 0;
+
+    /* Calculate DC value for the transform block */
+
+    m_mask = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASKY9[0]);
+
+    if(nt == 16)
+    {
+        __m128i temp_sad;
+
+        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
+        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
+        src_temp7 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 32));
+        src_temp8 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 48));
+
+        src_temp5 =  _mm_cvtepu8_epi16(src_temp3);
+        src_temp6 =  _mm_cvtepu8_epi16(src_temp4);
+        src_temp9 =  _mm_cvtepu8_epi16(src_temp7);
+        src_temp10 =  _mm_cvtepu8_epi16(src_temp8);
+
+        src_temp3 = _mm_srli_si128(src_temp3, 8);
+        src_temp4 = _mm_srli_si128(src_temp4, 8);
+        src_temp7 = _mm_srli_si128(src_temp7, 8);
+        src_temp8 = _mm_srli_si128(src_temp8, 8);
+
+        src_temp3 =  _mm_cvtepu8_epi16(src_temp3);
+        src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
+        src_temp7 =  _mm_cvtepu8_epi16(src_temp7);
+        src_temp8 =  _mm_cvtepu8_epi16(src_temp8);
+
+        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+        src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
+        src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
+        src_temp10 = _mm_add_epi16(src_temp9, src_temp10);
+
+        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+        src_temp8 = _mm_add_epi16(src_temp8, src_temp10);
+
+        src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
+        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
+        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+        src_temp4 = _mm_cvtepi16_epi32(src_temp4);
+        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
+        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
+        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
+    }
+
+    else if(nt == 8)
+    {
+        __m128i temp_sad;
+        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
+        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
+
+        src_temp5 =  _mm_cvtepu8_epi16(src_temp3);
+        src_temp6 =  _mm_cvtepu8_epi16(src_temp4);
+
+        src_temp3 = _mm_srli_si128(src_temp3, 8);
+        src_temp4 = _mm_srli_si128(src_temp4, 8);
+
+        src_temp3 =  _mm_cvtepu8_epi16(src_temp3);
+        src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
+
+        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+        src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
+
+        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
+        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+        src_temp4 = _mm_cvtepi16_epi32(src_temp4);
+        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
+        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
+        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
+    }
+
+    else if(nt == 4)
+    {
+        __m128i temp_sad;
+        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
+
+        src_temp5 =  _mm_cvtepu8_epi16(src_temp3);
+        src_temp4 = _mm_srli_si128(src_temp3, 8);
+        src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
+
+        src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
+
+        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
+        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+        src_temp4 = _mm_cvtepi16_epi32(src_temp4);
+        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
+        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
+        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
+    }
+
+
+    acc_dc_u += pu1_ref[6 * nt];
+    acc_dc_v += pu1_ref[6 * nt + 1];
+
+    acc_dc_u -= pu1_ref[4 * nt];
+    acc_dc_v -= pu1_ref[4 * nt + 1];
+
+    dc_val_u = (acc_dc_u + nt) >> (log2nt + 1);
+    dc_val_v = (acc_dc_v + nt) >> (log2nt + 1);
+
+    dc_val_u = dc_val_u | (dc_val_v << 8);
+
+    /* Fill the remaining rows with DC value*/
+
+    if(nt == 4)
+    {
+        src_temp1 = _mm_set1_epi16(dc_val_u);
+
+        /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
+        _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
+
+    }
+    else if(nt == 8)
+    {
+        src_temp1 = _mm_set1_epi16(dc_val_u);
+
+        /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
+        _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
+
+        _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
+
+    }
+
+    else /* nt == 16 */
+    {
+
+        src_temp1 = _mm_set1_epi16(dc_val_u);
+
+        for(row = 0; row < nt; row += 8)
+        {
+            /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
+            _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp1);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp1);
+
+            pu1_dst += 8 * dst_strd;
+        }
+
+
+    }
+
+}

diff --git a/common/x86/ihevc_chroma_intra_pred_filters_ssse3_intr.c b/common/x86/ihevc_chroma_intra_pred_filters_ssse3_intr.c
new file mode 100644
index 0000000..6a3883e
--- /dev/null
+++ b/common/x86/ihevc_chroma_intra_pred_filters_ssse3_intr.c

@@ -0,0 +1,2633 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_chroma_intra_pred_filters_atom_intr.c
+*
+* @brief
+*  Contains function Definition for intra prediction  interpolation filters
+*
+*
+* @author
+*  Ittiam
+*
+* @par List of Functions:
+*  ihevc_intra_pred_chroma_planar_ssse3()
+*
+*  ihevc_intra_pred_chroma_dc_ssse3()
+*
+*  ihevc_intra_pred_chroma_horz_ssse3()
+*
+*  ihevc_intra_pred_chroma_ver_ssse3()
+*
+*  ihevc_intra_pred_chroma_mode2_ssse3()
+*
+*  ihevc_intra_pred_chroma_mode_18_34_ssse3()
+*
+*  ihevc_intra_pred_chroma_mode_3_to_9_ssse3()
+*
+*  ihevc_intra_pred_chroma_mode_11_to_17_ssse3()
+*
+*  ihevc_intra_pred_chroma_mode_19_to_25_ssse3()
+*
+*  ihevc_intra_pred_chroma_mode_27_to_33_ssse3()
+*
+*
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_intra_pred.h"
+
+#include "ihevc_chroma_intra_pred.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_tables_x86_intr.h"
+
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#include <immintrin.h>
+
+
+/****************************************************************************/
+/* Constant Macros                                                          */
+/****************************************************************************/
+#define MAX_CU_SIZE 64
+#define BIT_DEPTH 8
+#define T32_4NT 128
+#define T16_4NT 64
+#define T16C_4NT 64
+#define T8C_4NT 32
+/****************************************************************************/
+/* Function Macros                                                          */
+/****************************************************************************/
+
+#define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x)
+
+/* tables to shuffle 8-bit values */
+
+/*****************************************************************************/
+/* Function Definition                                                      */
+/*****************************************************************************/
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Planar Intraprediction with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'  Refer
+* to section 8.4.4.2.4 in the standard
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_planar_ssse3(UWORD8 *pu1_ref,
+                                          WORD32 src_strd,
+                                          UWORD8 *pu1_dst,
+                                          WORD32 dst_strd,
+                                          WORD32 nt,
+                                          WORD32 mode)
+{
+
+    WORD32 row, col;
+    WORD32 log2nt = 5;
+    WORD32 two_nt, three_nt;
+
+    __m128i const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
+    __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b;
+    UNUSED(src_strd);
+    UNUSED(mode);
+    switch(nt)
+    {
+        case 16:
+            log2nt = 4;
+            break;
+        case 8:
+            log2nt = 3;
+            break;
+        case 4:
+            log2nt = 2;
+            break;
+        default:
+            break;
+    }
+    two_nt = 2 * nt;
+    three_nt = 3 * nt;
+
+    /* Planar filtering */
+
+/* setting vallues in  registera*/
+
+//  pu1_ref[2*(two_nt - 1 - row)]
+//  pu1_ref[2 * (three_nt + 1)]
+//  pu1_ref[2 * (two_nt + 1) + col]
+//  pu1_ref[2 * (nt - 1)]
+
+    const_temp_4x32b  = _mm_set_epi16(pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1],
+                                      pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)],
+                                      pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)]);
+
+    const_temp1_4x32b = _mm_set_epi16(pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)],
+                                      pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)]);
+
+    const_temp4_4x32b = _mm_set1_epi16(nt - 1);
+    const_temp6_4x32b = _mm_set1_epi16(nt);
+    const_temp7_4x32b = _mm_set1_epi16(4);
+
+    zero_8x16b = _mm_set1_epi32(0);
+
+
+    if(nt % 4 == 0)
+    {
+        const_temp7_4x32b = _mm_set1_epi16(4);
+
+        for(row = 0; row < nt; row++)
+        {
+            __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b;
+            __m128i res_temp3_8x16b;
+
+            const_temp2_4x32b  = _mm_set_epi16(pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1],
+                                               pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)],
+                                               pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)]);
+
+            const_temp3_4x32b  = _mm_set1_epi16((row + 1));
+            row_8x16b = _mm_set1_epi16((nt - 1 - row));
+
+            const_temp5_4x32b = _mm_set_epi16(3, 3, 2, 2, 1, 1, 0, 0);
+            col_8x16b = _mm_set_epi16(4, 4, 3, 3, 2, 2, 1, 1);
+
+            const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b);
+
+            /*(row + 1) * pu1_ref[nt - 1]*/
+            res_temp_8x16b  = _mm_mullo_epi16(const_temp3_4x32b,  const_temp1_4x32b);
+
+            /*(row + 1) * pu1_ref[nt - 1] + nt)*/
+            res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
+
+            for(col = 0; col < 2 * nt; col += 8)
+            {
+                __m128i src_temp_8x16b;
+
+                /* loding 8bit 16 pixles*/
+                src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (two_nt + 1) + col));
+
+                //src_temp_8x16b =  _mm_cvtepu8_epi16 (src_temp_8x16b); /* row=0*/
+                src_temp_8x16b = _mm_unpacklo_epi8(src_temp_8x16b, zero_8x16b);
+
+                /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */
+                res_temp1_8x16b  = _mm_mullo_epi16(src_temp_8x16b,  row_8x16b);
+
+                /*(col + 1) * pu1_ref[three_nt + 1]*/
+                res_temp2_8x16b  = _mm_mullo_epi16(const_temp_4x32b,  col_8x16b);
+
+                /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/
+                res_temp3_8x16b  = _mm_mullo_epi16(const_temp2_4x32b,  const_temp5_4x32b);
+
+                res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b);
+                res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+                res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b);
+
+                res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, (log2nt + 1));
+                res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd) + col), res_temp1_8x16b);
+
+                const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b);
+                col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b);
+            } /* inner loop ends here */
+        }
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Intraprediction for DC mode with reference neighboring  samples location
+* pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
+* to section 8.4.4.2.5 in the standard
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size (Chroma)
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_dc_ssse3(UWORD8 *pu1_ref,
+                                      WORD32 src_strd,
+                                      UWORD8 *pu1_dst,
+                                      WORD32 dst_strd,
+                                      WORD32 nt,
+                                      WORD32 mode)
+{
+
+    WORD32 acc_dc_u, acc_dc_v;
+    WORD32 dc_val_u, dc_val_v;
+    WORD32 row;
+    WORD32 log2nt = 5;
+    __m128i src_temp1, src_temp3, src_temp4, src_temp5, src_temp6, m_mask;
+    __m128i src_temp7, src_temp8, src_temp9, src_temp10;
+    __m128i m_zero = _mm_set1_epi32(0);
+    UNUSED(src_strd);
+    UNUSED(mode);
+
+    switch(nt)
+    {
+        case 32:
+            log2nt = 5;
+            break;
+        case 16:
+            log2nt = 4;
+            break;
+        case 8:
+            log2nt = 3;
+            break;
+        case 4:
+            log2nt = 2;
+            break;
+        default:
+            break;
+    }
+
+    acc_dc_u = 0;
+    acc_dc_v = 0;
+
+    /* Calculate DC value for the transform block */
+
+    m_mask = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY9[0]);
+
+    if(nt == 16)
+    {
+        __m128i temp_sad, sign_8x16b;
+
+        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
+        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
+        src_temp7 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 32));
+        src_temp8 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 48));
+
+        src_temp5  = _mm_unpacklo_epi8(src_temp3, m_zero);
+        src_temp6  = _mm_unpacklo_epi8(src_temp4, m_zero);
+        src_temp9  = _mm_unpacklo_epi8(src_temp7, m_zero);
+        src_temp10 = _mm_unpacklo_epi8(src_temp8, m_zero);
+
+        src_temp3 = _mm_srli_si128(src_temp3, 8);
+        src_temp4 = _mm_srli_si128(src_temp4, 8);
+        src_temp7 = _mm_srli_si128(src_temp7, 8);
+        src_temp8 = _mm_srli_si128(src_temp8, 8);
+
+        src_temp3 = _mm_unpacklo_epi8(src_temp3, m_zero);
+        src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero);
+        src_temp7 = _mm_unpacklo_epi8(src_temp7, m_zero);
+        src_temp8 = _mm_unpacklo_epi8(src_temp8, m_zero);
+
+        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+        src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
+        src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
+        src_temp10 = _mm_add_epi16(src_temp9, src_temp10);
+
+        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+        src_temp8 = _mm_add_epi16(src_temp8, src_temp10);
+
+        src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
+        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
+        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+        sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
+        src_temp4  = _mm_unpacklo_epi16(src_temp4, sign_8x16b);
+
+        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
+        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
+        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
+    }
+
+    else if(nt == 8)
+    {
+        __m128i temp_sad, sign_8x16b;
+        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
+        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
+
+        src_temp5 = _mm_unpacklo_epi8(src_temp3, m_zero);
+        src_temp6 = _mm_unpacklo_epi8(src_temp4, m_zero);
+
+        src_temp3 = _mm_srli_si128(src_temp3, 8);
+        src_temp4 = _mm_srli_si128(src_temp4, 8);
+
+        src_temp3 = _mm_unpacklo_epi8(src_temp3, m_zero);
+        src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero);
+
+        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+        src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
+
+        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
+        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+        sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
+        src_temp4  = _mm_unpacklo_epi16(src_temp4, sign_8x16b);
+
+        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
+        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
+        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
+    }
+
+    else if(nt == 4)
+    {
+        __m128i temp_sad, sign_8x16b;
+        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
+
+        src_temp5 =  _mm_unpacklo_epi8(src_temp3, m_zero);
+        src_temp4 = _mm_srli_si128(src_temp3, 8);
+
+        src_temp4 =  _mm_unpacklo_epi8(src_temp4, m_zero);
+
+        src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
+
+        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
+        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+        sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
+        src_temp4  = _mm_unpacklo_epi16(src_temp4, sign_8x16b);
+
+        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
+        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
+        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
+    }
+
+
+    acc_dc_u += pu1_ref[6 * nt];
+    acc_dc_v += pu1_ref[6 * nt + 1];
+
+    acc_dc_u -= pu1_ref[4 * nt];
+    acc_dc_v -= pu1_ref[4 * nt + 1];
+
+    dc_val_u = (acc_dc_u + nt) >> (log2nt + 1);
+    dc_val_v = (acc_dc_v + nt) >> (log2nt + 1);
+
+    dc_val_u = dc_val_u | (dc_val_v << 8);
+
+    /* Fill the remaining rows with DC value*/
+
+    if(nt == 4)
+    {
+        src_temp1 = _mm_set1_epi16(dc_val_u);
+
+        /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
+        _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
+
+    }
+    else if(nt == 8)
+    {
+        src_temp1 = _mm_set1_epi16(dc_val_u);
+
+        /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
+        _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
+
+        _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
+
+    }
+
+    else /* nt == 16 */
+    {
+        src_temp1 = _mm_set1_epi16(dc_val_u);
+
+        for(row = 0; row < nt; row += 8)
+        {
+            /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
+            _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp1);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp1);
+
+            pu1_dst += 8 * dst_strd;
+        }
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Horizontal intraprediction(mode 10) with reference  samples location
+* pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
+* to section 8.4.4.2.6 in the standard (Special case)
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_horz_ssse3(UWORD8 *pu1_ref,
+                                        WORD32 src_strd,
+                                        UWORD8 *pu1_dst,
+                                        WORD32 dst_strd,
+                                        WORD32 nt,
+                                        WORD32 mode)
+{
+
+    WORD32 row;
+    __m128i temp1, temp2, temp3, temp4, temp5, temp6,  temp7, temp8;
+    UNUSED(src_strd);
+    UNUSED(mode);
+
+    /* Replication to next rows*/
+
+    if(nt == 8)
+    {
+        for(row = 0; row < nt; row += 4)
+        {
+            temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 0)]);
+            temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 0)]);
+            temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 1)]);
+            temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 1)]);
+            temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 2)]);
+            temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 2)]);
+            temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 3)]);
+            temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 3)]);
+
+            temp2 = _mm_unpacklo_epi8(temp1, temp2);
+            temp4 = _mm_unpacklo_epi8(temp3, temp4);
+            temp6 = _mm_unpacklo_epi8(temp5, temp6);
+            temp8 = _mm_unpacklo_epi8(temp7, temp8);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), temp4);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), temp6);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), temp8);
+
+        }
+    }
+    else if(nt == 16)
+    {
+        for(row = 0; row < nt; row += 4)
+        {
+            temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 0)]);
+            temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 0)]);
+
+            temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 1)]);
+            temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 1)]);
+
+            temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 2)]);
+            temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 2)]);
+
+            temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 3)]);
+            temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 3)]);
+
+            temp2 = _mm_unpacklo_epi8(temp1, temp2);
+            temp4 = _mm_unpacklo_epi8(temp3, temp4);
+            temp6 = _mm_unpacklo_epi8(temp5, temp6);
+            temp8 = _mm_unpacklo_epi8(temp7, temp8);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd) + 0), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd) + 16), temp2);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 0), temp4);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 16), temp4);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 0), temp6);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 16), temp6);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 0), temp8);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 16), temp8);
+
+
+        }
+    }
+    else
+    {
+        temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 0]);
+        temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 0]);
+
+        temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 1]);
+        temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 1]);
+
+        temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 2]);
+        temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 2]);
+
+        temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 3]);
+        temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 3]);
+
+        temp2 = _mm_unpacklo_epi8(temp1, temp2);
+        temp4 = _mm_unpacklo_epi8(temp3, temp4);
+        temp6 = _mm_unpacklo_epi8(temp5, temp6);
+        temp8 = _mm_unpacklo_epi8(temp7, temp8);
+
+        _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), temp2);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), temp4);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), temp6);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), temp8);
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Horizontal intraprediction with reference neighboring  samples location
+* pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
+* to section 8.4.4.2.6 in the standard (Special case)
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_ver_ssse3(UWORD8 *pu1_ref,
+                                       WORD32 src_strd,
+                                       UWORD8 *pu1_dst,
+                                       WORD32 dst_strd,
+                                       WORD32 nt,
+                                       WORD32 mode)
+{
+    __m128i src_temp1;
+    UNUSED(src_strd);
+    UNUSED(mode);
+
+    /* Replication to next columns*/
+    if(nt == 8)
+    {
+        src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
+
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), src_temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp1);
+
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp1);
+
+    }
+    if(nt == 16)
+    {
+        __m128i temp1, temp2;
+
+        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
+        temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 16));
+
+        /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp1);
+
+        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp2);
+        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp2);
+        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp2);
+        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp2);
+        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp2);
+        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp2);
+        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp2);
+        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp2);
+
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp1);
+        _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp1);
+
+        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp2);
+        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp2);
+        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp2);
+        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp2);
+        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp2);
+        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp2);
+        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp2);
+        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp2);
+
+    }
+    else
+    {
+        src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
+
+        _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
+
+
+    }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Intraprediction for mode 2 (sw angle) with reference  neighboring samples
+* location pointed by 'pu1_ref' to the  TU block location pointed by
+* 'pu1_dst'  Refer to section 8.4.4.2.6 in the standard
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_mode2_ssse3(UWORD8 *pu1_ref,
+                                         WORD32 src_strd,
+                                         UWORD8 *pu1_dst,
+                                         WORD32 dst_strd,
+                                         WORD32 nt,
+                                         WORD32 mode)
+{
+    WORD32 row, col;
+
+
+    __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8, sm2, sm3;
+    UNUSED(src_strd);
+    UNUSED(mode);
+
+    sm2 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY7[0]);
+    sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY8[0]);
+
+    /* For the angle 45, replication is done from the corresponding angle */
+    /* intra_pred_ang = tan(angle) in q5 format */
+
+    if(nt == 4)
+    {
+        /*pu1_ref[two_nt - row - (col+1) - 1]*/
+        src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 0 - 8 - 2));
+        src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 1 - 8 - 2));
+        src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 2 - 8 - 2));
+        src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 3 - 8 - 2));
+
+        _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), _mm_shuffle_epi8(src_temp1, sm2));
+        _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), _mm_shuffle_epi8(src_temp2, sm2));
+        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), _mm_shuffle_epi8(src_temp3, sm2));
+        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), _mm_shuffle_epi8(src_temp4, sm2));
+
+    }
+    else if(nt == 8)
+    {
+        /*pu1_ref[two_nt - row - (col+1) - 1]*/
+        src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 0 - 16 - 2));
+        src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 1 - 16 - 2));
+        src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 2 - 16 - 2));
+        src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 3 - 16 - 2));
+        src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 4 - 16 - 2));
+        src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 5 - 16 - 2));
+        src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 6 - 16 - 2));
+        src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 7 - 16 - 2));
+
+        _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), _mm_shuffle_epi8(src_temp1, sm3));
+        _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), _mm_shuffle_epi8(src_temp2, sm3));
+        _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), _mm_shuffle_epi8(src_temp3, sm3));
+        _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), _mm_shuffle_epi8(src_temp4, sm3));
+        _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), _mm_shuffle_epi8(src_temp5, sm3));
+        _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), _mm_shuffle_epi8(src_temp6, sm3));
+        _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), _mm_shuffle_epi8(src_temp7, sm3));
+        _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), _mm_shuffle_epi8(src_temp8, sm3));
+
+
+    }
+    else
+    {
+        for(row = 0; row < nt; row += 8)
+        {
+            for(col = 0; col < 2 * nt; col += 16)
+            {   /*pu1_ref[two_nt - row - (col+1) - 1]*/
+                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 0) - (col + 16) - 2));
+                src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 1) - (col + 16) - 2));
+                src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 2) - (col + 16) - 2));
+                src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 3) - (col + 16) - 2));
+                src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 4) - (col + 16) - 2));
+                src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 5) - (col + 16) - 2));
+                src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 6) - (col + 16) - 2));
+                src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 7) - (col + 16) - 2));
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 0) * dst_strd)), _mm_shuffle_epi8(src_temp1, sm3));
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 1) * dst_strd)), _mm_shuffle_epi8(src_temp2, sm3));
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 2) * dst_strd)), _mm_shuffle_epi8(src_temp3, sm3));
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 3) * dst_strd)), _mm_shuffle_epi8(src_temp4, sm3));
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 4) * dst_strd)), _mm_shuffle_epi8(src_temp5, sm3));
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 5) * dst_strd)), _mm_shuffle_epi8(src_temp6, sm3));
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 6) * dst_strd)), _mm_shuffle_epi8(src_temp7, sm3));
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 7) * dst_strd)), _mm_shuffle_epi8(src_temp8, sm3));
+            }
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Intraprediction for mode 34 (ne angle) and  mode 18 (nw angle) with
+* reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+* block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_mode_18_34_ssse3(UWORD8 *pu1_ref,
+                                              WORD32 src_strd,
+                                              UWORD8 *pu1_dst,
+                                              WORD32 dst_strd,
+                                              WORD32 nt,
+                                              WORD32 mode)
+{
+    WORD32 row;
+    WORD32 idx = 0;
+
+    __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
+    UNUSED(src_strd);
+
+    if(mode == 34)
+    {
+        if(nt == 4)
+        {
+            /*pu1_ref[two_nt + col + idx + 1]*/
+            src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
+
+            _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
+
+        }
+        else if(nt == 8)
+        {
+            /*pu1_ref[two_nt + col + idx + 1]*/
+            src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + (4 * nt) + 2 * idx + 2));
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8);
+
+
+        }
+        else
+        {
+            __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
+            for(row = 0; row < nt; row += 8)
+            {
+                /*pu1_ref[two_nt + col + idx + 1]*/
+                src_temp1  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) +  0 + (4 * nt) + 2 * idx + 2));
+                src_temp9  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+                src_temp2  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) +  0 + (4 * nt) + 2 * idx + 2));
+                src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+                src_temp3  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) +  0 + (4 * nt) + 2 * idx + 2));
+                src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+                src_temp4  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) +  0 + (4 * nt) + 2 * idx + 2));
+                src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (0 * dst_strd)), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (1 * dst_strd)), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (2 * dst_strd)), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (3 * dst_strd)), src_temp4);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);
+
+                src_temp5  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) +  0 + (4 * nt) + 2 * idx + 2));
+                src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+                src_temp6  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) +  0 + (4 * nt) + 2 * idx + 2));
+                src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+                src_temp7  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) +  0 + (4 * nt) + 2 * idx + 2));
+                src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+                src_temp8  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) +  0 + (4 * nt) + 2 * idx + 2));
+                src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (4 * dst_strd)), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (5 * dst_strd)), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (6 * dst_strd)), src_temp7);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (7 * dst_strd)), src_temp8);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);
+
+                pu1_ref += 2 * 8;
+                pu1_dst += 8 * dst_strd;
+            }
+        }
+    }
+    else
+    {
+        if(nt == 4)
+        {
+            /*pu1_ref[two_nt + col + idx + 1]*/
+            src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
+
+            _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
+
+
+        }
+        else if(nt == 8)
+        {
+            /*pu1_ref[two_nt + col + idx + 1]*/
+            src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + (4 * nt) + 2 * idx + 2));
+            src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + (4 * nt) + 2 * idx + 2));
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8);
+
+
+        }
+        else
+        {
+            __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
+            for(row = 0; row < nt; row += 8)
+            {
+                /*pu1_ref[two_nt + col + idx + 1]*/
+                src_temp1  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) +  0 + (4 * nt) + 2 * idx + 2));
+                src_temp9  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+                src_temp2  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) +  0 + (4 * nt) + 2 * idx + 2));
+                src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+                src_temp3  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) +  0 + (4 * nt) + 2 * idx + 2));
+                src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+                src_temp4  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) +  0 + (4 * nt) + 2 * idx + 2));
+                src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (0 * dst_strd)), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (1 * dst_strd)), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (2 * dst_strd)), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (3 * dst_strd)), src_temp4);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);
+
+                src_temp5  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) +  0 + (4 * nt) + 2 * idx + 2));
+                src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+                src_temp6  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) +  0 + (4 * nt) + 2 * idx + 2));
+                src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+                src_temp7  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) +  0 + (4 * nt) + 2 * idx + 2));
+                src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+                src_temp8  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) +  0 + (4 * nt) + 2 * idx + 2));
+                src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (4 * dst_strd)), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (5 * dst_strd)), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (6 * dst_strd)), src_temp7);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (7 * dst_strd)), src_temp8);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);
+
+                pu1_ref -= 2 * 8;
+                pu1_dst += 8 * dst_strd;
+            }
+        }
+    }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Intraprediction for mode 3 to 9  (positive angle, horizontal mode ) with
+* reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+* block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_mode_3_to_9_ssse3(UWORD8 *pu1_ref,
+                                               WORD32 src_strd,
+                                               UWORD8 *pu1_dst,
+                                               WORD32 dst_strd,
+                                               WORD32 nt,
+                                               WORD32 mode)
+{
+    WORD32 row, col;
+
+    WORD32 intra_pred_ang;
+
+    __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
+    __m128i fract_4x32b, zero_8x16b, intra_pred_ang_4x32b;
+    __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm1;
+    UNUSED(src_strd);
+
+    /* Intra Pred Angle according to the mode */
+    intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+    /* samples dependent on distance to obtain destination sample */
+
+    sm1 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY7[0]);
+    const_temp_4x32b  = _mm_set1_epi16(16);
+    const_temp2_4x32b = _mm_set1_epi32(31);
+    const_temp3_4x32b = _mm_set1_epi16(32);
+    const_temp4_4x32b = _mm_set1_epi32(4);
+
+    two_nt_4x32b = _mm_set1_epi32(1);
+
+    zero_8x16b = _mm_set1_epi16(0);
+
+
+    /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+    intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
+
+    row_4x32b = _mm_set_epi32(4, 3, 2, 1);
+
+    if(nt == 4)
+    {
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp4_4x32b = _mm_set1_epi16(4);
+        two_nt_4x32b = _mm_set1_epi16((4 * nt) - 2);
+
+        {
+            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+            WORD8  ai1_fract_temp_val[16], ai1_src_temp_val[16];
+
+            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
+            __m128i src_values10;
+
+            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* fract = pos & (31); */
+            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            ref_main_idx_4x32b = _mm_srai_epi16(res_temp5_4x32b,  5);
+
+            ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b,  ref_main_idx_4x32b);
+
+            ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, ref_main_idx_4x32b);
+
+            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+            _mm_storel_epi64((__m128i *)(ai1_fract_temp_val), fract_4x32b);
+            _mm_storel_epi64((__m128i *)(ai1_src_temp_val),  src_values10);
+
+            fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]);  /* col=0*/
+            fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]);  /* col=1*/
+            fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]);  /* col=2*/
+            fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]);  /* col=3*/
+
+            temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]);  /* col=0*/
+            temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]);  /* col=1*/
+            temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]);  /* col=2*/
+            temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]);  /* col=3*/
+
+            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
+            temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
+            temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
+            temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
+
+            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
+            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
+            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
+            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
+
+            {
+                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+                /* loding 8-bit 16 pixels */
+                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 8)); /* col=0*/
+                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 8)); /* col=1*/
+                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 8)); /* col=2*/
+                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 8)); /* col=3*/
+
+                src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
+                src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
+                src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
+                src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
+
+                src_temp1_8x16b =  _mm_unpacklo_epi8(src_temp1_8x16b, src_temp5_8x16b); /* col=0*/
+                src_temp2_8x16b =  _mm_unpacklo_epi8(src_temp2_8x16b, src_temp6_8x16b); /* col=1*/
+                src_temp3_8x16b =  _mm_unpacklo_epi8(src_temp3_8x16b, src_temp7_8x16b); /* col=2*/
+                src_temp4_8x16b =  _mm_unpacklo_epi8(src_temp4_8x16b, src_temp8_8x16b); /* col=3*/
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
+                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
+                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
+                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
+                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
+                src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
+                src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
+
+                src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm1);
+                src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm1);
+                src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm1);
+                src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm1);
+
+                src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
+                src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
+
+                src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
+                src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp8_8x16b);             /* row=0*/
+
+                src_temp2_8x16b  = _mm_shuffle_epi32(src_temp8_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp2_8x16b);       /* row=1*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp7_8x16b);       /* row=2*/
+
+                src_temp4_8x16b  = _mm_shuffle_epi32(src_temp7_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp4_8x16b);       /* row=4*/
+
+            }
+        }
+    }
+    else
+    {
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp4_4x32b = _mm_set1_epi16(8);
+        two_nt_4x32b = _mm_set1_epi16((4 * nt) - 2);
+
+        for(col = 0; col < 2 * nt; col += 16)
+        {
+            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+            WORD8  ai1_fract_temp_val[16], ai1_src_temp_val[16];
+
+            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
+            __m128i fract5_8x16b, fract6_8x16b, fract7_8x16b, fract8_8x16b, src_values10;
+
+            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* fract = pos & (31); */
+            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            ref_main_idx_4x32b = _mm_srai_epi16(res_temp5_4x32b,  5);
+
+            ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b,  ref_main_idx_4x32b);
+
+            ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, ref_main_idx_4x32b);
+
+            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+            _mm_storeu_si128((__m128i *)(ai1_fract_temp_val), fract_4x32b);
+            _mm_storeu_si128((__m128i *)(ai1_src_temp_val),  src_values10);
+
+            fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]);  /* col=0*/
+            fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]);  /* col=1*/
+            fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]);  /* col=2*/
+            fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]);  /* col=3*/
+
+            temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]);  /* col=0*/
+            temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]);  /* col=1*/
+            temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]);  /* col=2*/
+            temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]);  /* col=3*/
+
+            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
+            temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
+            temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
+            temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
+
+            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
+            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
+            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
+            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
+
+            fract5_8x16b = _mm_set1_epi8(ai1_fract_temp_val[8]);  /* col=5*/
+            fract6_8x16b = _mm_set1_epi8(ai1_fract_temp_val[10]);  /* col=6*/
+            fract7_8x16b = _mm_set1_epi8(ai1_fract_temp_val[12]);  /* col=7*/
+            fract8_8x16b = _mm_set1_epi8(ai1_fract_temp_val[14]);  /* col=8*/
+
+            temp11_8x16b = _mm_set1_epi8(ai1_src_temp_val[8]);  /* col=0*/
+            temp12_8x16b = _mm_set1_epi8(ai1_src_temp_val[10]);  /* col=1*/
+            temp13_8x16b = _mm_set1_epi8(ai1_src_temp_val[12]);  /* col=2*/
+            temp14_8x16b = _mm_set1_epi8(ai1_src_temp_val[14]);  /* col=3*/
+
+            temp11_8x16b = _mm_unpacklo_epi8(temp11_8x16b, fract5_8x16b);
+            temp12_8x16b = _mm_unpacklo_epi8(temp12_8x16b, fract6_8x16b);
+            temp13_8x16b = _mm_unpacklo_epi8(temp13_8x16b, fract7_8x16b);
+            temp14_8x16b = _mm_unpacklo_epi8(temp14_8x16b, fract8_8x16b);
+
+            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
+            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
+            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
+            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
+
+            for(row = 0; row < nt; row += 4)
+            {
+                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+                /* loding 8-bit 16 pixels */
+                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - row - (8 + row))); /* col=0*/
+                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - row - (8 + row))); /* col=1*/
+                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - row - (8 + row))); /* col=2*/
+                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - row - (8 + row))); /* col=3*/
+
+                src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
+                src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
+                src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
+                src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
+
+                src_temp1_8x16b =  _mm_unpacklo_epi8(src_temp1_8x16b, src_temp5_8x16b); /* col=0*/
+                src_temp2_8x16b =  _mm_unpacklo_epi8(src_temp2_8x16b, src_temp6_8x16b); /* col=1*/
+                src_temp3_8x16b =  _mm_unpacklo_epi8(src_temp3_8x16b, src_temp7_8x16b); /* col=2*/
+                src_temp4_8x16b =  _mm_unpacklo_epi8(src_temp4_8x16b, src_temp8_8x16b); /* col=3*/
+
+                /* loding 8-bit 16 pixels */
+                src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - row - row - 8)); /* col=5*/
+                src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - row - row - 8)); /* col=6*/
+                src_temp17_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - row - row - 8)); /* col=7*/
+                src_temp18_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - row - row - 8)); /* col=8*/
+
+                src_temp11_8x16b = _mm_srli_si128(src_temp15_8x16b, 2); /* col=5*/
+                src_temp12_8x16b = _mm_srli_si128(src_temp16_8x16b, 2); /* col=6*/
+                src_temp13_8x16b = _mm_srli_si128(src_temp17_8x16b, 2); /* col=7*/
+                src_temp14_8x16b = _mm_srli_si128(src_temp18_8x16b, 2); /* col=8*/
+
+                src_temp11_8x16b =  _mm_unpacklo_epi8(src_temp11_8x16b, src_temp15_8x16b); /* col=0*/
+                src_temp12_8x16b =  _mm_unpacklo_epi8(src_temp12_8x16b, src_temp16_8x16b); /* col=1*/
+                src_temp13_8x16b =  _mm_unpacklo_epi8(src_temp13_8x16b, src_temp17_8x16b); /* col=2*/
+                src_temp14_8x16b =  _mm_unpacklo_epi8(src_temp14_8x16b, src_temp18_8x16b); /* col=3*/
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
+                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
+                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
+                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
+                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
+                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
+                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
+                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
+                src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
+                src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
+
+                src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm1);
+                src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm1);
+                src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm1);
+                src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm1);
+
+                /* converting 16 bit to 8 bit */
+                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, zero_8x16b); /* col=5*/
+                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, zero_8x16b); /* col=6*/
+                src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, zero_8x16b); /* col=7*/
+                src_temp14_8x16b = _mm_packus_epi16(src_temp14_8x16b, zero_8x16b); /* col=8*/
+
+                src_temp11_8x16b = _mm_shuffle_epi8(src_temp11_8x16b, sm1);
+                src_temp12_8x16b = _mm_shuffle_epi8(src_temp12_8x16b, sm1);
+                src_temp13_8x16b = _mm_shuffle_epi8(src_temp13_8x16b, sm1);
+                src_temp14_8x16b = _mm_shuffle_epi8(src_temp14_8x16b, sm1);
+
+                src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
+                src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
+
+                src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
+                src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
+
+                src_temp15_8x16b = _mm_unpacklo_epi16(src_temp11_8x16b, src_temp12_8x16b);
+                src_temp16_8x16b = _mm_unpacklo_epi16(src_temp13_8x16b, src_temp14_8x16b);
+
+                src_temp18_8x16b = _mm_unpacklo_epi32(src_temp15_8x16b, src_temp16_8x16b);
+                src_temp17_8x16b = _mm_unpackhi_epi32(src_temp15_8x16b, src_temp16_8x16b);
+
+                src_temp11_8x16b = _mm_unpacklo_epi64(src_temp8_8x16b, src_temp18_8x16b);
+                src_temp12_8x16b = _mm_unpackhi_epi64(src_temp8_8x16b, src_temp18_8x16b);
+                src_temp13_8x16b = _mm_unpacklo_epi64(src_temp7_8x16b, src_temp17_8x16b);
+                src_temp14_8x16b = _mm_unpackhi_epi64(src_temp7_8x16b, src_temp17_8x16b);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * row)),    src_temp11_8x16b);          /* row=0*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp12_8x16b);       /* row=1*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp13_8x16b);       /* row=2*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp14_8x16b);       /* row=4*/
+
+            }
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Intraprediction for mode 11 to 17  (negative angle, horizontal mode )
+* with reference  neighboring samples location pointed by 'pu1_ref' to the
+* TU block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_mode_11_to_17_ssse3(UWORD8 *pu1_ref,
+                                                 WORD32 src_strd,
+                                                 UWORD8 *pu1_dst,
+                                                 WORD32 dst_strd,
+                                                 WORD32 nt,
+                                                 WORD32 mode)
+{
+    /* This function and ihevc_intra_pred_CHROMA_mode_19_to_25 are same except*/
+    /* for ref main & side samples assignment,can be combined for */
+    /* optimzation*/
+
+    WORD32 row, col, k;
+    WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
+    WORD32 ref_idx;
+
+
+    __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
+    __m128i fract_4x32b, zero_8x16b, intra_pred_ang_4x32b;
+    __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b;
+
+    UWORD8 ref_temp[2 * MAX_CU_SIZE + 2];
+    UWORD8 *ref_main;
+    UNUSED(src_strd);
+
+    inv_ang_sum = 128;
+
+    intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+    inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
+    /* Intermediate reference samples for negative angle modes */
+    /* This have to be removed during optimization*/
+
+    /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
+
+
+    ref_main = ref_temp + 2 * nt;
+    for(k = 0; k < (2 * (nt + 1)); k += 2)
+    {
+        ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k];
+        ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k + 1];
+    }
+
+    ref_main = ref_temp + (2 * (nt - 1));
+    ref_idx = (nt * intra_pred_ang) >> 5;
+
+    /* SIMD Optimization can be done using look-up table for the loop */
+    /* For negative angled derive the main reference samples from side */
+    /* reference samples refer to section 8.4.4.2.6 */
+
+    for(k = -2; k > (2 * ref_idx); k -= 2)
+    {
+        inv_ang_sum += inv_ang;
+        ref_main[k] = pu1_ref[(4 * nt) + ((inv_ang_sum >> 8) << 1)];
+        ref_main[k + 1] = pu1_ref[((4 * nt) + 1) + ((inv_ang_sum >> 8) << 1)];
+    }
+
+    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+    /* samples dependent on distance to obtain destination sample */
+
+    const_temp_4x32b  = _mm_set1_epi16(16);
+    const_temp2_4x32b = _mm_set1_epi32(31);
+    const_temp3_4x32b = _mm_set1_epi16(32);
+    const_temp4_4x32b = _mm_set1_epi32(4);
+
+    two_nt_4x32b = _mm_set1_epi32(1);
+
+    zero_8x16b = _mm_set1_epi16(0);
+
+
+    /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+    intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
+
+    row_4x32b = _mm_set_epi32(4, 3, 2, 1);
+
+    if(nt == 4)
+    {
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp4_4x32b = _mm_set1_epi16(4);
+        two_nt_4x32b = _mm_set1_epi16(1);
+
+        {
+            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+            WORD8  ai1_fract_temp_val[16], ai1_src_temp_val[16];
+
+            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
+            __m128i src_values10;
+
+            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* fract = pos & (31); */
+            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+            ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);
+
+            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+            _mm_storel_epi64((__m128i *)(ai1_fract_temp_val), fract_4x32b);
+            _mm_storel_epi64((__m128i *)(ai1_src_temp_val),  src_values10);
+
+            fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]);  /* col=0*/
+            fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]);  /* col=1*/
+            fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]);  /* col=2*/
+            fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]);  /* col=3*/
+
+            temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]);  /* col=0*/
+            temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]);  /* col=1*/
+            temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]);  /* col=2*/
+            temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]);  /* col=3*/
+
+            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
+            temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
+            temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
+            temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
+
+            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
+            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
+            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
+            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
+
+            {
+                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+                /* loding 8-bit 16 pixels */
+                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1)); /* col=0*/
+                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2)); /* col=1*/
+                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3)); /* col=2*/
+                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4)); /* col=3*/
+
+                src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
+                src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
+                src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
+                src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
+
+                src_temp1_8x16b =  _mm_unpacklo_epi8(src_temp5_8x16b, src_temp1_8x16b); /* col=0*/
+                src_temp2_8x16b =  _mm_unpacklo_epi8(src_temp6_8x16b, src_temp2_8x16b); /* col=1*/
+                src_temp3_8x16b =  _mm_unpacklo_epi8(src_temp7_8x16b, src_temp3_8x16b); /* col=2*/
+                src_temp4_8x16b =  _mm_unpacklo_epi8(src_temp8_8x16b, src_temp4_8x16b); /* col=3*/
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
+                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
+                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
+                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
+                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
+                src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
+                src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
+
+                src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
+                src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
+
+                src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
+                src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp8_8x16b);             /* row=0*/
+
+                src_temp2_8x16b  = _mm_shuffle_epi32(src_temp8_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp2_8x16b);       /* row=1*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp7_8x16b);       /* row=2*/
+
+                src_temp4_8x16b  = _mm_shuffle_epi32(src_temp7_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp4_8x16b);       /* row=4*/
+
+            }
+        }
+    }
+    else
+    {
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp4_4x32b = _mm_set1_epi16(8);
+        two_nt_4x32b = _mm_set1_epi16(1);
+
+        for(col = 0; col < 2 * nt; col += 16)
+        {
+            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+            WORD8  ai1_fract_temp_val[16], ai1_src_temp_val[16];
+
+            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
+            __m128i fract5_8x16b, fract6_8x16b, fract7_8x16b, fract8_8x16b, src_values10;
+
+            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* fract = pos & (31); */
+            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+            ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);
+
+            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+            _mm_storeu_si128((__m128i *)(ai1_fract_temp_val), fract_4x32b);
+            _mm_storeu_si128((__m128i *)(ai1_src_temp_val),  src_values10);
+
+            fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]);  /* col=0*/
+            fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]);  /* col=1*/
+            fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]);  /* col=2*/
+            fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]);  /* col=3*/
+
+            temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]);  /* col=0*/
+            temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]);  /* col=1*/
+            temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]);  /* col=2*/
+            temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]);  /* col=3*/
+
+            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
+            temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
+            temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
+            temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
+
+            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
+            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
+            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
+            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
+
+            fract5_8x16b = _mm_set1_epi8(ai1_fract_temp_val[8]);  /* col=5*/
+            fract6_8x16b = _mm_set1_epi8(ai1_fract_temp_val[10]);  /* col=6*/
+            fract7_8x16b = _mm_set1_epi8(ai1_fract_temp_val[12]);  /* col=7*/
+            fract8_8x16b = _mm_set1_epi8(ai1_fract_temp_val[14]);  /* col=8*/
+
+            temp11_8x16b = _mm_set1_epi8(ai1_src_temp_val[8]);  /* col=0*/
+            temp12_8x16b = _mm_set1_epi8(ai1_src_temp_val[10]);  /* col=1*/
+            temp13_8x16b = _mm_set1_epi8(ai1_src_temp_val[12]);  /* col=2*/
+            temp14_8x16b = _mm_set1_epi8(ai1_src_temp_val[14]);  /* col=3*/
+
+            temp11_8x16b = _mm_unpacklo_epi8(temp11_8x16b, fract5_8x16b);
+            temp12_8x16b = _mm_unpacklo_epi8(temp12_8x16b, fract6_8x16b);
+            temp13_8x16b = _mm_unpacklo_epi8(temp13_8x16b, fract7_8x16b);
+            temp14_8x16b = _mm_unpacklo_epi8(temp14_8x16b, fract8_8x16b);
+
+            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
+            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
+            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
+            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
+
+            for(row = 0; row < nt; row += 4)
+            {
+                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+                /* loding 8-bit 16 pixels */
+                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row + row)); /* col=0*/
+                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row + row)); /* col=1*/
+                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row + row)); /* col=2*/
+                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row + row)); /* col=3*/
+
+                src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
+                src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
+                src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
+                src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
+
+                src_temp1_8x16b =  _mm_unpacklo_epi8(src_temp5_8x16b, src_temp1_8x16b); /* col=0*/
+                src_temp2_8x16b =  _mm_unpacklo_epi8(src_temp6_8x16b, src_temp2_8x16b); /* col=1*/
+                src_temp3_8x16b =  _mm_unpacklo_epi8(src_temp7_8x16b, src_temp3_8x16b); /* col=2*/
+                src_temp4_8x16b =  _mm_unpacklo_epi8(src_temp8_8x16b, src_temp4_8x16b); /* col=3*/
+
+                /* loding 8-bit 16 pixels */
+                src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row + row)); /* col=5*/
+                src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row + row)); /* col=6*/
+                src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row + row)); /* col=7*/
+                src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row + row)); /* col=8*/
+
+                src_temp11_8x16b = _mm_srli_si128(src_temp15_8x16b, 2); /* col=5*/
+                src_temp12_8x16b = _mm_srli_si128(src_temp16_8x16b, 2); /* col=6*/
+                src_temp13_8x16b = _mm_srli_si128(src_temp17_8x16b, 2); /* col=7*/
+                src_temp14_8x16b = _mm_srli_si128(src_temp18_8x16b, 2); /* col=8*/
+
+                src_temp11_8x16b =  _mm_unpacklo_epi8(src_temp15_8x16b, src_temp11_8x16b); /* col=0*/
+                src_temp12_8x16b =  _mm_unpacklo_epi8(src_temp16_8x16b, src_temp12_8x16b); /* col=1*/
+                src_temp13_8x16b =  _mm_unpacklo_epi8(src_temp17_8x16b, src_temp13_8x16b); /* col=2*/
+                src_temp14_8x16b =  _mm_unpacklo_epi8(src_temp18_8x16b, src_temp14_8x16b); /* col=3*/
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
+                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
+                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
+                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
+                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
+                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
+                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
+                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
+                src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
+                src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, zero_8x16b); /* col=5*/
+                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, zero_8x16b); /* col=6*/
+                src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, zero_8x16b); /* col=7*/
+                src_temp14_8x16b = _mm_packus_epi16(src_temp14_8x16b, zero_8x16b); /* col=8*/
+
+                src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
+                src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
+
+                src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
+                src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
+
+                src_temp15_8x16b = _mm_unpacklo_epi16(src_temp11_8x16b, src_temp12_8x16b);
+                src_temp16_8x16b = _mm_unpacklo_epi16(src_temp13_8x16b, src_temp14_8x16b);
+
+                src_temp18_8x16b = _mm_unpacklo_epi32(src_temp15_8x16b, src_temp16_8x16b);
+                src_temp17_8x16b = _mm_unpackhi_epi32(src_temp15_8x16b, src_temp16_8x16b);
+
+                src_temp11_8x16b = _mm_unpacklo_epi64(src_temp8_8x16b, src_temp18_8x16b);
+                src_temp12_8x16b = _mm_unpackhi_epi64(src_temp8_8x16b, src_temp18_8x16b);
+                src_temp13_8x16b = _mm_unpacklo_epi64(src_temp7_8x16b, src_temp17_8x16b);
+                src_temp14_8x16b = _mm_unpackhi_epi64(src_temp7_8x16b, src_temp17_8x16b);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * row)),    src_temp11_8x16b);          /* row=0*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp12_8x16b);       /* row=1*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp13_8x16b);       /* row=2*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp14_8x16b);       /* row=4*/
+
+            }
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Intraprediction for mode 19 to 25  (negative angle, vertical mode ) with
+* reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+* block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_mode_19_to_25_ssse3(UWORD8 *pu1_ref,
+                                                 WORD32 src_strd,
+                                                 UWORD8 *pu1_dst,
+                                                 WORD32 dst_strd,
+                                                 WORD32 nt,
+                                                 WORD32 mode)
+{
+    WORD32 row, k;
+    WORD32 intra_pred_ang, idx;
+    WORD32 inv_ang, inv_ang_sum, pos, fract;
+    WORD32 ref_main_idx, ref_idx;
+    UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 2];
+    UWORD8 *ref_main;
+
+    __m128i zero_8x16b, fract_8x16b, const_temp_8x16b;
+    UNUSED(src_strd);
+
+    intra_pred_ang = gai4_ihevc_ang_table_chroma[mode];
+    inv_ang = gai4_ihevc_inv_ang_table_chroma[mode - 12];
+
+    /* Intermediate reference samples for negative angle modes */
+    /* This have to be removed during optimization*/
+    /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+    ref_main = ref_temp + 2 * nt;
+    for(k = 0; k < (2 * (nt + 1)); k += 2)
+    {
+        ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k];
+        ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k + 1];
+    }
+
+    ref_idx = (nt * intra_pred_ang) >> 5;
+    inv_ang_sum = 128;
+    ref_main = ref_temp + (2 * (nt - 1));
+    /* SIMD Optimization can be done using look-up table for the loop */
+    /* For negative angled derive the main reference samples from side */
+    /*  reference samples refer to section 8.4.4.2.6 */
+    for(k = -2; k > (2 * ref_idx); k -= 2)
+    {
+        inv_ang_sum += inv_ang;
+        ref_main[k] = pu1_ref[(4 * nt) - (inv_ang_sum >> 8) * 2];
+        ref_main[k + 1] = pu1_ref[((4 * nt) + 1) - (inv_ang_sum >> 8) * 2];
+    }
+
+    const_temp_8x16b = _mm_set1_epi16(16);
+
+    if(nt == 4) /* if nt =4*/
+    {
+        __m128i const_temp2_4x32b, const_temp3_4x32b;
+        __m128i src_values10, src_values11, zero_8x16b, intra_pred_ang_4x32b;
+        __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+
+        const_temp2_4x32b = _mm_set1_epi32(31);
+        const_temp3_4x32b = _mm_set1_epi32(32);
+
+        two_nt_4x32b = _mm_set1_epi32(2);
+
+        zero_8x16b = _mm_set1_epi16(0);
+
+        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+        row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
+        {
+            WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+            WORD8  ai1_src_temp0_val[16], ai1_src_temp1_val[16];
+
+            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b, res_temp5_4x32b;
+            __m128i src_values0, src_values1, src_values2, src_values3, src_values13;
+            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+            __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, sign_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+            sign_8x16b      = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
+            res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
+
+            src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
+            src_values12 = _mm_add_epi32(src_values12, _mm_srai_epi32(res_temp5_4x32b,  5));
+
+            ref_main_temp0 = _mm_srli_si128(src_values12, 4);  /* next 32 bit values */
+            ref_main_temp1 = _mm_srli_si128(src_values12, 8);  /* next 32 bit values */
+            ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
+            ref_main_idx1  = _mm_cvtsi128_si32(src_values12);    /* row=0*/
+            ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* row=1*/
+            ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* row=2*/
+            ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* row=3*/
+
+            /* fract = pos & (31); */
+            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
+
+            _mm_storeu_si128((__m128i *)(ai1_src_temp1_val), src_values11);
+            _mm_storeu_si128((__m128i *)(ai1_src_temp0_val), src_values10);
+
+            fract1_8x16b = _mm_set1_epi8(ai1_src_temp1_val[0]);  /* row=0*/
+            fract2_8x16b = _mm_set1_epi8(ai1_src_temp1_val[4]);  /* row=1*/
+            fract3_8x16b = _mm_set1_epi8(ai1_src_temp1_val[8]);  /* row=2*/
+            fract4_8x16b = _mm_set1_epi8(ai1_src_temp1_val[12]);  /* row=3*/
+
+            temp1_8x16b = _mm_set1_epi8(ai1_src_temp0_val[0]);  /* row=0*/
+            temp2_8x16b = _mm_set1_epi8(ai1_src_temp0_val[4]);  /* row=1*/
+            temp3_8x16b = _mm_set1_epi8(ai1_src_temp0_val[8]);  /* row=2*/
+            temp4_8x16b = _mm_set1_epi8(ai1_src_temp0_val[12]);  /* row=3*/
+
+            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
+            temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
+            temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
+            temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
+
+// inner loop starts from here
+            src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1));  /* col = 0-7   */
+            src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2));  /* col = 8-15  */
+            src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3));  /* col = 16-23 */
+            src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4));  /* col = 24-31 */
+
+            src_values10 = _mm_srli_si128(src_values0, 2);
+            src_values11 = _mm_srli_si128(src_values1, 2);
+            src_values12 = _mm_srli_si128(src_values2, 2);
+            src_values13 = _mm_srli_si128(src_values3, 2);
+
+            src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
+            src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
+            src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
+            src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
+
+            src_values0 = _mm_maddubs_epi16(src_values0, temp1_8x16b);
+            src_values1 = _mm_maddubs_epi16(src_values1, temp2_8x16b);
+            src_values2 = _mm_maddubs_epi16(src_values2, temp3_8x16b);
+            src_values3 = _mm_maddubs_epi16(src_values3, temp4_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+            src_values0 = _mm_srai_epi16(src_values0,  5);
+            src_values1 = _mm_srai_epi16(src_values1,  5);
+            src_values2 = _mm_srai_epi16(src_values2,  5);
+            src_values3 = _mm_srai_epi16(src_values3,  5);
+
+            /* converting 16 bit to 8 bit */
+            src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
+            src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
+            src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
+            src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
+
+            _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_values0);       /* row=0*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_values1);   /* row=1*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_values2);   /* row=2*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_values3);   /* row=3*/
+
+        }
+    }
+    else if(nt == 8) /* for nt = 16 case */
+    {
+        WORD32 ref_main_idx1, fract1, temp, temp1;
+        __m128i fract1_8x16b, temp_8x16b, temp1_8x16b;
+
+        zero_8x16b = _mm_set1_epi16(0);
+
+        for(row = 0; row < nt; row += 2)
+        {
+            __m128i src_values0, src_values1, src_values2, src_values3;
+            __m128i  src_values10, src_values11, src_values12, src_values13;
+
+            pos = ((row + 1) * intra_pred_ang);
+            idx = pos >> 5;
+            fract = pos & (31);
+            temp = 32 - fract;
+            ref_main_idx = 2 * idx + 2; /* col from 0-15 */
+
+            pos = ((row + 2) * intra_pred_ang);
+            idx = pos >> 5;
+            fract1 = pos & (31);
+            temp1 = 32 - fract1;
+            ref_main_idx1 = 2 * idx + 2; /* col from 0-15 */
+
+            fract_8x16b  = _mm_set1_epi8(fract);
+            fract1_8x16b = _mm_set1_epi8(fract1);
+            temp_8x16b   = _mm_set1_epi8(temp);
+            temp1_8x16b  = _mm_set1_epi8(temp1);
+
+            temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
+            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
+
+            /* row=0 */
+            src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx));     /* col = 0-7   */
+            src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 8));   /* col = 8-15  */
+
+            /* row=1 */
+            src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1));   /* col = 0-7  */
+            src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1 + 8));  /* col = 8-15 */
+
+            src_values10 = _mm_srli_si128(src_values0, 2);
+            src_values11 = _mm_srli_si128(src_values1, 2);
+            src_values12 = _mm_srli_si128(src_values2, 2);
+            src_values13 = _mm_srli_si128(src_values3, 2);
+
+            src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
+            src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
+            src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
+            src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
+
+            src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
+            src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
+
+            src_values2 = _mm_maddubs_epi16(src_values2, temp1_8x16b);
+            src_values3 = _mm_maddubs_epi16(src_values3, temp1_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+
+            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+            src_values0 = _mm_srai_epi16(src_values0,  5);
+            src_values1 = _mm_srai_epi16(src_values1,  5);
+
+            src_values2 = _mm_srai_epi16(src_values2,  5);
+            src_values3 = _mm_srai_epi16(src_values3,  5);
+
+            /* converting 16 bit to 8 bit */
+            src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
+            src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
+
+            src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
+            src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
+
+            /* loding 8-bit 8 pixels values */
+            _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
+            _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
+
+            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_values2);
+            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + 8), src_values3);
+
+            pu1_dst += 2 * dst_strd;
+        }
+    }
+    else if(nt == 16)
+    {
+        WORD32 temp;
+        /* unroll the col loop (inner) */
+        zero_8x16b = _mm_set1_epi16(0);
+
+        for(row = 0; row < nt; row += 1)
+        {
+            __m128i  src_values0, src_values1, src_values2, src_values3, temp_8x16b;
+            __m128i  src_values10, src_values11, src_values12, src_values13;
+
+            pos = ((row + 1) * intra_pred_ang);
+            idx = pos >> 5;
+            fract = pos & (31);
+            temp = 32 - fract;
+            ref_main_idx = 2 * idx + 2; /* col from 0-31 */
+
+            fract_8x16b = _mm_set1_epi8(fract);
+            temp_8x16b  = _mm_set1_epi8(temp);
+
+            temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
+
+            src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx));     /* col = 0-7   */
+            src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 8));   /* col = 8-15  */
+            src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 16));  /* col = 16-23 */
+            src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 24));  /* col = 24-31 */
+
+            src_values10 = _mm_srli_si128(src_values0, 2);
+            src_values11 = _mm_srli_si128(src_values1, 2);
+            src_values12 = _mm_srli_si128(src_values2, 2);
+            src_values13 = _mm_srli_si128(src_values3, 2);
+
+            src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
+            src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
+            src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
+            src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
+
+            /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+            src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
+            src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
+            src_values2 = _mm_maddubs_epi16(src_values2, temp_8x16b);
+            src_values3 = _mm_maddubs_epi16(src_values3, temp_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+            src_values0 = _mm_srai_epi16(src_values0,  5);
+            src_values1 = _mm_srai_epi16(src_values1,  5);
+            src_values2 = _mm_srai_epi16(src_values2,  5);
+            src_values3 = _mm_srai_epi16(src_values3,  5);
+
+            /* converting 16 bit to 8 bit */
+            src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
+            src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
+            src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
+            src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
+
+            /* loding 8-bit 8 pixels values */
+            _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
+            _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
+            _mm_storel_epi64((__m128i *)(pu1_dst + 16), src_values2);
+            _mm_storel_epi64((__m128i *)(pu1_dst + 24), src_values3);
+
+            pu1_dst += dst_strd;
+
+        }
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
+* reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+* block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_mode_27_to_33_ssse3(UWORD8 *pu1_ref,
+                                                 WORD32 src_strd,
+                                                 UWORD8 *pu1_dst,
+                                                 WORD32 dst_strd,
+                                                 WORD32 nt,
+                                                 WORD32 mode)
+{
+    WORD32 row;
+    WORD32 pos, fract;
+    WORD32 intra_pred_ang;
+    WORD32 idx, ref_main_idx;
+
+    __m128i zero_8x16b, fract_8x16b, const_temp_8x16b;
+    UNUSED(src_strd);
+
+    intra_pred_ang = gai4_ihevc_ang_table_chroma[mode];
+    const_temp_8x16b = _mm_set1_epi16(16);
+
+    if(nt == 4) /* if nt =4*/
+    {
+        __m128i const_temp2_4x32b, const_temp3_4x32b;
+        __m128i src_values10, src_values11, zero_8x16b, intra_pred_ang_4x32b;
+        __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+        const_temp2_4x32b = _mm_set1_epi32(31);
+        const_temp3_4x32b = _mm_set1_epi32(32);
+
+        two_nt_4x32b = _mm_set1_epi32((4 * nt) + 2);
+
+        zero_8x16b = _mm_set1_epi16(0);
+
+        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+        row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
+
+        {
+            WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+            WORD8  ai1_src_temp0_val[16], ai1_src_temp1_val[16];
+
+            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b, res_temp5_4x32b;
+            __m128i src_values0, src_values1, src_values2, src_values3, src_values13;
+            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+            __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, sign_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+            sign_8x16b      = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
+            res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
+
+            src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
+            src_values12 = _mm_add_epi32(src_values12, _mm_srai_epi32(res_temp5_4x32b,  5));
+
+            ref_main_temp0 = _mm_srli_si128(src_values12, 4);  /* next 32 bit values */
+            ref_main_temp1 = _mm_srli_si128(src_values12, 8);  /* next 32 bit values */
+            ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
+            ref_main_idx1  = _mm_cvtsi128_si32(src_values12);    /* row=0*/
+            ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* row=1*/
+            ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* row=2*/
+            ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* row=3*/
+
+            /* fract = pos & (31); */
+            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
+
+            _mm_storeu_si128((__m128i *)(ai1_src_temp1_val), src_values11);
+            _mm_storeu_si128((__m128i *)(ai1_src_temp0_val), src_values10);
+
+            fract1_8x16b = _mm_set1_epi8(ai1_src_temp1_val[0]);  /* row=0*/
+            fract2_8x16b = _mm_set1_epi8(ai1_src_temp1_val[4]);  /* row=1*/
+            fract3_8x16b = _mm_set1_epi8(ai1_src_temp1_val[8]);  /* row=2*/
+            fract4_8x16b = _mm_set1_epi8(ai1_src_temp1_val[12]);  /* row=3*/
+
+            temp1_8x16b = _mm_set1_epi8(ai1_src_temp0_val[0]);  /* row=0*/
+            temp2_8x16b = _mm_set1_epi8(ai1_src_temp0_val[4]);  /* row=1*/
+            temp3_8x16b = _mm_set1_epi8(ai1_src_temp0_val[8]);  /* row=2*/
+            temp4_8x16b = _mm_set1_epi8(ai1_src_temp0_val[12]);  /* row=3*/
+
+            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
+            temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
+            temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
+            temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
+
+// inner loop starts from here
+            src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1));  /* col = 0-7   */
+            src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2));  /* col = 8-15  */
+            src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3));  /* col = 16-23 */
+            src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4));  /* col = 24-31 */
+
+            src_values10 = _mm_srli_si128(src_values0, 2);
+            src_values11 = _mm_srli_si128(src_values1, 2);
+            src_values12 = _mm_srli_si128(src_values2, 2);
+            src_values13 = _mm_srli_si128(src_values3, 2);
+
+            src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
+            src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
+            src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
+            src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
+
+            src_values0 = _mm_maddubs_epi16(src_values0, temp1_8x16b);
+            src_values1 = _mm_maddubs_epi16(src_values1, temp2_8x16b);
+            src_values2 = _mm_maddubs_epi16(src_values2, temp3_8x16b);
+            src_values3 = _mm_maddubs_epi16(src_values3, temp4_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+            src_values0 = _mm_srai_epi16(src_values0,  5);
+            src_values1 = _mm_srai_epi16(src_values1,  5);
+            src_values2 = _mm_srai_epi16(src_values2,  5);
+            src_values3 = _mm_srai_epi16(src_values3,  5);
+
+            /* converting 16 bit to 8 bit */
+            src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
+            src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
+            src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
+            src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
+
+            _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_values0);       /* row=0*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_values1);   /* row=1*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_values2);   /* row=2*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_values3);   /* row=3*/
+
+        }
+    }
+
+    else if(nt == 8) /* for nt = 16 case */
+    {
+        WORD32 ref_main_idx1, fract1, temp, temp1;
+        __m128i fract1_8x16b, temp_8x16b, temp1_8x16b;
+
+        zero_8x16b = _mm_set1_epi16(0);
+
+        for(row = 0; row < nt; row += 2)
+        {
+            __m128i src_values0, src_values1, src_values2, src_values3;
+            __m128i  src_values10, src_values11, src_values12, src_values13;
+
+            pos = ((row + 1) * intra_pred_ang);
+            idx = pos >> 5;
+            fract = pos & (31);
+            temp = 32 - fract;
+            ref_main_idx = (4 * nt) + 2 * idx + 2; /* col from 0-15 */
+
+            pos = ((row + 2) * intra_pred_ang);
+            idx = pos >> 5;
+            fract1 = pos & (31);
+            temp1 = 32 - fract1;
+            ref_main_idx1 = (4 * nt) + 2 * idx + 2; /* col from 0-15 */
+
+            fract_8x16b  = _mm_set1_epi8(fract);
+            fract1_8x16b = _mm_set1_epi8(fract1);
+            temp_8x16b   = _mm_set1_epi8(temp);
+            temp1_8x16b  = _mm_set1_epi8(temp1);
+
+            temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
+            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
+
+            /* row=0 */
+            src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx));     /* col = 0-7   */
+            src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 8));   /* col = 8-15  */
+
+            /* row=1 */
+            src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1));    /* col = 0-7  */
+            src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 + 8));  /* col = 8-15 */
+
+            src_values10 = _mm_srli_si128(src_values0, 2);
+            src_values11 = _mm_srli_si128(src_values1, 2);
+            src_values12 = _mm_srli_si128(src_values2, 2);
+            src_values13 = _mm_srli_si128(src_values3, 2);
+
+            src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
+            src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
+            src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
+            src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
+
+            src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
+            src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
+
+            src_values2 = _mm_maddubs_epi16(src_values2, temp1_8x16b);
+            src_values3 = _mm_maddubs_epi16(src_values3, temp1_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+
+            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+            src_values0 = _mm_srai_epi16(src_values0,  5);
+            src_values1 = _mm_srai_epi16(src_values1,  5);
+
+            src_values2 = _mm_srai_epi16(src_values2,  5);
+            src_values3 = _mm_srai_epi16(src_values3,  5);
+
+            /* converting 16 bit to 8 bit */
+            src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
+            src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
+
+            src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
+            src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
+
+            /* loding 8-bit 8 pixels values */
+            _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
+            _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
+
+            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_values2);
+            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + 8), src_values3);
+
+            pu1_dst += 2 * dst_strd;
+        }
+    }
+    else if(nt == 16)
+    {
+        WORD32 temp;
+        /* unroll the col loop (inner) */
+        zero_8x16b = _mm_set1_epi16(0);
+
+        for(row = 0; row < nt; row += 1)
+        {
+            __m128i  src_values0, src_values1, src_values2, src_values3, temp_8x16b;
+            __m128i  src_values10, src_values11, src_values12, src_values13;
+
+            pos = ((row + 1) * intra_pred_ang);
+            idx = pos >> 5;
+            fract = pos & (31);
+            temp = 32 - fract;
+            ref_main_idx = (4 * nt) + 2 * idx + 2; /* col from 0-31 */
+
+            fract_8x16b = _mm_set1_epi8(fract);
+            temp_8x16b  = _mm_set1_epi8(temp);
+
+            temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
+
+            src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx));     /* col = 0-7   */
+            src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 8));   /* col = 8-15  */
+            src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 16));  /* col = 16-23 */
+            src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 24));  /* col = 24-31 */
+
+            src_values10 = _mm_srli_si128(src_values0, 2);
+            src_values11 = _mm_srli_si128(src_values1, 2);
+            src_values12 = _mm_srli_si128(src_values2, 2);
+            src_values13 = _mm_srli_si128(src_values3, 2);
+
+            src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
+            src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
+            src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
+            src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
+
+            /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+            src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
+            src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
+            src_values2 = _mm_maddubs_epi16(src_values2, temp_8x16b);
+            src_values3 = _mm_maddubs_epi16(src_values3, temp_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+            src_values0 = _mm_srai_epi16(src_values0,  5);
+            src_values1 = _mm_srai_epi16(src_values1,  5);
+            src_values2 = _mm_srai_epi16(src_values2,  5);
+            src_values3 = _mm_srai_epi16(src_values3,  5);
+
+            /* converting 16 bit to 8 bit */
+            src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
+            src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
+            src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
+            src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
+
+            /* loding 8-bit 8 pixels values */
+            _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
+            _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
+            _mm_storel_epi64((__m128i *)(pu1_dst + 16), src_values2);
+            _mm_storel_epi64((__m128i *)(pu1_dst + 24), src_values3);
+
+            pu1_dst += dst_strd;
+
+        }
+    }
+}

diff --git a/common/x86/ihevc_deblk_ssse3_intr.c b/common/x86/ihevc_deblk_ssse3_intr.c
new file mode 100644
index 0000000..34ea090
--- /dev/null
+++ b/common/x86/ihevc_deblk_ssse3_intr.c

@@ -0,0 +1,1263 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_deblck_atom_intr.c
+*
+* @brief
+*  Contains function definitions for deblocking filters
+*
+* @author
+*  Rishab
+*
+* @par List of Functions:
+*   - ihevc_deblk_luma_vert_ssse3()
+*   - ihevc_deblk_luma_horz_ssse3()
+*   - ihevc_deblk_chroma_vert_ssse3()
+*   - ihevc_deblk_chroma_horz_ssse3()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_deblk.h"
+#include "ihevc_deblk_tables.h"
+#include "ihevc_debug.h"
+
+#include "ihevc_tables_x86_intr.h"
+
+#include <immintrin.h>
+/**
+*******************************************************************************
+*
+* @brief
+*       Decision process and filtering for the luma block vertical edge.
+*
+* @par Description:
+*     The decision process for the luma block vertical edge is  carried out and
+*     an appropriate filter is applied. The  boundary filter strength, bs should
+*     be greater than 0.  The pcm flags and the transquant bypass flags should
+*     be  taken care of by the calling function.
+*
+* @param[in] pu1_src
+*  Pointer to the src sample q(0,0)
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] bs
+*  Boundary filter strength of q(0,0)
+*
+* @param[in] quant_param_p
+*  quantization parameter of p block
+*
+* @param[in] quant_param_q
+*  quantization parameter of p block
+*
+* @param[in] beta_offset_div2
+*
+*
+* @param[in] tc_offset_div2
+*
+*
+* @param[in] filter_flag_p
+*  flag whether to filter the p block
+*
+* @param[in] filter_flag_q
+*  flag whether to filter the q block
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_deblk_luma_vert_ssse3(UWORD8 *pu1_src,
+                                 WORD32 src_strd,
+                                 WORD32 bs,
+                                 WORD32 quant_param_p,
+                                 WORD32 quant_param_q,
+                                 WORD32 beta_offset_div2,
+                                 WORD32 tc_offset_div2,
+                                 WORD32 filter_flag_p,
+                                 WORD32 filter_flag_q)
+{
+    WORD32 qp_luma, beta_indx, tc_indx;
+    WORD32 beta, tc;
+    WORD32 d, dp, dq, d_sam0, d_sam3;
+
+    WORD32 d3, d0, de_0, de_1, de_2, de_3;
+    WORD32 de, dep, deq;
+    __m128i src_row0_8x16b, src_row1_8x16b, src_row2_8x16b, src_row3_8x16b;
+
+
+    {
+        __m128i src_tmp_8x16b, coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b;
+        __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b;
+
+
+
+        ASSERT((bs > 0) && (bs <= 3));
+        ASSERT(filter_flag_p || filter_flag_q);
+
+        qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
+        beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
+
+        /* BS based on implementation can take value 3 if it is intra/inter egde          */
+        /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
+        /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2)          */
+        /* the above desired functionallity is achieved by doing (2*(bs>>1))              */
+
+        tc_indx = CLIP3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53);
+
+        beta = gai4_ihevc_beta_table[beta_indx];
+        tc = gai4_ihevc_tc_table[tc_indx];
+        if(0 == tc)
+        {
+            return;
+        }
+        src_row0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4));
+        src_row3_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd));
+
+        coef_8x16b = _mm_load_si128((__m128i *)(coef_d));
+        mask_16x8b =  _mm_load_si128((__m128i *)(shuffle_d));
+
+        src_tmp_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row3_8x16b);
+        mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_8x16b, mask_16x8b);
+
+        mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_8x16b, coef_8x16b);
+
+
+        //to get all 1's of 8 bit in (1)
+        temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_8x16b, src_tmp_8x16b);
+        temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15);
+        //accumulating values foe dp3 dq3 , dp0 dq0 values
+        mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b);
+
+        temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b);
+        // to get all 1,-1 sets of 16 bits in (0)
+        temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b);
+        //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00
+        mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
+        //to get 16 bit 1's
+        temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8);
+
+
+        // dq3 dp3 dq0 dp0
+        mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b);
+        mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec);
+        mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49);
+        // dq dp d3 d0
+        mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b);
+        //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00|
+        mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b);
+        //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00|
+        mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
+
+        ///store back in a single variable
+        temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4);
+        temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8);
+        mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12);
+
+        d0 = _mm_cvtsi128_si32(mask_d_result_4x32b);
+        d3 = _mm_cvtsi128_si32(temp_coef0_8x16b);
+        dp = _mm_cvtsi128_si32(temp_coef1_8x16b);
+        dq = _mm_cvtsi128_si32(mask_16x8b);
+        //getting d
+        d = d0 + d3;
+
+        ///store back in a single variable
+        temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4);
+        temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8);
+        mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12);
+
+        de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b);
+        de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b);
+        de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b);
+        de_3 = _mm_cvtsi128_si32(mask_16x8b);
+
+        de = 0;
+        dep = 0;
+        deq = 0;
+        if(d < beta)
+        {
+            d_sam0 = 0;
+            if((2 * d0 < (beta >> 2))
+                            && (de_2 < (beta >> 3))
+                            && (de_0 < ((5 * tc + 1) >> 1)))
+            {
+                d_sam0 = 1;
+            }
+
+            d_sam3 = 0;
+            if((2 * d3 < (beta >> 2))
+                            && (de_3 < (beta >> 3))
+                            && de_1 < ((5 * tc + 1) >> 1))
+            {
+                d_sam3 = 1;
+            }
+
+            de = (d_sam0 & d_sam3) + 1;
+            dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
+            deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
+            if(tc <= 1)
+            {
+                dep = 0;
+                deq = 0;
+            }
+        }
+
+    }
+
+    if(de != 0)
+    {
+
+
+        src_row1_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + src_strd));
+        src_row2_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd));
+
+        if(de == 2)
+        {
+            __m128i temp_pq_str0_16x8b;
+            __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b;
+            __m128i temp_pq2_str0_16x8b;
+            __m128i temp_pq_str1_16x8b;
+            __m128i temp_str0_16x8b, temp_str1_16x8b, temp_str2_16x8b, temp_str3_16x8b;
+            __m128i temp_max0_16x8b, temp_max1_16x8b, temp_min0_16x8b, temp_min1_16x8b;
+            __m128i const2_8x16b, const2tc_8x16b;
+            LWORD64 mask, tc2;
+            tc = tc << 1;
+            mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31);
+            tc2 = ((LWORD64)tc);
+
+            const2_8x16b = _mm_cmpeq_epi16(src_row0_8x16b, src_row0_8x16b);
+            //q'0-q'1-2 ,p'0-p'1-2
+            src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row2_8x16b);
+            src_row1_8x16b = _mm_unpacklo_epi64(src_row1_8x16b, src_row3_8x16b);
+
+            const2_8x16b = _mm_srli_epi16(const2_8x16b, 15);
+            temp_pq_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 16);
+            temp_pq_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 16);
+            //arranged x x x x x x x x q31 q30 q1 q10 p30 p31 p10 p11 , x x x x x x x x q21 q20 q01 q00 p20 p21 p00 p01
+            temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
+            temp_str1_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
+
+            const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b);
+            //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01
+            temp_pq_str0_16x8b = _mm_unpacklo_epi32(temp_str0_16x8b, temp_str1_16x8b);
+
+            temp_pq_str0_16x8b = _mm_maddubs_epi16(temp_pq_str0_16x8b, const2_8x16b);
+
+            //q'1-2, p'1-2
+            temp_pq1_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 8);
+            temp_pq1_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 8);
+
+            temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
+            temp_str3_16x8b = _mm_unpackhi_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
+
+            temp_str2_16x8b = _mm_shuffle_epi32(temp_str2_16x8b, 0x58);
+            temp_str3_16x8b = _mm_shuffle_epi32(temp_str3_16x8b, 0x58);
+            // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00
+            temp_pq1_str0_16x8b = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str3_16x8b);
+            // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01
+            temp_pq1_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str3_16x8b);
+
+            temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b);
+            temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b);
+
+            //clipping mask design
+            temp_str1_16x8b = _mm_setzero_si128();
+            temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
+            const2tc_8x16b  = _mm_loadl_epi64((__m128i *)(&tc2));
+            temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44);
+            const2tc_8x16b  = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b);
+
+            //clipping mask design
+            temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31);
+            const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b);
+            //calculating Clipping MAX for all pixel values.
+            temp_max0_16x8b = _mm_adds_epu8(src_row0_8x16b, const2tc_8x16b);
+            temp_max1_16x8b = _mm_adds_epu8(src_row1_8x16b, const2tc_8x16b);
+
+
+            //q'2-q'0-2,p'2-p'0-2
+            temp_pq2_str0_16x8b = _mm_unpacklo_epi16(src_row0_8x16b, src_row2_8x16b);
+            temp_str3_16x8b     = _mm_unpacklo_epi16(src_row1_8x16b, src_row3_8x16b);
+
+            temp_pq2_str0_16x8b = _mm_shuffle_epi32(temp_pq2_str0_16x8b, 0x5c);
+            temp_str3_16x8b     = _mm_shuffle_epi32(temp_str3_16x8b, 0x5c);
+
+            const2_8x16b = _mm_slli_epi16(const2_8x16b, 1);
+            //arranged q33 q32 q23 q22 q13 q12 q03 q02 p33 p32 p23 p22 p13 p12 p03 p02
+            temp_str3_16x8b = _mm_unpacklo_epi16(temp_pq2_str0_16x8b, temp_str3_16x8b);
+
+            temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_str3_16x8b, const2_8x16b);
+
+            //calculating Clipping MIN for all pixel values.
+            temp_min0_16x8b = _mm_subs_epu8(src_row0_8x16b, const2tc_8x16b);
+            temp_min1_16x8b = _mm_subs_epu8(src_row1_8x16b, const2tc_8x16b);
+            //q'0-q'1-2 ,p'0-p'1-2
+            temp_pq_str1_16x8b = _mm_shuffle_epi32(temp_pq_str0_16x8b, 0x4e);
+            temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
+            //q'1-2 p'1-2
+            temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
+            //to get 2 in 16 bit
+            const2_8x16b = _mm_srli_epi16(const2_8x16b, 8);
+            //to get q33 q23 q13 q03, p33 p23 p13 p03
+            temp_pq1_str1_16x8b = _mm_slli_epi16(temp_str3_16x8b, 8);
+            temp_pq_str1_16x8b = _mm_srli_epi16(temp_str3_16x8b, 8);
+            temp_pq1_str1_16x8b = _mm_srli_epi16(temp_pq1_str1_16x8b, 8);
+
+            //q'1, p'1 (adding 2)
+            temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b);
+            //q'0-q'1,p'0-p'1
+            temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, const2_8x16b);
+            //q'2-q'1,p'2-p'1
+            temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b);
+            //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1;
+            temp_pq_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b);
+            //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1;
+            temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b);
+
+            //normalisation of all modified pixels
+            temp_pq_str0_16x8b  = _mm_srai_epi16(temp_pq_str0_16x8b, 3);
+            temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2);
+            temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3);
+
+            //getting p0 p1 together and p2 p3 together
+            temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b);
+            temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str1_16x8b, temp_pq2_str0_16x8b);
+            //getting q1 q0 together and  q3 q2 together
+            temp_pq_str0_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq1_str0_16x8b);
+            temp_pq2_str0_16x8b = _mm_unpackhi_epi16(temp_pq2_str0_16x8b, temp_pq_str1_16x8b);
+            //getting p's of row0 row1 together and of row2 row3 together
+            temp_pq_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str0_16x8b);
+            temp_str2_16x8b    = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str0_16x8b);
+            //getting q's of row0 row1 together and of row2 row3 together
+            temp_str0_16x8b    = _mm_unpacklo_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b);
+            temp_pq_str0_16x8b = _mm_unpackhi_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b);
+            //getting values for respective rows in 16 bit
+            src_row0_8x16b = _mm_unpacklo_epi64(temp_pq_str1_16x8b, temp_str0_16x8b);
+            src_row1_8x16b = _mm_unpackhi_epi64(temp_pq_str1_16x8b, temp_str0_16x8b);
+            src_row2_8x16b = _mm_unpacklo_epi64(temp_str2_16x8b, temp_pq_str0_16x8b);
+            src_row3_8x16b = _mm_unpackhi_epi64(temp_str2_16x8b, temp_pq_str0_16x8b);
+            //packing values to 8 bit
+            src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row2_8x16b);
+            src_row1_8x16b = _mm_packus_epi16(src_row1_8x16b, src_row3_8x16b);
+            //Clipping MAX
+            src_row0_8x16b = _mm_min_epu8(src_row0_8x16b, temp_max0_16x8b);
+            src_row1_8x16b = _mm_min_epu8(src_row1_8x16b, temp_max1_16x8b);
+            //Clipping MIN
+            src_row0_8x16b = _mm_max_epu8(src_row0_8x16b, temp_min0_16x8b);
+            src_row1_8x16b = _mm_max_epu8(src_row1_8x16b, temp_min1_16x8b);
+            //separating row 2 and row 3
+            src_row2_8x16b = _mm_srli_si128(src_row0_8x16b, 8);
+            src_row3_8x16b = _mm_srli_si128(src_row1_8x16b, 8);
+
+        }
+
+        else
+        {
+
+            __m128i tmp_delta0_8x16b, tmp_delta1_8x16b, tmp_delta2_8x16b, tmp_delta3_8x16b;
+            __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b, tmp3_const_8x16b;
+            __m128i coefdelta_0_8x16b, mask_pq_8x16b;
+            __m128i const2_8x16b, consttc_8x16b;
+
+            LWORD64 mask1;
+            mask1 = (((LWORD64)(filter_flag_q & deq)) << 63) | (((LWORD64)filter_flag_q) << 47) | (((LWORD64)filter_flag_p) << 31) | (((LWORD64)(filter_flag_p & dep)) << 15);
+
+            consttc_8x16b = _mm_set1_epi32(tc);
+
+
+            src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row1_8x16b);
+            src_row2_8x16b = _mm_unpacklo_epi64(src_row2_8x16b, src_row3_8x16b);
+
+            tmp_delta2_8x16b = _mm_srli_epi64(src_row0_8x16b, 16);
+            tmp_delta3_8x16b = _mm_srli_epi64(src_row2_8x16b, 16);
+
+            tmp_delta2_8x16b = _mm_shuffle_epi32(tmp_delta2_8x16b, 0x08);
+            tmp_delta3_8x16b = _mm_shuffle_epi32(tmp_delta3_8x16b, 0x08);
+            //arranged q31 q30 p30 p31  q21 q20 p20 p21  q1 q10 p10 p11 q01 q00 p00 p01
+            tmp_delta2_8x16b = _mm_unpacklo_epi64(tmp_delta2_8x16b, tmp_delta3_8x16b);
+
+            coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1);
+            // (-3q1+9q0),(-9p0+3p1)
+            tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b);
+            //converting to 16 bit
+            consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b);
+            //getting -tc store
+            tmp1_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b);
+            //calc 10 *tc = 2*tc +8*tc ; 2*tc
+            tmp2_const_8x16b = _mm_slli_epi16(consttc_8x16b, 1);
+            //calc 10 *tc = 2*tc +8*tc ; 8*tc
+            tmp0_const_8x16b = _mm_slli_epi16(consttc_8x16b, 3);
+            //getting -tc store
+            tmp3_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b);
+            //calc 10 *tc
+            tmp2_const_8x16b = _mm_add_epi16(tmp2_const_8x16b, tmp0_const_8x16b);
+            //const 1
+            const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15);
+            tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta3_8x16b, const2_8x16b);
+            const2_8x16b = _mm_srli_epi32(tmp1_const_8x16b, 31);
+            //getting the mask values
+            mask_pq_8x16b = _mm_loadl_epi64((__m128i *)(&mask1));
+            //loaded coef for delta1 calculation
+            coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1);
+            //(-2q1+q0),(p0-2p1)
+            tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b);
+            //const 8
+            const2_8x16b = _mm_slli_epi32(const2_8x16b, 3);
+            //rearranging the mask values
+            mask_pq_8x16b = _mm_unpacklo_epi64(mask_pq_8x16b, mask_pq_8x16b);
+            //normalisation of the filter
+            tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b);
+            tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4);
+
+            //getting deltaq0
+            tmp_delta2_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp1_const_8x16b);
+            //packing  d3q d2q d1q d0q d3p d2p d1p d0p
+            tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_delta2_8x16b);
+            //absolute delta
+            tmp_delta2_8x16b = _mm_abs_epi16(tmp_delta0_8x16b);
+            //Clipping of delta0
+            tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b);
+            //mask for |delta| < 10*tc
+            tmp0_const_8x16b = _mm_cmpgt_epi16(tmp2_const_8x16b, tmp_delta2_8x16b);
+            //Clipping of delta0
+            tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp3_const_8x16b);
+
+
+            //delta 1 calc starts
+
+            //getting q32 q22 q12 q02 p32 p12 p22 p02
+            tmp2_const_8x16b = _mm_loadl_epi64((__m128i *)(shuffle0));
+            tmp_delta2_8x16b = _mm_shuffle_epi8(src_row0_8x16b, tmp2_const_8x16b);
+            tmp_delta1_8x16b =  _mm_shuffle_epi8(src_row2_8x16b, tmp2_const_8x16b);
+            tmp_delta1_8x16b = _mm_unpacklo_epi32(tmp_delta2_8x16b, tmp_delta1_8x16b);
+            //constant 1
+            const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15);
+            //tc>>1 16 bit
+            consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1);
+
+            //getting -tc>>1 store  16 bit
+            tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b);
+            //2*delta0
+            tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b);
+
+            //getting  all respective q's and p's together
+            tmp3_const_8x16b = _mm_load_si128((__m128i *)(shuffle1));
+            tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta3_8x16b, tmp3_const_8x16b);
+            //final adds for deltap1 and deltaq1
+            tmp_delta3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, const2_8x16b);
+            tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp2_const_8x16b);
+            tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp_delta3_8x16b);
+            tmp2_const_8x16b = _mm_setzero_si128();
+            tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2);
+
+            // clipping delta1
+            tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b);
+            // clipping delta1
+            tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b);
+
+            //getting the mask ready
+            mask_pq_8x16b = _mm_srai_epi16(mask_pq_8x16b, 15);
+            //masking of the delta values |delta|<10*tc
+            tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp0_const_8x16b);
+            tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp0_const_8x16b);
+            //packing dq1 dq0 dp0 dp1
+            tmp1_const_8x16b = _mm_unpacklo_epi16(tmp_delta1_8x16b, tmp_delta0_8x16b);
+            tmp_delta0_8x16b = _mm_unpackhi_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b);
+            tmp_delta1_8x16b = _mm_unpackhi_epi32(tmp1_const_8x16b, tmp_delta0_8x16b);
+            tmp_delta0_8x16b = _mm_unpacklo_epi32(tmp1_const_8x16b, tmp_delta0_8x16b);
+
+            //masking of the delta values dep, deq , filter_p ,filter_q
+            tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, mask_pq_8x16b);
+            tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, mask_pq_8x16b);
+            //converting 8bit to 16 bit
+            src_row0_8x16b = _mm_unpacklo_epi8(src_row0_8x16b, tmp2_const_8x16b);
+            src_row1_8x16b = _mm_unpacklo_epi8(src_row1_8x16b, tmp2_const_8x16b);
+            src_row2_8x16b = _mm_unpacklo_epi8(src_row2_8x16b, tmp2_const_8x16b);
+            src_row3_8x16b = _mm_unpacklo_epi8(src_row3_8x16b, tmp2_const_8x16b);
+            //shuffle values loaded
+            tmp0_const_8x16b = _mm_load_si128((__m128i *)shuffle2);
+            tmp1_const_8x16b = _mm_load_si128((__m128i *)shuffle3);
+            //arranging each row delta in different registers
+            tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp1_const_8x16b);
+            tmp_delta2_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp0_const_8x16b);
+            tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp1_const_8x16b);
+            tmp_delta0_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp0_const_8x16b);
+
+            //adding the respective delta
+            src_row3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, src_row3_8x16b);
+            src_row2_8x16b = _mm_add_epi16(tmp_delta2_8x16b, src_row2_8x16b);
+            src_row1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_row1_8x16b);
+            src_row0_8x16b = _mm_add_epi16(tmp_delta0_8x16b, src_row0_8x16b);
+            //saturating to 8 bit
+            src_row2_8x16b = _mm_packus_epi16(src_row2_8x16b, src_row3_8x16b);
+            src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row1_8x16b);
+            //separating different rows
+            src_row1_8x16b = _mm_srli_si128(src_row0_8x16b, 8);
+            src_row3_8x16b = _mm_srli_si128(src_row2_8x16b, 8);
+        }
+
+        _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row0_8x16b);
+        _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), src_row1_8x16b);
+        _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row2_8x16b);
+        _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), src_row3_8x16b);
+    }
+}
+
+void ihevc_deblk_luma_horz_ssse3(UWORD8 *pu1_src,
+                                 WORD32 src_strd,
+                                 WORD32 bs,
+                                 WORD32 quant_param_p,
+                                 WORD32 quant_param_q,
+                                 WORD32 beta_offset_div2,
+                                 WORD32 tc_offset_div2,
+                                 WORD32 filter_flag_p,
+                                 WORD32 filter_flag_q)
+{
+    WORD32 qp_luma, beta_indx, tc_indx;
+    WORD32 beta, tc;
+
+    WORD32 d0, d3, dp, dq, d;
+    WORD32 de_0, de_1, de_2, de_3;
+    WORD32 d_sam0, d_sam3;
+    WORD32 de, dep, deq;
+
+    __m128i src_q0_8x16b, src_q1_8x16b, src_p0_8x16b, src_p1_8x16b, src_q2_8x16b;
+    __m128i tmp_pq_str1_8x16b, src_p2_8x16b, tmp_pq_str0_8x16b;
+
+
+
+
+    {
+        __m128i src_tmp_p_0_8x16b, src_tmp_p_1_8x16b, src_tmp_q_0_8x16b, src_tmp_q_1_8x16b;
+        __m128i coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b;
+        __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b;
+
+        ASSERT((bs > 0));
+        ASSERT(filter_flag_p || filter_flag_q);
+
+        qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
+        beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
+
+        /* BS based on implementation can take value 3 if it is intra/inter egde          */
+        /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
+        /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2)          */
+        /* the above desired functionallity is achieved by doing (2*(bs>>1))              */
+
+        tc_indx = CLIP3(qp_luma + 2 * (bs >> 1) + (tc_offset_div2 << 1), 0, 53);
+
+        beta = gai4_ihevc_beta_table[beta_indx];
+        tc = gai4_ihevc_tc_table[tc_indx];
+        if(0 == tc)
+        {
+            return;
+        }
+        src_q0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src));
+        src_q1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+        src_p0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd));
+        src_p1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd));
+        src_q2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd));
+        tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd));
+        src_p2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 3 * src_strd));
+        tmp_pq_str0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4 * src_strd));
+
+
+        src_tmp_p_0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
+        src_tmp_p_1_8x16b = _mm_unpacklo_epi8(tmp_pq_str0_8x16b, src_p2_8x16b);
+
+        src_tmp_q_0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
+        src_tmp_q_1_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b);
+
+        src_tmp_p_0_8x16b = _mm_unpacklo_epi16(src_tmp_p_1_8x16b, src_tmp_p_0_8x16b);
+        src_tmp_q_0_8x16b = _mm_unpacklo_epi16(src_tmp_q_0_8x16b, src_tmp_q_1_8x16b);
+
+        src_tmp_p_0_8x16b = _mm_shuffle_epi32(src_tmp_p_0_8x16b, 0x6c);
+        src_tmp_q_0_8x16b = _mm_shuffle_epi32(src_tmp_q_0_8x16b, 0x6c);
+
+        coef_8x16b = _mm_load_si128((__m128i *)(coef_d));
+        mask_16x8b =  _mm_load_si128((__m128i *)(shuffle_d));
+
+        src_tmp_p_0_8x16b = _mm_unpacklo_epi32(src_tmp_p_0_8x16b, src_tmp_q_0_8x16b);
+        //WORD32 shuffle_d[4]={0x80800403,0x80800c0b,0x03000704,0x0b080f0c};
+        mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_p_0_8x16b, mask_16x8b);
+
+        mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_p_0_8x16b, coef_8x16b);
+
+
+        //to get all 1's of 8 bit in (1)
+        temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_p_0_8x16b, src_tmp_p_0_8x16b);
+        temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15);
+        //accumulating values foe dp3 dq3 , dp0 dq0 values
+        mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b);
+
+        temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b);
+        // to get all 1,-1 sets of 16 bits in (0)
+        temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b);
+        //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00
+        mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
+        //to get 16 bit 1's
+        temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8);
+
+
+        // dq3 dp3 dq0 dp0
+        mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b);
+        mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec);
+        mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49);
+        // dq dp d3 d0
+        mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b);
+        //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00|
+        mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b);
+        //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00|
+        mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
+
+        ///store back in a single variable
+        temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4);
+        temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8);
+        mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12);
+
+        d0 = _mm_cvtsi128_si32(mask_d_result_4x32b);
+        d3 = _mm_cvtsi128_si32(temp_coef0_8x16b);
+        dp = _mm_cvtsi128_si32(temp_coef1_8x16b);
+        dq = _mm_cvtsi128_si32(mask_16x8b);
+        //getting d
+        d = d0 + d3;
+
+        ///store back in a single variable
+        temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4);
+        temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8);
+        mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12);
+
+        de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b);
+        de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b);
+        de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b);
+        de_3 = _mm_cvtsi128_si32(mask_16x8b);
+
+        de = 0;
+        dep = 0;
+        deq = 0;
+        if(d < beta)
+        {
+            d_sam0 = 0;
+            if((2 * d0 < (beta >> 2))
+                            && (de_2 < (beta >> 3))
+                            && (de_0 < ((5 * tc + 1) >> 1)))
+            {
+                d_sam0 = 1;
+            }
+
+            d_sam3 = 0;
+            if((2 * d3 < (beta >> 2))
+                            && (de_3 < (beta >> 3))
+                            && de_1 < ((5 * tc + 1) >> 1))
+            {
+                d_sam3 = 1;
+            }
+
+            de = (d_sam0 & d_sam3) + 1;
+            dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
+            deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
+            if(tc <= 1)
+            {
+                dep = 0;
+                deq = 0;
+            }
+        }
+
+    }
+
+    if(de != 0)
+    {
+
+        if(2 == de)
+        {
+
+            __m128i temp_pq0_str0_16x8b;
+            __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b;
+            __m128i temp_pq2_str0_16x8b;
+            __m128i temp_str0_16x8b, temp_str1_16x8b;
+            __m128i const2_8x16b, const2tc_8x16b;
+
+            LWORD64 mask, tc2;
+            tc = tc << 1;
+            mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31);
+            tc2 = ((LWORD64)tc);
+
+            const2_8x16b = _mm_cmpeq_epi16(src_p1_8x16b, src_p1_8x16b);
+            //q'0-q'1-2 ,p'0-p'1-2
+            temp_pq0_str0_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
+            temp_str0_16x8b   = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
+            const2_8x16b = _mm_srli_epi16(const2_8x16b, 15);
+            //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01
+            temp_pq0_str0_16x8b = _mm_unpacklo_epi64(temp_pq0_str0_16x8b, temp_str0_16x8b);
+
+            const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b);
+            temp_pq0_str0_16x8b = _mm_maddubs_epi16(temp_pq0_str0_16x8b, const2_8x16b);
+
+            //q'1-2, p'1-2
+            temp_pq1_str0_16x8b = _mm_unpacklo_epi8(src_p0_8x16b, src_q0_8x16b);
+            temp_pq1_str1_16x8b = _mm_unpacklo_epi8(src_q1_8x16b, src_q2_8x16b);
+            temp_str1_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p2_8x16b);
+            // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00
+            temp_pq1_str0_16x8b = _mm_unpacklo_epi64(temp_pq1_str0_16x8b, temp_pq1_str0_16x8b);
+            // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01
+            temp_pq1_str1_16x8b = _mm_unpacklo_epi64(temp_str1_16x8b, temp_pq1_str1_16x8b);
+
+            temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b);
+            temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b);
+
+            //clipping mask design
+            temp_str1_16x8b = _mm_setzero_si128();
+            temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
+            const2tc_8x16b  = _mm_loadl_epi64((__m128i *)(&tc2));
+            temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44);
+            const2tc_8x16b  = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b);
+
+            //clipping mask design
+            temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31);
+            const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b);
+            //calculating Clipping MAX for all pixel values.
+            src_p0_8x16b = _mm_unpacklo_epi32(src_p0_8x16b, src_q0_8x16b);
+            src_q0_8x16b = _mm_unpacklo_epi32(src_p1_8x16b, src_q1_8x16b);
+            //for clipping calc
+            src_p1_8x16b = _mm_unpacklo_epi64(src_p0_8x16b, src_q0_8x16b);
+            //saving the unmodified data of q1 p1 q0 p0
+            src_q1_8x16b = _mm_unpackhi_epi64(src_p0_8x16b, src_q0_8x16b);
+            //CLIpping MAX and MIN for q1 p1 q0 p0
+            src_p0_8x16b = _mm_adds_epu8(src_p1_8x16b, const2tc_8x16b);
+            src_p1_8x16b = _mm_subs_epu8(src_p1_8x16b, const2tc_8x16b);
+
+
+            //q'2-q'0-2,p'2-p'0-2
+            tmp_pq_str0_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp_pq_str0_8x16b);
+            temp_pq2_str0_16x8b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b);
+            const2_8x16b = _mm_slli_epi16(const2_8x16b, 1);
+            //arranged q33 q32 q23 q22 q13 q12 q03 q02 p32 p33 p22 p23 p12 p13 p02 p03
+            temp_pq2_str0_16x8b = _mm_unpacklo_epi64(tmp_pq_str0_8x16b, temp_pq2_str0_16x8b);
+            src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, src_q2_8x16b);
+            temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_pq2_str0_16x8b, const2_8x16b);
+
+            //calculating Clipping MAX and MIN for p2 and q2 .
+            tmp_pq_str0_8x16b = _mm_adds_epu8(src_p2_8x16b, const2tc_8x16b);
+            tmp_pq_str1_8x16b = _mm_subs_epu8(src_p2_8x16b, const2tc_8x16b);
+            //q'0-q'1-2 ,p'0-p'1-2
+            temp_str0_16x8b = _mm_shuffle_epi32(temp_pq0_str0_16x8b, 0x4e);
+            temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, temp_str0_16x8b);
+            //q'1-2 p'1-2
+            temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
+            //to get 2 in 16 bit
+            const2_8x16b = _mm_srli_epi16(const2_8x16b, 8);
+
+
+            //q'1, p'1 (adding 2)
+            temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b);
+            //q'0-q'1,p'0-p'1
+            temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, const2_8x16b);
+            //q'2-q'1,p'2-p'1
+            temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b);
+            //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1;
+            temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq0_str0_16x8b);
+            //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1;
+            temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b);
+
+            //normalisation of all modified pixels
+            temp_pq0_str0_16x8b = _mm_srai_epi16(temp_pq0_str0_16x8b, 3);
+            temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2);
+            temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3);
+            //q'1 p'1 q'0 p'0
+            temp_pq0_str0_16x8b = _mm_packus_epi16(temp_pq0_str0_16x8b, temp_pq1_str0_16x8b);
+            temp_pq2_str0_16x8b = _mm_packus_epi16(temp_pq2_str0_16x8b, temp_pq2_str0_16x8b);
+            //pack with the unmodified data of q2 and p2
+            src_p2_8x16b = _mm_unpackhi_epi64(temp_pq2_str0_16x8b, src_p2_8x16b);
+            //Clipping MAX and MIN for q'1 p'1 q'0 p'0 and q'2  p'2
+            temp_pq0_str0_16x8b = _mm_min_epu8(temp_pq0_str0_16x8b, src_p0_8x16b);
+            src_p2_8x16b = _mm_min_epu8(src_p2_8x16b, tmp_pq_str0_8x16b);
+            temp_pq0_str0_16x8b = _mm_max_epu8(temp_pq0_str0_16x8b, src_p1_8x16b);
+            src_p2_8x16b = _mm_max_epu8(src_p2_8x16b, tmp_pq_str1_8x16b);
+            //Reshuffling q'1 p'1 q'0 p'0 along with unmodified data
+            src_p0_8x16b = _mm_unpacklo_epi32(temp_pq0_str0_16x8b, src_q1_8x16b);
+            src_p1_8x16b = _mm_unpackhi_epi32(temp_pq0_str0_16x8b, src_q1_8x16b);
+            src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0xd8);
+            src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8);
+            src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8);
+            src_q2_8x16b = _mm_srli_si128(src_p2_8x16b, 8);
+
+            _mm_storel_epi64((__m128i *)(pu1_src - 3 * src_strd), src_p2_8x16b);
+            _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b);
+            _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b);
+            _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b);
+            _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b);
+            _mm_storel_epi64((__m128i *)(pu1_src + 2 * src_strd), src_q2_8x16b);
+
+
+        }
+
+        else
+        {
+
+            __m128i tmp_delta0_8x16b, tmp_delta1_8x16b;
+            __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b;
+            __m128i coefdelta_0_8x16b;
+            __m128i const2_8x16b, consttc_8x16b;
+
+            LWORD64 maskp0, maskp1, maskq0, maskq1;
+            maskp0 = (LWORD64)filter_flag_p;
+            maskq0 = (LWORD64)filter_flag_q;
+            maskp1 = (LWORD64)dep;
+            maskq1 = (LWORD64)deq;
+            consttc_8x16b = _mm_set1_epi32(tc);
+
+            tmp_delta0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
+            tmp_delta1_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
+            //arranged q31 q30 p30 p31  q21 q20 p20 p21  q1 q10 p10 p11 q01 q00 p00 p01
+            tmp_delta1_8x16b = _mm_unpacklo_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b);
+
+            coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1);
+            // (-3q1+9q0),(-9p0+3p1)
+            tmp_delta0_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b);
+
+            //getting -tc store
+            tmp2_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b);
+
+            //getting tc in 16 bit
+            consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b);
+            //calc 10 *tc = 2*tc +8*tc ; 2*tc
+            tmp_pq_str0_8x16b = _mm_slli_epi16(consttc_8x16b, 1);
+            //calc 10 *tc = 2*tc +8*tc ; 8*tc
+            tmp_pq_str1_8x16b = _mm_slli_epi16(consttc_8x16b, 3);
+
+            //const 1
+            const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15);
+            //calc 10 *tc
+            tmp_pq_str0_8x16b = _mm_add_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b);
+            //delta0 without normalisation and clipping
+            tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta0_8x16b, const2_8x16b);
+
+            const2_8x16b = _mm_srli_epi32(tmp2_const_8x16b, 31);
+
+            //loaded coef for delta1 calculation
+            coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1);
+            //(-2q1+q0),(p0-2p1)
+            tmp_delta1_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b);
+            //const 8
+            const2_8x16b = _mm_slli_epi32(const2_8x16b, 3);
+
+            //normalisation of the filter
+            tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b);
+            tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4);
+
+            //getting deltaq0
+            tmp_pq_str1_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp2_const_8x16b);
+            //getting -tc
+            tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b);
+            //packing  d03q d02q d01q d0q d03p d02p d01p d00p
+            tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_pq_str1_8x16b);
+            //absolute delta
+            tmp_pq_str1_8x16b = _mm_abs_epi16(tmp_delta0_8x16b);
+
+            //Clipping of delta0
+            tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b);
+            //tc>>1 16 bit
+            consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1);
+            //Clipping of delta0
+            tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp1_const_8x16b);
+
+            //(-tc)>>1 16 bit
+            tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b);
+            //mask for |delta| < 10*tc
+            tmp_pq_str0_8x16b = _mm_cmpgt_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b);
+            //delta 1 calc starts
+
+            //getting q32 q22 q12 q02 p32 p12 p22 p02
+            tmp0_const_8x16b = _mm_setzero_si128();
+            src_q2_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp0_const_8x16b);
+            src_p2_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp0_const_8x16b);
+            src_p2_8x16b = _mm_unpacklo_epi64(src_p2_8x16b, src_q2_8x16b);
+            //constant 1
+            const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15);
+            //2*delta0
+            tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b);
+            //getting  all respective q's and p's together
+            coefdelta_0_8x16b = _mm_load_si128((__m128i *)(shuffle1));
+            tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, coefdelta_0_8x16b);
+            //final adds for deltap1 and deltaq1
+            tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, const2_8x16b);
+            src_p2_8x16b = _mm_add_epi16(src_p2_8x16b, tmp2_const_8x16b);
+            tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_p2_8x16b);
+            tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2);
+
+            //mask0= (((LWORD64)filter_flag_q)<<63)| (((LWORD64)filter_flag_p)<<31);
+            tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq0)));
+            src_p2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp0)));
+
+            //   src_p2_8x16b = _mm_set_epi32(filter_flag_q,filter_flag_p,filter_flag_q,filter_flag_p);
+            //mask1= (((LWORD64)(filter_flag_q&deq))<<63)|(((LWORD64)(filter_flag_p & dep))<<31);
+            src_q2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq1)));
+            coefdelta_0_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp1)));
+
+            src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, tmp_pq_str1_8x16b);
+            src_q2_8x16b = _mm_unpacklo_epi32(coefdelta_0_8x16b, src_q2_8x16b);
+            //src_q2_8x16b = _mm_set_epi32(deq,dep,deq,dep);
+            src_q2_8x16b = _mm_and_si128(src_q2_8x16b, src_p2_8x16b);
+
+            //rearranging the mask values
+            src_q2_8x16b = _mm_shuffle_epi32(src_q2_8x16b, 0x50);
+            src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0x50);
+
+            src_q2_8x16b = _mm_slli_epi32(src_q2_8x16b, 31);
+            src_p2_8x16b = _mm_slli_epi32(src_p2_8x16b, 31);
+            src_q2_8x16b = _mm_srai_epi32(src_q2_8x16b, 31);
+            src_p2_8x16b = _mm_srai_epi32(src_p2_8x16b, 31);
+
+            //combining mask delta1
+            tmp_pq_str1_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_q2_8x16b);
+            // clipping delta1
+            tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b);
+            //combining mask delat0
+            tmp_pq_str0_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_p2_8x16b);
+            // clipping delta1
+            tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b);
+
+
+            //masking of the delta values |delta|<10*tc
+            tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp_pq_str1_8x16b);
+            tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp_pq_str0_8x16b);
+            //separating p and q delta 0 and addinq p0 and q0
+            tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta0_8x16b, tmp0_const_8x16b);
+            tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta0_8x16b, tmp0_const_8x16b);
+            src_p0_8x16b = _mm_unpacklo_epi8(src_p0_8x16b, tmp0_const_8x16b);
+            src_q0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, tmp0_const_8x16b);
+            src_p0_8x16b = _mm_add_epi16(src_p0_8x16b, tmp_pq_str0_8x16b);
+            src_q0_8x16b = _mm_add_epi16(src_q0_8x16b, tmp_pq_str1_8x16b);
+            //separating p and q delta 0 and addinq p0 and q0
+            tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta1_8x16b, tmp0_const_8x16b);
+            tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta1_8x16b, tmp0_const_8x16b);
+            src_p1_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, tmp0_const_8x16b);
+            src_q1_8x16b = _mm_unpacklo_epi8(src_q1_8x16b, tmp0_const_8x16b);
+            src_p1_8x16b = _mm_add_epi16(src_p1_8x16b, tmp_pq_str0_8x16b);
+            src_q1_8x16b = _mm_add_epi16(src_q1_8x16b, tmp_pq_str1_8x16b);
+            //packing p1 q1 and p0 q0 to 8 bit
+            src_p1_8x16b = _mm_packus_epi16(src_p1_8x16b, src_q1_8x16b);
+            src_p0_8x16b = _mm_packus_epi16(src_p0_8x16b, src_q0_8x16b);
+
+            src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8);
+            src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8);
+
+            _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b);
+            _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b);
+            _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b);
+            _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b);
+
+
+        }
+
+
+
+    }
+
+}
+
+void ihevc_deblk_chroma_vert_ssse3(UWORD8 *pu1_src,
+                                   WORD32 src_strd,
+                                   WORD32 quant_param_p,
+                                   WORD32 quant_param_q,
+                                   WORD32 qp_offset_u,
+                                   WORD32 qp_offset_v,
+                                   WORD32 tc_offset_div2,
+                                   WORD32 filter_flag_p,
+                                   WORD32 filter_flag_q)
+{
+    WORD32 qp_indx_u, qp_chroma_u;
+    WORD32 qp_indx_v, qp_chroma_v;
+    WORD32 tc_indx_u, tc_u;
+    WORD32 tc_indx_v, tc_v;
+
+    __m128i src_row_0_16x8b, tmp_pxl_0_16x8b, src_row_2_16x8b, tmp_pxl_1_16x8b;
+    ASSERT(filter_flag_p || filter_flag_q);
+
+    /* chroma processing is done only if BS is 2             */
+    /* this function is assumed to be called only if BS is 2 */
+    qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
+    qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
+
+    qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
+    qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
+
+    tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
+    tc_u = gai4_ihevc_tc_table[tc_indx_u];
+
+    tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
+    tc_v = gai4_ihevc_tc_table[tc_indx_v];
+
+    if(0 == tc_u && 0 == tc_v)
+    {
+        return;
+    }
+    src_row_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 4));
+    tmp_pxl_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd - 4));
+    src_row_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd - 4));
+    tmp_pxl_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd - 4));
+
+    {
+        LWORD64 mask_tc, mask_flag, mask;
+        __m128i delta_vu0_16x8b, delta_vu1_16x8b;
+        __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b;
+        __m128i min_0_16x8b;
+        __m128i const_16x8b;
+        mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63);
+        mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u);
+        mask = 0xffff00000000ffffLL;
+
+        src_row_0_16x8b = _mm_unpacklo_epi64(src_row_0_16x8b, tmp_pxl_0_16x8b);
+        src_row_2_16x8b = _mm_unpacklo_epi64(src_row_2_16x8b, tmp_pxl_1_16x8b);
+
+        mask_16x8b = _mm_load_si128((__m128i *)(shuffle_uv));
+        // qv11 qu11 qv10 qu10 qv01 qu01 qv00 qu00 pv10 pu10 pv11 pu11 pv00 pu00 pv01 pu01
+        // qv31 qu31 qv30 qu30 qv21 qu21 qv20 qu20 pv30 pu30 pv31 pu31 pv20 pu20 pv21 pu21
+        delta_vu0_16x8b = _mm_shuffle_epi8(src_row_0_16x8b, mask_16x8b);
+        delta_vu1_16x8b = _mm_shuffle_epi8(src_row_2_16x8b, mask_16x8b);
+
+        tmp_pxl_0_16x8b = _mm_unpacklo_epi64(delta_vu0_16x8b, delta_vu1_16x8b);
+        tmp_pxl_1_16x8b = _mm_unpackhi_epi64(delta_vu0_16x8b, delta_vu1_16x8b);
+        // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01
+        // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00
+        delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0);
+        delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1);
+
+        delta_vu0_16x8b = _mm_maddubs_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b);
+        delta_vu1_16x8b = _mm_maddubs_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b);
+
+        //generating offset 4
+        const_16x8b = _mm_cmpeq_epi16(tmp_pxl_0_16x8b, tmp_pxl_0_16x8b);
+        // filter flag mask and tc mask
+        mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc));
+        mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag));
+
+        mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00);
+        mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31);
+        //-tc
+        min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b);
+        //converting const 1
+        const_16x8b = _mm_srli_epi16(const_16x8b, 15);
+
+        //filterp and filterq flag
+        mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00);
+        mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55);
+
+        //modified delta with a filter (1 -4 4 -1) available in 16 bit
+        delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b);
+        //converting const 4
+        const_16x8b = _mm_slli_epi16(const_16x8b, 2);
+
+        mask_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
+        //offset addition
+        delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b);
+        //eliminating q1
+        tmp_pxl_1_16x8b = _mm_slli_epi16(tmp_pxl_1_16x8b, 8);
+
+        const_16x8b = _mm_setzero_si128();
+        //filter after normalisation
+        delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3);
+        mask_16x8b = _mm_shuffle_epi32(mask_16x8b, 0x44);
+
+        //clipping MAX
+        delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8);
+        //getting p0 and eliminating p1
+        tmp_pxl_0_16x8b = _mm_srli_epi16(tmp_pxl_0_16x8b, 8);
+        //clipping MIN
+        delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b);
+        //getting q0
+        tmp_pxl_1_16x8b = _mm_srli_epi16(tmp_pxl_1_16x8b, 8);
+        //masking filter flag
+        delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b);
+        delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b);
+
+        // q-delta ,p+delta
+        tmp_pxl_1_16x8b = _mm_sub_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b);
+        tmp_pxl_0_16x8b = _mm_add_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b);
+        //merging q0 and p0 of respective rows
+        delta_vu1_16x8b = _mm_unpackhi_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b);
+        delta_vu0_16x8b = _mm_unpacklo_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b);
+        // row 0 and row 1 packed , row2 and row3 packed
+        delta_vu0_16x8b = _mm_packus_epi16(delta_vu0_16x8b, const_16x8b);
+        delta_vu1_16x8b = _mm_packus_epi16(delta_vu1_16x8b, const_16x8b);
+        //removing older pixel values
+        src_row_0_16x8b = _mm_and_si128(src_row_0_16x8b, mask_16x8b);
+        src_row_2_16x8b = _mm_and_si128(src_row_2_16x8b, mask_16x8b);
+        //arranging modified pixels
+        delta_vu0_16x8b = _mm_shuffle_epi32(delta_vu0_16x8b, 0xd8);
+        delta_vu1_16x8b = _mm_shuffle_epi32(delta_vu1_16x8b, 0xd8);
+        delta_vu0_16x8b = _mm_slli_epi64(delta_vu0_16x8b, 16);
+        delta_vu1_16x8b = _mm_slli_epi64(delta_vu1_16x8b, 16);
+        //plugging the modified values
+        src_row_0_16x8b = _mm_or_si128(src_row_0_16x8b, delta_vu0_16x8b);
+        src_row_2_16x8b = _mm_or_si128(src_row_2_16x8b, delta_vu1_16x8b);
+
+
+        //geting values for row1 and row 3
+        tmp_pxl_0_16x8b = _mm_srli_si128(src_row_0_16x8b, 8);
+        tmp_pxl_1_16x8b = _mm_srli_si128(src_row_2_16x8b, 8);
+
+        _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row_0_16x8b);
+        _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), tmp_pxl_0_16x8b);
+        _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row_2_16x8b);
+        _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), tmp_pxl_1_16x8b);
+    }
+
+
+
+}
+
+void ihevc_deblk_chroma_horz_ssse3(UWORD8 *pu1_src,
+                                   WORD32 src_strd,
+                                   WORD32 quant_param_p,
+                                   WORD32 quant_param_q,
+                                   WORD32 qp_offset_u,
+                                   WORD32 qp_offset_v,
+                                   WORD32 tc_offset_div2,
+                                   WORD32 filter_flag_p,
+                                   WORD32 filter_flag_q)
+{
+    WORD32 qp_indx_u, qp_chroma_u;
+    WORD32 qp_indx_v, qp_chroma_v;
+    WORD32 tc_indx_u, tc_u;
+    WORD32 tc_indx_v, tc_v;
+
+
+    __m128i tmp_p0_16x8b, src_p0_16x8b, src_q0_16x8b, tmp_q0_16x8b;
+
+    ASSERT(filter_flag_p || filter_flag_q);
+
+    /* chroma processing is done only if BS is 2             */
+    /* this function is assumed to be called only if BS is 2 */
+    qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
+    qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
+
+    qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
+    qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
+
+    tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
+    tc_u = gai4_ihevc_tc_table[tc_indx_u];
+
+    tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
+    tc_v = gai4_ihevc_tc_table[tc_indx_v];
+
+    if(0 == tc_u && 0 == tc_v)
+    {
+        return;
+    }
+    tmp_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd));
+    src_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd));
+    src_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+    tmp_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+
+    {
+        LWORD64 mask_tc, mask_flag;
+        __m128i delta_vu0_16x8b, delta_vu1_16x8b;
+        __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b;
+        __m128i min_0_16x8b;
+        __m128i const_16x8b;
+        mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63);
+        mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u);
+
+        tmp_p0_16x8b = _mm_unpacklo_epi8(tmp_p0_16x8b, src_p0_16x8b);
+        tmp_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, tmp_q0_16x8b);
+
+        // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01
+        // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00
+        delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0);
+        delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1);
+
+        delta_vu0_16x8b = _mm_maddubs_epi16(tmp_p0_16x8b, delta_vu0_16x8b);
+        delta_vu1_16x8b = _mm_maddubs_epi16(tmp_q0_16x8b, delta_vu1_16x8b);
+
+
+        // filter flag mask and tc mask
+        mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc));
+        mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag));
+
+        //generating offset 4
+        const_16x8b = _mm_cmpeq_epi16(tmp_p0_16x8b, tmp_p0_16x8b);
+        // filter flag mask and tc mask
+        mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00);
+        mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31);
+        //-tc
+        min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b);
+        //converting const 1
+        const_16x8b = _mm_srli_epi16(const_16x8b, 15);
+
+        //filterp
+        mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00);
+
+
+        //converting const 4
+        const_16x8b = _mm_slli_epi16(const_16x8b, 2);
+        //modified delta with a filter (1 -4 4 -1) available in 16 bit
+        delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b);
+
+        //filterq flag
+        mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55);
+        //offset addition
+        delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b);
+        mask_16x8b = _mm_setzero_si128();
+        //filter after normalisation
+        delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3);
+
+        //converting p0 to 16bit
+        src_p0_16x8b = _mm_unpacklo_epi8(src_p0_16x8b, mask_16x8b);
+        //clipping MAX
+        delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8);
+        //converting q0 to 16bit
+        src_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, mask_16x8b);
+        //clipping MIN
+        delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b);
+
+        //masking filter flag
+        delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b);
+        delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b);
+
+        // q-delta ,p+delta
+        src_q0_16x8b = _mm_sub_epi16(src_q0_16x8b, delta_vu1_16x8b);
+        src_p0_16x8b = _mm_add_epi16(src_p0_16x8b, delta_vu0_16x8b);
+
+        // p0 and q0 packed
+        src_q0_16x8b = _mm_packus_epi16(src_q0_16x8b, mask_16x8b);
+        src_p0_16x8b = _mm_packus_epi16(src_p0_16x8b, mask_16x8b);
+
+
+
+        _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_16x8b);
+        _mm_storel_epi64((__m128i *)(pu1_src), src_q0_16x8b);
+
+    }
+
+
+}

diff --git a/common/x86/ihevc_func_selector.h b/common/x86/ihevc_func_selector.h
new file mode 100644
index 0000000..52023c2
--- /dev/null
+++ b/common/x86/ihevc_func_selector.h

@@ -0,0 +1,224 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_func_selector.h
+*
+* @brief
+*  For each function decide whether to use C function,  or Neon intrinsics
+* or Cortex A8 intrinsics or Neon  assembly or cortex a8 assembly
+*
+* @author
+*  Harish
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef __IHEVC_FUNC_SELECTOR_H__
+#define __IHEVC_FUNC_SELECTOR_H__
+
+
+#include "ihevc_func_types.h"
+
+#define     INTER_PRED_LUMA_COPY                    C
+#define     INTER_PRED_LUMA_HORZ                    C
+#define     INTER_PRED_LUMA_VERT                    C
+#define     INTER_PRED_LUMA_COPY_W16OUT             C
+#define     INTER_PRED_LUMA_HORZ_W16OUT             C
+#define     INTER_PRED_LUMA_VERT_W16OUT             C
+#define     INTER_PRED_LUMA_VERT_W16INP             C
+#define     INTER_PRED_LUMA_VERT_W16INP_W16OUT      C
+
+#define     INTER_PRED_CHROMA_COPY                  C
+#define     INTER_PRED_CHROMA_HORZ                  C
+#define     INTER_PRED_CHROMA_VERT                  C
+#define     INTER_PRED_CHROMA_COPY_W16OUT           C
+#define     INTER_PRED_CHROMA_HORZ_W16OUT           C
+#define     INTER_PRED_CHROMA_VERT_W16OUT           C
+#define     INTER_PRED_CHROMA_VERT_W16INP           C
+#define     INTER_PRED_CHROMA_VERT_W16INP_W16OUT    C
+
+#define     WEIGHTED_PRED_UNI                       C
+#define     WEIGHTED_PRED_BI                        C
+#define     WEIGHTED_PRED_BI_DEFAULT                C
+#define    WEIGHTED_PRED_CHROMA_UNI                 C
+#define    WEIGHTED_PRED_CHROMA_BI                  C
+#define    WEIGHTED_PRED_CHROMA_BI_DEFAULT          C
+
+#define     INTRA_PRED_LUMA_REF_SUBSTITUTION        C
+#define     INTRA_PRED_REF_FILTERING                C
+#define     INTRA_PRED_LUMA_PLANAR                  C
+#define     INTRA_PRED_LUMA_DC                      C
+#define     INTRA_PRED_LUMA_HORZ                    C
+#define     INTRA_PRED_LUMA_VER                     C
+#define     INTRA_PRED_LUMA_MODE_2                  C
+#define     INTRA_PRED_LUMA_MODE_18_34              C
+#define     INTRA_PRED_LUMA_MODE_3_TO_9             C
+#define     INTRA_PRED_LUMA_MODE_11_TO_17           C
+#define     INTRA_PRED_LUMA_MODE_19_TO_25           C
+#define     INTRA_PRED_LUMA_MODE_27_TO_33           C
+
+
+#define     INTRA_PRED_CHROMA_PLANAR                C
+#define     INTRA_PRED_CHROMA_DC                    C
+#define     INTRA_PRED_CHROMA_HOR                   C
+#define     INTRA_PRED_CHROMA_VER                   C
+#define     INTRA_PRED_CHROMA_MODE_2                C
+#define     INTRA_PRED_CHROMA_18_34                 C
+#define     INTRA_PRED_CHROMA_3_T0_9                C
+#define     INTRA_PRED_CHROMA_11_T0_17              C
+#define     INTRA_PRED_CHROMA_19_T0_25              C
+#define     INTRA_PRED_CHROMA_27_T0_33              C
+#define     INTRA_PRED_CHROMA_REF_SUBSTITUTION      C
+
+#define     PAD_VERT                                C
+#define     PAD_HORZ                                C
+
+#define     DEBLK_LUMA_HORZ                         C
+#define     DEBLK_LUMA_VERT                         C
+#define     DEBLK_CHROMA_HORZ                       C
+#define     DEBLK_CHROMA_VERT                       C
+
+#define     SAO_BAND_OFFSET_LUMA                    C
+#define     SAO_BAND_OFFSET_CHROMA                  C
+#define     SAO_EDGE_OFFSET_CLASS0_LUMA             C
+#define     SAO_EDGE_OFFSET_CLASS1_LUMA             C
+#define     SAO_EDGE_OFFSET_CLASS2_LUMA             C
+#define     SAO_EDGE_OFFSET_CLASS3_LUMA             C
+#define     SAO_EDGE_OFFSET_CLASS0_CHROMA           C
+#define     SAO_EDGE_OFFSET_CLASS1_CHROMA           C
+#define     SAO_EDGE_OFFSET_CLASS2_CHROMA           C
+#define     SAO_EDGE_OFFSET_CLASS3_CHROMA           C
+
+/* Forward transform functions */
+/* Luma */
+#define RESI_TRANS_QUANT_4X4_TTYPE1                 C
+#define RESI_TRANS_QUANT_4X4                        C
+#define RESI_TRANS_QUANT_8X8                        C
+#define RESI_TRANS_QUANT_16X16                      C
+#define RESI_TRANS_QUANT_32X32                      C
+
+#define RESI_QUANT_4X4_TTYPE1                       C
+#define RESI_QUANT_4X4                              C
+#define RESI_QUANT_8X8                              C
+#define RESI_QUANT_16X16                            C
+#define RESI_QUANT_32X32                            C
+
+#define RESI_TRANS_4X4_TTYPE1                       C
+#define RESI_TRANS_4X4                              C
+#define RESI_TRANS_8X8                              C
+#define RESI_TRANS_16X16                            C
+#define RESI_TRANS_32X32                            C
+
+#define RESI_4X4_TTYPE1                             C
+#define RESI_4X4                                    C
+#define RESI_8X8                                    C
+#define RESI_16X16                                  C
+#define RESI_32X32                                  C
+
+#define TRANS_4X4_TTYPE1                            C
+#define TRANS_4X4                                   C
+#define TRANS_8X8                                   C
+#define TRANS_16X16                                 C
+#define TRANS_32X32                                 C
+
+#define QUANT_4X4_TTYPE1                            C
+#define QUANT_4X4                                   C
+#define QUANT_8X8                                   C
+#define QUANT_16X16                                 C
+#define QUANT_32X32                                 C
+
+/* Chroma interleaved*/
+#define CHROMA_RESI_TRANS_QUANT_4X4                        C
+#define CHROMA_RESI_TRANS_QUANT_8X8                        C
+#define CHROMA_RESI_TRANS_QUANT_16X16                      C
+
+#define CHROMA_RESI_QUANT_4X4                              C
+#define CHROMA_RESI_QUANT_8X8                              C
+#define CHROMA_RESI_QUANT_16X16                            C
+
+#define CHROMA_RESI_TRANS_4X4                              C
+#define CHROMA_RESI_TRANS_8X8                              C
+#define CHROMA_RESI_TRANS_16X16                            C
+
+#define CHROMA_RESI_4X4                                    C
+#define CHROMA_RESI_8X8                                    C
+#define CHROMA_RESI_16X16                                  C
+
+/* Inverse transform functions */
+/* Luma */
+#define IQUANT_ITRANS_RECON_4X4_TTYPE1              C
+#define IQUANT_ITRANS_RECON_4X4                     C
+#define IQUANT_ITRANS_RECON_8X8                     C
+#define IQUANT_ITRANS_RECON_16X16                   C
+#define IQUANT_ITRANS_RECON_32X32                   C
+
+#define IQUANT_RECON_4X4_TTYPE1                     C
+#define IQUANT_RECON_4X4                            C
+#define IQUANT_RECON_8X8                            C
+#define IQUANT_RECON_16X16                          C
+#define IQUANT_RECON_32X32                          C
+
+#define ITRANS_RECON_4X4_TTYPE1                     C
+#define ITRANS_RECON_4X4                            C
+#define ITRANS_RECON_8X8                            C
+#define ITRANS_RECON_16X16                          C
+#define ITRANS_RECON_32X32                          C
+
+#define RECON_4X4_TTYPE1                            C
+#define RECON_4X4                                   C
+#define RECON_8X8                                   C
+#define RECON_16X16                                 C
+#define RECON_32X32                                 C
+
+#define ITRANS_4X4_TTYPE1                           C
+#define ITRANS_4X4                                  C
+#define ITRANS_8X8                                  C
+#define ITRANS_16X16                                C
+#define ITRANS_32X32                                C
+
+/* Chroma interleaved */
+#define CHROMA_IQUANT_ITRANS_RECON_4X4                     C
+#define CHROMA_IQUANT_ITRANS_RECON_8X8                     C
+#define CHROMA_IQUANT_ITRANS_RECON_16X16                   C
+
+#define CHROMA_IQUANT_RECON_4X4                            C
+#define CHROMA_IQUANT_RECON_8X8                            C
+#define CHROMA_IQUANT_RECON_16X16                          C
+
+#define CHROMA_ITRANS_RECON_4X4                            C
+#define CHROMA_ITRANS_RECON_8X8                            C
+#define CHROMA_ITRANS_RECON_16X16                          C
+
+#define CHROMA_RECON_4X4                                   C
+#define CHROMA_RECON_8X8                                   C
+#define CHROMA_RECON_16X16                                 C
+
+#define IHEVC_MEMCPY                                C
+#define IHEVC_MEMSET                                C
+#define IHEVC_MEMSET_16BIT                          C
+#define IHEVC_MEMCPY_MUL_8                          C
+#define IHEVC_MEMSET_MUL_8                          C
+#define IHEVC_MEMSET_16BIT_MUL_8                    C
+
+#endif /* __IHEVC_FUNC_SELECTOR_H__ */
+

diff --git a/common/x86/ihevc_inter_pred_filters_sse42_intr.c b/common/x86/ihevc_inter_pred_filters_sse42_intr.c
new file mode 100644
index 0000000..154b613
--- /dev/null
+++ b/common/x86/ihevc_inter_pred_filters_sse42_intr.c

@@ -0,0 +1,607 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+
+/**
+*******************************************************************************
+* @file
+*  ihevc_inter_pred_filters_x86_intr.c
+*
+* @brief
+*  Contains function definitions for inter prediction  interpolation filters
+*  coded in x86 intrinsics
+*
+*
+* @author
+*
+*
+* @par List of Functions:
+*  - ihevc_inter_pred_luma_copy_w16out_sse42()
+*  - ihevc_inter_pred_chroma_copy_sse42()
+*  - ihevc_inter_pred_chroma_copy_w16out_sse42()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <assert.h>
+
+#include "ihevc_debug.h"
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_inter_pred.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_func_selector.h"
+
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+
+/*****************************************************************************/
+/* Function Definitions                                                      */
+/*****************************************************************************/
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*       Interprediction luma filter for copy 16bit output
+*
+* @par Description:
+*    Copies the array of width 'wd' and height 'ht' from the  location pointed
+*    by 'src' to the location pointed by 'dst' The output is upshifted by 6
+*    bits and is used as input for vertical filtering or weighted prediction
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+*  WORD16 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_luma_copy_w16out_sse42(UWORD8 *pu1_src,
+                                             WORD16 *pi2_dst,
+                                             WORD32 src_strd,
+                                             WORD32 dst_strd,
+                                             WORD8 *pi1_coeff,
+                                             WORD32 ht,
+                                             WORD32 wd)
+{
+    WORD32 row, col;
+    __m128i  src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
+    UNUSED(pi1_coeff);
+    ASSERT(wd % 4 == 0); /* checking assumption*/
+    ASSERT(ht % 4 == 0); /* checking assumption*/
+
+    if(0 == (wd & 7)) /* multiple of 8 case */
+    {
+        for(row = 0; row < ht; row += 4)
+        {
+            for(col = 0; col < wd; col += 8)
+            {
+                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
+                src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+                src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+                src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+                src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
+                src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
+                src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
+                src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
+
+                src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+                src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
+                src2_16x8b = _mm_slli_epi16(src2_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
+                src3_16x8b = _mm_slli_epi16(src3_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* storing 16 8-bit output values */
+                _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
+                _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
+                _mm_storeu_si128((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
+                _mm_storeu_si128((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
+
+                pu1_src += 8; /* pointer update */
+                pi2_dst += 8; /* pointer update */
+            } /* inner for loop ends here(8-output values in single iteration) */
+
+            pu1_src += 4 * src_strd - wd; /* pointer update */
+            pi2_dst += 4 * dst_strd - wd; /* pointer update */
+        }
+    }
+    else /* wd = multiple of 4 case */
+    {
+        for(row = 0; row < ht; row += 4)
+        {
+            for(col = 0; col < wd; col += 4)
+            {
+                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
+                src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+                src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+                src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+                src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
+                src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
+                src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
+                src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
+
+                src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+                src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
+                src2_16x8b = _mm_slli_epi16(src2_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
+                src3_16x8b = _mm_slli_epi16(src3_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* storing 16 8-bit output values */
+                _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
+                _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
+                _mm_storel_epi64((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
+                _mm_storel_epi64((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
+
+                pu1_src += 4; /* pointer update */
+                pi2_dst += 4; /* pointer update */
+            } /* inner for loop ends here(4-output values in single iteration) */
+
+            pu1_src += 4 * src_strd - wd; /* pointer update */
+            pi2_dst += 4 * dst_strd - wd; /* pointer update */
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*      Chroma interprediction filter for copy
+*
+* @par Description:
+*    Copies the array of width 'wd' and height 'ht' from the  location pointed
+*    by 'src' to the location pointed by 'dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_chroma_copy_sse42(UWORD8 *pu1_src,
+                                        UWORD8 *pu1_dst,
+                                        WORD32 src_strd,
+                                        WORD32 dst_strd,
+                                        WORD8 *pi1_coeff,
+                                        WORD32 ht,
+                                        WORD32 wd)
+{
+    WORD32 row, col, wdx2;
+    __m128i  src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
+
+    ASSERT(wd % 2 == 0); /* checking assumption*/
+    ASSERT(ht % 2 == 0); /* checking assumption*/
+    UNUSED(pi1_coeff);
+    wdx2 = wd * 2;
+
+    if(0 == (ht & 3)) /* ht multiple of 4 */
+    {
+        if(0 == (wdx2 & 15)) /* wdx2 multiple of 16 case */
+        {
+            for(row = 0; row < ht; row += 4)
+            {
+                for(col = 0; col < wdx2; col += 16)
+                {
+                    /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+                    src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));                /* row =0 */
+                    src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+                    src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+                    src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+                    /* storing 16 8-bit output values */
+                    _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
+                    _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
+                    _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
+                    _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
+
+                    pu1_src += 16; /* pointer update */
+                    pu1_dst += 16; /* pointer update */
+                } /* inner for loop ends here(16-output values in single iteration) */
+
+                pu1_src += 4 * src_strd - wdx2; /* pointer update */
+                pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
+            }
+
+        }
+        else if(0 == (wdx2 & 7)) /* multiple of 8 case */
+        {
+            for(row = 0; row < ht; row += 4)
+            {
+                for(col = 0; col < wdx2; col += 8)
+                {
+                    /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+                    src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
+                    src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+                    src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+                    src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+                    /* storing 16 8-bit output values */
+                    _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
+
+                    pu1_src += 8; /* pointer update */
+                    pu1_dst += 8; /* pointer update */
+                } /*  inner for loop ends here(8-output values in single iteration) */
+
+                pu1_src += 4 * src_strd - wdx2; /* pointer update */
+                pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
+            }
+        }
+        else /* wdx2 = multiple of 4 case */
+        {
+            WORD32 dst0, dst1, dst2, dst3;
+            for(row = 0; row < ht; row += 4)
+            {
+                for(col = 0; col < wdx2; col += 4)
+                {
+                    /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+                    src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
+                    src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+                    src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+                    src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+                    dst0 = _mm_cvtsi128_si32(src0_16x8b);
+                    dst1 = _mm_cvtsi128_si32(src1_16x8b);
+                    dst2 = _mm_cvtsi128_si32(src2_16x8b);
+                    dst3 = _mm_cvtsi128_si32(src3_16x8b);
+
+                    /* storing 4 8-bit output values */
+                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */
+                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */
+                    *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; /* row =2 */
+                    *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; /* row =3 */
+
+                    pu1_src += 4; /* pointer update */
+                    pu1_dst += 4; /* pointer update */
+                } /*  inner for loop ends here(4- output values in single iteration) */
+
+                pu1_src += 4 * src_strd - wdx2; /* pointer update */
+                pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
+            }
+        }
+    }
+    else /* ht multiple of 2 */
+    {
+        if(0 == (wdx2 & 15)) /* wdx2 multiple of 16 case */
+        {
+            for(row = 0; row < ht; row += 2)
+            {
+                for(col = 0; col < wdx2; col += 16)
+                {
+                    /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+                    src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));                /* row =0 */
+                    src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+
+                    /* storing 16 8-bit output values */
+                    _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
+                    _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
+
+                    pu1_src += 16; /* pointer update */
+                    pu1_dst += 16; /* pointer update */
+                } /* inner for loop ends here(16-output values in single iteration) */
+
+                pu1_src += 2 * src_strd - wdx2; /* pointer update */
+                pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
+            }
+
+        }
+        else if(0 == (wdx2 & 7)) /* multiple of 8 case */
+        {
+            for(row = 0; row < ht; row += 2)
+            {
+                for(col = 0; col < wdx2; col += 8)
+                {
+                    /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+                    src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
+                    src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+
+                    /* storing 16 8-bit output values */
+                    _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
+
+                    pu1_src += 8; /* pointer update */
+                    pu1_dst += 8; /* pointer update */
+                } /*  inner for loop ends here(8-output values in single iteration) */
+
+                pu1_src += 2 * src_strd - wdx2; /* pointer update */
+                pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
+            }
+        }
+        else /* wdx2 = multiple of 4 case */
+        {
+            WORD32 dst0, dst1;
+            for(row = 0; row < ht; row += 2)
+            {
+                for(col = 0; col < wdx2; col += 4)
+                {
+                    /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+                    src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
+                    src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+
+                    dst0 = _mm_cvtsi128_si32(src0_16x8b);
+                    dst1 = _mm_cvtsi128_si32(src1_16x8b);
+
+
+                    /* storing 4 8-bit output values */
+                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */
+                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */
+
+                    pu1_src += 4; /* pointer update */
+                    pu1_dst += 4; /* pointer update */
+                } /*  inner for loop ends here(4- output values in single iteration) */
+
+                pu1_src += 2 * src_strd - wdx2; /* pointer update */
+                pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
+            }
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*       chroma interprediction filter for copying 16bit output
+*
+* @par Description:
+*    Copies the array of width 'wd' and height 'ht' from the  location pointed
+*    by 'src' to the location pointed by 'dst' The output is upshifted by 6
+*    bits and is used as input for vertical filtering or weighted prediction
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+*  WORD16 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_chroma_copy_w16out_sse42(UWORD8 *pu1_src,
+                                               WORD16 *pi2_dst,
+                                               WORD32 src_strd,
+                                               WORD32 dst_strd,
+                                               WORD8 *pi1_coeff,
+                                               WORD32 ht,
+                                               WORD32 wd)
+{
+    WORD32 row, col, wdx2;
+    __m128i  src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
+
+    ASSERT(wd % 2 == 0); /* checking assumption*/
+    ASSERT(ht % 2 == 0); /* checking assumption*/
+    UNUSED(pi1_coeff);
+    wdx2 = wd * 2;
+
+    if(0 == (ht & 3)) /* multiple of 4 case */
+    {
+        if(0 == (wdx2 & 7)) /* multiple of 8 case */
+        {
+            for(row = 0; row < ht; row += 4)
+            {
+                for(col = 0; col < wdx2; col += 8)
+                {
+                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
+                    src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+                    src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+                    src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+                    src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
+                    src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
+                    src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
+                    src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
+
+                    src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+                    src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
+                    src2_16x8b = _mm_slli_epi16(src2_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
+                    src3_16x8b = _mm_slli_epi16(src3_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                    /* storing 16 8-bit output values */
+                    _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
+                    _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
+                    _mm_storeu_si128((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
+                    _mm_storeu_si128((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
+
+                    pu1_src += 8; /* pointer update */
+                    pi2_dst += 8; /* pointer update */
+                } /* inner for loop ends here(8-output values in single iteration) */
+
+                pu1_src += 4 * src_strd - wdx2; /* pointer update */
+                pi2_dst += 4 * dst_strd - wdx2; /* pointer update */
+            }
+        }
+        else /* wdx2 = multiple of 4 case */
+        {
+            for(row = 0; row < ht; row += 4)
+            {
+                for(col = 0; col < wdx2; col += 4)
+                {
+                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
+                    src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+                    src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+                    src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+                    src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
+                    src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
+                    src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
+                    src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
+
+                    src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+                    src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
+                    src2_16x8b = _mm_slli_epi16(src2_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
+                    src3_16x8b = _mm_slli_epi16(src3_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                    /* storing 16 8-bit output values */
+                    _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
+                    _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
+                    _mm_storel_epi64((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
+                    _mm_storel_epi64((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
+
+                    pu1_src += 4; /* pointer update */
+                    pi2_dst += 4; /* pointer update */
+                } /* inner for loop ends here(4-output values in single iteration) */
+
+                pu1_src += 4 * src_strd - wdx2; /* pointer update */
+                pi2_dst += 4 * dst_strd - wdx2; /* pointer update */
+            }
+        }
+    }
+    else  /* ht multiple of 2 case */
+    {
+        if(0 == (wdx2 & 7)) /* multiple of 8 case */
+        {
+            for(row = 0; row < ht; row += 2)
+            {
+                for(col = 0; col < wdx2; col += 8)
+                {
+                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
+                    src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+
+                    src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
+                    src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
+
+                    src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+                    src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                    /* storing 16 8-bit output values */
+                    _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
+                    _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
+
+                    pu1_src += 8; /* pointer update */
+                    pi2_dst += 8; /* pointer update */
+                } /* inner for loop ends here(8-output values in single iteration) */
+
+                pu1_src += 2 * src_strd - wdx2; /* pointer update */
+                pi2_dst += 2 * dst_strd - wdx2; /* pointer update */
+            }
+        }
+        else /* wdx2 = multiple of 4 case */
+        {
+            for(row = 0; row < ht; row += 2)
+            {
+                for(col = 0; col < wdx2; col += 4)
+                {
+                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
+                    src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+
+                    src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
+                    src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
+
+                    src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+                    src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                    /* storing 16 8-bit output values */
+                    _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
+                    _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
+
+                    pu1_src += 4; /* pointer update */
+                    pi2_dst += 4; /* pointer update */
+                } /* inner for loop ends here(4-output values in single iteration) */
+
+                pu1_src += 2 * src_strd - wdx2; /* pointer update */
+                pi2_dst += 2 * dst_strd - wdx2; /* pointer update */
+            }
+        }
+    }
+}

diff --git a/common/x86/ihevc_inter_pred_filters_ssse3_intr.c b/common/x86/ihevc_inter_pred_filters_ssse3_intr.c
new file mode 100644
index 0000000..ffdab4c
--- /dev/null
+++ b/common/x86/ihevc_inter_pred_filters_ssse3_intr.c

@@ -0,0 +1,5608 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+*******************************************************************************
+* @file
+*  ihevc_inter_pred_filters_atom_intr.c
+*
+* @brief
+*  Contains function definitions for inter prediction  interpolation filters
+*  coded in x86 intrinsics
+*
+*
+* @author
+*
+*
+* @par List of Functions:
+*  - ihevc_inter_pred_luma_copy_ssse3()
+*  - ihevc_inter_pred_luma_horz_ssse3()
+*  - ihevc_inter_pred_luma_vert_ssse3()
+*  - ihevc_inter_pred_luma_copy_w16out_ssse3()
+*  - ihevc_inter_pred_luma_horz_w16out_ssse3()
+*  - ihevc_inter_pred_luma_vert_w16out_ssse3()
+*  - ihevc_inter_pred_luma_vert_w16inp_ssse3()
+*  - ihevc_inter_pred_luma_vert_w16inp_w16out_ssse3()
+*  - ihevc_inter_pred_chroma_copy_ssse3()
+*  - ihevc_inter_pred_chroma_horz_ssse3()
+*  - ihevc_inter_pred_chroma_vert_ssse3()
+*  - ihevc_inter_pred_chroma_copy_w16out_ssse3()
+*  - ihevc_inter_pred_chroma_horz_w16out_ssse3()
+*  - ihevc_inter_pred_chroma_vert_w16out_ssse3()
+*  - ihevc_inter_pred_chroma_vert_w16inp_ssse3()
+*  - ihevc_inter_pred_chroma_vert_w16inp_w16out_ssse3()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <assert.h>
+
+#include "ihevc_debug.h"
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_inter_pred.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+
+#include <immintrin.h>
+
+/*****************************************************************************/
+/* Function Definitions                                                      */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief
+*       Interprediction luma function for copy
+*
+* @par Description:
+*    Copies the array of width 'wd' and height 'ht' from the  location pointed
+*    by 'src' to the location pointed by 'dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+* Assumption : ht%4 == 0, wd%4 == 0
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_luma_copy_ssse3(UWORD8 *pu1_src,
+                                      UWORD8 *pu1_dst,
+                                      WORD32 src_strd,
+                                      WORD32 dst_strd,
+                                      WORD8 *pi1_coeff,
+                                      WORD32 ht,
+                                      WORD32 wd)
+{
+
+    WORD32 row, col;
+    __m128i  src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
+    UNUSED(pi1_coeff);
+    ASSERT(wd % 4 == 0); /* checking assumption*/
+    ASSERT(ht % 4 == 0); /* checking assumption*/
+
+/*  outer for loop starts from here */
+    if(0 == (wd & 15)) /* wd multiple of 16 case */
+    {
+        for(row = 0; row < ht; row += 4)
+        {
+            for(col = 0; col < wd; col += 16)
+            {
+                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+                src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));                /* row =0 */
+                src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+                src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+                src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+                /* storing 16 8-bit output values */
+                _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
+                _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
+                _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
+                _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
+
+                pu1_src += 16; /* pointer update */
+                pu1_dst += 16; /* pointer update */
+            } /* inner for loop ends here(16-output values in single iteration) */
+
+            pu1_src += 4 * src_strd - wd; /* pointer update */
+            pu1_dst += 4 * dst_strd - wd; /* pointer update */
+        }
+
+    }
+    else if(0 == (wd & 7)) /* multiple of 8 case */
+    {
+        for(row = 0; row < ht; row += 4)
+        {
+            for(col = 0; col < wd; col += 8)
+            {
+                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+                src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
+                src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+                src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+                src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+                /* storing 16 8-bit output values */
+                _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
+                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
+                _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
+                _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
+
+                pu1_src += 8; /* pointer update */
+                pu1_dst += 8; /* pointer update */
+            } /*  inner for loop ends here(8-output values in single iteration) */
+
+            pu1_src += 4 * src_strd - wd; /* pointer update */
+            pu1_dst += 4 * dst_strd - wd; /* pointer update */
+        }
+    }
+    else /* wd = multiple of 4 case */
+    {
+        WORD32 dst0, dst1, dst2, dst3;
+        for(row = 0; row < ht; row += 4)
+        {
+            for(col = 0; col < wd; col += 4)
+            {
+                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+                src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
+                src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+                src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+                src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+                dst0 = _mm_cvtsi128_si32(src0_16x8b);
+                dst1 = _mm_cvtsi128_si32(src1_16x8b);
+                dst2 = _mm_cvtsi128_si32(src2_16x8b);
+                dst3 = _mm_cvtsi128_si32(src3_16x8b);
+
+                /* storing 4 8-bit output values */
+                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */
+                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */
+                *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; /* row =2 */
+                *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; /* row =3 */
+
+                pu1_src += 4; /* pointer update */
+                pu1_dst += 4; /* pointer update */
+            } /*  inner for loop ends here(4- output values in single iteration) */
+
+            pu1_src += 4 * src_strd - wd; /* pointer update */
+            pu1_dst += 4 * dst_strd - wd; /* pointer update */
+        }
+    }
+}
+
+/* INTER_PRED_LUMA_COPY */
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Interprediction luma filter for horizontal input
+*
+* @par Description:
+*    Applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
+*    to the elements pointed by 'pu1_src' and  writes to the location pointed
+*    by 'pu1_dst'  The output is downshifted by 6 and clipped to 8 bits
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_luma_horz_ssse3(UWORD8 *pu1_src,
+                                      UWORD8 *pu1_dst,
+                                      WORD32 src_strd,
+                                      WORD32 dst_strd,
+                                      WORD8 *pi1_coeff,
+                                      WORD32 ht,
+                                      WORD32 wd)
+{
+    WORD32 row, col;
+
+    /* all 128 bit registers are named with a suffix mxnb, where m is the */
+    /* number of n bits packed in the register                            */
+    __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b;
+    __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b;
+    __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b;
+    __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b, res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b, res_temp8_8x16b;
+    __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b, res_temp14_8x16b, res_temp15_8x16b, res_temp16_8x16b, res_temp17_8x16b, res_temp18_8x16b;
+    __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
+    __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b;
+
+    ASSERT(wd % 4 == 0); /* checking assumption*/
+
+    PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
+
+    /* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
+    src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+    zero_8x16b = _mm_set1_epi32(0);
+    offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
+
+    mask_low_32b = _mm_cmpeq_epi16(zero_8x16b, zero_8x16b);
+    mask_high_96b = _mm_srli_si128(mask_low_32b, 12);
+    mask_low_32b = _mm_slli_si128(mask_low_32b, 4);
+
+    control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
+    control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
+    control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */
+    control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */
+
+    coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b);  /* pi1_coeff[4] */
+    coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b);  /* pi1_coeff[4] */
+
+    coeff4_5_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_3_8x16b);  /* pi1_coeff[4] */
+    coeff6_7_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_4_8x16b);  /* pi1_coeff[4] */
+
+    if(0 == (ht & 1)) /* ht multiple of 2 case */
+    {
+
+        if(0 == (wd & 7)) /* wd = multiple of 8 case */
+        {
+            for(row = 0; row < ht; row += 2)
+            {
+
+                int offset = 0;
+
+#if 1
+                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+                for(col = 0; col < wd; col += 8)
+                {
+                    /*load 16 pixel values from row 0*/
+                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));         /* row = 0 */
+
+                    /*load 16 pixel values from row 1*/
+                    src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
+
+                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
+                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
+                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
+                                                                                           /* row = 0 */
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
+
+                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+                    res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b);             /* row = 0 */
+                    res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
+                    res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b);        /* row = 0 */
+
+                    _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
+
+                    src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1);                   /* row =1 */
+                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/                 /* row =1 */
+                    src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b);   /* row = 1 */
+                                                                                              /* row = 1 */
+                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
+                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row =1 */
+                    src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b);   /* row = 1 */
+
+                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
+                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row =1 */
+                    src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b);   /* row = 1 */
+
+                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
+                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row =1 */
+                    src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b);   /* row = 1 */
+
+                    res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+                    res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
+                    res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
+
+                    res_temp16_8x16b = _mm_adds_epi16(res_temp15_8x16b, offset_8x16b);             /* row = 1 */
+                    res_temp16_8x16b = _mm_srai_epi16(res_temp16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 1 */
+                    res_temp15_8x16b = _mm_packus_epi16(res_temp16_8x16b, res_temp16_8x16b);       /* row = 1 */
+
+                    /* to store the 1st 4 pixels res. */
+                    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp15_8x16b);
+
+                    offset += 8; /* To pointer updates*/
+                }
+                pu1_src += 2 * src_strd;  /* pointer updates*/
+                pu1_dst += 2 * dst_strd;  /* pointer updates*/
+            }
+        }
+        else /* wd = multiple of 4 case */
+        {
+            for(row = 0; row < ht; row += 2)
+            {
+                int offset = 0;
+
+#if 1
+                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+                for(col = 0; col < wd; col += 4)
+                {
+                    /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
+                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));             /* row = 0 */
+                    src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
+
+                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
+                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
+                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
+                                                                                           /* row = 0 */
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
+
+                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+                    res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b);             /* row = 0 */
+                    res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
+                    res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b);        /* row = 0 */
+
+                    res_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset));
+                    res_temp8_8x16b =  _mm_and_si128(res_temp7_8x16b, mask_low_32b);
+                    res_temp7_8x16b =  _mm_and_si128(res_temp5_8x16b, mask_high_96b);
+                    res_temp5_8x16b = _mm_or_si128(res_temp7_8x16b, res_temp8_8x16b);
+
+                    _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
+
+                    src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1);                   /* row = 1 */
+                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/                 /* row = 1 */
+                    src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b);   /* row = 1 */
+                                                                                              /* row = 1 */
+                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
+                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
+                    src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b);   /* row = 1 */
+
+                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
+                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
+                    src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b);   /* row = 1 */
+
+                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
+                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
+                    src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b);   /* row = 1 */
+
+                    res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+                    res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
+                    res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
+
+                    res_temp16_8x16b = _mm_adds_epi16(res_temp15_8x16b, offset_8x16b);             /* row = 1 */
+                    res_temp16_8x16b = _mm_srai_epi16(res_temp16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 1 */
+                    res_temp15_8x16b = _mm_packus_epi16(res_temp16_8x16b, res_temp16_8x16b);       /* row = 1 */
+
+                    res_temp17_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd + offset));
+                    res_temp18_8x16b =  _mm_and_si128(res_temp17_8x16b, mask_low_32b);
+                    res_temp17_8x16b =  _mm_and_si128(res_temp15_8x16b, mask_high_96b);
+                    res_temp15_8x16b = _mm_or_si128(res_temp17_8x16b, res_temp18_8x16b);
+
+                    /* to store the 1st 4 pixels res. */
+                    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp15_8x16b);
+
+                    offset += 4; /* To pointer updates*/
+                }
+                pu1_src += 2 * src_strd;  /* Pointer update */
+                pu1_dst += 2 * dst_strd;  /* Pointer update */
+            }
+        }
+    }
+    else /* odd ht */
+    {
+        if(0 == (wd & 7)) /* multiple of 8 case */
+        {
+            for(row = 0; row < ht; row++)
+            {
+                int offset = 0;
+
+
+#if 1
+                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+                for(col = 0; col < wd; col += 8)
+                {
+                    /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
+                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));  /* row = 0 */
+
+                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
+                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
+                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
+                                                                                           /* row = 0 */
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
+
+                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+                    res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b);             /* row = 0 */
+                    res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
+                    res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b);        /* row = 0 */
+
+                    /* to store the 1st 4 pixels res. */
+                    _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
+
+                    offset += 8; /* To pointer updates*/
+                }
+                pu1_src += src_strd;    /* pointer updates*/
+                pu1_dst += dst_strd;    /* pointer updates*/
+            }
+        }
+        else  /* wd = multiple of 4 case */
+        {
+            for(row = 0; row < (ht - 1); row += 2)
+            {
+                int offset = 0;
+
+#if 1
+                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+                for(col = 0; col < wd; col += 4)
+                {
+                    /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
+                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));             /* row = 0 */
+                    src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
+
+                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
+                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
+                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
+                                                                                           /* row = 0 */
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
+
+                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+                    res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b);             /* row = 0 */
+                    res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
+                    res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b);        /* row = 0 */
+
+                    res_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset));
+                    res_temp8_8x16b =  _mm_and_si128(res_temp7_8x16b, mask_low_32b);
+                    res_temp7_8x16b =  _mm_and_si128(res_temp5_8x16b, mask_high_96b);
+                    res_temp5_8x16b = _mm_or_si128(res_temp7_8x16b, res_temp8_8x16b);
+
+                    _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
+
+                    src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1);                   /* row = 1 */
+                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/                 /* row = 1 */
+                    src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b);   /* row = 1 */
+                                                                                              /* row = 1 */
+                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
+                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
+                    src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b);   /* row = 1 */
+
+                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
+                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
+                    src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b);   /* row = 1 */
+
+                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
+                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
+                    src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b);   /* row = 1 */
+
+                    res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+                    res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
+                    res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
+
+                    res_temp16_8x16b = _mm_adds_epi16(res_temp15_8x16b, offset_8x16b);             /* row = 1 */
+                    res_temp16_8x16b = _mm_srai_epi16(res_temp16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 1 */
+                    res_temp15_8x16b = _mm_packus_epi16(res_temp16_8x16b, res_temp16_8x16b);       /* row = 1 */
+
+                    res_temp17_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd + offset));
+                    res_temp18_8x16b =  _mm_and_si128(res_temp17_8x16b, mask_low_32b);
+                    res_temp17_8x16b =  _mm_and_si128(res_temp15_8x16b, mask_high_96b);
+                    res_temp15_8x16b = _mm_or_si128(res_temp17_8x16b, res_temp18_8x16b);
+
+                    /* to store the 1st 4 pixels res. */
+                    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp15_8x16b);
+
+                    offset += 4; /* To pointer updates*/
+                }
+                pu1_src += 2 * src_strd;  /* Pointer update */
+                pu1_dst += 2 * dst_strd;  /* Pointer update */
+            }
+            { /* last repeat at outside the loop */
+                int offset = 0;
+                for(col = 0; col < wd; col += 4)
+                {
+                    /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
+                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));  /* row = 0 */
+
+                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
+                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
+                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
+                                                                                           /* row = 0 */
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
+
+                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+                    res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b);             /* row = 0 */
+                    res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
+                    res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b);        /* row = 0 */
+
+                    res_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset));
+                    res_temp8_8x16b =  _mm_and_si128(res_temp7_8x16b, mask_low_32b);
+                    res_temp7_8x16b =  _mm_and_si128(res_temp5_8x16b, mask_high_96b);
+                    res_temp5_8x16b = _mm_or_si128(res_temp7_8x16b, res_temp8_8x16b);
+
+                    /* to store the 1st 4 pixels res. */
+                    _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
+
+                    offset += 4; /* To pointer updates*/
+                }
+            }
+        }
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Interprediction luma filter for vertical input
+*
+* @par Description:
+*   Applies a vertcal filter with coefficients pointed to  by 'pi1_coeff' to
+*   the elements pointed by 'pu1_src' and  writes to the location pointed by
+*   'pu1_dst'  The output is downshifted by 6 and clipped to 8 bits
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_luma_vert_ssse3(UWORD8 *pu1_src,
+                                      UWORD8 *pu1_dst,
+                                      WORD32 src_strd,
+                                      WORD32 dst_strd,
+                                      WORD8 *pi1_coeff,
+                                      WORD32 ht,
+                                      WORD32 wd)
+{
+    WORD32 row, col;
+    UWORD8 *pu1_src_copy;
+    UWORD8 *pu1_dst_copy;
+    __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
+    __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b;
+    __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b;
+    __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b;
+    __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b;
+    __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b, s17_8x16b, s18_8x16b, s19_8x16b;
+    __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b, s27_8x16b, s28_8x16b, s29_8x16b;
+    __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b, s37_8x16b, s38_8x16b, s39_8x16b;
+
+    __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b;
+    __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b;
+
+    PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+/* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
+    s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+    control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
+    control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
+    control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */
+    control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */
+
+    coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b);  /* pi1_coeff[4] */
+    coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b);  /* pi1_coeff[4] */
+
+    coeff4_5_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_3_8x16b);  /* pi1_coeff[4] */
+    coeff6_7_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_4_8x16b);  /* pi1_coeff[4] */
+
+/*  seting  values in register */
+    zero_8x16b = _mm_setzero_si128(); /* for saturated clipping */
+    offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
+    mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
+    mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF);
+
+/*  outer for loop starts from here */
+    if(wd % 8 == 0)
+    { /* wd = multiple of 8 case */
+
+        pu1_src_copy = pu1_src;
+        pu1_dst_copy = pu1_dst;
+
+        for(col = 0; col < wd; col += 8)
+        {
+
+            pu1_src = pu1_src_copy + col;
+            pu1_dst = pu1_dst_copy + col;
+
+            PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
+            PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
+            PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
+            PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
+
+            /*load 8 pixel values.*/
+            s2_0_16x8b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd)));
+
+            /*load 8 pixel values*/
+            s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd)));
+
+            s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
+
+            s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
+
+            /*load 8 pixel values*/
+            s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
+
+            /*load 8 pixel values*/
+            s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
+
+            s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
+
+            s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
+
+            /*load 8 pixel values*/
+            s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
+
+            /*load 8 pixel values*/
+            s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+            s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b);
+
+            s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+            /*load 8 pixel values*/
+            s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+            /*load 8 pixel values*/
+            s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd)));
+
+            s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b);
+
+            s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+            s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
+            s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
+            s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
+
+            s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b);
+
+            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s8_8x16b = _mm_srai_epi16(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            /* i2_tmp = CLIP_U8(i2_tmp);*/
+            s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of row 0*/
+            _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
+
+            /* ROW 2*/
+            s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
+            s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
+            s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+            /*load 8 pixel values*/
+            s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd)));
+
+            /*load 8 pixel values*/
+            s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd)));
+
+            s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b);
+
+            s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+            s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
+            s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
+            s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
+
+            s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b);
+
+            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s28_8x16b = _mm_srai_epi16(s27_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            /* i2_tmp = CLIP_U8(i2_tmp);*/
+            s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of row 2*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b);
+
+
+            /*ROW 1*/
+            s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
+
+            s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
+
+            s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b);
+
+            s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
+
+            s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b);
+
+            s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+            s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b);
+
+            s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+            s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
+            s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
+            s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
+
+            s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b);
+
+            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s18_8x16b = _mm_srai_epi16(s17_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            /* i2_tmp = CLIP_U8(i2_tmp);*/
+            s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of row 1*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd)), s19_8x16b);
+
+
+            /* ROW 3*/
+            s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
+            s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
+            s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+            /*load 8 pixel values*/
+            s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd)));
+
+            s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b);
+
+            s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+            s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
+            s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
+            s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
+
+            s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b);
+
+            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s38_8x16b = _mm_srai_epi16(s37_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            /* i2_tmp = CLIP_U8(i2_tmp);*/
+            s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of row 2*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b);
+
+            pu1_src += (8 * src_strd);
+            pu1_dst += (4 * dst_strd);
+
+            for(row = 4; row < ht; row += 4)
+            {
+#if 1
+                PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+                s3_0_16x8b = s3_2_16x8b;
+                s3_1_16x8b = s3_3_16x8b;
+                s3_2_16x8b = s3_4_16x8b;
+
+                s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
+                s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
+                s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+                /*load 8 pixel values from (cur_row + 4)th row*/
+                s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+
+                s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b);
+                s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+                s4_0_16x8b = s4_2_16x8b;
+                s4_1_16x8b = s4_3_16x8b;
+                s4_2_16x8b = s4_4_16x8b;
+
+                s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
+                s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
+                s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
+
+                s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b);
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s8_8x16b = _mm_srai_epi16(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* i2_tmp = CLIP_U8(i2_tmp);*/
+                s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
+
+                /* store 8 8-bit output values  */
+                /* Store the output pixels of row 4*/
+                _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
+
+                /* row + 2*/
+                s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
+                s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
+                s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+                /*load 8 pixel values from (cur_row + 5)th row*/
+                s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+
+                /*load 8 pixel values from (cur_row + 6)th row*/
+                s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+                /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
+                s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
+
+                s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+                s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
+                s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
+                s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
+
+                s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b);
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s28_8x16b = _mm_srai_epi16(s27_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* i2_tmp = CLIP_U8(i2_tmp);*/
+                s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
+
+                /* store 8 8-bit output values  */
+                /* Store the output pixels of (cur_row+2)*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b);
+
+
+                /*row + 1*/
+                s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
+                s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
+                s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+                /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
+                s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
+                s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+                s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
+                s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
+                s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
+
+                s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b);
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s18_8x16b = _mm_srai_epi16(s17_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* i2_tmp = CLIP_U8(i2_tmp);*/
+                s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
+
+                /* store 8 8-bit output values  */
+                /* Store the output pixels of (cur_row + 1)*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s19_8x16b);
+
+
+                /* row + 3*/
+                s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
+                s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
+                s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+                /*load 8 pixel values from (cur_row + 7)th row*/
+                s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+                /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
+                s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
+
+                s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+                s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
+                s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
+                s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
+
+                s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b);
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s38_8x16b = _mm_srai_epi16(s37_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* i2_tmp = CLIP_U8(i2_tmp);*/
+                s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
+
+                /* store 8 8-bit output values  */
+                /* Store the output pixels of (cur_row+3)*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b);
+
+                s2_10_16x8b = s2_3_16x8b;
+
+                pu1_src += 4 * src_strd; /* pointer update */
+                pu1_dst += 4 * dst_strd; /* pointer update */
+            }
+        }
+    }
+    else /* wd = multiple of 8 case */
+    {
+
+        pu1_src_copy = pu1_src;
+        pu1_dst_copy = pu1_dst;
+
+        for(col = 0; col < wd; col += 4)
+        {
+
+            pu1_src = pu1_src_copy + col;
+            pu1_dst = pu1_dst_copy + col;
+
+#if 1
+            PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
+            PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
+            PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
+            PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+            /*load 8 pixel values */
+            s2_0_16x8b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd)));
+
+            /*load 8 pixel values */
+            s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd)));
+
+            s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
+
+            s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
+
+            /*load 8 pixel values */
+            s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
+
+            /*load 8 pixel values */
+            s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
+
+            s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
+
+            s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
+
+            /*load 8 pixel values */
+            s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
+
+            /*load 8 pixel values */
+            s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+            s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b);
+
+            s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+            /*load 8 pixel values */
+            s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+            /*load 8 pixel values */
+            s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd)));
+
+            s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b);
+
+            s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+            s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
+            s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
+            s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
+
+            s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b);
+
+            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s8_8x16b = _mm_srai_epi16(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            /* i2_tmp = CLIP_U8(i2_tmp);*/
+            s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
+#if 1
+            s5_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
+            s6_8x16b =  _mm_and_si128(s5_8x16b, mask_low_32b);
+            s7_8x16b =  _mm_and_si128(s9_8x16b, mask_high_96b);
+            s8_8x16b = _mm_or_si128(s6_8x16b, s7_8x16b);
+#endif
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of row 0*/
+            _mm_storel_epi64((__m128i *)(pu1_dst), s8_8x16b);
+
+            /* ROW 2*/
+            s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
+            s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
+            s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+            /*load 8 pixel values */
+            s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd)));
+
+            /*load 8 pixel values */
+            s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd)));
+
+            s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b);
+
+            s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+            s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
+            s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
+            s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
+
+            s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b);
+
+            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s28_8x16b = _mm_srai_epi16(s27_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            /* i2_tmp = CLIP_U8(i2_tmp);*/
+            s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
+#if 1
+            s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd)));
+            s26_8x16b =  _mm_and_si128(s25_8x16b, mask_low_32b);
+            s27_8x16b =  _mm_and_si128(s29_8x16b, mask_high_96b);
+            s28_8x16b = _mm_or_si128(s26_8x16b, s27_8x16b);
+#endif
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of row 2*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s28_8x16b);
+
+
+            /*ROW 1*/
+            s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
+
+            s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
+
+            s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b);
+
+            s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
+
+            s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b);
+
+            s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+            s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b);
+
+            s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+            s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
+            s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
+            s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
+
+            s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b);
+
+            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s18_8x16b = _mm_srai_epi16(s17_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            /* i2_tmp = CLIP_U8(i2_tmp);*/
+            s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
+#if 1
+            s15_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
+            s16_8x16b =  _mm_and_si128(s15_8x16b, mask_low_32b);
+            s17_8x16b =  _mm_and_si128(s19_8x16b, mask_high_96b);
+            s18_8x16b = _mm_or_si128(s16_8x16b, s17_8x16b);
+#endif
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of row 1*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd)), s18_8x16b);
+
+
+            /* ROW 3*/
+            s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
+            s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
+            s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+            /*load 8 pixel values */
+            s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd)));
+
+            s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b);
+
+            s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+            s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
+            s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
+            s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
+
+            s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b);
+
+            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s38_8x16b = _mm_srai_epi16(s37_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            /* i2_tmp = CLIP_U8(i2_tmp);*/
+            s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
+
+#if 1
+            s35_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd)));
+            s36_8x16b =  _mm_and_si128(s35_8x16b, mask_low_32b);
+            s37_8x16b =  _mm_and_si128(s39_8x16b, mask_high_96b);
+            s38_8x16b = _mm_or_si128(s36_8x16b, s37_8x16b);
+#endif
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of row 2*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s38_8x16b);
+
+            pu1_src += (8 * src_strd);
+            pu1_dst += (4 * dst_strd);
+
+            for(row = 4; row < ht; row += 4)
+            {
+
+#if 1
+                PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+                s3_0_16x8b = s3_2_16x8b;
+                s3_1_16x8b = s3_3_16x8b;
+                s3_2_16x8b = s3_4_16x8b;
+
+                s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
+                s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
+                s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+                /*load 16 pixel values from (cur_row + 4)th row*/
+                s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+
+                s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b);
+                s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+                s4_0_16x8b = s4_2_16x8b;
+                s4_1_16x8b = s4_3_16x8b;
+                s4_2_16x8b = s4_4_16x8b;
+
+                s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
+                s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
+                s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
+
+                s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b);
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s8_8x16b = _mm_srai_epi16(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* i2_tmp = CLIP_U8(i2_tmp);*/
+                s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
+
+                s5_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
+                s6_8x16b =  _mm_and_si128(s5_8x16b, mask_low_32b);
+                s7_8x16b =  _mm_and_si128(s9_8x16b, mask_high_96b);
+                s8_8x16b = _mm_or_si128(s6_8x16b, s7_8x16b);
+
+                /* store 8 8-bit output values  */
+                /* Store the output pixels of row 4*/
+                _mm_storel_epi64((__m128i *)(pu1_dst), s8_8x16b);
+
+                /* row + 2*/
+                s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
+                s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
+                s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+                /*load 16 pixel values from (cur_row + 5)th row*/
+                s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+
+                /*load 16 pixel values from (cur_row + 6)th row*/
+                s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+                /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
+                s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
+
+                s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+                s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
+                s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
+                s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
+
+                s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b);
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s28_8x16b = _mm_srai_epi16(s27_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* i2_tmp = CLIP_U8(i2_tmp);*/
+                s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
+
+                s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd)));
+                s26_8x16b =  _mm_and_si128(s25_8x16b, mask_low_32b);
+                s27_8x16b =  _mm_and_si128(s29_8x16b, mask_high_96b);
+                s28_8x16b = _mm_or_si128(s26_8x16b, s27_8x16b);
+
+                /* store 8 8-bit output values  */
+                /* Store the output pixels of (cur_row+2)*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s28_8x16b);
+
+
+                /*row + 1*/
+                s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
+                s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
+                s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+                /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
+                s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
+                s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+                s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
+                s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
+                s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
+
+                s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b);
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s18_8x16b = _mm_srai_epi16(s17_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* i2_tmp = CLIP_U8(i2_tmp);*/
+                s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
+
+                s15_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
+                s16_8x16b =  _mm_and_si128(s15_8x16b, mask_low_32b);
+                s17_8x16b =  _mm_and_si128(s19_8x16b, mask_high_96b);
+                s18_8x16b = _mm_or_si128(s16_8x16b, s17_8x16b);
+
+                /* store 8 8-bit output values  */
+                /* Store the output pixels of (cur_row + 1)*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s18_8x16b);
+
+
+                /* row + 3*/
+                s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
+                s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
+                s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+                /*load 16 pixel values from (cur_row + 7)th row*/
+                s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+                /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
+                s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
+
+                s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+                s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
+                s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
+                s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
+
+                s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b);
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s38_8x16b = _mm_srai_epi16(s37_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* i2_tmp = CLIP_U8(i2_tmp);*/
+                s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
+
+                s35_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd)));
+                s36_8x16b =  _mm_and_si128(s35_8x16b, mask_low_32b);
+                s37_8x16b =  _mm_and_si128(s39_8x16b, mask_high_96b);
+                s38_8x16b = _mm_or_si128(s36_8x16b, s37_8x16b);
+
+                /* store 8 8-bit output values  */
+                /* Store the output pixels of (cur_row+3)*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s38_8x16b);
+
+                s2_10_16x8b = s2_3_16x8b;
+
+                pu1_src += 4 * src_strd; /* pointer update */
+                pu1_dst += 4 * dst_strd; /* pointer update */
+            }
+        }
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*       Interprediction luma filter for copy 16bit output
+*
+* @par Description:
+*    Copies the array of width 'wd' and height 'ht' from the  location pointed
+*    by 'src' to the location pointed by 'dst' The output is upshifted by 6
+*    bits and is used as input for vertical filtering or weighted prediction
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+*  WORD16 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_luma_copy_w16out_ssse3(UWORD8 *pu1_src,
+                                             WORD16 *pi2_dst,
+                                             WORD32 src_strd,
+                                             WORD32 dst_strd,
+                                             WORD8 *pi1_coeff,
+                                             WORD32 ht,
+                                             WORD32 wd)
+{
+    WORD32 row, col;
+    __m128i  s3, zero_8x16b;
+
+    ASSERT(wd % 2 == 0); /* checking assumption*/
+    ASSERT(ht % 2 == 0); /* checking assumption*/
+    UNUSED(pi1_coeff);
+    zero_8x16b = _mm_setzero_si128();
+/*  outer for loop starts from here */
+    if(wd % 8 == 0) /* wd = multiple of 8 case */
+    {
+        for(row = 0; row < ht; row += 2)
+        {
+            int offset = 0;
+            for(col = 0; col < wd; col += 8)
+            {
+/* row =0 */
+                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+                s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
+                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+
+                /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
+                _mm_store_si128((__m128i *)(pi2_dst + offset), s3);
+
+/* row =1 */
+                /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
+                s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col] */
+                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+
+                /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
+                _mm_store_si128((__m128i *)(pi2_dst + dst_strd + offset), s3);
+
+                offset += 8; /* To pointer update */
+            } /* inner for loop ends here(8-output values in single iteration) */
+
+            pu1_src += 2 * src_strd; /* pointer update */
+            pi2_dst += 2 * dst_strd; /* pointer update */
+        }
+    }
+    else /* wd = multiple of 4 case */
+    {
+        for(row = 0; row < ht; row += 2)
+        {
+            int offset = 0;
+            for(col = 0; col < wd; col += 4)
+            {
+/* row =0 */
+                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+                s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
+                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+
+                /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
+                _mm_storel_epi64((__m128i *)(pi2_dst + offset), s3);
+
+/* row =1 */
+                /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
+                s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col] */
+                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+
+                /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
+                _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), s3);
+                offset += 4; /* To pointer update */
+            } /* inner for loop ends here(4-output values in single iteration) */
+
+            pu1_src += 2 * src_strd; /* pointer update */
+            pi2_dst += 2 * dst_strd; /* pointer update */
+        }
+    }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Interprediction luma filter for horizontal 16bit output
+*
+* @par Description:
+*    Applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
+*    to the elements pointed by 'pu1_src' and  writes to the location pointed
+*    by 'pu1_dst'  No downshifting or clipping is done and the output is  used
+*    as an input for vertical filtering or weighted  prediction
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+*  WORD16 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_luma_horz_w16out_ssse3(UWORD8 *pu1_src,
+                                             WORD16 *pi2_dst,
+                                             WORD32 src_strd,
+                                             WORD32 dst_strd,
+                                             WORD8 *pi1_coeff,
+                                             WORD32 ht,
+                                             WORD32 wd)
+{
+    WORD32 row, col;
+
+    /* all 128 bit registers are named with a suffix mxnb, where m is the */
+    /* number of n bits packed in the register                            */
+
+    __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b;
+    __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b;
+    __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b, res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b;
+    __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b, res_temp14_8x16b, res_temp15_8x16b, res_temp16_8x16b;
+    __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
+    __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b;
+
+    ASSERT(wd % 4 == 0); /* checking assumption*/
+
+    PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
+
+    /* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
+    src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+
+    control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
+    control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
+    control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */
+    control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */
+
+    coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b);  /* pi1_coeff[4] */
+    coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b);  /* pi1_coeff[4] */
+
+    coeff4_5_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_3_8x16b);  /* pi1_coeff[4] */
+    coeff6_7_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_4_8x16b);  /* pi1_coeff[4] */
+
+    if(0 == (ht & 1)) /* ht multiple of 2 case */
+    {
+
+        if(0 == (wd & 7)) /* wd = multiple of 8 case */
+        {
+            for(row = 0; row < ht; row += 2)
+            {
+
+                int offset = 0;
+
+#if 1
+                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+
+#endif
+
+                for(col = 0; col < wd; col += 8)
+                {
+                    /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
+                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));             /* row = 0 */
+                    src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
+
+                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
+                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
+                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
+                    /* row = 0 */
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
+
+                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+                    src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1);                   /* row = 1 */
+                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/                 /* row = 1 */
+                    src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b);   /* row = 1 */
+                                                                                              /* row = 1 */
+                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
+                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
+                    src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b);   /* row = 1 */
+
+                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
+                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
+                    src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b);   /* row = 1 */
+
+                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
+                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
+                    src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b);   /* row = 1 */
+
+                    res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+                    res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
+                    res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
+
+                    /* to store the 1st 4 pixels res. */
+                    _mm_store_si128((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
+                    _mm_store_si128((__m128i *)(pi2_dst + dst_strd + offset), res_temp15_8x16b);
+
+                    offset += 8; /* To pointer updates*/
+                }
+                pu1_src += 2 * src_strd;  /* pointer updates*/
+                pi2_dst += 2 * dst_strd;  /* pointer updates*/
+            }
+        }
+        else /* wd = multiple of 4 case */
+        {
+            for(row = 0; row < ht; row += 2)
+            {
+                int offset = 0;
+
+#if 1
+                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+                for(col = 0; col < wd; col += 4)
+                {
+                    /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
+                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));             /* row = 0 */
+                    src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
+
+                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
+                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
+                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
+                    /* row = 0 */
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
+
+                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+                    src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1);                   /* row = 1 */
+                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/                 /* row = 1 */
+                    src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b);   /* row = 1 */
+                                                                                              /* row = 1 */
+                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
+                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
+                    src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b);   /* row = 1 */
+
+                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
+                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
+                    src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b);   /* row = 1 */
+
+                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
+                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
+                    src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b);   /* row = 1 */
+
+                    res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+                    res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
+                    res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
+
+                    /* to store the 1st 4 pixels res. */
+                    _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
+                    _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), res_temp15_8x16b);
+
+                    offset += 4; /* To pointer updates*/
+                }
+                pu1_src += 2 * src_strd;  /* Pointer update */
+                pi2_dst += 2 * dst_strd;  /* Pointer update */
+            }
+        }
+    }
+    else /* odd ht */
+    {
+        if(0 == (wd & 7)) /* multiple of 8 case */
+        {
+            for(row = 0; row < ht; row++)
+            {
+                int offset = 0;
+
+#if 1
+                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+                for(col = 0; col < wd; col += 8)
+                {
+                    /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
+                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));  /* row = 0 */
+
+                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
+                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
+                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
+                    /* row = 0 */
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
+
+                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+                    /* to store the 1st 4 pixels res. */
+                    _mm_store_si128((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
+
+                    offset += 8; /* To pointer updates*/
+                }
+                pu1_src += src_strd;    /* pointer updates*/
+                pi2_dst += dst_strd;    /* pointer updates*/
+            }
+        }
+        else  /* wd = multiple of 4 case */
+        {
+            for(row = 0; row < (ht - 1); row += 2)
+            {
+                int offset = 0;
+
+
+#if 1
+                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+                for(col = 0; col < wd; col += 4)
+                {
+                    /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
+                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));             /* row = 0 */
+                    src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
+
+                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
+                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
+                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
+                                                                                           /* row = 0 */
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
+
+                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+                    src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1);                   /* row = 1 */
+                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/                 /* row = 1 */
+                    src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b);   /* row = 1 */
+                                                                                              /* row = 1 */
+                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
+                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
+                    src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b);   /* row = 1 */
+
+                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
+                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
+                    src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b);   /* row = 1 */
+
+                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
+                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
+                    src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+                    res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b);   /* row = 1 */
+
+                    res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+                    res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
+                    res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
+
+                    /* to store the 1st 4 pixels res. */
+                    _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
+                    _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), res_temp15_8x16b);
+
+                    offset += 4; /* To pointer updates*/
+                }
+                pu1_src += 2 * src_strd;  /* Pointer update */
+                pi2_dst += 2 * dst_strd;  /* Pointer update */
+            }
+            { /* last repeat at outside the loop */
+                int offset = 0;
+                for(col = 0; col < wd; col += 4)
+                {
+                    /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
+                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));  /* row = 0 */
+
+                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
+                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
+                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
+                                                                                           /* row = 0 */
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
+
+                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
+                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
+                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
+                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
+
+                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+                    /* to store the 1st 4 pixels res. */
+                    _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
+
+                    offset += 4; /* To pointer updates*/
+                }
+            }
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*      Interprediction luma filter for vertical 16bit output
+*
+* @par Description:
+*    Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+*    'pu1_dst'  No downshifting or clipping is done and the output is  used as
+*    an input for weighted prediction
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+*  WORD16 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_luma_vert_w16out_ssse3(UWORD8 *pu1_src,
+                                             WORD16 *pi2_dst,
+                                             WORD32 src_strd,
+                                             WORD32 dst_strd,
+                                             WORD8 *pi1_coeff,
+                                             WORD32 ht,
+                                             WORD32 wd)
+{
+    WORD32 row, col;
+    UWORD8 *pu1_src_copy;
+    WORD16 *pi2_dst_copy;
+    __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
+    __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b;
+    __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b;
+    __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b;
+    __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b;
+    __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b;
+    __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b;
+    __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b;
+
+
+    __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b;
+
+/* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
+    s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+    control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
+    control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
+    control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */
+    control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */
+
+    coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b);  /* pi1_coeff[4] */
+    coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b);  /* pi1_coeff[4] */
+
+    coeff4_5_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_3_8x16b);  /* pi1_coeff[4] */
+    coeff6_7_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_4_8x16b);  /* pi1_coeff[4] */
+
+
+/*  outer for loop starts from here */
+    if((wd % 8) == 0)
+    { /* wd = multiple of 8 case */
+
+        pu1_src_copy = pu1_src;
+        pi2_dst_copy = pi2_dst;
+
+        for(col = 0; col < wd; col += 8)
+        {
+
+            pu1_src = pu1_src_copy + col;
+            pi2_dst = pi2_dst_copy + col;
+
+            PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
+            PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
+            PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
+            PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
+
+            /*load 8 pixel values */
+            s2_0_16x8b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd)));
+
+            /*load 8 pixel values */
+            s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd)));
+
+            s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
+
+            s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
+
+            /*load 8 pixel values */
+            s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
+
+            /*load 8 pixel values */
+            s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
+
+            s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
+
+            s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
+
+            /*load 8 pixel values */
+            s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
+
+            /*load 8 pixel values */
+            s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+            s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b);
+
+            s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+            /*load 8 pixel values */
+            s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+            /*load 8 pixel values */
+            s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd)));
+
+            s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b);
+
+            s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+            s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
+            s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
+            s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of row 0*/
+            _mm_store_si128((__m128i *)(pi2_dst), s6_8x16b);
+
+            /* ROW 2*/
+            s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
+            s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
+            s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+            /*load 8 pixel values */
+            s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd)));
+
+            /*load 8 pixel values */
+            s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd)));
+
+            s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b);
+
+            s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+            s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
+            s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
+            s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of row 2*/
+            _mm_store_si128((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b);
+
+
+            /*ROW 1*/
+            s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
+
+            s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
+
+            s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b);
+
+            s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
+
+            s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b);
+
+            s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+            s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b);
+
+            s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+            s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
+            s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
+            s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
+
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of row 1*/
+            _mm_store_si128((__m128i *)(pi2_dst + (dst_strd)), s16_8x16b);
+
+
+            /* ROW 3*/
+            s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
+            s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
+            s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+            /*load 8 pixel values */
+            s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd)));
+
+            s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b);
+
+            s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+            s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
+            s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
+            s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
+
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of row 2*/
+            _mm_store_si128((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b);
+
+            pu1_src += (8 * src_strd);
+            pi2_dst += (4 * dst_strd);
+
+            for(row = 4; row < ht; row += 4)
+            {
+
+                PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+                s3_0_16x8b = s3_2_16x8b;
+                s3_1_16x8b = s3_3_16x8b;
+                s3_2_16x8b = s3_4_16x8b;
+
+                s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
+                s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
+                s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+                /*load 8 pixel values from (cur_row + 4)th row*/
+                s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+
+                s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b);
+                s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+                s4_0_16x8b = s4_2_16x8b;
+                s4_1_16x8b = s4_3_16x8b;
+                s4_2_16x8b = s4_4_16x8b;
+
+                s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
+                s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
+                s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
+
+                /* store 8 8-bit output values  */
+                /* Store the output pixels of row 4*/
+                _mm_store_si128((__m128i *)(pi2_dst), s6_8x16b);
+
+                /* row + 2*/
+                s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
+                s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
+                s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+                /*load 8 pixel values from (cur_row + 5)th row*/
+                s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+
+                /*load 8 pixel values from (cur_row + 6)th row*/
+                s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+                /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
+                s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
+
+                s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+                s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
+                s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
+                s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
+
+                /* store 8 8-bit output values  */
+                /* Store the output pixels of (cur_row+2)*/
+                _mm_store_si128((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b);
+
+
+                /*row + 1*/
+                s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
+                s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
+                s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+                /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
+                s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
+                s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+                s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
+                s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
+                s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
+
+
+                /* store 8 8-bit output values  */
+                /* Store the output pixels of (cur_row + 1)*/
+                _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s16_8x16b);
+
+
+                /* row + 3*/
+                s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
+                s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
+                s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+                /*load 8 pixel values from (cur_row + 7)th row*/
+                s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+                /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
+                s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
+
+                s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+                s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
+                s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
+                s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
+
+                /* store 8 8-bit output values  */
+                /* Store the output pixels of (cur_row+3)*/
+                _mm_store_si128((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b);
+
+                s2_10_16x8b = s2_3_16x8b;
+
+
+                pu1_src += 4 * src_strd; /* pointer update */
+                pi2_dst += 4 * dst_strd; /* pointer update */
+            }
+        }
+    }
+    else /* wd = multiple of 8 case */
+    {
+
+        pu1_src_copy = pu1_src;
+        pi2_dst_copy = pi2_dst;
+
+        for(col = 0; col < wd; col += 4)
+        {
+
+            pu1_src = pu1_src_copy + col;
+            pi2_dst = pi2_dst_copy + col;
+
+            PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
+            PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
+            PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
+            PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
+
+            /*load 8 pixel values */
+            s2_0_16x8b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd)));
+
+            /*load 8 pixel values */
+            s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd)));
+
+            s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
+
+            s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
+
+            /*load 8 pixel values */
+            s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
+
+            /*load 8 pixel values */
+            s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
+
+            s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
+
+            s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
+
+            /*load 8 pixel values */
+            s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
+
+            /*load 8 pixel values */
+            s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+            s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b);
+
+            s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+            /*load 8 pixel values */
+            s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+            /*load 8 pixel values */
+            s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd)));
+
+            s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b);
+
+            s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+            s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
+            s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
+            s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of row 0*/
+            _mm_storel_epi64((__m128i *)(pi2_dst), s6_8x16b);
+
+            /* ROW 2*/
+            s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
+            s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
+            s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+            /*load 8 pixel values */
+            s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd)));
+
+            /*load 8 pixel values */
+            s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd)));
+
+            s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b);
+
+            s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+            s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
+            s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
+            s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of row 2*/
+            _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b);
+
+
+            /*ROW 1*/
+            s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
+
+            s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
+
+            s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b);
+
+            s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
+
+            s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b);
+
+            s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+            s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b);
+
+            s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+            s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
+            s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
+            s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
+
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of row 1*/
+            _mm_storel_epi64((__m128i *)(pi2_dst + (dst_strd)), s16_8x16b);
+
+
+            /* ROW 3*/
+            s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
+            s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
+            s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+            /*load 8 pixel values */
+            s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd)));
+
+            s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b);
+
+            s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+            s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
+            s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
+            s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of row 2*/
+            _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b);
+
+            pu1_src += (8 * src_strd);
+            pi2_dst += (4 * dst_strd);
+
+            for(row = 4; row < ht; row += 4)
+            {
+
+                PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+                s3_0_16x8b = s3_2_16x8b;
+                s3_1_16x8b = s3_3_16x8b;
+                s3_2_16x8b = s3_4_16x8b;
+
+                s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
+                s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
+                s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+                /*load 8 pixel values from (cur_row + 4)th row*/
+                s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+
+                s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b);
+                s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+                s4_0_16x8b = s4_2_16x8b;
+                s4_1_16x8b = s4_3_16x8b;
+                s4_2_16x8b = s4_4_16x8b;
+
+                s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
+                s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
+                s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
+
+                /* store 8 8-bit output values  */
+                /* Store the output pixels of row 4*/
+                _mm_storel_epi64((__m128i *)(pi2_dst), s6_8x16b);
+
+                /* row + 2*/
+                s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
+                s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
+                s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+                /*load 8 pixel values from (cur_row + 5)th row*/
+                s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+
+                /*load 8 pixel values from (cur_row + 6)th row*/
+                s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+                /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
+                s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
+
+                s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+                s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
+                s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
+                s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
+
+                /* store 8 8-bit output values  */
+                /* Store the output pixels of (cur_row+2)*/
+                _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b);
+
+
+                /*row + 1*/
+                s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
+                s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
+                s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+                /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
+                s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
+                s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+                s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
+                s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
+                s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
+
+                /* store 8 8-bit output values  */
+                /* Store the output pixels of (cur_row + 1)*/
+                _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s16_8x16b);
+
+
+                /* row + 3*/
+                s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
+                s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
+                s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+                /*load 8 pixel values from (cur_row + 7)th row*/
+                s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+                /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
+                s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
+
+                s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+                s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
+                s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
+                s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
+
+                /* store 8 8-bit output values  */
+                /* Store the output pixels of (cur_row+3)*/
+                _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b);
+
+                s2_10_16x8b = s2_3_16x8b;
+
+                pu1_src += 4 * src_strd; /* pointer update */
+                pi2_dst += 4 * dst_strd; /* pointer update */
+            }
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*
+*        Luma vertical filter for 16bit input.
+*
+* @par Description:
+*   Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+*   the elements pointed by 'pu1_src' and  writes to the location pointed by
+*   'pu1_dst'  Input is 16 bits  The filter output is downshifted by 12 and
+*   clipped to lie  between 0 and 255
+*
+* @param[in] pi2_src
+*  WORD16 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_luma_vert_w16inp_ssse3(WORD16 *pi2_src,
+                                             UWORD8 *pu1_dst,
+                                             WORD32 src_strd,
+                                             WORD32 dst_strd,
+                                             WORD8 *pi1_coeff,
+                                             WORD32 ht,
+                                             WORD32 wd)
+{
+    WORD32 row, col;
+    WORD16 *pi2_src_copy;
+    UWORD8 *pu1_dst_copy;
+    __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
+    __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b, s8_8x16b, s9_8x16b;
+    __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b;
+    __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b;
+    __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b;
+    __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b, s18_8x16b, s19_8x16b;
+    __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b, s28_8x16b, s29_8x16b;
+    __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b, s38_8x16b, s39_8x16b;
+
+    __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b, sign_reg;
+
+/* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
+    s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+    zero_8x16b = _mm_setzero_si128();
+    sign_reg =  _mm_cmpgt_epi8(zero_8x16b, s4_8x16b);
+    s5_8x16b  = _mm_unpacklo_epi8(s4_8x16b, sign_reg);
+
+    coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0));  /* pi1_coeff[4] */
+    coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1));  /* pi1_coeff[4] */
+
+    coeff4_5_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(2, 2, 2, 2));  /* pi1_coeff[4] */
+    coeff6_7_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(3, 3, 3, 3));  /* pi1_coeff[4] */
+
+
+/* seting values in register */
+    offset_8x16b = _mm_set1_epi32(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
+    mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
+    mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF);
+
+
+    pi2_src_copy = pi2_src;
+    pu1_dst_copy = pu1_dst;
+
+/*  outer for loop starts from here */
+    for(col = 0; col < wd; col += 4)
+    {
+
+        pi2_src = pi2_src_copy + col;
+        pu1_dst = pu1_dst_copy + col;
+
+        /*load 4 pixel values */
+        s2_0_16x8b  = _mm_loadl_epi64((__m128i *)(pi2_src + (-3 * src_strd)));
+
+        /*load 4 pixel values */
+        s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-2 * src_strd)));
+
+        s3_0_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b);
+
+        s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b);
+
+        /*load 4 pixel values */
+        s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd)));
+
+        /*load 4 pixel values */
+        s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd)));
+
+        s3_1_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b);
+
+        s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b);
+
+        /*load 4 pixel values */
+        s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd)));
+
+        /*load 4 pixel values */
+        s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
+
+        s3_2_16x8b = _mm_unpacklo_epi16(s2_4_16x8b, s2_5_16x8b);
+
+        s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+        /*load 4 pixel values */
+        s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
+
+        /*load 4 pixel values */
+        s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (4 * src_strd)));
+
+        s3_3_16x8b = _mm_unpacklo_epi16(s2_6_16x8b, s2_7_16x8b);
+
+        s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+        s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b);
+        s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b);
+        s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b);
+
+        /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+        s8_8x16b = _mm_srai_epi32(s6_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+        /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+        s9_8x16b = _mm_add_epi32(s8_8x16b, offset_8x16b);
+
+        /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+        s8_8x16b = _mm_srai_epi32(s9_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+        s8_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
+
+
+        /* i2_tmp = CLIP_U8(i2_tmp);*/
+        s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
+
+        s4_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
+        s5_8x16b =  _mm_and_si128(s4_8x16b, mask_low_32b);
+        s6_8x16b =  _mm_and_si128(s9_8x16b, mask_high_96b);
+        s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
+
+        /* store 8 8-bit output values  */
+        /* Store the output pixels of row 0*/
+        _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
+
+        /* ROW 2*/
+        s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b);
+        s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b);
+        s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+        /*load 4 pixel values */
+        s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (5 * src_strd)));
+
+        /*load 4 pixel values */
+        s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (6 * src_strd)));
+
+        s3_4_16x8b = _mm_unpacklo_epi16(s2_8_16x8b, s2_9_16x8b);
+
+        s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+        s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b);
+        s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b);
+        s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b);
+
+        /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+        s28_8x16b = _mm_srai_epi32(s26_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+        /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+        s29_8x16b = _mm_add_epi32(s28_8x16b, offset_8x16b);
+
+        /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+        s28_8x16b = _mm_srai_epi32(s29_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+        s28_8x16b = _mm_packs_epi32(s28_8x16b, zero_8x16b);
+
+
+        /* i2_tmp = CLIP_U8(i2_tmp);*/
+        s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
+
+        s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd)));
+        s25_8x16b =  _mm_and_si128(s24_8x16b, mask_low_32b);
+        s26_8x16b =  _mm_and_si128(s29_8x16b, mask_high_96b);
+        s29_8x16b = _mm_or_si128(s25_8x16b, s26_8x16b);
+
+        /* store 8 8-bit output values  */
+        /* Store the output pixels of row 2*/
+        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b);
+
+
+        /*ROW 1*/
+        s4_0_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b);
+
+        s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b);
+
+        s4_1_16x8b = _mm_unpacklo_epi16(s2_3_16x8b, s2_4_16x8b);
+
+        s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b);
+
+        s4_2_16x8b = _mm_unpacklo_epi16(s2_5_16x8b, s2_6_16x8b);
+
+        s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+        s4_3_16x8b = _mm_unpacklo_epi16(s2_7_16x8b, s2_8_16x8b);
+
+        s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+        s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b);
+        s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b);
+        s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b);
+
+        /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+        s18_8x16b = _mm_srai_epi32(s16_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+        /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+        s19_8x16b = _mm_add_epi32(s18_8x16b, offset_8x16b);
+
+        /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+        s18_8x16b = _mm_srai_epi32(s19_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+        s18_8x16b = _mm_packs_epi32(s18_8x16b, zero_8x16b);
+
+
+        /* i2_tmp = CLIP_U8(i2_tmp);*/
+        s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
+
+        s14_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (dst_strd)));
+        s15_8x16b =  _mm_and_si128(s14_8x16b, mask_low_32b);
+        s16_8x16b =  _mm_and_si128(s19_8x16b, mask_high_96b);
+        s19_8x16b = _mm_or_si128(s15_8x16b, s16_8x16b);
+
+        /* store 8 8-bit output values  */
+        /* Store the output pixels of row 1*/
+        _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd)), s19_8x16b);
+
+
+        /* ROW 3*/
+        s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b);
+        s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b);
+        s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+        /*load 4 pixel values */
+        s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (7 * src_strd)));
+
+        s4_4_16x8b = _mm_unpacklo_epi16(s2_9_16x8b, s2_10_16x8b);
+
+        s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+        s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b);
+        s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b);
+        s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b);
+
+        /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+        s38_8x16b = _mm_srai_epi32(s36_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+
+        /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+        s39_8x16b = _mm_add_epi32(s38_8x16b, offset_8x16b);
+
+        /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+        s38_8x16b = _mm_srai_epi32(s39_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+        s38_8x16b = _mm_packs_epi32(s38_8x16b, zero_8x16b);
+
+
+        /* i2_tmp = CLIP_U8(i2_tmp);*/
+        s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
+
+        s34_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd)));
+        s35_8x16b =  _mm_and_si128(s34_8x16b, mask_low_32b);
+        s36_8x16b =  _mm_and_si128(s39_8x16b, mask_high_96b);
+        s39_8x16b = _mm_or_si128(s35_8x16b, s36_8x16b);
+
+        /* store 8 8-bit output values  */
+        /* Store the output pixels of row 2*/
+        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b);
+
+        pi2_src += (8 * src_strd);
+        pu1_dst += (4 * dst_strd);
+
+        for(row = 4; row < ht; row += 4)
+        {
+
+            s3_0_16x8b = s3_2_16x8b;
+            s3_1_16x8b = s3_3_16x8b;
+            s3_2_16x8b = s3_4_16x8b;
+
+            s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b);
+            s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b);
+            s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+            /*load 4 pixel values from (cur_row + 4)th row*/
+            s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src));
+
+            s3_3_16x8b = _mm_unpacklo_epi16(s2_10_16x8b, s2_0_16x8b);
+            s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+            s4_0_16x8b = s4_2_16x8b;
+            s4_1_16x8b = s4_3_16x8b;
+            s4_2_16x8b = s4_4_16x8b;
+
+            s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b);
+            s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b);
+            s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b);
+
+            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s8_8x16b = _mm_srai_epi32(s6_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+            s9_8x16b = _mm_add_epi32(s8_8x16b, offset_8x16b);
+
+            /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s8_8x16b = _mm_srai_epi32(s9_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            s8_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
+
+
+            /* i2_tmp = CLIP_U8(i2_tmp);*/
+            s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
+
+            s4_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
+            s5_8x16b =  _mm_and_si128(s4_8x16b, mask_low_32b);
+            s6_8x16b =  _mm_and_si128(s9_8x16b, mask_high_96b);
+            s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of row 4*/
+            _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
+
+/* row + 2*/
+            s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b);
+            s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b);
+            s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+            /*load 4 pixel values from (cur_row + 5)th row*/
+            s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
+
+            /*load 4 pixel values from (cur_row + 6)th row*/
+            s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
+
+            /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
+            s3_4_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b);
+
+            s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+            s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b);
+            s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b);
+            s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b);
+
+            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s28_8x16b = _mm_srai_epi32(s26_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+            s29_8x16b = _mm_add_epi32(s28_8x16b, offset_8x16b);
+
+            /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s28_8x16b = _mm_srai_epi32(s29_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            s28_8x16b = _mm_packs_epi32(s28_8x16b, zero_8x16b);
+
+
+            /* i2_tmp = CLIP_U8(i2_tmp);*/
+            s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
+
+            s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd)));
+            s25_8x16b =  _mm_and_si128(s24_8x16b, mask_low_32b);
+            s26_8x16b =  _mm_and_si128(s29_8x16b, mask_high_96b);
+            s29_8x16b = _mm_or_si128(s25_8x16b, s26_8x16b);
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of (cur_row+2)*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b);
+
+
+/*row + 1*/
+            s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b);
+            s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b);
+            s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+            /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
+            s4_3_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b);
+            s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+            s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b);
+            s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b);
+            s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b);
+
+            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s18_8x16b = _mm_srai_epi32(s16_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+            s19_8x16b = _mm_add_epi32(s18_8x16b, offset_8x16b);
+
+            /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s18_8x16b = _mm_srai_epi32(s19_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            s18_8x16b = _mm_packs_epi32(s18_8x16b, zero_8x16b);
+
+            /* i2_tmp = CLIP_U8(i2_tmp);*/
+            s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
+
+            s14_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
+            s15_8x16b =  _mm_and_si128(s14_8x16b, mask_low_32b);
+            s16_8x16b =  _mm_and_si128(s19_8x16b, mask_high_96b);
+            s19_8x16b = _mm_or_si128(s15_8x16b, s16_8x16b);
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of (cur_row + 1)*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s19_8x16b);
+
+
+/* row + 3*/
+            s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b);
+            s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b);
+            s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+            /*load 4 pixel values from (cur_row + 7)th row*/
+            s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
+
+            /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
+            s4_4_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b);
+
+            s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+            s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b);
+            s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b);
+            s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b);
+
+            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s38_8x16b = _mm_srai_epi32(s36_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+            s39_8x16b = _mm_add_epi32(s38_8x16b, offset_8x16b);
+
+            /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s38_8x16b = _mm_srai_epi32(s39_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            s38_8x16b = _mm_packs_epi32(s38_8x16b, zero_8x16b);
+
+
+            /* i2_tmp = CLIP_U8(i2_tmp);*/
+            s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
+
+            s34_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd)));
+            s35_8x16b =  _mm_and_si128(s34_8x16b, mask_low_32b);
+            s36_8x16b =  _mm_and_si128(s39_8x16b, mask_high_96b);
+            s39_8x16b = _mm_or_si128(s35_8x16b, s36_8x16b);
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of (cur_row+3)*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b);
+
+            s2_10_16x8b = s2_3_16x8b;
+
+            pi2_src += 4 * src_strd; /* pointer update */
+            pu1_dst += 4 * dst_strd; /* pointer update */
+        }
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*      Luma prediction filter for vertical 16bit input & output
+*
+* @par Description:
+*    Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+*    'pu1_dst'  Input is 16 bits  The filter output is downshifted by 6 and
+*    8192 is  subtracted to store it as a 16 bit number  The output is used as
+*    a input to weighted prediction
+*
+* @param[in] pi2_src
+*  WORD16 pointer to the source
+*
+* @param[out] pi2_dst
+*  WORD16 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_luma_vert_w16inp_w16out_ssse3(WORD16 *pi2_src,
+                                                    WORD16 *pi2_dst,
+                                                    WORD32 src_strd,
+                                                    WORD32 dst_strd,
+                                                    WORD8 *pi1_coeff,
+                                                    WORD32 ht,
+                                                    WORD32 wd)
+{
+    WORD32 row, col;
+    WORD16 *pi2_src_copy;
+    WORD16 *pi2_dst_copy;
+    __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
+    __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b, s8_8x16b, s9_8x16b;
+    __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b;
+    __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b;
+    __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b;
+    __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b, s18_8x16b, s19_8x16b;
+    __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b, s28_8x16b, s29_8x16b;
+    __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b, s38_8x16b, s39_8x16b;
+
+    __m128i zero_8x16b, offset_8x16b, sign_reg;
+
+/* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
+    s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+    zero_8x16b = _mm_setzero_si128();
+    sign_reg =  _mm_cmpgt_epi8(zero_8x16b, s4_8x16b);
+    s5_8x16b  = _mm_unpacklo_epi8(s4_8x16b, sign_reg);
+
+    coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0));  /* pi1_coeff[4] */
+    coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1));  /* pi1_coeff[4] */
+
+    coeff4_5_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(2, 2, 2, 2));  /* pi1_coeff[4] */
+    coeff6_7_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(3, 3, 3, 3));  /* pi1_coeff[4] */
+
+
+/* seting values in register */
+    offset_8x16b = _mm_set1_epi32(OFFSET14); /* for offset addition */
+
+    pi2_src_copy = pi2_src;
+    pi2_dst_copy = pi2_dst;
+
+/*  outer for loop starts from here */
+    for(col = 0; col < wd; col += 4)
+    {
+
+        pi2_src = pi2_src_copy + col;
+        pi2_dst = pi2_dst_copy + col;
+
+        /*load 4 pixel values*/
+        s2_0_16x8b  = _mm_loadl_epi64((__m128i *)(pi2_src + (-3 * src_strd)));
+
+        /*load 4 pixel values*/
+        s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-2 * src_strd)));
+
+        s3_0_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b);
+
+        s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b);
+
+        /*load 4 pixel values*/
+        s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd)));
+
+        /*load 4 pixel values*/
+        s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd)));
+
+        s3_1_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b);
+
+        s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b);
+
+        /*load 4 pixel values*/
+        s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd)));
+
+        /*load 4 pixel values*/
+        s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
+
+        s3_2_16x8b = _mm_unpacklo_epi16(s2_4_16x8b, s2_5_16x8b);
+
+        s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+        /*load 4 pixel values*/
+        s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
+
+        /*load 4 pixel values*/
+        s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (4 * src_strd)));
+
+        s3_3_16x8b = _mm_unpacklo_epi16(s2_6_16x8b, s2_7_16x8b);
+
+        s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+        s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b);
+        s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b);
+        s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b);
+
+        /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+        s8_8x16b = _mm_srai_epi32(s6_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+        /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+        s9_8x16b = _mm_sub_epi32(s8_8x16b, offset_8x16b);
+
+        s8_8x16b = _mm_packs_epi32(s9_8x16b, zero_8x16b);
+
+        /* store 8 8-bit output values  */
+        /* Store the output pixels of row 0*/
+        _mm_storel_epi64((__m128i *)(pi2_dst), s8_8x16b);
+
+        /* ROW 2*/
+        s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b);
+        s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b);
+        s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+        /*load 4 pixel values*/
+        s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (5 * src_strd)));
+
+        /*load 4 pixel values*/
+        s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (6 * src_strd)));
+
+        s3_4_16x8b = _mm_unpacklo_epi16(s2_8_16x8b, s2_9_16x8b);
+
+        s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+        s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b);
+        s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b);
+        s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b);
+
+        /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+        s28_8x16b = _mm_srai_epi32(s26_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+        /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+        s29_8x16b = _mm_sub_epi32(s28_8x16b, offset_8x16b);
+
+        s28_8x16b = _mm_packs_epi32(s29_8x16b, zero_8x16b);
+
+        /* store 8 8-bit output values  */
+        /* Store the output pixels of row 2*/
+        _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s28_8x16b);
+
+
+        /*ROW 1*/
+        s4_0_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b);
+
+        s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b);
+
+        s4_1_16x8b = _mm_unpacklo_epi16(s2_3_16x8b, s2_4_16x8b);
+
+        s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b);
+
+        s4_2_16x8b = _mm_unpacklo_epi16(s2_5_16x8b, s2_6_16x8b);
+
+        s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+        s4_3_16x8b = _mm_unpacklo_epi16(s2_7_16x8b, s2_8_16x8b);
+
+        s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+        s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b);
+        s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b);
+        s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b);
+
+        /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+        s18_8x16b = _mm_srai_epi32(s16_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+        /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+        s19_8x16b = _mm_sub_epi32(s18_8x16b, offset_8x16b);
+
+        s18_8x16b = _mm_packs_epi32(s19_8x16b, zero_8x16b);
+
+        /* store 8 8-bit output values  */
+        /* Store the output pixels of row 1*/
+        _mm_storel_epi64((__m128i *)(pi2_dst + (dst_strd)), s18_8x16b);
+
+
+        /* ROW 3*/
+        s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b);
+        s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b);
+        s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+        /*load 4 pixel values*/
+        s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (7 * src_strd)));
+
+        s4_4_16x8b = _mm_unpacklo_epi16(s2_9_16x8b, s2_10_16x8b);
+
+        s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+        s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b);
+        s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b);
+        s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b);
+
+        /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+        s38_8x16b = _mm_srai_epi32(s36_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+
+        /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+        s39_8x16b = _mm_sub_epi32(s38_8x16b, offset_8x16b);
+
+        s38_8x16b = _mm_packs_epi32(s39_8x16b, zero_8x16b);
+
+        /* store 8 8-bit output values  */
+        /* Store the output pixels of row 2*/
+        _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s38_8x16b);
+
+        pi2_src += (8 * src_strd);
+        pi2_dst += (4 * dst_strd);
+
+        for(row = 4; row < ht; row += 4)
+        {
+
+            s3_0_16x8b = s3_2_16x8b;
+            s3_1_16x8b = s3_3_16x8b;
+            s3_2_16x8b = s3_4_16x8b;
+
+            s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b);
+            s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b);
+            s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+            /*load 4 pixel values from (cur_row + 4)th row*/
+            s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src));
+
+            s3_3_16x8b = _mm_unpacklo_epi16(s2_10_16x8b, s2_0_16x8b);
+            s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+            s4_0_16x8b = s4_2_16x8b;
+            s4_1_16x8b = s4_3_16x8b;
+            s4_2_16x8b = s4_4_16x8b;
+
+            s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b);
+            s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b);
+            s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b);
+
+            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s8_8x16b = _mm_srai_epi32(s6_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+            s9_8x16b = _mm_sub_epi32(s8_8x16b, offset_8x16b);
+
+            s8_8x16b = _mm_packs_epi32(s9_8x16b, zero_8x16b);
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of row 4*/
+            _mm_storel_epi64((__m128i *)(pi2_dst), s8_8x16b);
+
+/* row + 2*/
+            s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b);
+            s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b);
+            s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+            /*load 4 pixel values from (cur_row + 5)th row*/
+            s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
+
+            /*load 4 pixel values from (cur_row + 6)th row*/
+            s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
+
+            /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
+            s3_4_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b);
+
+            s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+            s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b);
+            s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b);
+            s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b);
+
+            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s28_8x16b = _mm_srai_epi32(s26_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+            s29_8x16b = _mm_sub_epi32(s28_8x16b, offset_8x16b);
+
+            s28_8x16b = _mm_packs_epi32(s29_8x16b, zero_8x16b);
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of (cur_row+2)*/
+            _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s28_8x16b);
+
+
+/*row + 1*/
+            s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b);
+            s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b);
+            s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+            /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
+            s4_3_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b);
+            s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+            s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b);
+            s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b);
+            s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b);
+
+            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s18_8x16b = _mm_srai_epi32(s16_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+            s19_8x16b = _mm_sub_epi32(s18_8x16b, offset_8x16b);
+
+            s18_8x16b = _mm_packs_epi32(s19_8x16b, zero_8x16b);
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of (cur_row + 1)*/
+            _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s18_8x16b);
+
+
+/* row + 3*/
+            s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b);
+            s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b);
+            s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+            /*load 4 pixel values from (cur_row + 7)th row*/
+            s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
+
+            /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
+            s4_4_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b);
+
+            s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+            s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b);
+            s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b);
+            s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b);
+
+            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+            s38_8x16b = _mm_srai_epi32(s36_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+            /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+            s39_8x16b = _mm_sub_epi32(s38_8x16b, offset_8x16b);
+
+            s38_8x16b = _mm_packs_epi32(s39_8x16b, zero_8x16b);
+
+            /* store 8 8-bit output values  */
+            /* Store the output pixels of (cur_row+3)*/
+            _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s38_8x16b);
+
+            s2_10_16x8b = s2_3_16x8b;
+
+            pi2_src += 4 * src_strd; /* pointer update */
+            pi2_dst += 4 * dst_strd; /* pointer update */
+        }
+    }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*      Chroma interprediction filter for copy
+*
+* @par Description:
+*    Copies the array of width 'wd' and height 'ht' from the  location pointed
+*    by 'src' to the location pointed by 'dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_chroma_copy_ssse3(UWORD8 *pu1_src,
+                                        UWORD8 *pu1_dst,
+                                        WORD32 src_strd,
+                                        WORD32 dst_strd,
+                                        WORD8 *pi1_coeff,
+                                        WORD32 ht,
+                                        WORD32 wd)
+{
+    WORD32 row, col;
+    __m128i  s3, mask_4x32b;
+    UNUSED(pi1_coeff);
+    ASSERT(wd % 2 == 0); /* checking assumption*/
+    ASSERT(ht % 2 == 0); /* checking assumption*/
+
+    mask_4x32b = _mm_set_epi32(0, 0, 0, 0x80808080); /* Mask register */
+
+/*  for loop starts from here */
+    if(wd % 8 == 0)
+    {
+        for(row = 0; row < ht; row += 2)
+        {
+            int offset = 0;
+            for(col = 0; col < 2 * wd; col += 16)
+            {
+/* row =0 */
+
+                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+                s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]; */
+                /* storing 16 8-bit output values */
+                _mm_storeu_si128((__m128i *)(pu1_dst + offset), s3); /* pu1_dst[col] = pu1_src[col]; */
+
+/* row =1 */
+                /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
+                s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col]; */
+                /* storing 8 8-bit output values */
+                _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd + offset), s3); /* pu1_dst[col] = pu1_src[col]*/
+
+                offset += 16; /*To pointer update */
+            } /*  inner for loop ends here(16-output values in single iteration) */
+
+            pu1_src += 2 * src_strd; /* pointer update */
+            pu1_dst += 2 * dst_strd; /* pointer update */
+        }
+    }
+    else if(wd % 4 == 0)
+    {
+        for(row = 0; row < ht; row += 2)
+        {
+            int offset = 0;
+            for(col = 0; col < 2 * wd; col += 8)
+            {
+/* row =0  */
+                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+                s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]; */
+                /* storing 8 8-bit output values */
+                _mm_storel_epi64((__m128i *)(pu1_dst + offset), s3); /* pu1_dst[col] = pu1_src[col]; */
+/* row =1 */
+                /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
+                s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col]; */
+                /* storing 8 8-bit output values */
+                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), s3); /* pu1_dst[col] = pu1_src[col]; */
+
+                offset += 8; /* To pointer update */
+            } /* inner for loop ends here(8-output values in single iteration) */
+
+            pu1_src += 2 * src_strd;  /* pointer update */
+            pu1_dst += 2 * dst_strd;  /* pointer update */
+        }
+    }
+    else
+    {
+        for(row = 0; row < ht; row += 2)
+        {
+            int offset = 0;
+            for(col = 0; col < 2 * wd; col += 4)
+            {
+/* row =0 */
+                s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
+                /* storing four 8-bit output values */
+                _mm_maskmoveu_si128(s3, mask_4x32b, (char *)(pu1_dst + offset)); /* pu1_dst[col] = pu1_src[col]; */
+/* row =1 */
+                /* pu1_src[col] */
+                s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
+
+                /* storing four 8-bit output values */
+                _mm_maskmoveu_si128(s3, mask_4x32b, (char *)(pu1_dst + dst_strd + offset)); /* pu1_dst[col] = pu1_src[col]; */
+
+                offset += 4; /* To pointer update */
+            } /*  inner for loop ends here(4-output values in single iteration) */
+
+            pu1_src += 2 * src_strd; /* pointer increment */
+            pu1_dst += 2 * dst_strd; /* pointer increment */
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Chroma interprediction filter for horizontal input
+*
+* @par Description:
+*    Applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
+*    to the elements pointed by 'pu1_src' and  writes to the location pointed
+*    by 'pu1_dst'  The output is downshifted by 6 and clipped to 8 bits
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_chroma_horz_ssse3(UWORD8 *pu1_src,
+                                        UWORD8 *pu1_dst,
+                                        WORD32 src_strd,
+                                        WORD32 dst_strd,
+                                        WORD8 *pi1_coeff,
+                                        WORD32 ht,
+                                        WORD32 wd)
+{
+    WORD32 row, col;
+
+    __m128i coeff0_1_8x16b, coeff2_3_8x16b, control_mask_1_8x16b, control_mask_2_8x16b, offset_8x16b, mask_low_32b, mask_high_96b;
+    __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b;
+    __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b;
+    __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b, res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b;
+    __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b, res_temp14_8x16b, res_temp15_8x16b, res_temp16_8x16b, res_temp17_8x16b;
+
+    PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
+
+    ASSERT(wd % 2 == 0); /* checking assumption*/
+
+/* loading four 8-bit coefficients  */
+    src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+    offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
+    mask_low_32b = _mm_cmpeq_epi16(offset_8x16b, offset_8x16b);
+    mask_high_96b = _mm_srli_si128(mask_low_32b, 12);
+    mask_low_32b = _mm_slli_si128(mask_low_32b, 4);
+
+    control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
+    control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
+
+    coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b);  /* pi1_coeff[4] */
+    coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b);  /* pi1_coeff[4] */
+
+/*  outer for loop starts from here */
+    if(wd % 2 == 0 && wd % 4 != 0)
+    {
+
+        for(row = 0; row < ht; row += 2)
+        {
+            int offset = 0;
+
+#if 1
+            PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+            PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+            for(col = 0; col < 2 * wd; col += 4)
+            {
+
+
+                /*load 16 pixel values from row 0*/
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+                /*load 16 pixel values from row 1*/
+                src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+                /*Derive the source pixels for processing the 2nd pixel*/
+                src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+
+                src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
+
+                /*Derive the source pixels for processing the 3rd pixel*/
+                src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
+
+                /*Derive the source pixels for processing the 4th pixel*/
+                src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
+
+                src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
+
+                /*Derive the source pixels for processing the 2nd pixel*/
+                src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);
+
+                src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b);
+
+                /*Derive the source pixels for processing the 3rd pixel*/
+                src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4);
+                /*Derive the source pixels for processing the 4th pixel*/
+                src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6);
+
+                src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b);
+
+                res_temp1_8x16b = _mm_unpacklo_epi64(src_temp5_16x8b, src_temp15_16x8b);
+                res_temp2_8x16b = _mm_unpacklo_epi64(src_temp6_16x8b, src_temp16_16x8b);
+                res_temp11_8x16b = _mm_maddubs_epi16(res_temp1_8x16b, coeff0_1_8x16b);
+                res_temp12_8x16b = _mm_maddubs_epi16(res_temp2_8x16b, coeff2_3_8x16b);
+
+                /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
+                res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+
+                res_temp14_8x16b = _mm_adds_epi16(res_temp13_8x16b, offset_8x16b);             /* row = 0 */
+                res_temp15_8x16b = _mm_srai_epi16(res_temp14_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
+                res_temp13_8x16b = _mm_packus_epi16(res_temp15_8x16b, res_temp15_8x16b);       /* row = 0 */
+
+                res_temp3_8x16b = _mm_srli_si128(res_temp13_8x16b, 4);
+
+                res_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset));
+                res_temp5_8x16b =  _mm_and_si128(res_temp4_8x16b, mask_low_32b);
+                res_temp6_8x16b =  _mm_and_si128(res_temp13_8x16b, mask_high_96b);
+                res_temp7_8x16b = _mm_or_si128(res_temp5_8x16b, res_temp6_8x16b);
+
+                /* store 4 16-bit values */
+                _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp7_8x16b); /* pu1_dst[col] = i2_tmp_u  */
+
+                res_temp14_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd + offset));
+                res_temp15_8x16b =  _mm_and_si128(res_temp14_8x16b, mask_low_32b);
+                res_temp16_8x16b =  _mm_and_si128(res_temp3_8x16b, mask_high_96b);
+                res_temp17_8x16b = _mm_or_si128(res_temp15_8x16b, res_temp16_8x16b);
+
+                /* store 4 16-bit values */
+                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp17_8x16b); /* pu1_dst[col] = i2_tmp_u  */
+
+
+                offset += 4; /* To pointer update*/
+
+            } /* inner loop ends here(8- output values in single iteration)*/
+
+            pu1_src += 2 * src_strd; /*pointer update*/
+            pu1_dst += 2 * dst_strd; /*pointer update*/
+        }
+    }
+    else
+    {
+
+        for(row = 0; row < ht; row += 2)
+        {
+            int offset = 0;
+
+#if 1
+            PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+            PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+            for(col = 0; col < 2 * wd; col += 8)
+            {
+
+                /*load 16 pixel values from row 0*/
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+                /*load 16 pixel values from row 1*/
+                src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+                /*Derive the source pixels for processing the 2nd pixel*/
+                src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+
+                src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
+
+                /*Derive the source pixels for processing the 3rd pixel*/
+                src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
+
+                /*Derive the source pixels for processing the 4th pixel*/
+                src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
+
+                src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
+
+                res_temp1_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff0_1_8x16b);
+                res_temp2_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff2_3_8x16b);
+
+                /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
+                res_temp3_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+
+                res_temp4_8x16b = _mm_adds_epi16(res_temp3_8x16b, offset_8x16b);             /* row = 0 */
+                res_temp5_8x16b = _mm_srai_epi16(res_temp4_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
+                res_temp6_8x16b = _mm_packus_epi16(res_temp5_8x16b, res_temp5_8x16b);        /* row = 0 */
+
+                /* store 4 16-bit values */
+                _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp6_8x16b); /* pi2_dst[col] = i2_tmp_u  */
+
+                /*Derive the source pixels for processing the 2nd pixel of row 1*/
+                src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);
+
+                src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b);
+
+                /*Derive the source pixels for processing the 3rd pixel of row 1*/
+                src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4);
+
+                /*Derive the source pixels for processing the 4th pixel of row 1*/
+                src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6);
+
+                src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b);
+
+                res_temp11_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff0_1_8x16b);
+                res_temp12_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff2_3_8x16b);
+
+                /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
+                res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+
+                res_temp14_8x16b = _mm_adds_epi16(res_temp13_8x16b, offset_8x16b);             /* row = 0 */
+                res_temp15_8x16b = _mm_srai_epi16(res_temp14_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
+                res_temp16_8x16b = _mm_packus_epi16(res_temp15_8x16b, res_temp15_8x16b);       /* row = 0 */
+
+                /* store 4 16-bit values */
+                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp16_8x16b); /* pu1_dst[col] = i2_tmp_u  */
+
+
+                offset += 8; /* To pointer update*/
+
+            } /* inner loop ends here(8- output values in single iteration)*/
+
+            pu1_src += 2 * src_strd; /*pointer update*/
+            pu1_dst += 2 * dst_strd; /*pointer update*/
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Chroma interprediction filter for vertical input
+*
+* @par Description:
+*    Applies a vertcal filter with coefficients pointed to  by 'pi1_coeff' to
+*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+*    'pu1_dst'  The output is downshifted by 6 and clipped to 8 bits
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_chroma_vert_ssse3(UWORD8 *pu1_src,
+                                        UWORD8 *pu1_dst,
+                                        WORD32 src_strd,
+                                        WORD32 dst_strd,
+                                        WORD8 *pi1_coeff,
+                                        WORD32 ht,
+                                        WORD32 wd)
+{
+    WORD32 row, col;
+    UWORD8 *pu1_src_copy;
+    UWORD8 *pu1_dst_copy;
+    __m128i coeff0_1_8x16b, coeff2_3_8x16b;
+    __m128i s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b;
+    __m128i control_mask_1_8x16b, control_mask_2_8x16b;
+    __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b;
+    __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b;
+    __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b;
+    __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b;
+
+    PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
+
+/* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
+    s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+    control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
+    control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
+
+    coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b);  /* pi1_coeff[4] */
+    coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b);  /* pi1_coeff[4] */
+
+
+/*  seting  values in register */
+    zero_8x16b = _mm_setzero_si128(); /* for saturated clipping */
+    offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
+    mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
+    mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF);
+
+/*  outer for loop starts from here */
+    if(wd % 8 == 0)
+    { /* wd = multiple of 8 case */
+
+        pu1_src_copy = pu1_src;
+        pu1_dst_copy = pu1_dst;
+
+        for(col = 0; col < 2 * wd; col += 16)
+        {
+
+            pu1_src = pu1_src_copy + col;
+            pu1_dst = pu1_dst_copy + col;
+
+
+            for(row = 0; row < ht; row += 2)
+            {
+
+#if 1
+                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+                /*load 8 pixel values from -751:-768 pos. relative to cur. pos.*/
+                s21_8x16b  = _mm_loadu_si128((__m128i *)(pu1_src + (-1 * src_strd)));
+
+                /*load 8 pixel values from -495:-512 pos. relative to cur. pos.*/
+                s22_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (0 * src_strd)));
+
+
+                /*load 8 pixel values from -239:-256 pos. relative to cur. pos.*/
+                s23_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (1 * src_strd)));
+
+                /*load 8 pixel values from 15:0 pos. relative to cur. pos.*/
+                s24_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (2 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
+
+                s31_8x16b = _mm_unpackhi_epi8(s21_8x16b, s22_8x16b);
+
+                s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
+
+                s33_8x16b = _mm_unpackhi_epi8(s23_8x16b, s24_8x16b);
+
+                s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b);
+
+                s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b);
+
+                s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
+
+                s31_8x16b = _mm_add_epi16(s35_8x16b, offset_8x16b);
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s6_8x16b = _mm_srai_epi16(s5_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                s32_8x16b = _mm_srai_epi16(s31_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* i2_tmp = CLIP_U8(i2_tmp);*/
+                s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
+
+                s33_8x16b =  _mm_packus_epi16(s32_8x16b, zero_8x16b);
+
+                s7_8x16b = _mm_unpacklo_epi64(s7_8x16b, s33_8x16b);
+/* store 8 8-bit output values  */
+                /* pu1_dst[col] = (UWORD8)i2_tmp; */
+                _mm_storeu_si128((__m128i *)(pu1_dst), s7_8x16b);
+
+
+#if 1
+                s25_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (3 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
+
+                s31_8x16b = _mm_unpackhi_epi8(s22_8x16b, s23_8x16b);
+
+                s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b);
+
+                s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
+
+                s33_8x16b = _mm_unpackhi_epi8(s24_8x16b, s25_8x16b);
+
+                s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
+
+                s31_8x16b = _mm_add_epi16(s35_8x16b, offset_8x16b);
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s6_8x16b = _mm_srai_epi16(s5_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                s32_8x16b = _mm_srai_epi16(s31_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* i2_tmp = CLIP_U8(i2_tmp);*/
+                s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
+
+                s33_8x16b =  _mm_packus_epi16(s32_8x16b, zero_8x16b);
+
+                s7_8x16b = _mm_unpacklo_epi64(s7_8x16b, s33_8x16b);
+/* store 8 8-bit output values  */
+                /* pu1_dst[col] = (UWORD8)i2_tmp; */
+                _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), s7_8x16b);
+#endif
+
+                pu1_src += 2 * src_strd;
+                pu1_dst += 2 * dst_strd;
+
+
+            } /* inner for loop ends here(8-output values in single iteration) */
+
+        }
+    }
+    else if(wd % 4 == 0)
+    { /* wd = multiple of 8 case */
+
+        for(row = 0; row < ht; row += 2)
+        {
+            pu1_src_copy = pu1_src;
+            pu1_dst_copy = pu1_dst;
+            for(col = 0; col < 2 * wd; col += 8)
+            {
+
+#if 1
+                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+                /*load 8 pixel values from -751:-768 pos. relative to cur. pos.*/
+                s21_8x16b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
+
+                /*load 8 pixel values from -495:-512 pos. relative to cur. pos.*/
+                s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
+
+                s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                /*load 8 pixel values from -239:-256 pos. relative to cur. pos.*/
+                s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
+
+                /*load 8 pixel values from 15:0 pos. relative to cur. pos.*/
+                s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+                s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
+
+                s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s6_8x16b = _mm_srai_epi16(s5_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* i2_tmp = CLIP_U8(i2_tmp);*/
+                s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
+
+/* store 8 8-bit output values  */
+                /* pu1_dst[col] = (UWORD8)i2_tmp; */
+                _mm_storel_epi64((__m128i *)(pu1_dst), s7_8x16b);
+
+                s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
+                s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
+                s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s6_8x16b = _mm_srai_epi16(s5_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* i2_tmp = CLIP_U8(i2_tmp);*/
+                s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
+
+/* store 8 8-bit output values  */
+                /* pu1_dst[col] = (UWORD8)i2_tmp; */
+                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s7_8x16b);
+
+                pu1_src += 8;    /* To pointer update */
+                pu1_dst += 8;
+
+            } /* inner for loop ends here(8-output values in single iteration) */
+
+            pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */
+            pu1_dst = pu1_dst_copy + 2 * dst_strd; /* pointer update */
+        }
+    }
+
+    else
+    { /* wd = multiple of 4 case */
+
+        for(row = 0; row < ht; row += 2)
+        {
+            pu1_src_copy = pu1_src;
+            pu1_dst_copy = pu1_dst;
+            for(col = 0; col < 2 * wd; col += 4)
+            {
+
+#if 1
+                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+                /*load 8 pixel values from -751:-768 pos. relative to cur. pos.*/
+                s21_8x16b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
+
+                /*load 8 pixel values from -495:-512 pos. relative to cur. pos.*/
+                s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
+
+                s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                /*load 8 pixel values from -239:-256 pos. relative to cur. pos.*/
+                s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
+
+                /*load 8 pixel values from 15:0 pos. relative to cur. pos.*/
+                s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+                s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
+
+                s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s6_8x16b = _mm_srai_epi16(s5_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* i2_tmp = CLIP_U8(i2_tmp);*/
+                s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
+
+                s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
+                s5_8x16b =  _mm_and_si128(s9_8x16b, mask_low_32b);
+                s6_8x16b =  _mm_and_si128(s7_8x16b, mask_high_96b);
+                s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
+
+/* store 8 8-bit output values  */
+                /* pu1_dst[col] = (UWORD8)i2_tmp; */
+                _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
+
+                s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
+                s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
+                s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s6_8x16b = _mm_srai_epi16(s5_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* i2_tmp = CLIP_U8(i2_tmp);*/
+                s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
+
+                s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
+                s5_8x16b =  _mm_and_si128(s9_8x16b, mask_low_32b);
+                s6_8x16b =  _mm_and_si128(s7_8x16b, mask_high_96b);
+                s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
+
+/* store 8 8-bit output values  */
+                /* pu1_dst[col] = (UWORD8)i2_tmp; */
+                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s9_8x16b);
+
+                pu1_src += 4;   /* To pointer update */
+                pu1_dst += 4;
+            } /* inner for loop ends here(8-output values in single iteration) */
+
+            pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */
+            pu1_dst = pu1_dst_copy + 2 * dst_strd; /* pointer update */
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*       chroma interprediction filter for copying 16bit output
+*
+* @par Description:
+*    Copies the array of width 'wd' and height 'ht' from the  location pointed
+*    by 'src' to the location pointed by 'dst' The output is upshifted by 6
+*    bits and is used as input for vertical filtering or weighted prediction
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+*  WORD16 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_chroma_copy_w16out_ssse3(UWORD8 *pu1_src,
+                                               WORD16 *pi2_dst,
+                                               WORD32 src_strd,
+                                               WORD32 dst_strd,
+                                               WORD8 *pi1_coeff,
+                                               WORD32 ht,
+                                               WORD32 wd)
+{
+    WORD32 row, col;
+    __m128i  s3, zero_8x16b;
+
+    ASSERT(wd % 2 == 0); /* checking assumption*/
+    ASSERT(ht % 2 == 0); /* checking assumption*/
+
+    UNUSED(pi1_coeff);
+    zero_8x16b = _mm_setzero_si128();
+/*  outer for loop starts from here */
+    if(wd == 2) /* for wd =2 */
+    {
+        for(row = 0; row < ht; row += 2)
+        {
+            int offset = 0;
+            for(col = 0; col < 2 * wd; col += 4)
+            {
+/* row =0 */
+                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+                s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
+                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+
+                /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+                _mm_storel_epi64((__m128i *)(pi2_dst + offset), s3);
+
+/* row =1 */
+                /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
+                s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
+                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+
+                _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), s3);
+                offset += 4; /* To pointer update */
+            } /* inner for loop ends here */
+
+            pu1_src += 2 * src_strd; /* pointer update */
+            pi2_dst += 2 * dst_strd; /* pointer update */
+        }
+    }
+    else if(wd % 2 == 0 && wd % 4 != 0)
+    {
+        for(row = 0; row < ht / 2; row++)
+        {
+            int offset = 0;
+            int count = (2 * wd) / 8;
+            for(col = 0; col < count; col++)
+            {
+/* row =0 */
+                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+                s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]*/
+                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+                /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
+                _mm_storeu_si128((__m128i *)(pi2_dst + offset), s3);
+
+                /*row=1*/       /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
+                s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
+                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+                _mm_storeu_si128((__m128i *)(pi2_dst + dst_strd + offset), s3);
+
+                offset += 8; /* To pointer update*/
+            } /*  inner for loop ends here(8-output values in single iteration) */
+
+/* finding last four values */
+            s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
+            s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+            s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+
+            /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+            _mm_storel_epi64((__m128i *)(pi2_dst + offset), s3);
+
+            /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
+            s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
+            s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+            s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+            _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), s3);
+
+            pu1_src += 2 * src_strd; /* pointer update */
+            pi2_dst += 2 * dst_strd;
+        }
+    }
+    else
+    {
+        for(row = 0; row < ht / 2; row++)
+        {
+            int offset = 0;
+            for(col = 0; col < 2 * wd / 8; col++)
+            {
+/* row =0 */
+                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+                s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]*/
+                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+                /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
+                _mm_storeu_si128((__m128i *)(pi2_dst + offset), s3);
+
+                /*row=1*/       /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
+                s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
+                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+                _mm_store_si128((__m128i *)(pi2_dst + dst_strd + offset), s3);
+
+                offset += 8; /* To pointer update*/
+            } /*  inner for loop ends here(8-output values in single iteration) */
+
+            pu1_src += 2 * src_strd; /* pointer update */
+            pi2_dst += 2 * dst_strd;
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*       chroma interprediction filter to store horizontal 16bit ouput
+*
+* @par Description:
+*    Applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
+*    to the elements pointed by 'pu1_src' and  writes to the location pointed
+*    by 'pu1_dst'  No downshifting or clipping is done and the output is  used
+*    as an input for vertical filtering or weighted  prediction
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+*  WORD16 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_chroma_horz_w16out_ssse3(UWORD8 *pu1_src,
+                                               WORD16 *pi2_dst,
+                                               WORD32 src_strd,
+                                               WORD32 dst_strd,
+                                               WORD8 *pi1_coeff,
+                                               WORD32 ht,
+                                               WORD32 wd)
+{
+    WORD32 row, col;
+
+    __m128i coeff0_1_8x16b, coeff2_3_8x16b, control_mask_1_8x16b, control_mask_2_8x16b, all_zero;
+    __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b;
+    __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b;
+    __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b;
+    __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b;
+
+    PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
+
+    ASSERT(wd % 2 == 0); /* checking assumption*/
+
+/* loading four 8-bit coefficients and convert 8-bit into 16-bit */
+    src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+    all_zero = _mm_setzero_si128();
+
+    control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
+    control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
+
+    coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b);  /* pi1_coeff[4] */
+    coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b);  /* pi1_coeff[4] */
+
+/*  outer for loop starts from here */
+    if(wd % 2 == 0 && wd % 4 != 0)
+    {
+        int offset = 0;
+        for(row = ht; row >= 2; row -= 2)
+        {
+            offset = 0;
+#if 1
+            PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+            PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+            for(col = 0; col < 2 * wd; col += 4)
+            {
+
+                /*load 16 pixel values of row 0*/
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+                /*load 16 pixel values of row 1*/
+                src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+                /*Derive the source pixels for processing the 2nd pixel of row 0*/
+                src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+
+                src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
+
+                /*Derive the source pixels for processing the 3rd pixel of row 0*/
+                src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
+
+                /*Derive the source pixels for processing the 4th pixel of row 0*/
+                src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
+
+                src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
+
+                /*Derive the source pixels for processing the 2nd pixel of row 1*/
+                src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);
+
+                src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b);
+
+                /*Derive the source pixels for processing the 3rd pixel of row 1*/
+                src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4);
+
+                /*Derive the source pixels for processing the 4th pixel of row 1*/
+                src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6);
+
+                src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b);
+
+                res_temp1_8x16b = _mm_unpacklo_epi64(src_temp5_16x8b, src_temp15_16x8b);
+                res_temp2_8x16b = _mm_unpacklo_epi64(src_temp6_16x8b, src_temp16_16x8b);
+                res_temp11_8x16b = _mm_maddubs_epi16(res_temp1_8x16b, coeff0_1_8x16b);
+                res_temp12_8x16b = _mm_maddubs_epi16(res_temp2_8x16b, coeff2_3_8x16b);
+
+                /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
+                res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+
+                res_temp3_8x16b = _mm_srli_si128(res_temp13_8x16b, 8);
+
+                /* store 4 16-bit values */
+                _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp13_8x16b); /* pi2_dst[col] = i2_tmp_u  */
+
+
+
+                /* store 4 16-bit values */
+                _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), res_temp3_8x16b); /* pi2_dst[col] = i2_tmp_u  */
+
+
+                offset += 4; /* To pointer update*/
+
+            } /* inner loop ends here(8- output values in single iteration)*/
+
+            pu1_src += 2 * src_strd; /*pointer update*/
+            pi2_dst += 2 * dst_strd; /*pointer update*/
+        }
+
+        /*Epilogue to handle ht= odd case*/
+        if(row)
+        {
+            offset = 0;
+            for(col = 0; col < 2 * wd; col += 4)
+            {
+
+                /*load 16 pixel values of row 0*/
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+                /*Derive the source pixels for processing the 2nd pixel of row 0*/
+                src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+
+                src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
+
+                /*Derive the source pixels for processing the 3rd pixel of row 0*/
+                src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
+
+                /*Derive the source pixels for processing the 4th pixel of row 0*/
+                src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
+
+                src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
+
+                res_temp1_8x16b = _mm_unpacklo_epi64(src_temp5_16x8b, all_zero);
+                res_temp2_8x16b = _mm_unpacklo_epi64(src_temp6_16x8b, all_zero);
+                res_temp11_8x16b = _mm_maddubs_epi16(res_temp1_8x16b, coeff0_1_8x16b);
+                res_temp12_8x16b = _mm_maddubs_epi16(res_temp2_8x16b, coeff2_3_8x16b);
+
+                /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
+                res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+
+                //res_temp3_8x16b = _mm_srli_si128 (res_temp13_8x16b, 8);
+
+                /* store 4 16-bit values */
+                _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp13_8x16b); /* pi2_dst[col] = i2_tmp_u  */
+
+                offset += 4; /* To pointer update*/
+
+            }
+        }
+
+    }
+    else
+    {
+        int offset = 0;
+
+        for(row = ht; row >= 2; row -= 2)
+        {
+            offset = 0;
+#if 1
+            PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+            PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+            for(col = 0; col < 2 * wd; col += 8)
+            {
+
+                /*load 16 pixel values of row 0*/
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+                /*load 16 pixel values of row 1*/
+                src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+                /*Derive the source pixels for processing the 2nd pixel of row 0*/
+                src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+
+                src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
+
+                /*Derive the source pixels for processing the 3rd pixel of row 0*/
+                src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
+
+                /*Derive the source pixels for processing the 4th pixel of row 0*/
+                src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
+
+                src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
+
+                res_temp1_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff0_1_8x16b);
+                res_temp2_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff2_3_8x16b);
+
+                /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
+                res_temp3_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+
+                /* store 8 16-bit values */
+                _mm_storeu_si128((__m128i *)(pi2_dst + offset), res_temp3_8x16b); /* pi2_dst[col] = i2_tmp_u  */
+
+                /*Derive the source pixels for processing the 2nd pixel of row 1*/
+                src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);
+
+                src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b);
+
+                /*Derive the source pixels for processing the 3rd pixel of row 1*/
+                src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4);
+
+                /*Derive the source pixels for processing the 4th pixel of row 1*/
+                src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6);
+
+                src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b);
+
+                res_temp11_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff0_1_8x16b);
+                res_temp12_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff2_3_8x16b);
+
+                /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
+                res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+
+                /* store 8 16-bit values */
+                _mm_storeu_si128((__m128i *)(pi2_dst + dst_strd + offset), res_temp13_8x16b); /* pi2_dst[col] = i2_tmp_u  */
+
+
+                offset += 8; /* To pointer update*/
+
+            } /* inner loop ends here(8- output values in single iteration)*/
+
+            pu1_src += 2 * src_strd; /*pointer update*/
+            pi2_dst += 2 * dst_strd; /*pointer update*/
+        }
+
+        /*Epilogue to take care of odd ht*/
+        if(row)
+        {
+            offset = 0;
+            for(col = 0; col < 2 * wd; col += 8)
+            {
+
+                /*load 16 pixel values of row 0*/
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+                /*Derive the source pixels for processing the 2nd pixel of row 0*/
+                src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+
+                src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
+
+                /*Derive the source pixels for processing the 3rd pixel of row 0*/
+                src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
+
+                /*Derive the source pixels for processing the 4th pixel of row 0*/
+                src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
+
+                src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
+
+                res_temp1_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff0_1_8x16b);
+                res_temp2_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff2_3_8x16b);
+
+                /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
+                res_temp3_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+
+                /* store 8 16-bit values */
+                _mm_storeu_si128((__m128i *)(pi2_dst + offset), res_temp3_8x16b); /* pi2_dst[col] = i2_tmp_u  */
+
+                offset += 8; /* To pointer update*/
+
+            }
+        }
+
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Interprediction chroma filter to store vertical 16bit ouput
+*
+* @par Description:
+*    Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+*    'pu1_dst'  No downshifting or clipping is done and the output is  used as
+*    an input for weighted prediction
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+*  WORD16 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_chroma_vert_w16out_ssse3(UWORD8 *pu1_src,
+                                               WORD16 *pi2_dst,
+                                               WORD32 src_strd,
+                                               WORD32 dst_strd,
+                                               WORD8 *pi1_coeff,
+                                               WORD32 ht,
+                                               WORD32 wd)
+{
+    WORD32 row, col;
+    UWORD8 *pu1_src_copy;
+    WORD16 *pi2_dst_copy;
+    __m128i coeff0_1_8x16b, coeff2_3_8x16b;
+    __m128i s4_8x16b, s5_8x16b, s6_8x16b, s8_8x16b;
+    __m128i control_mask_1_8x16b, control_mask_2_8x16b;
+    __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b;
+    __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b;
+    __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b;
+
+
+    PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
+    PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
+
+/* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
+    s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+    control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
+    control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
+
+    coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b);  /* pi1_coeff[4] */
+    coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b);  /* pi1_coeff[4] */
+
+
+
+/*  outer for loop starts from here */
+    if(wd % 8 == 0)
+    { /* wd = multiple of 8 case */
+
+        pu1_src_copy = pu1_src;
+        pi2_dst_copy = pi2_dst;
+
+        for(col = 0; col < 2 * wd; col += 16)
+        {
+
+            pu1_src = pu1_src_copy + col;
+            pi2_dst = pi2_dst_copy + col;
+
+
+            for(row = 0; row < ht; row += 2)
+            {
+
+#if 1
+                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+                /*load 16 pixel values */
+                s21_8x16b  = _mm_loadu_si128((__m128i *)(pu1_src + (-1 * src_strd)));
+
+                /*load 16 pixel values */
+                s22_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (0 * src_strd)));
+
+
+                /*load 16 pixel values */
+                s23_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (1 * src_strd)));
+
+                /*load 16 pixel values */
+                s24_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (2 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
+
+                s31_8x16b = _mm_unpackhi_epi8(s21_8x16b, s22_8x16b);
+
+                s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
+
+                s33_8x16b = _mm_unpackhi_epi8(s23_8x16b, s24_8x16b);
+
+                s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b);
+
+                s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b);
+
+/* store 8 8-bit output values  */
+                /* pi2_dst[col] = (UWORD8)i2_tmp; */
+                _mm_storeu_si128((__m128i *)(pi2_dst), s8_8x16b);
+
+                _mm_storeu_si128((__m128i *)(pi2_dst + 8), s35_8x16b);
+
+
+#if 1
+                s25_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (3 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
+
+                s31_8x16b = _mm_unpackhi_epi8(s22_8x16b, s23_8x16b);
+
+                s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b);
+
+                s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
+
+                s33_8x16b = _mm_unpackhi_epi8(s24_8x16b, s25_8x16b);
+
+                s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+/* store 8 8-bit output values  */
+                /* pi2_dst[col] = (UWORD8)i2_tmp; */
+                _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s8_8x16b);
+
+                _mm_store_si128((__m128i *)(pi2_dst + dst_strd + 8), s35_8x16b);
+
+#endif
+
+                pu1_src += 2 * src_strd;
+                pi2_dst += 2 * dst_strd;
+
+
+            } /* inner for loop ends here(8-output values in single iteration) */
+
+        }
+    }
+
+    else if(wd % 4 == 0)
+    { /* wd = multiple of 8 case */
+
+        for(row = 0; row < ht; row += 2)
+        {
+
+            pu1_src_copy = pu1_src;
+            pi2_dst_copy = pi2_dst;
+
+            for(col = 0; col < 2 * wd; col += 8)
+            {
+
+#if 1
+                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+                /*load 8 pixel values */
+                s21_8x16b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
+
+                /*load 8 pixel values */
+                s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
+
+                s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                /*load 8 pixel values */
+                s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
+
+                /*load 8 pixel values */
+                s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+                s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
+
+                s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                _mm_storeu_si128((__m128i *)(pi2_dst), s8_8x16b);
+
+                s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
+                s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
+                s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s8_8x16b);
+
+                pu1_src += 8;    /* To pointer update */
+                pi2_dst += 8;
+
+            } /* inner for loop ends here(8-output values in single iteration) */
+
+            pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */
+            pi2_dst = pi2_dst_copy + 2 * dst_strd; /* pointer update */
+        }
+    }
+
+    else
+    { /* wd = multiple of 4 case */
+
+        for(row = 0; row < ht; row += 2)
+        {
+            pu1_src_copy = pu1_src;
+            pi2_dst_copy = pi2_dst;
+            for(col = 0; col < 2 * wd; col += 4)
+            {
+
+#if 1
+                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+                /*load 8 pixel values */
+                s21_8x16b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
+
+                /*load 8 pixel values */
+                s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
+
+                s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                /*load 8 pixel values */
+                s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
+
+                /*load 8 pixel values */
+                s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+                s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
+
+                s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+
+/* store 8 8-bit output values  */
+                /* pi2_dst[col] = (UWORD8)i2_tmp; */
+                _mm_storel_epi64((__m128i *)(pi2_dst), s8_8x16b);
+
+                s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
+                s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
+                s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+
+/* store 8 8-bit output values  */
+                /* pi2_dst[col] = (UWORD8)i2_tmp; */
+                _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s8_8x16b);
+
+                pu1_src += 4;   /* To pointer update */
+                pi2_dst += 4;
+            } /* inner for loop ends here(8-output values in single iteration) */
+
+            pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */
+            pi2_dst = pi2_dst_copy + 2 * dst_strd; /* pointer update */
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*     chroma interprediction filter for vertical 16bit input
+*
+* @par Description:
+*    Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+*    the elements pointed by 'pu1_src' and  writes to the location pointed by
+*    'pu1_dst'  Input is 16 bits  The filter output is downshifted by 12 and
+*    clipped to lie  between 0 and 255
+*
+* @param[in] pi2_src
+*  WORD16 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_chroma_vert_w16inp_ssse3(WORD16 *pi2_src,
+                                               UWORD8 *pu1_dst,
+                                               WORD32 src_strd,
+                                               WORD32 dst_strd,
+                                               WORD8 *pi1_coeff,
+                                               WORD32 ht,
+                                               WORD32 wd)
+{
+    WORD32 row, col;
+    WORD16 *pi2_src_copy;
+    UWORD8 *pu1_dst_copy;
+    __m128i coeff0_1_8x16b, coeff2_3_8x16b;
+    __m128i s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b;
+    __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b;
+    __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b, sign_reg;
+    __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b;
+    __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b;
+
+
+/* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
+    s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+    zero_8x16b = _mm_setzero_si128();
+    sign_reg =  _mm_cmpgt_epi8(zero_8x16b, s4_8x16b);
+    s5_8x16b  = _mm_unpacklo_epi8(s4_8x16b, sign_reg);
+
+    coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0));  /* pi1_coeff[4] */
+    coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1));  /* pi1_coeff[4] */
+
+/*  seting  values in register */
+    offset_8x16b = _mm_set1_epi32(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
+    mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
+    mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF);
+
+/*  outer for loop starts from here */
+    if(wd % 4 == 0)
+    { /* wd = multiple of 8 case */
+
+        pi2_src_copy = pi2_src;
+        pu1_dst_copy = pu1_dst;
+
+        for(col = 0; col < 2 * wd; col += 8)
+        {
+
+            pi2_src = pi2_src_copy + col;
+            pu1_dst = pu1_dst_copy + col;
+
+
+            for(row = 0; row < ht; row += 2)
+            {
+
+                /*load 16 pixel values */
+                s21_8x16b  = _mm_load_si128((__m128i *)(pi2_src + (-1 * src_strd)));
+
+                /*load 16 pixel values */
+                s22_8x16b = _mm_load_si128((__m128i *)(pi2_src + (0 * src_strd)));
+
+
+                /*load 16 pixel values */
+                s23_8x16b = _mm_load_si128((__m128i *)(pi2_src + (1 * src_strd)));
+
+                /*load 16 pixel values */
+                s24_8x16b = _mm_load_si128((__m128i *)(pi2_src + (2 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b);
+
+                s31_8x16b = _mm_unpackhi_epi16(s21_8x16b, s22_8x16b);
+
+                s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b);
+
+                s33_8x16b = _mm_unpackhi_epi16(s23_8x16b, s24_8x16b);
+
+                s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b);
+
+                s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b);
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                s32_8x16b = _mm_srai_epi32(s35_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+
+                /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+                s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b);
+
+                /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s8_8x16b = _mm_srai_epi32(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
+
+                /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+                s33_8x16b = _mm_add_epi32(s32_8x16b, offset_8x16b);
+
+                /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s34_8x16b = _mm_srai_epi32(s33_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                s35_8x16b = _mm_packs_epi32(s34_8x16b, zero_8x16b);
+
+
+                /* i2_tmp = CLIP_U8(i2_tmp);*/
+                s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b);
+
+                s33_8x16b =  _mm_packus_epi16(s35_8x16b, zero_8x16b);
+
+                s7_8x16b = _mm_unpacklo_epi32(s7_8x16b, s33_8x16b);
+/* store 8 8-bit output values  */
+                /* pu1_dst[col] = (UWORD8)i2_tmp; */
+                _mm_storel_epi64((__m128i *)(pu1_dst), s7_8x16b);
+
+
+#if 1
+                s25_8x16b = _mm_load_si128((__m128i *)(pi2_src + (3 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b);
+
+                s31_8x16b = _mm_unpackhi_epi16(s22_8x16b, s23_8x16b);
+
+                s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b);
+
+                s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b);
+
+                s33_8x16b = _mm_unpackhi_epi16(s24_8x16b, s25_8x16b);
+
+                s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                s32_8x16b = _mm_srai_epi32(s35_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+
+                /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+                s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b);
+
+                /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s8_8x16b = _mm_srai_epi32(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
+
+                /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+                s33_8x16b = _mm_add_epi32(s32_8x16b, offset_8x16b);
+
+                /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s34_8x16b = _mm_srai_epi32(s33_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                s35_8x16b = _mm_packs_epi32(s34_8x16b, zero_8x16b);
+
+
+                /* i2_tmp = CLIP_U8(i2_tmp);*/
+                s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b);
+
+                s33_8x16b =  _mm_packus_epi16(s35_8x16b, zero_8x16b);
+
+                s7_8x16b = _mm_unpacklo_epi32(s7_8x16b, s33_8x16b);
+/* store 8 8-bit output values  */
+                /* pu1_dst[col] = (UWORD8)i2_tmp; */
+                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s7_8x16b);
+#endif
+
+                pi2_src += 2 * src_strd;
+                pu1_dst += 2 * dst_strd;
+
+
+            } /* inner for loop ends here(8-output values in single iteration) */
+
+        }
+    }
+    else
+    { /* wd = multiple of 4 case */
+
+        for(row = 0; row < ht; row += 2)
+        {
+            pi2_src_copy = pi2_src;
+            pu1_dst_copy = pu1_dst;
+            for(col = 0; col < 2 * wd; col += 4)
+            {
+
+                /*load 8 pixel values  */
+                s21_8x16b  = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd)));
+
+                /*load 8 pixel values */
+                s22_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b);
+
+                s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                /*load 8 pixel values */
+                s23_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd)));
+
+                /*load 8 pixel values */
+                s24_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
+
+                s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b);
+
+                s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+
+                /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+                s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b);
+
+                /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s8_8x16b = _mm_srai_epi32(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
+
+
+                /* i2_tmp = CLIP_U8(i2_tmp);*/
+                s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b);
+
+                s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
+                s5_8x16b =  _mm_and_si128(s9_8x16b, mask_low_32b);
+                s6_8x16b =  _mm_and_si128(s7_8x16b, mask_high_96b);
+                s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
+
+/* store 8 8-bit output values  */
+                /* pu1_dst[col] = (UWORD8)i2_tmp; */
+                _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
+
+                s25_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b);
+                s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b);
+                s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+                s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b);
+
+                /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s8_8x16b = _mm_srai_epi32(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
+
+                /* i2_tmp = CLIP_U8(i2_tmp);*/
+                s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b);
+
+                s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
+                s5_8x16b =  _mm_and_si128(s9_8x16b, mask_low_32b);
+                s6_8x16b =  _mm_and_si128(s7_8x16b, mask_high_96b);
+                s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
+
+/* store 8 8-bit output values  */
+                /* pu1_dst[col] = (UWORD8)i2_tmp; */
+                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s9_8x16b);
+
+                pi2_src += 4;   /* To pointer update */
+                pu1_dst += 4;
+            } /* inner for loop ends here(8-output values in single iteration) */
+
+            pi2_src = pi2_src_copy + 2 * src_strd; /* pointer update */
+            pu1_dst = pu1_dst_copy + 2 * dst_strd; /* pointer update */
+        }
+    }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*
+*      Chroma interprediction filter for 16bit vertical input and output.
+*
+* @par Description:
+*       Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
+*       the elements pointed by 'pu1_src' and  writes to the location pointed by
+*       'pu1_dst'  Input is 16 bits  The filter output is downshifted by 6 and
+*       8192 is  subtracted to store it as a 16 bit number  The output is used as
+*       a input to weighted prediction
+*
+* @param[in] pi2_src
+*  WORD16 pointer to the source
+*
+* @param[out] pi2_dst
+*  WORD16 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] pi1_coeff
+*  WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_chroma_vert_w16inp_w16out_ssse3(WORD16 *pi2_src,
+                                                      WORD16 *pi2_dst,
+                                                      WORD32 src_strd,
+                                                      WORD32 dst_strd,
+                                                      WORD8 *pi1_coeff,
+                                                      WORD32 ht,
+                                                      WORD32 wd)
+{
+    WORD32 row, col;
+    WORD16 *pi2_src_copy;
+    WORD16 *pi2_dst_copy;
+    __m128i coeff0_1_8x16b, coeff2_3_8x16b;
+    __m128i s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b;
+    __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b;
+    __m128i zero_8x16b, sign_reg;
+    __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b;
+    __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b;
+
+
+/* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
+    s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+    zero_8x16b = _mm_setzero_si128();
+    sign_reg =  _mm_cmpgt_epi8(zero_8x16b, s4_8x16b);
+    s5_8x16b  = _mm_unpacklo_epi8(s4_8x16b, sign_reg);
+
+    coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0));  /* pi1_coeff[4] */
+    coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1));  /* pi1_coeff[4] */
+
+
+/*  outer for loop starts from here */
+    if(wd % 4 == 0)
+    { /* wd = multiple of 8 case */
+
+        pi2_src_copy = pi2_src;
+        pi2_dst_copy = pi2_dst;
+
+        for(col = 0; col < 2 * wd; col += 8)
+        {
+
+            pi2_src = pi2_src_copy + col;
+            pi2_dst = pi2_dst_copy + col;
+
+
+            for(row = 0; row < ht; row += 2)
+            {
+
+                /*load 16 pixel values */
+                s21_8x16b  = _mm_load_si128((__m128i *)(pi2_src + (-1 * src_strd)));
+
+                /*load 16 pixel values */
+                s22_8x16b = _mm_load_si128((__m128i *)(pi2_src + (0 * src_strd)));
+
+
+                /*load 16 pixel values */
+                s23_8x16b = _mm_load_si128((__m128i *)(pi2_src + (1 * src_strd)));
+
+                /*load 16 pixel values */
+                s24_8x16b = _mm_load_si128((__m128i *)(pi2_src + (2 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b);
+
+                s31_8x16b = _mm_unpackhi_epi16(s21_8x16b, s22_8x16b);
+
+                s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b);
+
+                s33_8x16b = _mm_unpackhi_epi16(s23_8x16b, s24_8x16b);
+
+                s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b);
+
+                s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b);
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                s32_8x16b = _mm_srai_epi32(s35_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b);
+
+                s35_8x16b = _mm_packs_epi32(s32_8x16b, zero_8x16b);
+
+                s7_8x16b = _mm_unpacklo_epi64(s9_8x16b, s35_8x16b);
+/* store 8 8-bit output values  */
+                /* pi2_dst[col] = (UWORD8)i2_tmp; */
+                _mm_store_si128((__m128i *)(pi2_dst), s7_8x16b);
+
+
+#if 1
+                s25_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + (3 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b);
+
+                s31_8x16b = _mm_unpackhi_epi16(s22_8x16b, s23_8x16b);
+
+                s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b);
+
+                s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b);
+
+                s33_8x16b = _mm_unpackhi_epi16(s24_8x16b, s25_8x16b);
+
+                s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                s32_8x16b = _mm_srai_epi32(s35_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b);
+
+                s35_8x16b = _mm_packs_epi32(s32_8x16b, zero_8x16b);
+
+                s7_8x16b = _mm_unpacklo_epi64(s9_8x16b, s35_8x16b);
+/* store 8 8-bit output values  */
+                /* pi2_dst[col] = (UWORD8)i2_tmp; */
+                _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s7_8x16b);
+#endif
+
+                pi2_src += 2 * src_strd;
+                pi2_dst += 2 * dst_strd;
+
+
+            } /* inner for loop ends here(8-output values in single iteration) */
+
+        }
+    }
+    else
+    { /* wd = multiple of 4 case */
+
+        for(row = 0; row < ht; row += 2)
+        {
+            pi2_src_copy = pi2_src;
+            pi2_dst_copy = pi2_dst;
+            for(col = 0; col < 2 * wd; col += 4)
+            {
+
+                /*load 4 pixel values */
+                s21_8x16b  = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd)));
+
+                /*load 4 pixel values */
+                s22_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b);
+
+                s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                /*load 4 pixel values */
+                s23_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd)));
+
+                /*load 4 pixel values */
+                s24_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
+
+                s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b);
+
+                s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b);
+
+/* store 8 8-bit output values  */
+                /* pi2_dst[col] = (UWORD8)i2_tmp; */
+                _mm_storel_epi64((__m128i *)(pi2_dst), s9_8x16b);
+
+                s25_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
+
+                s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b);
+                s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
+
+                s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b);
+                s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
+
+                s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+                s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
+
+                s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b);
+
+/* store 8 8-bit output values  */
+                /* pi2_dst[col] = (UWORD8)i2_tmp; */
+                _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s9_8x16b);
+
+                pi2_src += 4;   /* To pointer update */
+                pi2_dst += 4;
+            } /* inner for loop ends here(8-output values in single iteration) */
+
+            pi2_src = pi2_src_copy + 2 * src_strd; /* pointer update */
+            pi2_dst = pi2_dst_copy + 2 * dst_strd; /* pointer update */
+        }
+    }
+
+}

diff --git a/common/x86/ihevc_intra_pred_filters_sse42_intr.c b/common/x86/ihevc_intra_pred_filters_sse42_intr.c
new file mode 100644
index 0000000..6488de6
--- /dev/null
+++ b/common/x86/ihevc_intra_pred_filters_sse42_intr.c

@@ -0,0 +1,4201 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_intra_pred_filters_x86_intr.c
+*
+* @brief
+*  Contains function Definition for intra prediction  interpolation filters
+*
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+*  - ihevc_intra_pred_ref_filtering_sse42()
+*  - ihevc_intra_pred_luma_dc_sse42()
+*  - ihevc_intra_pred_luma_horz_sse42()
+*  - ihevc_intra_pred_luma_ver_sse42()
+*  - ihevc_intra_pred_luma_mode_3_to_9_sse42()
+*  - ihevc_intra_pred_luma_mode_11_to_17_sse42()
+*  - ihevc_intra_pred_luma_mode_19_to_25_sse42()
+*  - ihevc_intra_pred_luma_mode_27_to_33_sse42()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdlib.h>
+
+#include "ihevc_typedefs.h"
+#include "ihevc_intra_pred.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_defs.h"
+#include "ihevc_tables_x86_intr.h"
+
+#include <immintrin.h>
+
+/****************************************************************************/
+/* Constant Macros                                                          */
+/****************************************************************************/
+#define MAX_CU_SIZE 64
+#define BIT_DEPTH 8
+#define T32_4NT 128
+#define T16_4NT 64
+
+
+/****************************************************************************/
+/* Function Macros                                                          */
+/****************************************************************************/
+#define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
+
+/* tables to shuffle 8-bit values */
+
+/*****************************************************************************/
+/* global tables Definition                                                  */
+/*****************************************************************************/
+
+
+
+/*****************************************************************************/
+/* Function Definition                                                      */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for ref_filtering
+*
+*
+* @par Description:
+*    Reference DC filtering for neighboring samples dependent  on TU size and
+*    mode  Refer to section 8.4.4.2.3 in the standard
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_ref_filtering_sse42(UWORD8 *pu1_src,
+                                          WORD32 nt,
+                                          UWORD8 *pu1_dst,
+                                          WORD32 mode,
+                                          WORD32 strong_intra_smoothing_enable_flag)
+{
+    WORD32 filter_flag;
+    WORD32 i; /* Generic indexing variable */
+    WORD32 four_nt = 4 * nt;
+    UWORD8 au1_flt[(4 * MAX_CU_SIZE) + 1];
+    WORD32 bi_linear_int_flag = 0;
+    WORD32 abs_cond_left_flag = 0;
+    WORD32 abs_cond_top_flag = 0;
+    WORD32 dc_val = 1 << (BIT_DEPTH - 5);
+    __m128i src_temp1, src_temp2, src_temp3, src_temp7;
+    __m128i src_temp4, src_temp5, src_temp6, src_temp8;
+
+    //WORD32 strong_intra_smoothing_enable_flag  = 1;
+
+
+
+    filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
+    if(0 == filter_flag)
+    {
+        if(pu1_src == pu1_dst)
+        {
+            return;
+        }
+        else
+        {
+            if(nt == 4)
+            {
+                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
+                _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+                pu1_dst[four_nt] = pu1_src[four_nt];
+
+            }
+
+            else if(nt == 8)
+            {
+
+                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
+                src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
+
+                _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+
+
+                pu1_dst[four_nt] = pu1_src[four_nt];
+            }
+            else if(nt == 16)
+            {
+
+                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
+                src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
+                src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32));
+                src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48));
+
+                _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
+
+                pu1_dst[four_nt] = pu1_src[four_nt];
+            }
+            else if(nt == 32)
+            {
+
+                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
+                src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
+                src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32));
+                src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48));
+
+                src_temp5 = _mm_loadu_si128((__m128i *)(pu1_src + 64));
+                src_temp6 = _mm_loadu_si128((__m128i *)(pu1_src + 80));
+                src_temp7 = _mm_loadu_si128((__m128i *)(pu1_src + 96));
+                src_temp8 = _mm_loadu_si128((__m128i *)(pu1_src + 112));
+
+                _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8);
+
+                pu1_dst[four_nt] = pu1_src[four_nt];
+            }
+
+        }
+    }
+
+    else
+    {
+        /* If strong intra smoothin is enabled and transform size is 32 */
+        if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
+        {
+            /* Strong Intra Filtering */
+            abs_cond_top_flag = (abs(pu1_src[2 * nt] + pu1_src[4 * nt]
+                                     - (2 * pu1_src[3 * nt]))) < dc_val;
+            abs_cond_left_flag = (abs(pu1_src[2 * nt] + pu1_src[0]
+                                      - (2 * pu1_src[nt]))) < dc_val;
+
+            bi_linear_int_flag = ((1 == abs_cond_left_flag)
+                            && (1 == abs_cond_top_flag));
+        }
+        /* Extremities Untouched*/
+        au1_flt[0] = pu1_src[0];
+        au1_flt[4 * nt] = pu1_src[4 * nt];
+
+        /* Strong filtering of reference samples */
+        if(1 == bi_linear_int_flag)
+        {
+            au1_flt[2 * nt] = pu1_src[2 * nt];
+
+            for(i = 1; i < (2 * nt); i++)
+                au1_flt[i] = (((2 * nt) - i) * pu1_src[0] + i * pu1_src[2 * nt] + 32) >> 6;
+
+            for(i = 1; i < (2 * nt); i++)
+                au1_flt[i + (2 * nt)] = (((2 * nt) - i) * pu1_src[2 * nt] + i * pu1_src[4 * nt] + 32) >> 6;
+        }
+        else
+        {
+            __m128i const_value_8x16;
+
+            const_value_8x16 = _mm_set1_epi16(2);
+
+            au1_flt[0] = pu1_src[0];
+            au1_flt[4 * nt] = pu1_src[4 * nt];
+
+            /* Perform bilinear filtering of Reference Samples */
+            for(i = 0; i < (four_nt); i += 16)
+            {
+                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src + i));
+                src_temp2 = _mm_srli_si128(src_temp1, 1);
+                src_temp3 = _mm_srli_si128(src_temp2, 1);
+
+                src_temp1 =  _mm_cvtepu8_epi16(src_temp1);
+                src_temp2 =  _mm_cvtepu8_epi16(src_temp2);
+                src_temp3 =  _mm_cvtepu8_epi16(src_temp3);
+
+                src_temp2 = _mm_slli_epi16(src_temp2,  1);
+
+                src_temp1 = _mm_add_epi16(src_temp1, src_temp2);
+                src_temp1 = _mm_add_epi16(src_temp1, src_temp3);
+                src_temp1 = _mm_add_epi16(src_temp1, const_value_8x16);
+
+                src_temp1 = _mm_srai_epi16(src_temp1,  2);
+
+                src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 8 + i));
+                src_temp5 = _mm_srli_si128(src_temp4, 1);
+                src_temp6 = _mm_srli_si128(src_temp5, 1);
+
+                src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
+                src_temp5 =  _mm_cvtepu8_epi16(src_temp5);
+                src_temp6 =  _mm_cvtepu8_epi16(src_temp6);
+
+                src_temp5 = _mm_slli_epi16(src_temp5,  1);
+
+                src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
+                src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+                src_temp4 = _mm_add_epi16(src_temp4, const_value_8x16);
+
+                src_temp4 = _mm_srai_epi16(src_temp4,  2);
+
+                /* converting 16 bit to 8 bit */
+                src_temp1 = _mm_packus_epi16(src_temp1, src_temp4);
+
+                _mm_storeu_si128((__m128i *)(au1_flt + 1 + i), src_temp1);
+            }
+            au1_flt[4 * nt] = pu1_src[4 * nt];
+        }
+
+        if(nt == 4)
+        {
+            src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
+            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+            pu1_dst[four_nt] = au1_flt[four_nt];
+        }
+        else if(nt == 8)
+        {
+
+            src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
+            src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
+
+            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+
+            pu1_dst[four_nt] = au1_flt[four_nt];
+        }
+        else if(nt == 16)
+        {
+
+            src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
+            src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
+            src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32));
+            src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48));
+
+            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
+
+            pu1_dst[four_nt] = au1_flt[four_nt];
+        }
+
+        else if(nt == 32)
+        {
+
+            src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
+            src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
+            src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32));
+            src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48));
+
+            src_temp5 = _mm_loadu_si128((__m128i *)(au1_flt + 64));
+            src_temp6 = _mm_loadu_si128((__m128i *)(au1_flt + 80));
+            src_temp7 = _mm_loadu_si128((__m128i *)(au1_flt + 96));
+            src_temp8 = _mm_loadu_si128((__m128i *)(au1_flt + 112));
+
+            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8);
+
+            pu1_dst[four_nt] = au1_flt[four_nt];
+        }
+
+    }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for luma dc
+*
+* @par Description:
+*   Intraprediction for DC mode with reference neighboring  samples location
+*   pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
+*   to section 8.4.4.2.5 in the standard
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_dc_sse42(UWORD8 *pu1_ref,
+                                    WORD32 src_strd,
+                                    UWORD8 *pu1_dst,
+                                    WORD32 dst_strd,
+                                    WORD32 nt,
+                                    WORD32 mode)
+{
+
+    WORD32 acc_dc;
+    WORD32 dc_val, two_dc_val, three_dc_val;
+    WORD32 row;
+    WORD32 log2nt = 5;
+    WORD32 two_nt, three_nt;
+    __m128i src_temp1, src_temp7, src_temp3, src_temp4, src_temp5, src_temp6;
+    __m128i src_temp8, src_temp9, src_temp10, src_temp2;
+    __m128i m_zero = _mm_set1_epi32(0);
+    __m128i sm = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASK5[0]);
+    UNUSED(src_strd);
+    UNUSED(mode);
+
+
+    switch(nt)
+    {
+        case 32:
+            log2nt = 5;
+            break;
+        case 16:
+            log2nt = 4;
+            break;
+        case 8:
+            log2nt = 3;
+            break;
+        case 4:
+            log2nt = 2;
+            break;
+        default:
+            break;
+    }
+    two_nt = 2 * nt;
+    three_nt = 3 * nt;
+
+    acc_dc = 0;
+    /* Calculate DC value for the transform block */
+
+
+
+    if(nt == 32)
+    {
+        __m128i temp;
+        WORD32 itr_count;
+
+        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
+        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
+        src_temp7 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 32));
+        src_temp8 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 48));
+
+        src_temp3 = _mm_sad_epu8(src_temp3, m_zero);
+        src_temp4 = _mm_sad_epu8(src_temp4, m_zero);
+        src_temp7 = _mm_sad_epu8(src_temp7, m_zero);
+        src_temp8 = _mm_sad_epu8(src_temp8, m_zero);
+
+        src_temp4 = _mm_add_epi16(src_temp3, src_temp4);
+        src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
+        src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
+
+        src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
+        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+        acc_dc = _mm_cvtsi128_si32(src_temp4);
+
+        acc_dc += pu1_ref[three_nt];
+        acc_dc -= pu1_ref[two_nt];
+
+        /* computing acc_dc value */
+        dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+        two_dc_val = 2 * dc_val;
+        three_dc_val = 3 * dc_val;
+
+        temp = _mm_set1_epi8(dc_val);
+
+        for(itr_count = 0; itr_count < 2; itr_count++)
+        {
+            /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp);
+
+            pu1_dst += 16 * dst_strd;
+        }
+    }
+
+    else
+
+    {
+        __m128i  zero_8x16b;
+        __m128i sm1 = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]);
+
+        /* DC filtering for the first top row and first left column */
+
+        zero_8x16b = _mm_set1_epi16(0);
+
+        if(nt == 4) /* nt multiple of 4*/
+        {
+            WORD32 temp1, temp2, temp3;
+
+            src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
+            src_temp2 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+
+            src_temp4 =  _mm_cvtepu8_epi16(src_temp3);
+            src_temp2 =  _mm_cvtepu8_epi16(src_temp2);
+
+            src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+            src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+            src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+            acc_dc = _mm_cvtsi128_si32(src_temp4);
+            acc_dc += pu1_ref[three_nt];
+            acc_dc -= pu1_ref[two_nt];
+
+/* computing acc_dc value */
+
+            dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+            three_dc_val = 3 * dc_val;
+
+            /* loding 8-bit 16 pixel */
+            src_temp1 = _mm_set1_epi16(three_dc_val + 2);
+            two_dc_val = 2 * dc_val;
+
+            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
+            src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
+
+            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2) >> 2 */
+            src_temp2 = _mm_srli_epi16(src_temp2, 2);
+
+            src_temp2 = _mm_packus_epi16(src_temp2, zero_8x16b);
+
+            temp1 = _mm_cvtsi128_si32(src_temp2);
+
+            *(WORD32 *)(&pu1_dst[0]) = temp1;
+
+            /*  retore  first value*/
+            pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
+                            >> 2);
+
+            for(row = 1; row < nt; row++)
+                pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
+                                >> 2;
+
+            src_temp2 = _mm_insert_epi8(src_temp2, dc_val, 0);
+
+            src_temp2 =  _mm_shuffle_epi8(src_temp2, sm1);
+            src_temp3 =  _mm_shuffle_epi8(src_temp2, sm1);
+            src_temp4 =  _mm_shuffle_epi8(src_temp2, sm1);
+
+            src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[(1 * dst_strd) + 0], 0);
+            src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[(2 * dst_strd) + 0], 0);
+            src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[(3 * dst_strd) + 0], 0);
+
+            temp1 = _mm_cvtsi128_si32(src_temp2);
+            temp2 = _mm_cvtsi128_si32(src_temp3);
+            temp3 = _mm_cvtsi128_si32(src_temp4);
+
+            *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1;
+            *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2;
+            *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3;
+
+        }
+        else if(nt == 8) /* if nt%8==0*/
+        {
+
+            src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
+
+            src_temp4 = _mm_sad_epu8(src_temp3, m_zero);
+            src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
+            src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+            acc_dc = _mm_cvtsi128_si32(src_temp4);
+
+            acc_dc += pu1_ref[three_nt];
+            acc_dc -= pu1_ref[two_nt];
+
+            /* computing acc_dc value */
+
+            dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+            three_dc_val = 3 * dc_val;
+            src_temp1 = _mm_set1_epi16(three_dc_val + 2);
+            two_dc_val = 2 * dc_val;
+
+            /* loding 8-bit 16 pixel */
+            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+            src_temp2 =  _mm_cvtepu8_epi16(src_temp2);
+
+            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
+            src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
+
+            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
+            src_temp2 = _mm_srli_epi16(src_temp2, 2);
+            src_temp2 = _mm_packus_epi16(src_temp2, zero_8x16b);
+
+            _mm_storel_epi64((__m128i *)(pu1_dst), src_temp2);
+
+            /*  retore  first value*/
+            pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
+                            >> 2);
+
+            for(row = 1; row < nt; row++)
+                pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
+                                >> 2;
+
+            /* Fill the remaining rows with DC value*/
+
+            src_temp1 = _mm_set1_epi8(dc_val);
+            src_temp2 = _mm_set1_epi8(dc_val);
+            src_temp3 = _mm_set1_epi8(dc_val);
+            src_temp4 = _mm_set1_epi8(dc_val);
+            src_temp5 = _mm_set1_epi8(dc_val);
+            src_temp6 = _mm_set1_epi8(dc_val);
+            src_temp7 = _mm_set1_epi8(dc_val);
+
+            src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0);
+            src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0);
+            src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0);
+            src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0);
+            src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0);
+            src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0);
+            src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0);
+
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
+
+        }
+        else if(nt == 16) /* if nt%8==0*/
+        {
+
+            src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
+            src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
+
+            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+            src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
+
+            src_temp3 = _mm_sad_epu8(src_temp3, m_zero);
+            src_temp4 = _mm_sad_epu8(src_temp4, m_zero);
+
+            src_temp2 =  _mm_cvtepu8_epi16(src_temp2);
+            src_temp10 =  _mm_cvtepu8_epi16(src_temp10);
+
+            src_temp4 = _mm_add_epi16(src_temp3, src_temp4);
+            src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
+            src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+            acc_dc = _mm_cvtsi128_si32(src_temp4);
+
+            acc_dc += pu1_ref[three_nt];
+            acc_dc -= pu1_ref[two_nt];
+
+            /* computing acc_dc value */
+
+            dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+            three_dc_val = 3 * dc_val;
+            src_temp1 = _mm_set1_epi16(three_dc_val + 2);
+            two_dc_val = 2 * dc_val;
+
+            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
+            src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
+            src_temp10 = _mm_add_epi16(src_temp10, src_temp1);
+            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
+            src_temp2 = _mm_srli_epi16(src_temp2, 2);
+            src_temp10 = _mm_srli_epi16(src_temp10, 2);
+
+            src_temp2 = _mm_packus_epi16(src_temp2, src_temp10);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp2);
+
+            /*  retore  first value*/
+            pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
+                            >> 2);
+
+            for(row = 1; row < nt; row++)
+                pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
+                                >> 2;
+            /* Fill the remaining rows with DC value*/
+            src_temp1 =  _mm_set1_epi8(dc_val);
+            src_temp2 =  _mm_set1_epi8(dc_val);
+            src_temp3 =  _mm_set1_epi8(dc_val);
+            src_temp4 =  _mm_set1_epi8(dc_val);
+            src_temp5 =  _mm_set1_epi8(dc_val);
+            src_temp6 =  _mm_set1_epi8(dc_val);
+            src_temp7 =  _mm_set1_epi8(dc_val);
+
+            for(row = 1; row < nt; row += 8)
+            {
+
+                src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0);
+                src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0);
+                src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0);
+                src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0);
+                src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0);
+                src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0);
+                src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
+
+                src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((8) * dst_strd)], 0);
+                src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((9) * dst_strd)], 0);
+                src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((10) * dst_strd)], 0);
+                src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((11) * dst_strd)], 0);
+                src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((12) * dst_strd)], 0);
+                src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((13) * dst_strd)], 0);
+                src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((14) * dst_strd)], 0);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp3);
+
+                src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((15) * dst_strd)], 0);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp4);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp7);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp1);
+
+            }
+
+        }
+        else if(nt == 32) /* if nt%8==0*/
+        {
+
+            __m128i src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16, src_temp17;
+
+            src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
+            src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
+            src_temp7 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 32));
+            src_temp8 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 48));
+
+            /* loding 8-bit 16 pixel */
+            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+            src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
+            src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 16));
+            src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 24));
+
+            src_temp3 = _mm_sad_epu8(src_temp3, m_zero);
+            src_temp4 = _mm_sad_epu8(src_temp4, m_zero);
+            src_temp7 = _mm_sad_epu8(src_temp7, m_zero);
+            src_temp8 = _mm_sad_epu8(src_temp8, m_zero);
+
+            src_temp2 =  _mm_cvtepu8_epi16(src_temp2);
+            src_temp6 =  _mm_cvtepu8_epi16(src_temp6);
+            src_temp9 =  _mm_cvtepu8_epi16(src_temp9);
+            src_temp10 =  _mm_cvtepu8_epi16(src_temp10);
+
+            src_temp4 = _mm_add_epi16(src_temp3, src_temp4);
+            src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
+            src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
+
+            src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
+            src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+            acc_dc = _mm_cvtsi128_si32(src_temp4);
+
+            acc_dc += pu1_ref[three_nt];
+            acc_dc -= pu1_ref[two_nt];
+
+            /* computing acc_dc value */
+
+            dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+            three_dc_val = 3 * dc_val;
+            src_temp1 = _mm_set1_epi16(three_dc_val + 2);
+            two_dc_val = 2 * dc_val;
+
+            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
+            src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
+            src_temp2 = _mm_add_epi16(src_temp6, src_temp1);
+            src_temp2 = _mm_add_epi16(src_temp9, src_temp1);
+            src_temp2 = _mm_add_epi16(src_temp10, src_temp1);
+
+            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
+            src_temp2 = _mm_srli_epi16(src_temp2, 2);
+            src_temp6 = _mm_srli_epi16(src_temp6, 2);
+            src_temp9 = _mm_srli_epi16(src_temp9, 2);
+            src_temp10 = _mm_srli_epi16(src_temp10, 2);
+
+            src_temp2 = _mm_packus_epi16(src_temp2, src_temp6);
+            src_temp10 = _mm_packus_epi16(src_temp9, src_temp10);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp10);
+
+            /*  retore  first value*/
+            pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
+                            >> 2);
+
+            for(row = 1; row < nt; row++)
+                pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
+                                >> 2;
+            /* Fill the remaining rows with DC value*/
+            src_temp1 = _mm_insert_epi8(src_temp1, dc_val, 0);
+
+            src_temp2 =  src_temp1;
+            src_temp3 = src_temp1;
+            src_temp4 =  src_temp1;
+            src_temp5 =  src_temp1;
+            src_temp6 =  src_temp1;
+            src_temp7 =  src_temp1;
+
+            src_temp12 = src_temp1;
+            src_temp13 = src_temp1;
+            src_temp14 = src_temp1;
+            src_temp15 = src_temp1;
+            src_temp16 = src_temp1;
+            src_temp17 = src_temp1;
+            src_temp11 = src_temp1;
+
+            for(row = 1; row < nt; row++)
+            {
+                src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0);
+                src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0);
+                src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0);
+                src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0);
+                src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0);
+                src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0);
+                src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd)), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd) + 16), src_temp11);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 16), src_temp12);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 16), src_temp13);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp4);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 16), src_temp14);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd) + 16), src_temp15);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd) + 16), src_temp16);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp7);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd) + 16), src_temp17);
+
+
+            }
+
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Intra prediction interpolation filter for horizontal luma variable.
+*
+* @par Description:
+*      Horizontal intraprediction(mode 10) with reference  samples location
+*      pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
+*      to section 8.4.4.2.6 in the standard (Special case)
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_horz_sse42(UWORD8 *pu1_ref,
+                                      WORD32 src_strd,
+                                      UWORD8 *pu1_dst,
+                                      WORD32 dst_strd,
+                                      WORD32 nt,
+                                      WORD32 mode)
+{
+
+    WORD32 row;
+    WORD32 two_nt;
+    UNUSED(src_strd);
+    UNUSED(mode);
+
+    two_nt = 2 * nt;
+
+
+    if(nt == 32)
+    {
+        __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
+        __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
+        __m128i sm = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]);
+
+        for(row = 0; row < nt; row += 16)
+        {
+            {
+                src_temp1 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 - row - 15));
+
+                src_temp2 =  _mm_srli_si128(src_temp1, 1);
+                src_temp3 =  _mm_srli_si128(src_temp1, 2);
+                src_temp4 =  _mm_srli_si128(src_temp1, 3);
+                src_temp5 =  _mm_srli_si128(src_temp1, 4);
+                src_temp6 =  _mm_srli_si128(src_temp1, 5);
+                src_temp7 =  _mm_srli_si128(src_temp1, 6);
+                src_temp8 =  _mm_srli_si128(src_temp1, 7);
+
+                src_temp9 =  _mm_srli_si128(src_temp1, 8);
+                src_temp10 =  _mm_srli_si128(src_temp1, 9);
+                src_temp11 =  _mm_srli_si128(src_temp1, 10);
+                src_temp12 =  _mm_srli_si128(src_temp1, 11);
+                src_temp13 =  _mm_srli_si128(src_temp1, 12);
+                src_temp14 =  _mm_srli_si128(src_temp1, 13);
+                src_temp15 =  _mm_srli_si128(src_temp1, 14);
+                src_temp16 =  _mm_srli_si128(src_temp1, 15);
+
+                src_temp8 =  _mm_shuffle_epi8(src_temp8, sm);
+                src_temp7 =  _mm_shuffle_epi8(src_temp7, sm);
+                src_temp6 =  _mm_shuffle_epi8(src_temp6, sm);
+                src_temp5 =  _mm_shuffle_epi8(src_temp5, sm);
+                src_temp4 =  _mm_shuffle_epi8(src_temp4, sm);
+                src_temp3 =  _mm_shuffle_epi8(src_temp3, sm);
+                src_temp2 =  _mm_shuffle_epi8(src_temp2, sm);
+                src_temp1 =  _mm_shuffle_epi8(src_temp1, sm);
+
+                src_temp16 =  _mm_shuffle_epi8(src_temp16, sm);
+                src_temp15 =  _mm_shuffle_epi8(src_temp15, sm);
+                src_temp14 =  _mm_shuffle_epi8(src_temp14, sm);
+                src_temp13 =  _mm_shuffle_epi8(src_temp13, sm);
+                src_temp12 =  _mm_shuffle_epi8(src_temp12, sm);
+                src_temp11 =  _mm_shuffle_epi8(src_temp11, sm);
+                src_temp10 =  _mm_shuffle_epi8(src_temp10, sm);
+                src_temp9 =  _mm_shuffle_epi8(src_temp9, sm);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp16);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp15);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp14);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp13);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp12);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp11);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp10);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp9);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 8) * dst_strd)), src_temp8);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 9) * dst_strd)), src_temp7);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 10) * dst_strd)), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 11) * dst_strd)), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 12) * dst_strd)), src_temp4);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 13) * dst_strd)), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 14) * dst_strd)), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 15) * dst_strd)), src_temp1);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 0) * dst_strd)), src_temp16);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 1) * dst_strd)), src_temp15);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 2) * dst_strd)), src_temp14);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 3) * dst_strd)), src_temp13);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 4) * dst_strd)), src_temp12);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 5) * dst_strd)), src_temp11);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 6) * dst_strd)), src_temp10);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 7) * dst_strd)), src_temp9);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 8) * dst_strd)), src_temp8);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 9) * dst_strd)), src_temp7);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 10) * dst_strd)), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 11) * dst_strd)), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 12) * dst_strd)), src_temp4);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 13) * dst_strd)), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 14) * dst_strd)), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 15) * dst_strd)), src_temp1);
+
+            }
+
+        }
+
+    }
+    else
+
+    {
+        __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6;
+        __m128i src_temp10, zero_8x16b, src_temp7;
+
+        /* DC filtering for the first top row and first left column */
+
+        zero_8x16b = _mm_set1_epi16(0);
+
+        /*Filtering done for the 1st row */
+
+        src_temp2 =  _mm_set1_epi16(pu1_ref[two_nt - 1]);
+        src_temp10 =  _mm_set1_epi16(pu1_ref[two_nt]);
+
+        /*  loding 8-bit 16 pixels */
+        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+
+        src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
+
+        /*(pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt])*/
+        src_temp3 = _mm_sub_epi16(src_temp4, src_temp10);
+
+        /* ((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/
+        src_temp3 = _mm_srai_epi16(src_temp3, 1);
+
+        /* pu1_ref[two_nt - 1]+((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/
+        src_temp3 = _mm_add_epi16(src_temp2, src_temp3);
+
+        if(nt == 4)
+        {
+            int temp1, temp2, temp3;
+            src_temp3 = _mm_packus_epi16(src_temp3, zero_8x16b);
+            temp1 = _mm_cvtsi128_si32(src_temp3);
+
+            *(WORD32 *)(&pu1_dst[0]) = temp1;
+
+            src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 2]);
+            src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 3]);
+            src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 4]);
+
+            temp1 = _mm_cvtsi128_si32(src_temp2);
+            temp2 = _mm_cvtsi128_si32(src_temp3);
+            temp3 = _mm_cvtsi128_si32(src_temp4);
+
+            /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
+            *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1;
+            *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2;
+            *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3;
+
+        }
+        else if(nt == 8)
+        {
+            src_temp10 = _mm_packus_epi16(src_temp3, zero_8x16b);
+
+
+            src_temp1 =  _mm_set1_epi8(pu1_ref[two_nt - 2]);
+            src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 3]);
+            src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 4]);
+            src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 5]);
+            src_temp5 =  _mm_set1_epi8(pu1_ref[two_nt - 6]);
+            src_temp6 =  _mm_set1_epi8(pu1_ref[two_nt - 7]);
+            src_temp7 =  _mm_set1_epi8(pu1_ref[two_nt - 8]);
+
+            _mm_storel_epi64((__m128i *)(pu1_dst), src_temp10);
+
+            /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp2);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp3);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp5);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp6);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp7);
+
+        }
+        else if(nt == 16)
+        {
+            src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
+            src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
+
+            src_temp10 = _mm_sub_epi16(src_temp4, src_temp10);
+            src_temp10 = _mm_srai_epi16(src_temp10, 1);
+            src_temp10 = _mm_add_epi16(src_temp2, src_temp10);
+
+            src_temp3 = _mm_packus_epi16(src_temp3, src_temp10);
+            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp3);
+
+            /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
+            src_temp1 =  _mm_set1_epi8(pu1_ref[two_nt - 2]);
+            src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 3]);
+            src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 4]);
+            src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 5]);
+            src_temp5 =  _mm_set1_epi8(pu1_ref[two_nt - 6]);
+            src_temp6 =  _mm_set1_epi8(pu1_ref[two_nt - 7]);
+            src_temp7 =  _mm_set1_epi8(pu1_ref[two_nt - 8]);
+            src_temp10 =  _mm_set1_epi8(pu1_ref[two_nt - 9]);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp10);
+
+            src_temp1 =  _mm_set1_epi8(pu1_ref[two_nt - 10]);
+            src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 11]);
+            src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 12]);
+            src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 13]);
+            src_temp5 =  _mm_set1_epi8(pu1_ref[two_nt - 14]);
+            src_temp6 =  _mm_set1_epi8(pu1_ref[two_nt - 15]);
+            src_temp7 =  _mm_set1_epi8(pu1_ref[two_nt - 16]);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp3);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp4);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp5);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp6);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp7);
+
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Intra prediction interpolation filter for vertical luma variable.
+*
+* @par Description:
+*    Horizontal intraprediction with reference neighboring  samples location
+*    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
+*    to section 8.4.4.2.6 in the standard (Special case)
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_ver_sse42(UWORD8 *pu1_ref,
+                                     WORD32 src_strd,
+                                     UWORD8 *pu1_dst,
+                                     WORD32 dst_strd,
+                                     WORD32 nt,
+                                     WORD32 mode)
+{
+    WORD32 row;
+    WORD16 s2_predpixel;
+    WORD32 two_nt = 2 * nt;
+    __m128i src_temp0, src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7;
+
+    UNUSED(src_strd);
+    UNUSED(mode);
+
+    if(nt == 32)
+    {
+        __m128i temp1, temp2;
+        WORD32 itr_count;
+
+        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+        temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 16));
+
+        for(itr_count = 0; itr_count < 2; itr_count++)
+        {
+            /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp1);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp2);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp1);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp2);
+
+            pu1_dst += 16 * dst_strd;
+        }
+    }
+
+    else
+
+    {
+        /*Filtering done for the 1st column */
+        for(row = nt - 1; row >= 0; row--)
+        {
+            s2_predpixel = pu1_ref[two_nt + 1]
+                            + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1);
+            pu1_dst[row * dst_strd] = CLIP_U8(s2_predpixel);
+        }
+
+        /* Replication to next columns*/
+
+        if(nt == 4)
+        {
+            int temp1, temp2, temp3, temp4;
+
+            src_temp2 =   _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+            src_temp3 =  src_temp2;
+            src_temp4 =  src_temp2;
+            src_temp5 =  src_temp2;
+
+            src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[(0 * dst_strd)], 0);
+            src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[(1 * dst_strd)], 0);
+            src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[(2 * dst_strd)], 0);
+            src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[(3 * dst_strd)], 0);
+
+            temp1 = _mm_cvtsi128_si32(src_temp2);
+            temp2 = _mm_cvtsi128_si32(src_temp3);
+            temp3 = _mm_cvtsi128_si32(src_temp4);
+            temp4 = _mm_cvtsi128_si32(src_temp5);
+
+            /* loding 4-bit 8 pixels values */
+            *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1;
+            *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2;
+            *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3;
+            *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4;
+
+        }
+        else if(nt == 8)
+        {
+
+            src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+            src_temp1 = src_temp0;
+            src_temp2 = src_temp0;
+            src_temp3 = src_temp0;
+            src_temp4 = src_temp0;
+            src_temp5 = src_temp0;
+            src_temp6 = src_temp0;
+            src_temp7 = src_temp0;
+
+            src_temp0 = _mm_insert_epi8(src_temp0, pu1_dst[((0) * dst_strd)], 0);
+            src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0);
+            src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0);
+            src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0);
+            src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0);
+            src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0);
+            src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0);
+            src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0);
+
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((0) * dst_strd)), src_temp0);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
+
+
+        }
+        else if(nt == 16)
+        {
+            for(row = 0; row < nt; row += 8)
+            {
+
+                src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+                src_temp1 = src_temp0;
+                src_temp2 = src_temp0;
+                src_temp3 = src_temp0;
+                src_temp4 = src_temp0;
+                src_temp5 = src_temp0;
+                src_temp6 = src_temp0;
+                src_temp7 = src_temp0;
+
+                src_temp0 = _mm_insert_epi8(src_temp0, pu1_dst[((row + 0) * dst_strd)], 0);
+                src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((row + 1) * dst_strd)], 0);
+                src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((row + 2) * dst_strd)], 0);
+                src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((row + 3) * dst_strd)], 0);
+                src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((row + 4) * dst_strd)], 0);
+                src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((row + 5) * dst_strd)], 0);
+                src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((row + 6) * dst_strd)], 0);
+                src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((row + 7) * dst_strd)], 0);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp0);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp4);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp7);
+
+            }
+
+        }
+
+
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for luma mode 3 to mode 9
+*
+* @par Description:
+*    Intraprediction for mode 3 to 9  (positive angle, horizontal mode ) with
+*    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+*    block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_3_to_9_sse42(UWORD8 *pu1_ref,
+                                             WORD32 src_strd,
+                                             UWORD8 *pu1_dst,
+                                             WORD32 dst_strd,
+                                             WORD32 nt,
+                                             WORD32 mode)
+{
+    WORD32 row, col;
+    WORD32 two_nt = 2 * nt;
+    WORD32 intra_pred_ang;
+
+
+    __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
+    __m128i fract_4x32b, intra_pred_ang_4x32b;
+    __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3;
+    UNUSED(src_strd);
+
+
+    /* Intra Pred Angle according to the mode */
+    intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+    /* samples dependent on distance to obtain destination sample */
+
+    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+    /* samples dependent on distance to obtain destination sample */
+
+    const_temp_4x32b  = _mm_set1_epi16(16);
+    const_temp2_4x32b = _mm_set1_epi32(31);
+    const_temp3_4x32b = _mm_set1_epi32(32);
+    const_temp4_4x32b = _mm_set1_epi32(4);
+
+    two_nt_4x32b = _mm_set1_epi32(two_nt - nt);
+
+
+    sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
+
+    /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+    intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
+
+    row_4x32b = _mm_set_epi32(4, 3, 2, 1);
+
+    if(nt == 4)
+    {
+
+        WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+        int temp11, temp21, temp31, temp41;
+        // WORD8  ai1_fract_temp_val[16], ai1_row_temp_val[16];
+
+        __m128i fract1_8x16b, fract2_8x16b;
+        __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+
+        __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+        __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b; //, src_temp8_8x16b;
+        __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
+
+        /* pos = ((row + 1) * intra_pred_ang); */
+        res_temp5_4x32b  = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b);
+
+        /* idx = pos >> 5; */
+        fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+        /* fract = pos & (31); */
+        ref_main_idx_4x32b = _mm_sub_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
+
+        /*(32 - fract) */
+        row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b);
+
+        fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+        fract2_8x16b = _mm_slli_epi16(row_4x32b, 8);
+
+        fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+        row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */
+
+        fract2_8x16b = _mm_unpackhi_epi8(row_4x32b, fract_4x32b);
+        fract1_8x16b = _mm_unpacklo_epi8(row_4x32b, fract_4x32b);
+
+        temp1_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+        temp2_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+        temp3_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+        temp4_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+
+        ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4);  /* next 32 bit values */
+        ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8);  /* next 32 bit values */
+        ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */
+        ref_main_idx1  = _mm_cvtsi128_si32(ref_main_idx_4x32b);    /* col=0*/
+        ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* col=1*/
+        ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* col=2*/
+        ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* col=3*/
+
+        /* loding 8-bit 16 pixels */
+        src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 - 1)); /* col=0*/
+        src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2 - 1)); /* col=1*/
+        src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3 - 1)); /* col=2*/
+        src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4 - 1)); /* col=3*/
+
+        src_temp1_8x16b =  _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
+        src_temp2_8x16b =  _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
+        src_temp3_8x16b =  _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
+        src_temp4_8x16b =  _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
+
+        /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+        src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+        src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+        src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+        src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+        /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+        src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+        src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+        src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+        src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+        /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+        src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
+        src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
+        src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
+        src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
+
+        /* converting 16 bit to 8 bit */
+        src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+        src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+
+        src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+        src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+        src_temp3_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+        src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 4);
+        src_temp1_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+        src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 12);
+
+        temp11 = _mm_cvtsi128_si32(src_temp7_8x16b);
+        temp21 = _mm_cvtsi128_si32(src_temp1_8x16b);
+        temp31 = _mm_cvtsi128_si32(src_temp2_8x16b);
+        temp41 = _mm_cvtsi128_si32(src_temp3_8x16b);
+
+        /* loding 4-bit 8 pixels values */
+        *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
+        *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
+        *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
+        *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
+
+    }
+
+    else if(nt == 16 || nt == 32)
+    {
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp4_4x32b = _mm_set1_epi16(8);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+        two_nt_4x32b = _mm_set1_epi16(two_nt);
+
+        for(col = 0; col < nt; col += 8)
+        {
+            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+            //WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
+
+            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* idx = pos >> 5; */
+            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+            fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+            fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+            fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+            fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+
+            fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b);
+            fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b);
+
+            temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
+            temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
+            temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
+            temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+            temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
+            temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
+            temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+            temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+            /* fract = pos & (31); */
+            ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+
+            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
+            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
+            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
+            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
+
+            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
+            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
+            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
+            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
+
+            for(row = 0; row < nt; row += 8)
+            {
+                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+
+                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+                /* loding 8-bit 16 pixels */
+                src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1 - (8 + row))); /* col=0*/
+                src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1 - (8 + row))); /* col=1*/
+                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1 - (8 + row))); /* col=2*/
+                src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1 - (8 + row))); /* col=3*/
+
+                /* loding 8-bit 16 pixels */
+                src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1 - (8 + row))); /* col=5*/
+                src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1 - (8 + row))); /* col=6*/
+                src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1 - (8 + row))); /* col=7*/
+                src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1 - (8 + row))); /* col=8*/
+
+                src_temp1_8x16b =  _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
+                src_temp2_8x16b =  _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
+                src_temp3_8x16b =  _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
+                src_temp4_8x16b =  _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
+
+                src_temp11_8x16b =  _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/
+                src_temp12_8x16b =  _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/
+                src_temp13_8x16b =  _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/
+                src_temp14_8x16b =  _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
+                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
+                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
+                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
+                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
+                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
+                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
+                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
+
+                src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+                src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+                src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+                src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+                src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+                src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+                src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+                src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+                src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+                src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+                src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+                src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+
+                src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+                src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+                src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+                src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp1_8x16b);          /* row=7*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp5_8x16b);       /* row=6*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp2_8x16b);       /* row=5*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp6_8x16b);       /* row=4*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp3_8x16b);       /* row=3*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp7_8x16b);       /* row=2*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp4_8x16b);       /* row=1*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 0))), src_temp8_8x16b);       /* row=0*/
+
+            }
+        }
+    }
+    else
+    {
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp4_4x32b = _mm_set1_epi16(8);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+        two_nt_4x32b = _mm_set1_epi16(two_nt - nt);
+        {
+            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+
+            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* idx = pos >> 5; */
+            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /* fract = pos & (31); */
+            ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            /*(32 - fract) */
+            fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+            fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+            fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+            fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+            fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+
+            fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b);
+            fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b);
+
+            temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
+            temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
+            temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
+            temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+            temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
+            temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
+            temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+            temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
+            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
+            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
+            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
+
+            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
+            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
+            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
+            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
+
+            {
+                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+                /* loding 8-bit 16 pixels */
+                src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1)); /* col=0*/
+                src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1)); /* col=1*/
+                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1)); /* col=2*/
+                src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1)); /* col=3*/
+
+                /* loding 8-bit 16 pixels */
+                src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1)); /* col=5*/
+                src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1)); /* col=6*/
+                src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1)); /* col=7*/
+                src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1)); /* col=8*/
+
+                src_temp1_8x16b =  _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
+                src_temp2_8x16b =  _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
+                src_temp3_8x16b =  _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
+                src_temp4_8x16b =  _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
+
+                src_temp11_8x16b =  _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/
+                src_temp12_8x16b =  _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/
+                src_temp13_8x16b =  _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/
+                src_temp14_8x16b =  _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
+                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
+                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
+                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
+                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
+                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
+                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
+                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
+
+                src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+                src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+                src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+                src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+                src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+                src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+                src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+                src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+                src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+                src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+                src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+                src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+
+                src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+                src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+                src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+                src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+                _mm_storel_epi64((__m128i *)(pu1_dst), src_temp8_8x16b);       /* row=0*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 1)), src_temp4_8x16b);       /* row=1*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 2)), src_temp7_8x16b);       /* row=2*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 3)), src_temp3_8x16b);       /* row=3*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 4)), src_temp6_8x16b);       /* row=4*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 5)), src_temp2_8x16b);       /* row=5*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 6)), src_temp5_8x16b);       /* row=6*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 7)), src_temp1_8x16b);          /* row=7*/
+
+            }
+        }
+    }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Intra prediction interpolation filter for luma mode 11 to mode 17
+*
+* @par Description:
+*    Intraprediction for mode 11 to 17  (negative angle, horizontal mode )
+*    with reference  neighboring samples location pointed by 'pu1_ref' to the
+*    TU block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_11_to_17_sse42(UWORD8 *pu1_ref,
+                                               WORD32 src_strd,
+                                               UWORD8 *pu1_dst,
+                                               WORD32 dst_strd,
+                                               WORD32 nt,
+                                               WORD32 mode)
+{
+
+    /* This function and ihevc_intra_pred_luma_mode_19_to_25 are same except*/
+    /* for ref main & side samples assignment,can be combined for */
+    /* optimzation*/
+
+    WORD32 row, col, k;
+    WORD32 two_nt;
+    WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
+    WORD32 ref_idx;
+
+    __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
+    __m128i fract_4x32b,  intra_pred_ang_4x32b;
+    __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3;
+
+
+    UWORD8 ref_tmp[2 * MAX_CU_SIZE + 2];
+    UWORD8 *ref_main;
+    UWORD8 *ref_temp;
+    UNUSED(src_strd);
+
+    inv_ang_sum = 128;
+    two_nt    = 2 * nt;
+    ref_temp = ref_tmp + 1;
+    ref_main = ref_temp + nt - 1;
+    intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+    /* samples dependent on distance to obtain destination sample */
+    const_temp_4x32b  = _mm_set1_epi16(16);
+    const_temp2_4x32b = _mm_set1_epi32(31);
+    const_temp3_4x32b = _mm_set1_epi32(32);
+    const_temp4_4x32b = _mm_set1_epi32(4);
+
+    two_nt_4x32b = _mm_set1_epi32(1);
+
+
+    sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
+
+    /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+    intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
+
+    row_4x32b = _mm_set_epi32(4, 3, 2, 1);
+
+    if(nt == 4)
+    {
+
+        WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+        int temp11, temp21, temp31, temp41;
+//        WORD8  ai1_fract_temp_val[16], ai1_row_temp_val[16];
+
+        __m128i fract1_8x16b, fract2_8x16b;
+        __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+
+        __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+        __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+        __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
+
+        /* Intermediate reference samples for negative angle modes */
+        /* This have to be removed during optimization*/
+        /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
+        inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
+
+        ref_main = ref_temp + nt - 1;
+        for(k = 0; k < nt + 1; k++)
+            ref_temp[k + nt - 1] = pu1_ref[two_nt - k];
+
+        ref_main = ref_temp + nt - 1;
+        ref_idx = (nt * intra_pred_ang) >> 5;
+
+        /* SIMD Optimization can be done using look-up table for the loop */
+        /* For negative angled derive the main reference samples from side */
+        /*  reference samples refer to section 8.4.4.2.6 */
+        for(k = -1; k > ref_idx; k--)
+        {
+            inv_ang_sum += inv_ang;
+            ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
+        }
+
+
+        /* pos = ((row + 1) * intra_pred_ang); */
+        res_temp5_4x32b  = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b);
+
+        /* idx = pos >> 5; */
+        fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+        /* fract = pos & (31); */
+        ref_main_idx_4x32b = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
+
+        /*(32 - fract) */
+        row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b);
+
+        fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+        fract2_8x16b = _mm_slli_epi16(row_4x32b, 8);
+
+        fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+        row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */
+
+        fract2_8x16b = _mm_unpackhi_epi8(fract_4x32b, row_4x32b);
+        fract1_8x16b = _mm_unpacklo_epi8(fract_4x32b, row_4x32b);
+
+        temp1_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+        temp2_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+        temp3_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+        temp4_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+
+        ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4);  /* next 32 bit values */
+        ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8);  /* next 32 bit values */
+        ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */
+        ref_main_idx1  = _mm_cvtsi128_si32(ref_main_idx_4x32b);    /* col=0*/
+        ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* col=1*/
+        ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* col=2*/
+        ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* col=3*/
+
+        /* loding 8-bit 16 pixels */
+        src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col=0*/
+        src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col=1*/
+        src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col=2*/
+        src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col=3*/
+
+        src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
+        src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
+        src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
+        src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
+
+        /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+        src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+        src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+        src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+        src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+        /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+        src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+        src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+        src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+        src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+        /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+        src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
+        src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
+        src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
+        src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
+
+        /* converting 16 bit to 8 bit */
+        src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+        src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+
+        src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+        src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+        src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+        src_temp1_8x16b = _mm_srli_si128(src_temp7_8x16b, 4);
+        src_temp2_8x16b = _mm_srli_si128(src_temp7_8x16b, 8);
+        src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 12);
+
+        temp11 = _mm_cvtsi128_si32(src_temp7_8x16b);
+        temp21 = _mm_cvtsi128_si32(src_temp1_8x16b);
+        temp31 = _mm_cvtsi128_si32(src_temp2_8x16b);
+        temp41 = _mm_cvtsi128_si32(src_temp3_8x16b);
+
+        /* loding 8-bit 4 pixels values */
+        *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
+        *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
+        *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
+        *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
+    }
+
+    else if(nt == 32)
+    {
+
+
+        __m128i temp1, temp2, temp3, temp11, temp12;
+        __m128i src_values0, src_values1;
+        /* Intermediate reference samples for negative angle modes */
+
+        ref_temp[two_nt - 1] = pu1_ref[two_nt - nt];
+        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1));
+        temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 17));
+        temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
+
+        /* For negative angled derive the main reference samples from side */
+
+        src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
+        src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 17)); /*(nt+16)-(two_nt-1)*/
+
+        temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode]));
+        temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
+
+        src_values0 = _mm_shuffle_epi8(src_values0, temp2);
+        src_values1 = _mm_shuffle_epi8(src_values1, temp2);
+        src_values0 = _mm_shuffle_epi8(src_values0, temp12);
+        src_values1 = _mm_shuffle_epi8(src_values1, temp11);
+
+        temp1 = _mm_shuffle_epi8(temp1, temp2);
+        temp3 = _mm_shuffle_epi8(temp3, temp2);
+
+        _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp3);
+        _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp1);
+        _mm_storeu_si128((__m128i *)(ref_main - 16), src_values0);
+        _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[17 - mode][0]), src_values1);
+
+
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp4_4x32b = _mm_set1_epi16(8);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+        two_nt_4x32b = _mm_set1_epi16(1);
+
+        for(col = 0; col < nt; col += 8)
+        {
+            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+            // WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
+
+            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* idx = pos >> 5; */
+            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /* fract = pos & (31); */
+            ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+            /*(32 - fract) */
+            fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+            fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+            fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+            fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+            fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+
+            fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
+            fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
+
+            temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
+            temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
+            temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
+            temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+            temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
+            temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
+            temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+            temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
+            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
+            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
+            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
+
+            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
+            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
+            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
+            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
+
+            for(row = 0; row < nt; row += 8)
+            {
+                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+
+                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+                /* loding 8-bit 16 pixels */
+                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/
+                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/
+                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/
+                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/
+
+                src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/
+                src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/
+                src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/
+                src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/
+
+                /* loding 8-bit 16 pixels */
+                src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/
+                src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/
+                src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/
+                src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/
+
+                src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
+                src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
+                src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
+                src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
+
+                src_temp11_8x16b =  _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
+                src_temp12_8x16b =  _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
+                src_temp13_8x16b =  _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
+                src_temp14_8x16b =  _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
+                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
+                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
+                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
+                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
+                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
+                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
+                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
+
+                src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+                src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+                src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+                src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+                src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+                src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+                src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+                src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+
+                src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+                src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+                src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+                src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+                src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+                src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+                src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+                src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b);          /* row=0*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b);       /* row=1*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b);       /* row=2*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b);       /* row=4*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b);       /* row=5*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b);       /* row=6*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b);       /* row=7*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b);       /* row=8*/
+
+            }
+        }
+    }
+    else if(nt == 16)
+    {
+
+        __m128i temp1, temp2, temp11, src_values0;
+        /* Intermediate reference samples for negative angle modes */
+        /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+        ref_temp[two_nt - 1] = pu1_ref[two_nt - nt];
+        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1));
+        temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
+        src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
+
+        temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
+
+        src_values0 = _mm_shuffle_epi8(src_values0, temp2);
+        temp1 = _mm_shuffle_epi8(temp1, temp2);
+        src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+
+        _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0);
+        _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
+
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp4_4x32b = _mm_set1_epi16(8);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+        two_nt_4x32b = _mm_set1_epi16(1);
+
+        for(col = 0; col < nt; col += 8)
+        {
+            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+            // WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
+
+            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* idx = pos >> 5; */
+            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /* fract = pos & (31); */
+            ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+            /*(32 - fract) */
+            fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+            fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+            fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+            fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+            fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+
+            fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
+            fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
+
+            temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
+            temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
+            temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
+            temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+            temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
+            temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
+            temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+            temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
+            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
+            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
+            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
+
+            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
+            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
+            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
+            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
+
+            for(row = 0; row < nt; row += 8)
+            {
+                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+
+                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+                /* loding 8-bit 16 pixels */
+                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/
+                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/
+                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/
+                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/
+
+                src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/
+                src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/
+                src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/
+                src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/
+
+                /* loding 8-bit 16 pixels */
+                src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/
+                src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/
+                src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/
+                src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/
+
+                src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
+                src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
+                src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
+                src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
+
+                src_temp11_8x16b =  _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
+                src_temp12_8x16b =  _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
+                src_temp13_8x16b =  _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
+                src_temp14_8x16b =  _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
+                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
+                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
+                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
+                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
+                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
+                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
+                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
+
+                src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+                src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+                src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+                src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+                src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+                src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+                src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+                src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+
+                src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+                src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+                src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+                src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+                src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+                src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+                src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+                src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b);          /* row=0*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b);       /* row=1*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b);       /* row=2*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b);       /* row=4*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b);       /* row=5*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b);       /* row=6*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b);       /* row=7*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b);       /* row=8*/
+
+            }
+        }
+    }
+    else
+    {
+
+
+        __m128i temp1, temp2, temp11, src_values0;
+        /* Intermediate reference samples for negative angle modes */
+        /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+        ref_temp[two_nt - 1] = pu1_ref[nt];
+        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 1));
+
+        /* For negative angled derive the main reference samples from side */
+
+        src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
+        temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
+        temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
+
+        src_values0 = _mm_shuffle_epi8(src_values0, temp2);
+        temp1 = _mm_shuffle_epi8(temp1, temp2);
+        src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+        src_values0 = _mm_srli_si128(src_values0, 8);
+
+        _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1);
+        _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0);
+
+
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp4_4x32b = _mm_set1_epi16(8);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+        two_nt_4x32b = _mm_set1_epi16(1);
+
+        {
+            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+            //WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
+
+            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* idx = pos >> 5; */
+            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /* fract = pos & (31); */
+            ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            /*(32 - fract) */
+            fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+            fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+            fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+            fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+            fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+            fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
+            fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
+
+            temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
+            temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
+            temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
+            temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+            temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
+            temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
+            temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+            temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
+            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
+            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
+            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
+
+            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
+            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
+            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
+            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
+
+            {
+                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+                /* loding 8-bit 16 pixels */
+                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1)); /* col=0*/
+                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2)); /* col=1*/
+                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3)); /* col=2*/
+                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4)); /* col=3*/
+
+                /* loding 8-bit 16 pixels */
+                src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5)); /* col=5*/
+                src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6)); /* col=6*/
+                src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7)); /* col=7*/
+                src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8)); /* col=8*/
+
+                src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
+                src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
+                src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
+                src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
+
+                src_temp11_8x16b =  _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
+                src_temp12_8x16b =  _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
+                src_temp13_8x16b =  _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
+                src_temp14_8x16b =  _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* row=0*/
+                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* row=1*/
+                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* row=2*/
+                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* row=3*/
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
+                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
+                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
+                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=4*/
+                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=5*/
+
+                src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+                src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+                src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+                src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+                src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+                src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+                src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+                src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+
+                src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+                src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+                src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+                src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+                src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+                src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+                src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+                src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp1_8x16b);       /* row=0*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp5_8x16b);       /* row=1*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp2_8x16b);       /* row=2*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp6_8x16b);       /* row=3*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (4))), src_temp3_8x16b);       /* row=4*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (5))), src_temp7_8x16b);       /* row=5*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (6))), src_temp4_8x16b);       /* row=6*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (7))), src_temp8_8x16b);       /* row=7*/
+
+            }
+        }
+    }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Intra prediction interpolation filter for luma mode 19 to mode 25
+*
+* @par Description:
+*    Intraprediction for mode 19 to 25  (negative angle, vertical mode ) with
+*    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+*    block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_19_to_25_sse42(UWORD8 *pu1_ref,
+                                               WORD32 src_strd,
+                                               UWORD8 *pu1_dst,
+                                               WORD32 dst_strd,
+                                               WORD32 nt,
+                                               WORD32 mode)
+{
+
+    WORD32 row, k;
+    WORD32 two_nt, intra_pred_ang;
+    WORD32 inv_ang, inv_ang_sum;
+    //WORD32 ref_main_idx, pos, fract, idx;
+    WORD32 ref_idx;
+    UWORD8 ref_tmp[(2 * MAX_CU_SIZE) + 2];
+    UWORD8 *ref_main, *ref_temp;
+
+    __m128i  /*fract_8x16b,*/ const_temp_8x16b, sm3;
+    __m128i temp1, temp2, temp3, temp4;
+    __m128i temp11, temp12, temp13, temp14;
+    UNUSED(src_strd);
+
+    two_nt = 2 * nt;
+    intra_pred_ang = gai4_ihevc_ang_table[mode];
+    inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
+
+    /* Intermediate reference samples for negative angle modes */
+    /* This have to be removed during optimization*/
+    /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+    ref_temp = ref_tmp + 1;
+    ref_main = ref_temp + nt - 1;
+
+
+    sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
+
+
+
+    const_temp_8x16b = _mm_set1_epi16(16);
+
+    if(nt == 32)
+    {
+
+        __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
+        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+        __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+        __m128i src_values0, src_values1, src_values2, src_values3;
+        __m128i  src_values4, src_values5, src_values6, src_values7;
+        WORD32 col = 0;
+
+        /* Intermediate reference samples for negative angle modes */
+        /* This have to be removed during optimization*/
+        /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+        ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
+        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt));
+        temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 16));
+
+        /* SIMD Optimization can be done using look-up table for the loop */
+        /* For negative angled derive the main reference samples from side */
+        /*  reference samples refer to section 8.4.4.2.6 */
+        src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/
+        src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 16)); /*(nt+16)-(two_nt-1)*/
+
+        temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19]));
+        temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
+
+        src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+        src_values1 = _mm_shuffle_epi8(src_values1, temp12);
+
+        _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
+        _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp3);
+        _mm_storeu_si128((__m128i *)(ref_main - 16), src_values1);
+        _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[mode - 19][0]), src_values0);
+
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+        const_temp8_4x32b = _mm_set1_epi16(8);
+
+        two_nt_4x32b = _mm_set1_epi16(1);
+
+        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+        for(row = 0; row < nt; row += 8)
+        {
+
+            WORD16 ref_main_idx[9];
+
+            __m128i res_temp5_4x32b;
+            __m128i fract1_8x16b, fract2_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* fract = pos & (31); */
+            src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            /* idx = pos >> 5; */
+            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
+            temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+            temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+            temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+            temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
+            temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+            temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+            row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
+            _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+            for(col = 0; col < nt; col += 16)
+            {
+                src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + col));
+                src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + col));
+                src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + col));
+                src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + col));
+                src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8 + col));
+                src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8 + col));
+                src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8 + col));
+                src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8 + col));
+
+                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+                src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+                src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+                src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+                src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+                src_values4 = _mm_maddubs_epi16(src_values4, temp1);
+                src_values5 = _mm_maddubs_epi16(src_values5, temp2);
+                src_values6 = _mm_maddubs_epi16(src_values6, temp3);
+                src_values7 = _mm_maddubs_epi16(src_values7, temp4);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_values0 = _mm_srai_epi16(src_values0,  5);
+                src_values1 = _mm_srai_epi16(src_values1,  5);
+                src_values2 = _mm_srai_epi16(src_values2,  5);
+                src_values3 = _mm_srai_epi16(src_values3,  5);
+                src_values4 = _mm_srai_epi16(src_values4,  5);
+                src_values5 = _mm_srai_epi16(src_values5,  5);
+                src_values6 = _mm_srai_epi16(src_values6,  5);
+                src_values7 = _mm_srai_epi16(src_values7,  5);
+
+                /* converting 16 bit to 8 bit */
+                src_values0 = _mm_packus_epi16(src_values0, src_values4);
+                src_values1 = _mm_packus_epi16(src_values1, src_values5);
+                src_values2 = _mm_packus_epi16(src_values2, src_values6);
+                src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+                /* loading 8-bit 8 pixels values */
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0);       /* row=0*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1);   /* row=1*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2);   /* row=2*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3);   /* row=3*/
+
+
+                src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + col));
+                src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + col));
+                src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + col));
+                src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + col));
+                src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8 + col));
+                src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8 + col));
+                src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8 + col));
+                src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8 + col));
+
+                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+                src_values0 = _mm_maddubs_epi16(src_values0, temp11);
+                src_values1 = _mm_maddubs_epi16(src_values1, temp12);
+                src_values2 = _mm_maddubs_epi16(src_values2, temp13);
+                src_values3 = _mm_maddubs_epi16(src_values3, temp14);
+                src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+                src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+                src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+                src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_values0 = _mm_srai_epi16(src_values0,  5);
+                src_values1 = _mm_srai_epi16(src_values1,  5);
+                src_values2 = _mm_srai_epi16(src_values2,  5);
+                src_values3 = _mm_srai_epi16(src_values3,  5);
+                src_values4 = _mm_srai_epi16(src_values4,  5);
+                src_values5 = _mm_srai_epi16(src_values5,  5);
+                src_values6 = _mm_srai_epi16(src_values6,  5);
+                src_values7 = _mm_srai_epi16(src_values7,  5);
+
+                /* converting 16 bit to 8 bit */
+                src_values0 = _mm_packus_epi16(src_values0, src_values4);
+                src_values1 = _mm_packus_epi16(src_values1, src_values5);
+                src_values2 = _mm_packus_epi16(src_values2, src_values6);
+                src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+                /* loading 8-bit 8 pixels values */
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0);   /* row=4*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1);   /* row=5*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2);   /* row=6*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3);   /* row=7*/
+
+            }
+            pu1_dst += 8 * dst_strd;
+        }
+
+    }
+    else if(nt == 16) /* for nt = 16 case */
+    {
+
+        __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
+        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+        __m128i row_4x32b, two_nt_4x32b, src_values12;
+        __m128i src_values0, src_values1, src_values2, src_values3;
+        __m128i  src_values4, src_values5, src_values6, src_values7;
+
+
+        /* Intermediate reference samples for negative angle modes */
+        /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+        ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
+        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt));
+
+        src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/
+
+        temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
+
+        src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+
+        _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0);
+        _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
+
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+        const_temp8_4x32b = _mm_set1_epi16(8);
+
+        two_nt_4x32b = _mm_set1_epi16(1);
+
+        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+        for(row = 0; row < nt; row += 8)
+        {
+
+            WORD16 ref_main_idx[9];
+
+            __m128i res_temp5_4x32b;
+            __m128i fract1_8x16b, fract2_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* fract = pos & (31); */
+            src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            /* idx = pos >> 5; */
+            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
+            temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+            temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+            temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+            temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
+            temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+            temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+            row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
+            _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+
+            {
+                src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0]));
+                src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1]));
+                src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2]));
+                src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3]));
+                src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8));
+                src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8));
+                src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8));
+                src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8));
+
+                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+                src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+                src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+                src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+                src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+                src_values4 = _mm_maddubs_epi16(src_values4, temp1);
+                src_values5 = _mm_maddubs_epi16(src_values5, temp2);
+                src_values6 = _mm_maddubs_epi16(src_values6, temp3);
+                src_values7 = _mm_maddubs_epi16(src_values7, temp4);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_values0 = _mm_srai_epi16(src_values0,  5);
+                src_values1 = _mm_srai_epi16(src_values1,  5);
+                src_values2 = _mm_srai_epi16(src_values2,  5);
+                src_values3 = _mm_srai_epi16(src_values3,  5);
+                src_values4 = _mm_srai_epi16(src_values4,  5);
+                src_values5 = _mm_srai_epi16(src_values5,  5);
+                src_values6 = _mm_srai_epi16(src_values6,  5);
+                src_values7 = _mm_srai_epi16(src_values7,  5);
+
+                /* converting 16 bit to 8 bit */
+                src_values0 = _mm_packus_epi16(src_values0, src_values4);
+                src_values1 = _mm_packus_epi16(src_values1, src_values5);
+                src_values2 = _mm_packus_epi16(src_values2, src_values6);
+                src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+                /* loading 8-bit 8 pixels values */
+                _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0);       /* row=0*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1);   /* row=1*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2);   /* row=2*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3);   /* row=3*/
+
+
+                src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4]));
+                src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5]));
+                src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6]));
+                src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7]));
+                src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8));
+                src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8));
+                src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8));
+                src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8));
+
+                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+                src_values0 = _mm_maddubs_epi16(src_values0, temp11);
+                src_values1 = _mm_maddubs_epi16(src_values1, temp12);
+                src_values2 = _mm_maddubs_epi16(src_values2, temp13);
+                src_values3 = _mm_maddubs_epi16(src_values3, temp14);
+                src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+                src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+                src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+                src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_values0 = _mm_srai_epi16(src_values0,  5);
+                src_values1 = _mm_srai_epi16(src_values1,  5);
+                src_values2 = _mm_srai_epi16(src_values2,  5);
+                src_values3 = _mm_srai_epi16(src_values3,  5);
+                src_values4 = _mm_srai_epi16(src_values4,  5);
+                src_values5 = _mm_srai_epi16(src_values5,  5);
+                src_values6 = _mm_srai_epi16(src_values6,  5);
+                src_values7 = _mm_srai_epi16(src_values7,  5);
+
+                /* converting 16 bit to 8 bit */
+                src_values0 = _mm_packus_epi16(src_values0, src_values4);
+                src_values1 = _mm_packus_epi16(src_values1, src_values5);
+                src_values2 = _mm_packus_epi16(src_values2, src_values6);
+                src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+                /* loading 8-bit 8 pixels values */
+                _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0);   /* row=4*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1);   /* row=5*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2);   /* row=6*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3);   /* row=7*/
+
+            }
+            pu1_dst += 8 * dst_strd;
+        }
+    }
+    else if(nt == 8)
+    {
+
+
+        __m128i const_temp2_4x32b, const_temp3_4x32b;
+        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+
+        __m128i row_4x32b, two_nt_4x32b, src_values12;
+        __m128i src_values0, src_values1, src_values2, src_values3;
+        __m128i  src_values4, src_values5, src_values6, src_values7;
+
+
+        /* Intermediate reference samples for negative angle modes */
+        /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+        ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
+        temp1 = _mm_loadl_epi64((__m128i *)(pu1_ref + two_nt));
+
+        /* For negative angled derive the main reference samples from side */
+
+        src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref)); /*nt-(nt+15)*/
+
+        temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
+
+        src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+        src_values0 = _mm_srli_si128(src_values0, 8);
+        _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1);
+        _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0);
+
+
+
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+
+
+        two_nt_4x32b = _mm_set1_epi16(1);
+
+
+        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+        {
+
+            WORD16 ref_main_idx[9];
+
+            __m128i res_temp5_4x32b;
+            __m128i fract1_8x16b, fract2_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* fract = pos & (31); */
+            src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            /* idx = pos >> 5; */
+            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
+            temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+            temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+            temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+            temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
+            temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+            temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+            _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+
+            src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0]));  /* col = 0-7   */
+            src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1]));  /* col = 8-15  */
+            src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2]));  /* col = 16-23 */
+            src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3]));  /* col = 24-31 */
+            src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4]));  /* col = 32-39   */
+            src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5]));  /* col = 40-47  */
+            src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6]));  /* col = 48-55 */
+            src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7]));  /* col = 56-63*/
+
+            src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+            src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+            src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+            src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+            src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+            src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+            src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+            src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+            src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+            src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+            src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+            src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+            src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+            src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+            src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+            src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+            src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+            src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+            src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+            src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+            src_values0 = _mm_srai_epi16(src_values0,  5);
+            src_values1 = _mm_srai_epi16(src_values1,  5);
+            src_values2 = _mm_srai_epi16(src_values2,  5);
+            src_values3 = _mm_srai_epi16(src_values3,  5);
+            src_values4 = _mm_srai_epi16(src_values4,  5);
+            src_values5 = _mm_srai_epi16(src_values5,  5);
+            src_values6 = _mm_srai_epi16(src_values6,  5);
+            src_values7 = _mm_srai_epi16(src_values7,  5);
+
+            /* converting 16 bit to 8 bit */
+            src_values0 = _mm_packus_epi16(src_values0, src_values1);
+            src_values2 = _mm_packus_epi16(src_values2, src_values3);
+            src_values1 = _mm_srli_si128(src_values0, 8);
+            src_values3 = _mm_srli_si128(src_values2, 8);
+            src_values4 = _mm_packus_epi16(src_values4, src_values5);
+            src_values6 = _mm_packus_epi16(src_values6, src_values7);
+            src_values5 = _mm_srli_si128(src_values4, 8);
+            src_values7 = _mm_srli_si128(src_values6, 8);
+
+            /* loading 8-bit 8 pixels values */
+            _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0);       /* row=0*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1);   /* row=1*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2);   /* row=2*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3);   /* row=3*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4);   /* row=4*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5);   /* row=5*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6);   /* row=6*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7);   /* row=7*/
+        }
+    }
+    else /* if nt =4*/
+    {
+
+        __m128i const_temp2_4x32b, const_temp3_4x32b;
+        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+
+        __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+
+        for(k = 0; k < (nt + 1); k++)
+            ref_temp[k + nt - 1] = pu1_ref[two_nt + k];
+        ref_idx = (nt * intra_pred_ang) >> 5;
+        inv_ang_sum = 128;
+
+        for(k = -1; k > ref_idx; k--)
+        {
+            inv_ang_sum += inv_ang;
+            ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
+        }
+
+
+        const_temp2_4x32b = _mm_set1_epi32(31);
+        const_temp3_4x32b = _mm_set1_epi32(32);
+
+        two_nt_4x32b = _mm_set1_epi32(1);
+
+
+        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+        intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
+
+        row_4x32b = _mm_set_epi32(4, 3, 2, 1);
+        {
+            WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+            int temp11, temp21, temp31, temp41;
+
+
+            __m128i fract1_8x16b, fract2_8x16b,  res_temp5_4x32b;
+            __m128i src_values0, src_values1, src_values2, src_values3;
+            __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b);
+
+            /* fract = pos & (31); */
+            src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
+
+            ref_main_temp0 = _mm_srli_si128(src_values12, 4);  /* next 32 bit values */
+            ref_main_temp1 = _mm_srli_si128(src_values12, 8);  /* next 32 bit values */
+            ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
+            ref_main_idx1  = _mm_cvtsi128_si32(src_values12);    /* row=0*/
+            ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* row=1*/
+            ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* row=2*/
+            ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* row=3*/
+
+            /* idx = pos >> 5; */
+            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
+
+            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+            temp3 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+            temp4 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+
+            src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1));     /* col = 0-7   */
+            src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2));   /* col = 8-15  */
+            src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3));  /* col = 16-23 */
+            src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4));  /* col = 24-31 */
+
+            src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+            src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+            src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+            src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+
+
+            src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+            src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+            src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+            src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+            src_values0 = _mm_srai_epi16(src_values0,  5);
+            src_values1 = _mm_srai_epi16(src_values1,  5);
+            src_values2 = _mm_srai_epi16(src_values2,  5);
+            src_values3 = _mm_srai_epi16(src_values3,  5);
+
+            /* converting 16 bit to 8 bit */
+            src_values0 = _mm_packus_epi16(src_values0, src_values1);
+            src_values2 = _mm_packus_epi16(src_values2, src_values3);
+            src_values1 = _mm_srli_si128(src_values0, 8);
+            src_values3 = _mm_srli_si128(src_values2, 8);
+
+            temp11 = _mm_cvtsi128_si32(src_values0);
+            temp21 = _mm_cvtsi128_si32(src_values1);
+            temp31 = _mm_cvtsi128_si32(src_values2);
+            temp41 = _mm_cvtsi128_si32(src_values3);
+
+            /* loding 4-bit 8 pixels values */
+            *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
+            *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
+            *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
+            *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
+
+        }
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for luma mode 27 to mode 33
+*
+* @par Description:
+*    Intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
+*    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+*    block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_27_to_33_sse42(UWORD8 *pu1_ref,
+                                               WORD32 src_strd,
+                                               UWORD8 *pu1_dst,
+                                               WORD32 dst_strd,
+                                               WORD32 nt,
+                                               WORD32 mode)
+{
+    WORD32 row;
+    WORD32 two_nt;
+    WORD32 intra_pred_ang;
+
+    __m128i temp11, temp12, temp13, temp14;
+
+    __m128i     const_temp_8x16b;
+    __m128i temp1, temp2, temp3, temp4, sm3;
+    UNUSED(src_strd);
+
+    two_nt = 2 * nt;
+    intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+    const_temp_8x16b = _mm_set1_epi16(16);
+    sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
+    if(nt == 32)
+    {
+
+        __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
+        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+        __m128i row_4x32b, two_nt_4x32b, src_values12;
+        int col = 0;
+
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+        const_temp8_4x32b = _mm_set1_epi16(8);
+
+        two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
+
+        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+        for(row = 0; row < nt; row += 8)
+        {
+
+            WORD16 ref_main_idx[9];
+
+            __m128i res_temp5_4x32b;
+            __m128i fract1_8x16b, fract2_8x16b;
+            __m128i src_values0, src_values1, src_values2, src_values3;
+            __m128i  src_values4, src_values5, src_values6, src_values7;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* fract = pos & (31); */
+            src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            /* idx = pos >> 5; */
+            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
+            temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+            temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+            temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+            temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
+            temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+            temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+            row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
+            _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+            for(col = 0; col < nt; col += 16)
+            {
+                src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + col));
+                src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + col));
+                src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + col));
+                src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + col));
+                src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8 + col));
+                src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8 + col));
+                src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8 + col));
+                src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8 + col));
+
+                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+                src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+                src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+                src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+                src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+                src_values4 = _mm_maddubs_epi16(src_values4, temp1);
+                src_values5 = _mm_maddubs_epi16(src_values5, temp2);
+                src_values6 = _mm_maddubs_epi16(src_values6, temp3);
+                src_values7 = _mm_maddubs_epi16(src_values7, temp4);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_values0 = _mm_srai_epi16(src_values0,  5);
+                src_values1 = _mm_srai_epi16(src_values1,  5);
+                src_values2 = _mm_srai_epi16(src_values2,  5);
+                src_values3 = _mm_srai_epi16(src_values3,  5);
+                src_values4 = _mm_srai_epi16(src_values4,  5);
+                src_values5 = _mm_srai_epi16(src_values5,  5);
+                src_values6 = _mm_srai_epi16(src_values6,  5);
+                src_values7 = _mm_srai_epi16(src_values7,  5);
+
+                /* converting 16 bit to 8 bit */
+                src_values0 = _mm_packus_epi16(src_values0, src_values4);
+                src_values1 = _mm_packus_epi16(src_values1, src_values5);
+                src_values2 = _mm_packus_epi16(src_values2, src_values6);
+                src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+                /* loading 8-bit 8 pixels values */
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0);       /* row=0*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1);   /* row=1*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2);   /* row=2*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3);   /* row=3*/
+
+
+                src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + col));
+                src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + col));
+                src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + col));
+                src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + col));
+                src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8 + col));
+                src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8 + col));
+                src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8 + col));
+                src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8 + col));
+
+                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+                src_values0 = _mm_maddubs_epi16(src_values0, temp11);
+                src_values1 = _mm_maddubs_epi16(src_values1, temp12);
+                src_values2 = _mm_maddubs_epi16(src_values2, temp13);
+                src_values3 = _mm_maddubs_epi16(src_values3, temp14);
+                src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+                src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+                src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+                src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_values0 = _mm_srai_epi16(src_values0,  5);
+                src_values1 = _mm_srai_epi16(src_values1,  5);
+                src_values2 = _mm_srai_epi16(src_values2,  5);
+                src_values3 = _mm_srai_epi16(src_values3,  5);
+                src_values4 = _mm_srai_epi16(src_values4,  5);
+                src_values5 = _mm_srai_epi16(src_values5,  5);
+                src_values6 = _mm_srai_epi16(src_values6,  5);
+                src_values7 = _mm_srai_epi16(src_values7,  5);
+
+                /* converting 16 bit to 8 bit */
+                src_values0 = _mm_packus_epi16(src_values0, src_values4);
+                src_values1 = _mm_packus_epi16(src_values1, src_values5);
+                src_values2 = _mm_packus_epi16(src_values2, src_values6);
+                src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+                /* loading 8-bit 8 pixels values */
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0);   /* row=4*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1);   /* row=5*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2);   /* row=6*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3);   /* row=7*/
+
+            }
+            pu1_dst += 8 * dst_strd;
+        }
+
+    }
+    else if(nt == 16) /* for nt = 16 case */
+    {
+
+        __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
+        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+        __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+        const_temp8_4x32b = _mm_set1_epi16(8);
+
+        two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
+
+        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+        for(row = 0; row < nt; row += 8)
+        {
+
+            WORD16 ref_main_idx[9];
+
+            __m128i res_temp5_4x32b;
+            __m128i fract1_8x16b, fract2_8x16b;
+            __m128i src_values0, src_values1, src_values2, src_values3;
+            __m128i  src_values4, src_values5, src_values6, src_values7;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* fract = pos & (31); */
+            src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            /* idx = pos >> 5; */
+            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
+            temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+            temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+            temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+            temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
+            temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+            temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+            row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
+            _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+
+            {
+                src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0]));
+                src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1]));
+                src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2]));
+                src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3]));
+                src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8));
+                src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8));
+                src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8));
+                src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8));
+
+                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+                src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+                src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+                src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+                src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+                src_values4 = _mm_maddubs_epi16(src_values4, temp1);
+                src_values5 = _mm_maddubs_epi16(src_values5, temp2);
+                src_values6 = _mm_maddubs_epi16(src_values6, temp3);
+                src_values7 = _mm_maddubs_epi16(src_values7, temp4);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_values0 = _mm_srai_epi16(src_values0,  5);
+                src_values1 = _mm_srai_epi16(src_values1,  5);
+                src_values2 = _mm_srai_epi16(src_values2,  5);
+                src_values3 = _mm_srai_epi16(src_values3,  5);
+                src_values4 = _mm_srai_epi16(src_values4,  5);
+                src_values5 = _mm_srai_epi16(src_values5,  5);
+                src_values6 = _mm_srai_epi16(src_values6,  5);
+                src_values7 = _mm_srai_epi16(src_values7,  5);
+
+                /* converting 16 bit to 8 bit */
+                src_values0 = _mm_packus_epi16(src_values0, src_values4);
+                src_values1 = _mm_packus_epi16(src_values1, src_values5);
+                src_values2 = _mm_packus_epi16(src_values2, src_values6);
+                src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+                /* loading 8-bit 8 pixels values */
+                _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0);       /* row=0*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1);   /* row=1*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2);   /* row=2*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3);   /* row=3*/
+
+
+                src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4]));
+                src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5]));
+                src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6]));
+                src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7]));
+                src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8));
+                src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8));
+                src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8));
+                src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8));
+
+                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+                src_values0 = _mm_maddubs_epi16(src_values0, temp11);
+                src_values1 = _mm_maddubs_epi16(src_values1, temp12);
+                src_values2 = _mm_maddubs_epi16(src_values2, temp13);
+                src_values3 = _mm_maddubs_epi16(src_values3, temp14);
+                src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+                src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+                src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+                src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_values0 = _mm_srai_epi16(src_values0,  5);
+                src_values1 = _mm_srai_epi16(src_values1,  5);
+                src_values2 = _mm_srai_epi16(src_values2,  5);
+                src_values3 = _mm_srai_epi16(src_values3,  5);
+                src_values4 = _mm_srai_epi16(src_values4,  5);
+                src_values5 = _mm_srai_epi16(src_values5,  5);
+                src_values6 = _mm_srai_epi16(src_values6,  5);
+                src_values7 = _mm_srai_epi16(src_values7,  5);
+
+                /* converting 16 bit to 8 bit */
+                src_values0 = _mm_packus_epi16(src_values0, src_values4);
+                src_values1 = _mm_packus_epi16(src_values1, src_values5);
+                src_values2 = _mm_packus_epi16(src_values2, src_values6);
+                src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+                /* loading 8-bit 8 pixels values */
+                _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0);   /* row=4*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1);   /* row=5*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2);   /* row=6*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3);   /* row=7*/
+
+            }
+            pu1_dst += 8 * dst_strd;
+        }
+
+    }
+    else if(nt == 8)
+    {
+
+        __m128i const_temp2_4x32b, const_temp3_4x32b;
+        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+        __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+
+        two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
+
+
+        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+        //for(row = 0; row < nt; row +=4)
+        {
+
+            WORD16 ref_main_idx[9];
+
+            __m128i res_temp5_4x32b;
+            __m128i fract1_8x16b, fract2_8x16b;
+            __m128i src_values0, src_values1, src_values2, src_values3;
+            __m128i  src_values4, src_values5, src_values6, src_values7;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* fract = pos & (31); */
+            src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            /* idx = pos >> 5; */
+            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
+            temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+            temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+            temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+            temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
+            temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+            temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+            _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+
+            src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0]));  /* col = 0-7   */
+            src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1]));  /* col = 8-15  */
+            src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2]));  /* col = 16-23 */
+            src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3]));  /* col = 24-31 */
+            src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4]));  /* col = 32-39   */
+            src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5]));  /* col = 40-47  */
+            src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6]));  /* col = 48-55 */
+            src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7]));  /* col = 56-63*/
+
+            src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+            src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+            src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+            src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+            src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+            src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+            src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+            src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+            src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+            src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+            src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+            src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+            src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+            src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+            src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+            src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+            src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+            src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+            src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+            src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+            src_values0 = _mm_srai_epi16(src_values0,  5);
+            src_values1 = _mm_srai_epi16(src_values1,  5);
+            src_values2 = _mm_srai_epi16(src_values2,  5);
+            src_values3 = _mm_srai_epi16(src_values3,  5);
+            src_values4 = _mm_srai_epi16(src_values4,  5);
+            src_values5 = _mm_srai_epi16(src_values5,  5);
+            src_values6 = _mm_srai_epi16(src_values6,  5);
+            src_values7 = _mm_srai_epi16(src_values7,  5);
+
+            /* converting 16 bit to 8 bit */
+            src_values0 = _mm_packus_epi16(src_values0, src_values1);
+            src_values2 = _mm_packus_epi16(src_values2, src_values3);
+            src_values1 = _mm_srli_si128(src_values0, 8);
+            src_values3 = _mm_srli_si128(src_values2, 8);
+            src_values4 = _mm_packus_epi16(src_values4, src_values5);
+            src_values6 = _mm_packus_epi16(src_values6, src_values7);
+            src_values5 = _mm_srli_si128(src_values4, 8);
+            src_values7 = _mm_srli_si128(src_values6, 8);
+
+            /* loading 8-bit 8 pixels values */
+            _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0);       /* row=0*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1);   /* row=1*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2);   /* row=2*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3);   /* row=3*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4);   /* row=4*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5);   /* row=5*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6);   /* row=6*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7);   /* row=7*/
+        }
+
+    }
+    else /* if nt =4*/
+    {
+
+        __m128i const_temp2_4x32b, const_temp3_4x32b;
+        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+
+        __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+
+        const_temp2_4x32b = _mm_set1_epi32(31);
+        const_temp3_4x32b = _mm_set1_epi32(32);
+
+        two_nt_4x32b = _mm_set1_epi32(two_nt + 1);
+
+
+        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+        intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
+
+        row_4x32b = _mm_set_epi32(4, 3, 2, 1);
+        {
+            int temp11, temp21, temp31, temp41;
+
+            WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+
+            __m128i fract1_8x16b, fract2_8x16b, res_temp5_4x32b;
+            __m128i src_values0, src_values1, src_values2, src_values3;
+            __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b);
+
+            /* fract = pos & (31); */
+            src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
+
+            ref_main_temp0 = _mm_srli_si128(src_values12, 4);  /* next 32 bit values */
+            ref_main_temp1 = _mm_srli_si128(src_values12, 8);  /* next 32 bit values */
+            ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
+            ref_main_idx1  = _mm_cvtsi128_si32(src_values12);    /* row=0*/
+            ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* row=1*/
+            ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* row=2*/
+            ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* row=3*/
+
+            /* idx = pos >> 5; */
+            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
+
+            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+            temp3 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+            temp4 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+
+            src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1));     /* col = 0-7   */
+            src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2));   /* col = 8-15  */
+            src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3));  /* col = 16-23 */
+            src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4));  /* col = 24-31 */
+
+            src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+            src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+            src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+            src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+
+            src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+            src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+            src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+            src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+            src_values0 = _mm_srai_epi16(src_values0,  5);
+            src_values1 = _mm_srai_epi16(src_values1,  5);
+            src_values2 = _mm_srai_epi16(src_values2,  5);
+            src_values3 = _mm_srai_epi16(src_values3,  5);
+
+            /* converting 16 bit to 8 bit */
+            src_values0 = _mm_packus_epi16(src_values0, src_values1);
+            src_values2 = _mm_packus_epi16(src_values2, src_values3);
+            src_values1 = _mm_srli_si128(src_values0, 8);
+            src_values3 = _mm_srli_si128(src_values2, 8);
+
+            temp11 = _mm_cvtsi128_si32(src_values0);
+            temp21 = _mm_cvtsi128_si32(src_values1);
+            temp31 = _mm_cvtsi128_si32(src_values2);
+            temp41 = _mm_cvtsi128_si32(src_values3);
+
+            /* loding 4-bit 8 pixels values */
+            *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
+            *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
+            *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
+            *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
+
+        }
+    }
+}
+

diff --git a/common/x86/ihevc_intra_pred_filters_ssse3_intr.c b/common/x86/ihevc_intra_pred_filters_ssse3_intr.c
new file mode 100644
index 0000000..dbab80a
--- /dev/null
+++ b/common/x86/ihevc_intra_pred_filters_ssse3_intr.c

@@ -0,0 +1,5127 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_intra_pred_filters_atom_intr.c
+*
+* @brief
+*  Contains function Definition for intra prediction  interpolation filters
+*
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+*  - ihevc_intra_pred_luma_planar_ssse3()
+*  - ihevc_intra_pred_luma_dc_ssse3()
+*  - ihevc_intra_pred_luma_horz_ssse3()
+*  - ihevc_intra_pred_luma_ver_ssse3()
+*  - ihevc_intra_pred_luma_mode2_ssse3()
+*  - ihevc_intra_pred_luma_mode_18_34_ssse3()
+*  - ihevc_intra_pred_luma_mode_3_to_9_ssse3()
+*  - ihevc_intra_pred_luma_mode_11_to_17_ssse3()
+*  - ihevc_intra_pred_luma_mode_19_to_25_ssse3()
+*  - ihevc_intra_pred_luma_mode_27_to_33_ssse3()
+*  - ihevc_intra_pred_luma_ref_substitution_ssse3()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdlib.h>
+
+#include "ihevc_typedefs.h"
+#include "ihevc_intra_pred.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_defs.h"
+#include "ihevc_tables_x86_intr.h"
+
+#include <immintrin.h>
+
+/****************************************************************************/
+/* Constant Macros                                                          */
+/****************************************************************************/
+#define MAX_CU_SIZE 64
+#define BIT_DEPTH 8
+#define T32_4NT 128
+#define T16_4NT 64
+
+
+/****************************************************************************/
+/* Function Macros                                                          */
+/****************************************************************************/
+#define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
+
+/* tables to shuffle 8-bit values */
+
+
+/*****************************************************************************/
+/* global tables Definition                                                  */
+/*****************************************************************************/
+
+
+/*****************************************************************************/
+/* Function Definition                                                      */
+/*****************************************************************************/
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for pu1_ref substitution
+*
+*
+* @par Description:
+*    Reference substitution process for samples unavailable  for prediction
+*    Refer to section 8.4.4.2.2
+*
+* @param[in] pu1_top_left
+*  UWORD8 pointer to the top-left
+*
+* @param[in] pu1_top
+*  UWORD8 pointer to the top
+*
+* @param[in] pu1_left
+*  UWORD8 pointer to the left
+*
+* @param[in] src_strd
+*  WORD32 Source stride
+*
+* @param[in] nbr_flags
+*  WORD32 neighbor availability flags
+*
+* @param[in] nt
+*  WORD32 transform Block size
+*
+* @param[in] dst_strd
+*  WORD32 Destination stride
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_ref_substitution_ssse3(UWORD8 *pu1_top_left,
+                                                  UWORD8 *pu1_top,
+                                                  UWORD8 *pu1_left,
+                                                  WORD32 src_strd,
+                                                  WORD32 nt,
+                                                  WORD32 nbr_flags,
+                                                  UWORD8 *pu1_dst,
+                                                  WORD32 dst_strd)
+{
+    UWORD8 pu1_ref;
+    WORD32 dc_val, i;
+    WORD32 total_samples = (4 * nt) + 1;
+    WORD32 two_nt = 2 * nt;
+
+    WORD32 three_nt = 3 * nt;
+    WORD32 get_bits;
+    WORD32 next;
+    WORD32 bot_left, left, top, tp_right, tp_left;
+
+    WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
+    UNUSED(dst_strd);
+
+    dc_val = 1 << (BIT_DEPTH - 1);
+
+
+    /* Neighbor Flag Structure*/
+    /* MSB ---> LSB */
+    /*    Top-Left | Top-Right | Top | Left | Bottom-Left
+              1         4         4     4         4
+     */
+    /* If no neighbor flags are present, fill the neighbor samples with DC value */
+    if(nbr_flags == 0)
+    {
+        for(i = 0; i < total_samples; i++)
+        {
+            pu1_dst[i] = dc_val;
+        }
+    }
+    else
+    {
+        /* Else fill the corresponding samples */
+        pu1_dst[two_nt] = *pu1_top_left;
+        for(i = 0; i < two_nt; i++)
+            pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+        for(i = 0; i < two_nt; i++)
+            pu1_dst[two_nt + 1 + i] = pu1_top[i];
+
+        if(nt <= 8)
+        {
+            /* 1 bit extraction for all the neighboring blocks */
+            tp_left = (nbr_flags & 0x10000) >> 16;
+            bot_left = (nbr_flags & 0x8) >> 3;
+            left = (nbr_flags & 0x80) >> 7;
+            top = (nbr_flags & 0x100) >> 8;
+            tp_right = (nbr_flags & 0x1000) >> 12;
+
+            next = 1;
+
+            /* If bottom -left is not available, reverse substitution process*/
+            if(bot_left == 0)
+            {
+                WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right };
+
+                /* Check for the 1st available sample from bottom-left*/
+                while(!a_nbr_flag[next])
+                    next++;
+
+                /* If Left, top-left are available*/
+                if(next <= 2)
+                {
+                    idx = nt * next;
+                    pu1_ref = pu1_dst[idx];
+                    for(i = 0; i < idx; i++)
+                        pu1_dst[i] = pu1_ref;
+                }
+                else /* If top, top-right are available */
+                {
+                    /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
+                    idx = (nt * (next - 1)) + 1;
+                    pu1_ref = pu1_dst[idx];
+                    for(i = 0; i < idx; i++)
+                        pu1_dst[i] = pu1_ref;
+                }
+            }
+
+            /* Forward Substitution Process */
+            /* If left is Unavailable, copy the last bottom-left value */
+            if(left == 0)
+            {
+                for(i = 0; i < nt; i++)
+                    pu1_dst[nt + i] = pu1_dst[nt - 1];
+            }
+            /* If top-left is Unavailable, copy the last left value */
+            if(tp_left == 0)
+                pu1_dst[two_nt] = pu1_dst[two_nt - 1];
+            /* If top is Unavailable, copy the last top-left value */
+            if(top == 0)
+            {
+                for(i = 0; i < nt; i++)
+                    pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt];
+            }
+            /* If to right is Unavailable, copy the last top value */
+            if(tp_right == 0)
+            {
+                for(i = 0; i < nt; i++)
+                    pu1_dst[three_nt + 1 + i] = pu1_dst[three_nt];
+            }
+        }
+
+        if(nt == 16)
+        {
+            WORD32 nbr_flags_temp = 0;
+            nbr_flags_temp = ((nbr_flags & 0xC) >> 2) + ((nbr_flags & 0xC0) >> 4)
+                            + ((nbr_flags & 0x300) >> 4)
+                            + ((nbr_flags & 0x3000) >> 6)
+                            + ((nbr_flags & 0x10000) >> 8);
+
+            /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
+            /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+            {
+                nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
+
+                if(nbr_id_from_bl == 64)
+                    nbr_id_from_bl = 32;
+
+                if(nbr_id_from_bl == 32)
+                {
+                    /* for top left : 1 pel per nbr bit */
+                    if(!((nbr_flags_temp >> 8) & 0x1))
+                    {
+                        nbr_id_from_bl++;
+                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right;  8 pels per nbr bit */
+                        //nbr_id_from_bl += idx * 8;
+                    }
+                }
+                /* Reverse Substitution Process*/
+                if(nbr_id_from_bl)
+                {
+                    /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+                    pu1_ref = pu1_dst[nbr_id_from_bl];
+                    for(i = (nbr_id_from_bl - 1); i >= 0; i--)
+                    {
+                        pu1_dst[i] = pu1_ref;
+                    }
+                }
+            }
+
+            /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+            while(nbr_id_from_bl < ((T16_4NT) + 1))
+            {
+                /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
+                /* Devide by 8 to obtain the original index */
+                frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
+
+                /* The Top-left flag is at the last bit location of nbr_flags*/
+                if(nbr_id_from_bl == (T16_4NT / 2))
+                {
+                    get_bits = GET_BITS(nbr_flags_temp, 8);
+
+                    /* only pel substitution for TL */
+                    if(!get_bits)
+                        pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
+                }
+                else
+                {
+                    get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag);
+                    if(!get_bits)
+                    {
+                        /* 8 pel substitution (other than TL) */
+                        pu1_ref = pu1_dst[nbr_id_from_bl - 1];
+                        for(i = 0; i < 8; i++)
+                            pu1_dst[nbr_id_from_bl + i] = pu1_ref;
+                    }
+
+                }
+                nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
+            }
+
+
+        }
+
+        if(nt == 32)
+        {
+            /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
+            /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+            {
+                nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
+
+                if(nbr_id_from_bl == 64)
+                {
+                    /* for top left : 1 pel per nbr bit */
+                    if(!((nbr_flags >> 16) & 0x1))
+                    {
+                        /* top left not available */
+                        nbr_id_from_bl++;
+                        /* top and top right;  8 pels per nbr bit */
+                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
+                    }
+                }
+                /* Reverse Substitution Process*/
+                if(nbr_id_from_bl)
+                {
+                    /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+                    pu1_ref = pu1_dst[nbr_id_from_bl];
+                    for(i = (nbr_id_from_bl - 1); i >= 0; i--)
+                        pu1_dst[i] = pu1_ref;
+                }
+            }
+
+            /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+            while(nbr_id_from_bl < ((T32_4NT) + 1))
+            {
+                /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
+                /* Devide by 8 to obtain the original index */
+                frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
+
+                /* The Top-left flag is at the last bit location of nbr_flags*/
+                if(nbr_id_from_bl == (T32_4NT / 2))
+                {
+                    get_bits = GET_BITS(nbr_flags, 16);
+                    /* only pel substitution for TL */
+                    if(!get_bits)
+                        pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
+                }
+                else
+                {
+                    get_bits = GET_BITS(nbr_flags, frwd_nbr_flag);
+                    if(!get_bits)
+                    {
+                        /* 8 pel substitution (other than TL) */
+                        pu1_ref = pu1_dst[nbr_id_from_bl - 1];
+                        for(i = 0; i < 8; i++)
+                            pu1_dst[nbr_id_from_bl + i] = pu1_ref;
+                    }
+
+                }
+                nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
+            }
+        }
+
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for ref_filtering
+*
+*
+* @par Description:
+*    Reference DC filtering for neighboring samples dependent  on TU size and
+*    mode  Refer to section 8.4.4.2.3 in the standard
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_ref_filtering_ssse3(UWORD8 *pu1_src,
+                                          WORD32 nt,
+                                          UWORD8 *pu1_dst,
+                                          WORD32 mode,
+                                          WORD32 strong_intra_smoothing_enable_flag)
+{
+    WORD32 filter_flag;
+    WORD32 i; /* Generic indexing variable */
+    WORD32 four_nt = 4 * nt;
+    UWORD8 au1_flt[(4 * MAX_CU_SIZE) + 1];
+    WORD32 bi_linear_int_flag = 0;
+    WORD32 abs_cond_left_flag = 0;
+    WORD32 abs_cond_top_flag = 0;
+    WORD32 dc_val = 1 << (BIT_DEPTH - 5);
+    __m128i src_temp1, src_temp2, src_temp3, src_temp7;
+    __m128i src_temp4, src_temp5, src_temp6, src_temp8;
+
+    //WORD32 strong_intra_smoothing_enable_flag  = 1;
+
+    filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
+    if(0 == filter_flag)
+    {
+        if(pu1_src == pu1_dst)
+        {
+            return;
+        }
+        else
+        {
+            if(nt == 4)
+            {
+                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
+                _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+                pu1_dst[four_nt] = pu1_src[four_nt];
+
+            }
+
+            else if(nt == 8)
+            {
+
+                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
+                src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
+
+                _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+
+
+                pu1_dst[four_nt] = pu1_src[four_nt];
+            }
+            else if(nt == 16)
+            {
+
+                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
+                src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
+                src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32));
+                src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48));
+
+                _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
+
+                pu1_dst[four_nt] = pu1_src[four_nt];
+            }
+            else if(nt == 32)
+            {
+
+                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
+                src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
+                src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32));
+                src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48));
+
+                src_temp5 = _mm_loadu_si128((__m128i *)(pu1_src + 64));
+                src_temp6 = _mm_loadu_si128((__m128i *)(pu1_src + 80));
+                src_temp7 = _mm_loadu_si128((__m128i *)(pu1_src + 96));
+                src_temp8 = _mm_loadu_si128((__m128i *)(pu1_src + 112));
+
+                _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8);
+
+                pu1_dst[four_nt] = pu1_src[four_nt];
+            }
+
+        }
+    }
+
+    else
+    {
+        /* If strong intra smoothin is enabled and transform size is 32 */
+        if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
+        {
+            /* Strong Intra Filtering */
+            abs_cond_top_flag = (abs(pu1_src[2 * nt] + pu1_src[4 * nt]
+                            - (2 * pu1_src[3 * nt]))) < dc_val;
+            abs_cond_left_flag = (abs(pu1_src[2 * nt] + pu1_src[0]
+                            - (2 * pu1_src[nt]))) < dc_val;
+
+            bi_linear_int_flag = ((1 == abs_cond_left_flag)
+                            && (1 == abs_cond_top_flag));
+        }
+        /* Extremities Untouched*/
+        au1_flt[0] = pu1_src[0];
+        au1_flt[4 * nt] = pu1_src[4 * nt];
+
+        /* Strong filtering of reference samples */
+        if(1 == bi_linear_int_flag)
+        {
+            au1_flt[2 * nt] = pu1_src[2 * nt];
+
+            for(i = 1; i < (2 * nt); i++)
+                au1_flt[i] = (((2 * nt) - i) * pu1_src[0] + i * pu1_src[2 * nt] + 32) >> 6;
+
+            for(i = 1; i < (2 * nt); i++)
+                au1_flt[i + (2 * nt)] = (((2 * nt) - i) * pu1_src[2 * nt] + i * pu1_src[4 * nt] + 32) >> 6;
+        }
+        else
+        {
+            __m128i const_value_8x16, zero_8x16b;
+
+            const_value_8x16 = _mm_set1_epi16(2);
+
+            au1_flt[0] = pu1_src[0];
+            au1_flt[4 * nt] = pu1_src[4 * nt];
+
+            zero_8x16b = _mm_setzero_si128();
+
+            /* Perform bilinear filtering of Reference Samples */
+            for(i = 0; i < (four_nt); i += 16)
+            {
+                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src + i));
+                src_temp2 = _mm_srli_si128(src_temp1, 1);
+                src_temp3 = _mm_srli_si128(src_temp2, 1);
+
+                src_temp1 = _mm_unpacklo_epi8(src_temp1, zero_8x16b);
+                src_temp2 = _mm_unpacklo_epi8(src_temp2, zero_8x16b);
+                src_temp3 = _mm_unpacklo_epi8(src_temp3, zero_8x16b);
+
+                src_temp2 = _mm_slli_epi16(src_temp2,  1);
+
+                src_temp1 = _mm_add_epi16(src_temp1, src_temp2);
+                src_temp1 = _mm_add_epi16(src_temp1, src_temp3);
+                src_temp1 = _mm_add_epi16(src_temp1, const_value_8x16);
+
+                src_temp1 = _mm_srai_epi16(src_temp1,  2);
+
+                src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 8 + i));
+                src_temp5 = _mm_srli_si128(src_temp4, 1);
+                src_temp6 = _mm_srli_si128(src_temp5, 1);
+
+                src_temp4 = _mm_unpacklo_epi8(src_temp4, zero_8x16b);
+                src_temp5 = _mm_unpacklo_epi8(src_temp5, zero_8x16b);
+                src_temp6 = _mm_unpacklo_epi8(src_temp6, zero_8x16b);
+
+                src_temp5 = _mm_slli_epi16(src_temp5,  1);
+
+                src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
+                src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+                src_temp4 = _mm_add_epi16(src_temp4, const_value_8x16);
+
+                src_temp4 = _mm_srai_epi16(src_temp4,  2);
+
+                /* converting 16 bit to 8 bit */
+                src_temp1 = _mm_packus_epi16(src_temp1, src_temp4);
+
+                _mm_storeu_si128((__m128i *)(au1_flt + 1 + i), src_temp1);
+            }
+            au1_flt[4 * nt] = pu1_src[4 * nt];
+        }
+
+        if(nt == 4)
+        {
+            src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
+            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+            pu1_dst[four_nt] = au1_flt[four_nt];
+        }
+        else if(nt == 8)
+        {
+
+            src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
+            src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
+
+            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+
+            pu1_dst[four_nt] = au1_flt[four_nt];
+        }
+        else if(nt == 16)
+        {
+
+            src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
+            src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
+            src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32));
+            src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48));
+
+            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
+
+            pu1_dst[four_nt] = au1_flt[four_nt];
+        }
+
+        else if(nt == 32)
+        {
+
+            src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
+            src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
+            src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32));
+            src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48));
+
+            src_temp5 = _mm_loadu_si128((__m128i *)(au1_flt + 64));
+            src_temp6 = _mm_loadu_si128((__m128i *)(au1_flt + 80));
+            src_temp7 = _mm_loadu_si128((__m128i *)(au1_flt + 96));
+            src_temp8 = _mm_loadu_si128((__m128i *)(au1_flt + 112));
+
+            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8);
+
+            pu1_dst[four_nt] = au1_flt[four_nt];
+        }
+
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for luma planar
+*
+* @par Description:
+*    Planar Intraprediction with reference neighboring samples location
+*    pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'  Refer
+*    to section 8.4.4.2.4 in the standard
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_planar_ssse3(UWORD8 *pu1_ref,
+                                        WORD32 src_strd,
+                                        UWORD8 *pu1_dst,
+                                        WORD32 dst_strd,
+                                        WORD32 nt,
+                                        WORD32 mode)
+{
+
+
+    WORD32 row, col;
+    WORD32 two_nt, three_nt;
+    UWORD16 temp;
+
+    __m128i pu1_ref_16x8b, const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
+    __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b, const_temp8_4x32b;
+    __m128i nt_row_16x8b, nt_row1_16x8b, nt_row2_16x8b, nt_row3_16x8b; //nt-1-row
+    __m128i row_16x8b, row1_16x8b, row2_16x8b, row3_16x8b; //row+1
+    UNUSED(src_strd);
+    UNUSED(mode);
+
+    two_nt = 2 * nt;
+    three_nt = 3 * nt;
+
+    /* Planar filtering */
+    temp = pu1_ref[nt - 1];
+    temp = (temp << 8) | ((UWORD16)pu1_ref[three_nt + 1]);
+    /* setting vallues in  registera*/
+    pu1_ref_16x8b  = _mm_set1_epi16(temp);
+    const_temp6_4x32b = _mm_set1_epi16(nt);
+
+
+
+    if(nt == 32) /* for nt multiple of 8*/
+    {
+
+
+        const_temp4_4x32b = _mm_set1_epi16(0x0400);
+        const_temp1_4x32b = _mm_set1_epi16(0x0100);
+        const_temp8_4x32b = _mm_set1_epi16(0x0008);
+        //(nt-1-y) (nt-1-x) ; x= 0..15 , y = row
+        //const_temp5_4x32b = _mm_set_epi8(nt_row, 0,nt_row, 1,nt_row, 2,nt_row, 3,nt_row, 4,nt_row, 5,nt_row, 6,nt_row, 7);
+        nt_row_16x8b = _mm_set_epi16(0x1f18, 0x1f19, 0x1f1a, 0x1f1b, 0x1f1c, 0x1f1d, 0x1f1e, 0x1f1f);
+        //(y+1) (x+1) ; x= 0..15 , y = row
+        //const_temp3_4x32b = _mm_set_epi16(row1,8,row1, 7,row1, 6, row1, 5,row1, 4, row1, 3, row1, 2, row1, 1);
+        row_16x8b = _mm_set_epi16(0x0108, 0x0107, 0x0106, 0x0105, 0x0104, 0x0103, 0x0102, 0x0101);
+
+        for(row = 0; row < nt; row += 1)
+        {
+            __m128i res_temp_8x16b, res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b;
+            __m128i res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b;
+
+            __m128i src_temp_8x16b, src_temp1_8x16b;
+
+
+            res_temp1_8x16b  = _mm_set1_epi8(pu1_ref[two_nt - 1 - row]);
+
+            nt_row1_16x8b = _mm_sub_epi16(nt_row_16x8b,  const_temp8_4x32b);
+            row1_16x8b    = _mm_add_epi16(row_16x8b,     const_temp8_4x32b);
+            nt_row2_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp8_4x32b);
+            row2_16x8b    = _mm_add_epi16(row1_16x8b,    const_temp8_4x32b);
+            nt_row3_16x8b = _mm_sub_epi16(nt_row2_16x8b, const_temp8_4x32b);
+            row3_16x8b    = _mm_add_epi16(row2_16x8b,    const_temp8_4x32b);
+            /* loding 8bit 16 pixles*/
+            src_temp_8x16b  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+            src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 17));
+
+            res_temp4_8x16b =  _mm_unpacklo_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=0*/
+            res_temp5_8x16b =  _mm_unpackhi_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=1*/
+            res_temp6_8x16b =  _mm_unpacklo_epi8(res_temp1_8x16b, src_temp1_8x16b); /* row=2*/
+            res_temp7_8x16b =  _mm_unpackhi_epi8(res_temp1_8x16b, src_temp1_8x16b); /* row=3*/
+
+            /*(row + 1) * pu1_ref[nt - 1] + (col + 1) * pu1_ref[three_nt + 1] */
+            res_temp_8x16b  = _mm_maddubs_epi16(pu1_ref_16x8b, row_16x8b);
+            res_temp1_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row1_16x8b);
+            res_temp2_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row2_16x8b);
+            res_temp3_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row3_16x8b);
+            /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] + (nt - 1 - col)* pu1_ref[two_nt - 1 - row] */
+            res_temp4_8x16b = _mm_maddubs_epi16(res_temp4_8x16b, nt_row_16x8b);
+            res_temp5_8x16b = _mm_maddubs_epi16(res_temp5_8x16b, nt_row1_16x8b);
+            res_temp6_8x16b = _mm_maddubs_epi16(res_temp6_8x16b, nt_row2_16x8b);
+            res_temp7_8x16b = _mm_maddubs_epi16(res_temp7_8x16b, nt_row3_16x8b);
+
+            res_temp_8x16b  = _mm_add_epi16(res_temp_8x16b, res_temp4_8x16b);
+            res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp5_8x16b);
+            res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, res_temp6_8x16b);
+            res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp7_8x16b);
+            /*res_temp + nt)*/
+            res_temp_8x16b  = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
+            res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, const_temp6_4x32b);
+            res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, const_temp6_4x32b);
+            res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, const_temp6_4x32b);
+
+            res_temp_8x16b  = _mm_srli_epi16(res_temp_8x16b, 6); //log2(32)+1
+            res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 6);
+            res_temp2_8x16b = _mm_srli_epi16(res_temp2_8x16b, 6);
+            res_temp3_8x16b = _mm_srli_epi16(res_temp3_8x16b, 6);
+
+            res_temp_8x16b  = _mm_packus_epi16(res_temp_8x16b, res_temp1_8x16b);
+            res_temp1_8x16b = _mm_packus_epi16(res_temp2_8x16b, res_temp3_8x16b);
+
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd)), res_temp_8x16b);
+            _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd) + 16), res_temp1_8x16b);
+
+
+            nt_row_16x8b = _mm_sub_epi16(nt_row_16x8b, const_temp1_4x32b);
+            row_16x8b    = _mm_add_epi16(row_16x8b,    const_temp1_4x32b);
+        }
+    }
+    else if(nt == 16) /* for nt multiple of 8*/
+    {
+
+        const_temp4_4x32b = _mm_set1_epi16(0x0400);
+        const_temp1_4x32b = _mm_set1_epi16(0x0100);
+        const_temp8_4x32b = _mm_set1_epi16(0x0008);
+        //(nt-1-y) (nt-1-x) ; x= 0..15 , y = row
+        //const_temp5_4x32b = _mm_set_epi8(nt_row, 0,nt_row, 1,nt_row, 2,nt_row, 3,nt_row, 4,nt_row, 5,nt_row, 6,nt_row, 7);
+        nt_row_16x8b = _mm_set_epi16(0x0f08, 0x0f09, 0x0f0a, 0x0f0b, 0x0f0c, 0x0f0d, 0x0f0e, 0x0f0f);
+        //(y+1) (x+1) ; x= 0..15 , y = row
+        //const_temp3_4x32b = _mm_set_epi16(row1,8,row1, 7,row1, 6, row1, 5,row1, 4, row1, 3, row1, 2, row1, 1);
+        row_16x8b = _mm_set_epi16(0x0108, 0x0107, 0x0106, 0x0105, 0x0104, 0x0103, 0x0102, 0x0101);
+
+        for(row = 0; row < nt; row += 2)
+        {
+            __m128i res_temp_8x16b, res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b;
+            __m128i res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b;
+
+            __m128i src_temp_8x16b;
+
+
+            res_temp1_8x16b  = _mm_set1_epi8(pu1_ref[two_nt - 1 - row]);
+            res_temp2_8x16b  = _mm_set1_epi8(pu1_ref[two_nt - 2 - row]);
+
+
+            nt_row1_16x8b = _mm_sub_epi16(nt_row_16x8b,  const_temp1_4x32b);
+            row1_16x8b    = _mm_add_epi16(row_16x8b,     const_temp1_4x32b);
+            nt_row2_16x8b = _mm_sub_epi16(nt_row_16x8b,  const_temp8_4x32b);
+            row2_16x8b    = _mm_add_epi16(row_16x8b,     const_temp8_4x32b);
+            nt_row3_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp8_4x32b);
+            row3_16x8b    = _mm_add_epi16(row1_16x8b,    const_temp8_4x32b);
+            /* loding 8bit 16 pixles*/
+            src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+
+
+            res_temp4_8x16b =  _mm_unpacklo_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=0*/
+            res_temp5_8x16b =  _mm_unpacklo_epi8(res_temp2_8x16b, src_temp_8x16b); /* row=1*/
+            res_temp6_8x16b =  _mm_unpackhi_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=2*/
+            res_temp7_8x16b =  _mm_unpackhi_epi8(res_temp2_8x16b, src_temp_8x16b); /* row=3*/
+
+            /*(row + 1) * pu1_ref[nt - 1] + (col + 1) * pu1_ref[three_nt + 1] */
+            res_temp_8x16b  = _mm_maddubs_epi16(pu1_ref_16x8b, row_16x8b);
+            res_temp1_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row1_16x8b);
+            res_temp2_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row2_16x8b);
+            res_temp3_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row3_16x8b);
+            /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] + (nt - 1 - col)* pu1_ref[two_nt - 1 - row] */
+            res_temp4_8x16b = _mm_maddubs_epi16(res_temp4_8x16b, nt_row_16x8b);
+            res_temp5_8x16b = _mm_maddubs_epi16(res_temp5_8x16b, nt_row1_16x8b);
+            res_temp6_8x16b = _mm_maddubs_epi16(res_temp6_8x16b, nt_row2_16x8b);
+            res_temp7_8x16b = _mm_maddubs_epi16(res_temp7_8x16b, nt_row3_16x8b);
+
+            res_temp_8x16b  = _mm_add_epi16(res_temp_8x16b, res_temp4_8x16b);
+            res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp5_8x16b);
+            res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, res_temp6_8x16b);
+            res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp7_8x16b);
+            /*res_temp + nt)*/
+            res_temp_8x16b  = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
+            res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, const_temp6_4x32b);
+            res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, const_temp6_4x32b);
+            res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, const_temp6_4x32b);
+
+            res_temp_8x16b  = _mm_srli_epi16(res_temp_8x16b, 5); //log2(16)+1
+            res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 5);
+            res_temp2_8x16b = _mm_srli_epi16(res_temp2_8x16b, 5);
+            res_temp3_8x16b = _mm_srli_epi16(res_temp3_8x16b, 5);
+
+            res_temp_8x16b  = _mm_packus_epi16(res_temp_8x16b, res_temp2_8x16b);
+            res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, res_temp3_8x16b);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd)), res_temp_8x16b);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), res_temp1_8x16b);
+
+            nt_row_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp1_4x32b);
+            row_16x8b    = _mm_add_epi16(row1_16x8b,    const_temp1_4x32b);
+        }
+    }
+    else if(nt == 8)
+    {
+
+
+        const_temp4_4x32b = _mm_set1_epi16(0x0400);
+        const_temp1_4x32b = _mm_set1_epi16(0x0100);
+        zero_8x16b = _mm_set1_epi32(0);
+
+        //(nt-1-y) (nt-1-x) ; x= 0..7 , y = row
+        //const_temp5_4x32b = _mm_set_epi8(nt_row, 0,nt_row, 1,nt_row, 2,nt_row, 3,nt_row, 4,nt_row, 5,nt_row, 6,nt_row, 7);
+        nt_row_16x8b = _mm_set_epi16(0x0700, 0x0701, 0x0702, 0x0703, 0x0704, 0x0705, 0x0706, 0x0707);
+        //(y+1) (x+1) ; x= 0..7 , y = row
+        //const_temp3_4x32b = _mm_set_epi16(row1,8,row1, 7,row1, 6, row1, 5,row1, 4, row1, 3, row1, 2, row1, 1);
+        row_16x8b = _mm_set_epi16(0x0108, 0x0107, 0x0106, 0x0105, 0x0104, 0x0103, 0x0102, 0x0101);
+
+        for(row = 0; row < nt; row += 4)
+        {
+            __m128i res_temp_8x16b, res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b;
+            __m128i res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b;
+
+            __m128i src_temp_8x16b;
+
+
+            res_temp4_8x16b  = _mm_set1_epi8(pu1_ref[two_nt - 1 - row]);
+            res_temp5_8x16b  = _mm_set1_epi8(pu1_ref[two_nt - 2 - row]);
+            res_temp6_8x16b  = _mm_set1_epi8(pu1_ref[two_nt - 3 - row]);
+            res_temp7_8x16b  = _mm_set1_epi8(pu1_ref[two_nt - 4 - row]);
+
+            nt_row1_16x8b = _mm_sub_epi16(nt_row_16x8b,  const_temp1_4x32b);
+            row1_16x8b    = _mm_add_epi16(row_16x8b,     const_temp1_4x32b);
+            nt_row2_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp1_4x32b);
+            row2_16x8b    = _mm_add_epi16(row1_16x8b,    const_temp1_4x32b);
+            nt_row3_16x8b = _mm_sub_epi16(nt_row2_16x8b, const_temp1_4x32b);
+            row3_16x8b    = _mm_add_epi16(row2_16x8b,    const_temp1_4x32b);
+            /* loding 8bit 16 pixles*/
+            src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+
+            res_temp4_8x16b =  _mm_unpacklo_epi8(res_temp4_8x16b, src_temp_8x16b); /* row=0*/
+            res_temp5_8x16b =  _mm_unpacklo_epi8(res_temp5_8x16b, src_temp_8x16b); /* row=1*/
+            res_temp6_8x16b =  _mm_unpacklo_epi8(res_temp6_8x16b, src_temp_8x16b); /* row=2*/
+            res_temp7_8x16b =  _mm_unpacklo_epi8(res_temp7_8x16b, src_temp_8x16b); /* row=3*/
+
+            /*(row + 1) * pu1_ref[nt - 1] + (col + 1) * pu1_ref[three_nt + 1] */
+            res_temp_8x16b  = _mm_maddubs_epi16(pu1_ref_16x8b, row_16x8b);
+            res_temp1_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row1_16x8b);
+            res_temp2_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row2_16x8b);
+            res_temp3_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row3_16x8b);
+            /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] + (nt - 1 - col)* pu1_ref[two_nt - 1 - row] */
+            res_temp4_8x16b = _mm_maddubs_epi16(res_temp4_8x16b, nt_row_16x8b);
+            res_temp5_8x16b = _mm_maddubs_epi16(res_temp5_8x16b, nt_row1_16x8b);
+            res_temp6_8x16b = _mm_maddubs_epi16(res_temp6_8x16b, nt_row2_16x8b);
+            res_temp7_8x16b = _mm_maddubs_epi16(res_temp7_8x16b, nt_row3_16x8b);
+
+            res_temp_8x16b  = _mm_add_epi16(res_temp_8x16b, res_temp4_8x16b);
+            res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp5_8x16b);
+            res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, res_temp6_8x16b);
+            res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp7_8x16b);
+            /*res_temp + nt)*/
+            res_temp_8x16b  = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
+            res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, const_temp6_4x32b);
+            res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, const_temp6_4x32b);
+            res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, const_temp6_4x32b);
+
+            res_temp_8x16b  = _mm_srli_epi16(res_temp_8x16b, 4); //log2(16)+1
+            res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 4);
+            res_temp2_8x16b = _mm_srli_epi16(res_temp2_8x16b, 4);
+            res_temp3_8x16b = _mm_srli_epi16(res_temp3_8x16b, 4);
+
+            res_temp_8x16b  = _mm_packus_epi16(res_temp_8x16b, zero_8x16b);
+            res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);
+            res_temp2_8x16b = _mm_packus_epi16(res_temp2_8x16b, zero_8x16b);
+            res_temp3_8x16b = _mm_packus_epi16(res_temp3_8x16b, zero_8x16b);
+
+            _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd)), res_temp_8x16b);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), res_temp1_8x16b);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), res_temp2_8x16b);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), res_temp3_8x16b);
+
+            nt_row_16x8b = _mm_sub_epi16(nt_row3_16x8b, const_temp1_4x32b);
+            row_16x8b    = _mm_add_epi16(row3_16x8b,    const_temp1_4x32b);
+        }
+    }
+    else
+    {
+
+        /* for nt multiple of 4*/
+        const_temp7_4x32b = _mm_set1_epi16(4);
+        const_temp4_4x32b = _mm_set1_epi16(nt - 1);
+        const_temp_4x32b  = _mm_set1_epi16(pu1_ref[three_nt + 1]);
+        const_temp1_4x32b = _mm_set1_epi16(pu1_ref[nt - 1]);
+        zero_8x16b = _mm_set1_epi32(0);
+
+        for(row = 0; row < nt; row++)
+        {
+            __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b;
+            __m128i res_temp3_8x16b;
+
+            const_temp2_4x32b  = _mm_set1_epi16(pu1_ref[two_nt - 1 - row]);
+            const_temp3_4x32b  = _mm_set1_epi16((row + 1));
+
+
+            row_8x16b = _mm_set1_epi16((nt - 1 - row));
+
+            const_temp5_4x32b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+            col_8x16b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+            const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b);
+
+            /*(row + 1) * pu1_ref[nt - 1]*/
+            res_temp_8x16b  = _mm_mullo_epi16(const_temp3_4x32b,  const_temp1_4x32b);
+
+            /*(row + 1) * pu1_ref[nt - 1] + nt)*/
+            res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
+
+            for(col = 0; col < nt; col += 4)
+            {
+                __m128i src_temp_8x16b;
+                int temp1;
+
+                /* loding 8bit 16 pixles*/
+                src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + col));
+
+                src_temp_8x16b =  _mm_unpacklo_epi8(src_temp_8x16b, zero_8x16b); /* row=0*/
+
+                /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */
+                res_temp1_8x16b  = _mm_mullo_epi16(src_temp_8x16b,  row_8x16b);
+
+                /*(col + 1) * pu1_ref[three_nt + 1]*/
+                res_temp2_8x16b  = _mm_mullo_epi16(const_temp_4x32b,  col_8x16b);
+
+                /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/
+                res_temp3_8x16b  = _mm_mullo_epi16(const_temp2_4x32b,  const_temp5_4x32b);
+
+                res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b);
+                res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+                res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b);
+
+                res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 3); //log2(16)+1
+                res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);
+
+                temp1 = _mm_cvtsi128_si32(res_temp1_8x16b);
+
+                *(WORD32 *)(&pu1_dst[(row * dst_strd) + col]) = temp1;
+
+                const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b);
+                col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b);
+            } /* inner loop ends here */
+        }
+    }
+
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for luma dc
+*
+* @par Description:
+*   Intraprediction for DC mode with reference neighboring  samples location
+*   pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
+*   to section 8.4.4.2.5 in the standard
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_dc_ssse3(UWORD8 *pu1_ref,
+                                    WORD32 src_strd,
+                                    UWORD8 *pu1_dst,
+                                    WORD32 dst_strd,
+                                    WORD32 nt,
+                                    WORD32 mode)
+{
+
+    WORD32 acc_dc;
+    WORD32 dc_val, two_dc_val, three_dc_val;
+    WORD32 row;
+    WORD32 log2nt = 5;
+    WORD32 two_nt, three_nt;
+    __m128i src_temp1, src_temp7, src_temp3, src_temp4, src_temp5, src_temp6;
+    __m128i src_temp8, src_temp10, src_temp2;
+    __m128i m_zero = _mm_setzero_si128();
+    __m128i sm = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASK5[0]);
+    UNUSED(src_strd);
+    UNUSED(mode);
+
+
+    switch(nt)
+    {
+        case 32:
+            log2nt = 5;
+            break;
+        case 16:
+            log2nt = 4;
+            break;
+        case 8:
+            log2nt = 3;
+            break;
+        case 4:
+            log2nt = 2;
+            break;
+        default:
+            break;
+    }
+    two_nt = 2 * nt;
+    three_nt = 3 * nt;
+
+    acc_dc = 0;
+    /* Calculate DC value for the transform block */
+
+
+
+    if(nt == 32)
+    {
+        __m128i temp;
+        WORD32 itr_count;
+
+        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
+        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
+        src_temp7 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 32));
+        src_temp8 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 48));
+
+        src_temp3 = _mm_sad_epu8(src_temp3, m_zero);
+        src_temp4 = _mm_sad_epu8(src_temp4, m_zero);
+        src_temp7 = _mm_sad_epu8(src_temp7, m_zero);
+        src_temp8 = _mm_sad_epu8(src_temp8, m_zero);
+
+        src_temp4 = _mm_add_epi16(src_temp3, src_temp4);
+        src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
+        src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
+
+        src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
+        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+        acc_dc = _mm_cvtsi128_si32(src_temp4);
+
+        acc_dc += pu1_ref[three_nt];
+        acc_dc -= pu1_ref[two_nt];
+
+        /* computing acc_dc value */
+        dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+        two_dc_val = 2 * dc_val;
+        three_dc_val = 3 * dc_val;
+
+        temp = _mm_set1_epi8(dc_val);
+
+        for(itr_count = 0; itr_count < 2; itr_count++)
+        {
+            /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp);
+
+            pu1_dst += 16 * dst_strd;
+        }
+    }
+    else
+
+    {
+        __m128i sm1 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]);
+
+        /* DC filtering for the first top row and first left column */
+
+
+
+        if(nt == 4) /* nt multiple of 4*/
+        {
+            WORD32 temp1, temp2, temp3;
+
+            src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
+            src_temp2 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+
+            src_temp4 =  _mm_unpacklo_epi8(src_temp3, m_zero);
+            src_temp2 =  _mm_unpacklo_epi8(src_temp2, m_zero);
+
+            src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+            src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+            src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+            acc_dc = _mm_cvtsi128_si32(src_temp4);
+            acc_dc += pu1_ref[three_nt];
+            acc_dc -= pu1_ref[two_nt];
+
+/* computing acc_dc value */
+
+            dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+            three_dc_val = 3 * dc_val;
+
+            /* loding 8-bit 16 pixel */
+            src_temp1 = _mm_set1_epi16(three_dc_val + 2);
+            two_dc_val = 2 * dc_val;
+
+            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
+            src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
+
+            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2) >> 2 */
+            src_temp2 = _mm_srli_epi16(src_temp2, 2);
+
+            src_temp2 = _mm_packus_epi16(src_temp2, m_zero);
+
+            temp1 = _mm_cvtsi128_si32(src_temp2);
+
+            *(WORD32 *)(&pu1_dst[0]) = temp1;
+
+            src_temp2 = _mm_insert_epi16(src_temp2, dc_val, 0);
+
+            src_temp2 =  _mm_shuffle_epi8(src_temp2, sm1);
+            src_temp3 =  _mm_shuffle_epi8(src_temp2, sm1);
+            src_temp4 =  _mm_shuffle_epi8(src_temp2, sm1);
+
+            temp1 = _mm_cvtsi128_si32(src_temp2);
+            temp2 = _mm_cvtsi128_si32(src_temp3);
+            temp3 = _mm_cvtsi128_si32(src_temp4);
+
+            *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1;
+            *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2;
+            *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3;
+
+            /*  retore  first value*/
+            pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
+                            >> 2);
+
+            for(row = 1; row < nt; row++)
+                pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
+                                >> 2;
+
+        }
+        else if(nt == 8) /* if nt%8==0*/
+        {
+
+            src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
+
+            src_temp4 = _mm_sad_epu8(src_temp3, m_zero);
+            src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
+            src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+            acc_dc = _mm_cvtsi128_si32(src_temp4);
+
+            acc_dc += pu1_ref[three_nt];
+            acc_dc -= pu1_ref[two_nt];
+
+            /* computing acc_dc value */
+
+            dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+            three_dc_val = 3 * dc_val;
+            src_temp1 = _mm_set1_epi16(three_dc_val + 2);
+            two_dc_val = 2 * dc_val;
+
+            /* loding 8-bit 16 pixel */
+            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+            src_temp2 =  _mm_unpacklo_epi8(src_temp2, m_zero);
+
+            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
+            src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
+
+            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
+            src_temp2 = _mm_srli_epi16(src_temp2, 2);
+            src_temp2 = _mm_packus_epi16(src_temp2, m_zero);
+
+            _mm_storel_epi64((__m128i *)(pu1_dst), src_temp2);
+
+            /* Fill the remaining rows with DC value*/
+
+            src_temp1 = _mm_set1_epi8(dc_val);
+            src_temp2 = _mm_set1_epi8(dc_val);
+            src_temp3 = _mm_set1_epi8(dc_val);
+            src_temp4 = _mm_set1_epi8(dc_val);
+            src_temp5 = _mm_set1_epi8(dc_val);
+            src_temp6 = _mm_set1_epi8(dc_val);
+            src_temp7 = _mm_set1_epi8(dc_val);
+
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
+
+            /*  retore  first value*/
+            pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
+                            >> 2);
+
+            for(row = 1; row < nt; row++)
+                pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
+                                >> 2;
+
+        }
+        else /* if nt == 16*/
+        {
+
+            src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
+            src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
+
+            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+            src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
+
+            src_temp3 = _mm_sad_epu8(src_temp3, m_zero);
+            src_temp4 = _mm_sad_epu8(src_temp4, m_zero);
+
+            src_temp2  =  _mm_unpacklo_epi8(src_temp2, m_zero);
+            src_temp10 =  _mm_unpacklo_epi8(src_temp10, m_zero);
+
+            src_temp4 = _mm_add_epi16(src_temp3, src_temp4);
+            src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
+            src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+            acc_dc = _mm_cvtsi128_si32(src_temp4);
+
+            acc_dc += pu1_ref[three_nt];
+            acc_dc -= pu1_ref[two_nt];
+
+            /* computing acc_dc value */
+
+            dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+            three_dc_val = 3 * dc_val;
+            src_temp1 = _mm_set1_epi16(three_dc_val + 2);
+            two_dc_val = 2 * dc_val;
+
+            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
+            src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
+            src_temp10 = _mm_add_epi16(src_temp10, src_temp1);
+            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
+            src_temp2 = _mm_srli_epi16(src_temp2, 2);
+            src_temp10 = _mm_srli_epi16(src_temp10, 2);
+
+            src_temp2 = _mm_packus_epi16(src_temp2, src_temp10);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp2);
+
+            /* Fill the remaining rows with DC value*/
+            src_temp1 =  _mm_set1_epi8(dc_val);
+            src_temp2 =  _mm_set1_epi8(dc_val);
+            src_temp3 =  _mm_set1_epi8(dc_val);
+            src_temp4 =  _mm_set1_epi8(dc_val);
+            src_temp5 =  _mm_set1_epi8(dc_val);
+            src_temp6 =  _mm_set1_epi8(dc_val);
+            src_temp7 =  _mm_set1_epi8(dc_val);
+
+            for(row = 1; row < nt; row += 8)
+            {
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp3);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp4);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp7);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp1);
+
+            }
+
+            /*  retore  first value*/
+            pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
+                            >> 2);
+
+            for(row = 1; row < nt; row++)
+                pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
+                                >> 2;
+
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Intra prediction interpolation filter for horizontal luma variable.
+*
+* @par Description:
+*      Horizontal intraprediction(mode 10) with reference  samples location
+*      pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
+*      to section 8.4.4.2.6 in the standard (Special case)
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_horz_ssse3(UWORD8 *pu1_ref,
+                                      WORD32 src_strd,
+                                      UWORD8 *pu1_dst,
+                                      WORD32 dst_strd,
+                                      WORD32 nt,
+                                      WORD32 mode)
+{
+
+    WORD32 row;
+    WORD32 two_nt;
+    UNUSED(src_strd);
+    UNUSED(mode);
+
+    two_nt = 2 * nt;
+
+
+    if(nt == 32)
+    {
+        __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
+        __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
+        __m128i sm = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]);
+
+        for(row = 0; row < nt; row += 16)
+        {
+            {
+                src_temp1 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 - row - 15));
+
+                src_temp2 =  _mm_srli_si128(src_temp1, 1);
+                src_temp3 =  _mm_srli_si128(src_temp1, 2);
+                src_temp4 =  _mm_srli_si128(src_temp1, 3);
+                src_temp5 =  _mm_srli_si128(src_temp1, 4);
+                src_temp6 =  _mm_srli_si128(src_temp1, 5);
+                src_temp7 =  _mm_srli_si128(src_temp1, 6);
+                src_temp8 =  _mm_srli_si128(src_temp1, 7);
+
+                src_temp9 =  _mm_srli_si128(src_temp1, 8);
+                src_temp10 =  _mm_srli_si128(src_temp1, 9);
+                src_temp11 =  _mm_srli_si128(src_temp1, 10);
+                src_temp12 =  _mm_srli_si128(src_temp1, 11);
+                src_temp13 =  _mm_srli_si128(src_temp1, 12);
+                src_temp14 =  _mm_srli_si128(src_temp1, 13);
+                src_temp15 =  _mm_srli_si128(src_temp1, 14);
+                src_temp16 =  _mm_srli_si128(src_temp1, 15);
+
+                src_temp8 =  _mm_shuffle_epi8(src_temp8, sm);
+                src_temp7 =  _mm_shuffle_epi8(src_temp7, sm);
+                src_temp6 =  _mm_shuffle_epi8(src_temp6, sm);
+                src_temp5 =  _mm_shuffle_epi8(src_temp5, sm);
+                src_temp4 =  _mm_shuffle_epi8(src_temp4, sm);
+                src_temp3 =  _mm_shuffle_epi8(src_temp3, sm);
+                src_temp2 =  _mm_shuffle_epi8(src_temp2, sm);
+                src_temp1 =  _mm_shuffle_epi8(src_temp1, sm);
+
+                src_temp16 =  _mm_shuffle_epi8(src_temp16, sm);
+                src_temp15 =  _mm_shuffle_epi8(src_temp15, sm);
+                src_temp14 =  _mm_shuffle_epi8(src_temp14, sm);
+                src_temp13 =  _mm_shuffle_epi8(src_temp13, sm);
+                src_temp12 =  _mm_shuffle_epi8(src_temp12, sm);
+                src_temp11 =  _mm_shuffle_epi8(src_temp11, sm);
+                src_temp10 =  _mm_shuffle_epi8(src_temp10, sm);
+                src_temp9 =  _mm_shuffle_epi8(src_temp9, sm);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp16);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp15);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp14);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp13);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp12);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp11);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp10);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp9);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 8) * dst_strd)), src_temp8);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 9) * dst_strd)), src_temp7);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 10) * dst_strd)), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 11) * dst_strd)), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 12) * dst_strd)), src_temp4);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 13) * dst_strd)), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 14) * dst_strd)), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 15) * dst_strd)), src_temp1);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 0) * dst_strd)), src_temp16);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 1) * dst_strd)), src_temp15);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 2) * dst_strd)), src_temp14);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 3) * dst_strd)), src_temp13);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 4) * dst_strd)), src_temp12);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 5) * dst_strd)), src_temp11);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 6) * dst_strd)), src_temp10);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 7) * dst_strd)), src_temp9);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 8) * dst_strd)), src_temp8);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 9) * dst_strd)), src_temp7);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 10) * dst_strd)), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 11) * dst_strd)), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 12) * dst_strd)), src_temp4);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 13) * dst_strd)), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 14) * dst_strd)), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 15) * dst_strd)), src_temp1);
+
+            }
+
+        }
+
+    }
+    else
+
+    {
+        __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6;
+        __m128i src_temp10, zero_8x16b, src_temp7;
+
+        /* DC filtering for the first top row and first left column */
+
+        zero_8x16b = _mm_set1_epi16(0);
+
+        /*Filtering done for the 1st row */
+
+        src_temp2 =  _mm_set1_epi16(pu1_ref[two_nt - 1]);
+        src_temp10 =  _mm_set1_epi16(pu1_ref[two_nt]);
+
+        /*  loding 8-bit 16 pixels */
+        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+
+        src_temp4 =  _mm_unpacklo_epi8(src_temp4, zero_8x16b);
+
+        /*(pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt])*/
+        src_temp3 = _mm_sub_epi16(src_temp4, src_temp10);
+
+        /* ((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/
+        src_temp3 = _mm_srai_epi16(src_temp3, 1);
+
+        /* pu1_ref[two_nt - 1]+((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/
+        src_temp3 = _mm_add_epi16(src_temp2, src_temp3);
+
+        if(nt == 4)
+        {
+            int temp1, temp2, temp3;
+            src_temp3 = _mm_packus_epi16(src_temp3, zero_8x16b);
+            temp1 = _mm_cvtsi128_si32(src_temp3);
+
+            *(WORD32 *)(&pu1_dst[0]) = temp1;
+
+            src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 2]);
+            src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 3]);
+            src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 4]);
+
+            temp1 = _mm_cvtsi128_si32(src_temp2);
+            temp2 = _mm_cvtsi128_si32(src_temp3);
+            temp3 = _mm_cvtsi128_si32(src_temp4);
+
+            /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
+            *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1;
+            *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2;
+            *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3;
+
+        }
+        else if(nt == 8)
+        {
+            src_temp10 = _mm_packus_epi16(src_temp3, zero_8x16b);
+
+
+            src_temp1 =  _mm_set1_epi8(pu1_ref[two_nt - 2]);
+            src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 3]);
+            src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 4]);
+            src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 5]);
+            src_temp5 =  _mm_set1_epi8(pu1_ref[two_nt - 6]);
+            src_temp6 =  _mm_set1_epi8(pu1_ref[two_nt - 7]);
+            src_temp7 =  _mm_set1_epi8(pu1_ref[two_nt - 8]);
+
+            _mm_storel_epi64((__m128i *)(pu1_dst), src_temp10);
+
+            /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp2);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp3);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp5);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp6);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp7);
+
+        }
+        else if(nt == 16)
+        {
+            src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
+            src_temp4 =  _mm_unpacklo_epi8(src_temp4, zero_8x16b);
+            //src_temp4 =  _mm_cvtepu8_epi16 (src_temp4);
+
+            src_temp10 = _mm_sub_epi16(src_temp4, src_temp10);
+            src_temp10 = _mm_srai_epi16(src_temp10, 1);
+            src_temp10 = _mm_add_epi16(src_temp2, src_temp10);
+
+            src_temp3 = _mm_packus_epi16(src_temp3, src_temp10);
+            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp3);
+
+            /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
+            src_temp1 =  _mm_set1_epi8(pu1_ref[two_nt - 2]);
+            src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 3]);
+            src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 4]);
+            src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 5]);
+            src_temp5 =  _mm_set1_epi8(pu1_ref[two_nt - 6]);
+            src_temp6 =  _mm_set1_epi8(pu1_ref[two_nt - 7]);
+            src_temp7 =  _mm_set1_epi8(pu1_ref[two_nt - 8]);
+            src_temp10 =  _mm_set1_epi8(pu1_ref[two_nt - 9]);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp10);
+
+            src_temp1 =  _mm_set1_epi8(pu1_ref[two_nt - 10]);
+            src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 11]);
+            src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 12]);
+            src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 13]);
+            src_temp5 =  _mm_set1_epi8(pu1_ref[two_nt - 14]);
+            src_temp6 =  _mm_set1_epi8(pu1_ref[two_nt - 15]);
+            src_temp7 =  _mm_set1_epi8(pu1_ref[two_nt - 16]);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp3);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp4);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp5);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp6);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp7);
+
+        }
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Intra prediction interpolation filter for vertical luma variable.
+*
+* @par Description:
+*    Horizontal intraprediction with reference neighboring  samples location
+*    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
+*    to section 8.4.4.2.6 in the standard (Special case)
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_ver_ssse3(UWORD8 *pu1_ref,
+                                     WORD32 src_strd,
+                                     UWORD8 *pu1_dst,
+                                     WORD32 dst_strd,
+                                     WORD32 nt,
+                                     WORD32 mode)
+{
+    WORD32 row;
+    WORD16 s2_predpixel;
+    WORD32 two_nt = 2 * nt;
+    __m128i src_temp0, src_temp2;
+    UNUSED(src_strd);
+    UNUSED(mode);
+
+
+    if(nt == 32)
+    {
+        __m128i temp1, temp2;
+        WORD32 itr_count;
+
+        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+        temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 16));
+
+        for(itr_count = 0; itr_count < 2; itr_count++)
+        {
+            /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp1);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp2);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp1);
+            _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp1);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp2);
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp2);
+
+            pu1_dst += 16 * dst_strd;
+        }
+    }
+    else
+    {
+        /* Replication to next columns*/
+
+        if(nt == 4)
+        {
+            int temp1;
+
+            src_temp2 =   _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+
+            temp1 = _mm_cvtsi128_si32(src_temp2);
+
+            /* loding 4-bit 8 pixels values */
+            *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1;
+            *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1;
+            *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp1;
+            *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp1;
+
+        }
+        else if(nt == 8)
+        {
+
+            src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((0) * dst_strd)), src_temp0);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp0);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp0);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp0);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp0);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp0);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp0);
+            _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp0);
+
+
+        }
+        else if(nt == 16)
+        {
+            for(row = 0; row < nt; row += 8)
+            {
+
+                src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp0);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp0);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp0);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp0);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp0);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp0);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp0);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp0);
+
+            }
+
+        }
+
+        /*Filtering done for the 1st column */
+        for(row = nt - 1; row >= 0; row--)
+        {
+            s2_predpixel = pu1_ref[two_nt + 1]
+                            + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1);
+            pu1_dst[row * dst_strd] = CLIP_U8(s2_predpixel);
+        }
+
+
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*     Intra prediction interpolation filter for luma mode2.
+*
+* @par Description:
+*    Intraprediction for mode 2 (sw angle) with reference  neighboring samples
+*    location pointed by 'pu1_ref' to the  TU block location pointed by
+*    'pu1_dst'  Refer to section 8.4.4.2.6 in the standard
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_mode2_ssse3(UWORD8 *pu1_ref,
+                                       WORD32 src_strd,
+                                       UWORD8 *pu1_dst,
+                                       WORD32 dst_strd,
+                                       WORD32 nt,
+                                       WORD32 mode)
+{
+    WORD32 row, col;
+    WORD32 two_nt = 2 * nt;
+
+    __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
+    __m128i   sm1, sm2, sm3;
+    UNUSED(src_strd);
+    UNUSED(mode);
+
+
+    sm1 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY1[0]);
+    sm2 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY2[0]);
+    sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY3[0]);
+
+    /* For the angle 45, replication is done from the corresponding angle */
+    /* intra_pred_ang = tan(angle) in q5 format */
+
+    if(nt == 4)
+    {
+        int temp1, temp2, temp3, temp4;
+
+        /*pu1_ref[two_nt - row - (col+1) - 1]*/
+        src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 8));
+        src_temp2 = _mm_srli_si128(src_temp1, 1);
+        src_temp3 = _mm_srli_si128(src_temp1, 2);
+        src_temp4 = _mm_srli_si128(src_temp1, 3);
+
+        src_temp4 = _mm_shuffle_epi8(src_temp4, sm1);
+        src_temp3 = _mm_shuffle_epi8(src_temp3, sm1);
+        src_temp2 = _mm_shuffle_epi8(src_temp2, sm1);
+        src_temp1 = _mm_shuffle_epi8(src_temp1, sm1);
+
+        temp1 = _mm_cvtsi128_si32(src_temp4);
+        temp2 = _mm_cvtsi128_si32(src_temp3);
+        temp3 = _mm_cvtsi128_si32(src_temp2);
+        temp4 = _mm_cvtsi128_si32(src_temp1);
+
+        /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
+        *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1;
+        *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2;
+        *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3;
+        *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4;
+
+
+    }
+    else if(nt == 8)
+    {
+        /*pu1_ref[two_nt - row - (col+1) - 1]*/
+        src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 16));
+        src_temp2 = _mm_srli_si128(src_temp1, 1);
+        src_temp3 = _mm_srli_si128(src_temp1, 2);
+        src_temp4 = _mm_srli_si128(src_temp1, 3);
+        src_temp5 = _mm_srli_si128(src_temp1, 4);
+        src_temp6 = _mm_srli_si128(src_temp1, 5);
+        src_temp7 = _mm_srli_si128(src_temp1, 6);
+        src_temp8 = _mm_srli_si128(src_temp1, 7);
+
+        src_temp1 = _mm_shuffle_epi8(src_temp1, sm2);
+        src_temp2 = _mm_shuffle_epi8(src_temp2, sm2);
+        src_temp3 = _mm_shuffle_epi8(src_temp3, sm2);
+        src_temp4 = _mm_shuffle_epi8(src_temp4, sm2);
+        src_temp5 = _mm_shuffle_epi8(src_temp5, sm2);
+        src_temp6 = _mm_shuffle_epi8(src_temp6, sm2);
+        src_temp7 = _mm_shuffle_epi8(src_temp7, sm2);
+        src_temp8 = _mm_shuffle_epi8(src_temp8, sm2);
+
+        _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp8);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp7);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp6);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp5);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp3);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp2);
+        _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
+
+    }
+    else
+    {
+        for(row = 0; row < nt; row += 8)
+        {
+            for(col = 0; col < nt; col += 16)
+            {   /*pu1_ref[two_nt - row - (col+1) - 1]*/
+
+                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 0) - (col + 16) - 1));
+                src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 1) - (col + 16) - 1));
+                src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 2) - (col + 16) - 1));
+                src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 3) - (col + 16) - 1));
+                src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 4) - (col + 16) - 1));
+                src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 5) - (col + 16) - 1));
+                src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 6) - (col + 16) - 1));
+                src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 7) - (col + 16) - 1));
+
+                src_temp1 = _mm_shuffle_epi8(src_temp1, sm3);
+                src_temp2 = _mm_shuffle_epi8(src_temp2, sm3);
+                src_temp3 = _mm_shuffle_epi8(src_temp3, sm3);
+                src_temp4 = _mm_shuffle_epi8(src_temp4, sm3);
+                src_temp5 = _mm_shuffle_epi8(src_temp5, sm3);
+                src_temp6 = _mm_shuffle_epi8(src_temp6, sm3);
+                src_temp7 = _mm_shuffle_epi8(src_temp7, sm3);
+                src_temp8 = _mm_shuffle_epi8(src_temp8, sm3);
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 0) * dst_strd)), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 1) * dst_strd)), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 2) * dst_strd)), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 3) * dst_strd)), src_temp4);
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 4) * dst_strd)), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 5) * dst_strd)), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 6) * dst_strd)), src_temp7);
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 7) * dst_strd)), src_temp8);
+            }
+        }
+    }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for luma mode 18 & mode 34.
+*
+* @par Description:
+*    Intraprediction for mode 34 (ne angle) and  mode 18 (nw angle) with
+*    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+*    block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_mode_18_34_ssse3(UWORD8 *pu1_ref,
+                                            WORD32 src_strd,
+                                            UWORD8 *pu1_dst,
+                                            WORD32 dst_strd,
+                                            WORD32 nt,
+                                            WORD32 mode)
+{
+    WORD32 row;
+    WORD32 two_nt = 2 * nt;
+    __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
+    UNUSED(src_strd);
+    if(mode == 34)
+    {
+        if(nt == 4)
+        {
+
+            int temp1, temp2, temp3, temp4;
+
+            /*pu1_ref[two_nt + col + idx + 1]*/
+            src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 2));
+            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 3));
+            src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 4));
+            src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 5));
+
+            temp1 = _mm_cvtsi128_si32(src_temp1);
+            temp2 = _mm_cvtsi128_si32(src_temp2);
+            temp3 = _mm_cvtsi128_si32(src_temp3);
+            temp4 = _mm_cvtsi128_si32(src_temp4);
+
+            /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
+            *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1;
+            *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2;
+            *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3;
+            *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4;
+
+        }
+        else if(nt == 8)
+        {
+            /*pu1_ref[two_nt + col + idx + 1]*/
+            src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 2));
+            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 3));
+            src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 4));
+            src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 5));
+            src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 6));
+            src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 7));
+            src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 8));
+            src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 9));
+
+            _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8);
+
+        }
+        else if(nt == 16)
+        {
+            for(row = 0; row < nt; row += 8)
+            {
+                /*pu1_ref[two_nt + col + idx + 1]*/
+                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 0) + 2));
+                src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 1) + 2));
+                src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 2) + 2));
+                src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 3) + 2));
+                src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 4) + 2));
+                src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 5) + 2));
+                src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 6) + 2));
+                src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 7) + 2));
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp4);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp7);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp8);
+
+
+            }
+        }
+        else
+        {
+            __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
+            for(row = 0; row < nt; row += 8)
+            {
+                /*pu1_ref[two_nt + col + idx + 1]*/
+                src_temp1  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (0 + 0) + 2));
+                src_temp9  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (0 + 16) + 2));
+                src_temp2  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (1 + 0) + 2));
+                src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (1 + 16) + 2));
+                src_temp3  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (2 + 0) + 2));
+                src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (2 + 16) + 2));
+                src_temp4  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (3 + 0) + 2));
+                src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (3 + 16) + 2));
+
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (0 * dst_strd)), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (1 * dst_strd)), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (2 * dst_strd)), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (3 * dst_strd)), src_temp4);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);
+
+                src_temp5  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (4 + 0) + 2));
+                src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (4 + 16) + 2));
+                src_temp6  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (5 + 0) + 2));
+                src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (5 + 16) + 2));
+                src_temp7  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (6 + 0) + 2));
+                src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (6 + 16) + 2));
+                src_temp8  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (7 + 0) + 2));
+                src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (7 + 16) + 2));
+
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (4 * dst_strd)), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (5 * dst_strd)), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (6 * dst_strd)), src_temp7);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (7 * dst_strd)), src_temp8);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);
+
+                pu1_ref += 8;
+                pu1_dst += 8 * dst_strd;
+            }
+        }
+    }
+    else
+    {
+        if(nt == 4)
+        {
+            int temp1, temp2, temp3, temp4;
+
+            /*pu1_ref[two_nt + col + idx + 1]*/
+            src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 3));
+            src_temp2 = _mm_srli_si128(src_temp1, 1);
+            src_temp3 = _mm_srli_si128(src_temp1, 2);
+            src_temp4 = _mm_srli_si128(src_temp1, 3);
+
+            temp1 = _mm_cvtsi128_si32(src_temp4);
+            temp2 = _mm_cvtsi128_si32(src_temp3);
+            temp3 = _mm_cvtsi128_si32(src_temp2);
+            temp4 = _mm_cvtsi128_si32(src_temp1);
+
+            /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
+            *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1;
+            *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2;
+            *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3;
+            *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4;
+
+        }
+        else if(nt == 8)
+        {
+            /*pu1_ref[two_nt + col + idx + 1]*/
+            src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 7));
+            src_temp2 = _mm_srli_si128(src_temp1, 1);
+            src_temp3 = _mm_srli_si128(src_temp1, 2);
+            src_temp4 = _mm_srli_si128(src_temp1, 3);
+            src_temp5 = _mm_srli_si128(src_temp1, 4);
+            src_temp6 = _mm_srli_si128(src_temp1, 5);
+            src_temp7 = _mm_srli_si128(src_temp1, 6);
+            src_temp8 = _mm_srli_si128(src_temp1, 7);
+
+            _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp8);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp7);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp6);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp5);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp3);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp2);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
+
+
+        }
+        else if(nt == 16)
+        {
+            for(row = 0; row < nt; row += 8)
+            {
+                /*pu1_ref[two_nt + col + idx + 1]*/
+                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 0)));
+                src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 1)));
+                src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 2)));
+                src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 3)));
+                src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 4)));
+                src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 5)));
+                src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 6)));
+                src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 7)));
+
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp4);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp7);
+                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp8);
+
+            }
+
+        }
+        else
+        {
+            __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
+            for(row = 0; row < nt; row += 8)
+            {
+                /*pu1_ref[two_nt + col + idx + 1]*/
+                src_temp1  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 0 + 0));
+                src_temp9  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 0 + 16));
+                src_temp2  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 + 0));
+                src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 + 16));
+                src_temp3  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 2 + 0));
+                src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 2 + 16));
+                src_temp4  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 3 + 0));
+                src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 3 + 16));
+
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (0 * dst_strd)), src_temp1);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (1 * dst_strd)), src_temp2);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (2 * dst_strd)), src_temp3);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (3 * dst_strd)), src_temp4);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);
+
+                src_temp5  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 4 + 0));
+                src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 4 + 16));
+                src_temp6  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 5 + 0));
+                src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 5 + 16));
+                src_temp7  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 6 + 0));
+                src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 6 + 16));
+                src_temp8  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 7 + 0));
+                src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 7 + 16));
+
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (4 * dst_strd)), src_temp5);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (5 * dst_strd)), src_temp6);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (6 * dst_strd)), src_temp7);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
+                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (7 * dst_strd)), src_temp8);
+                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);
+
+                pu1_ref -= 8;
+                pu1_dst += 8 * dst_strd;
+            }
+        }
+    }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for luma mode 3 to mode 9
+*
+* @par Description:
+*    Intraprediction for mode 3 to 9  (positive angle, horizontal mode ) with
+*    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+*    block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_mode_3_to_9_ssse3(UWORD8 *pu1_ref,
+                                             WORD32 src_strd,
+                                             UWORD8 *pu1_dst,
+                                             WORD32 dst_strd,
+                                             WORD32 nt,
+                                             WORD32 mode)
+{
+    WORD32 row, col;
+    WORD32 two_nt = 2 * nt;
+    WORD32 intra_pred_ang;
+
+
+    __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b, zero_8x16b;
+    __m128i fract_4x32b, intra_pred_ang_4x32b;
+    __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3;
+    UNUSED(src_strd);
+
+    /* Intra Pred Angle according to the mode */
+    intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+    /* samples dependent on distance to obtain destination sample */
+
+    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+    /* samples dependent on distance to obtain destination sample */
+
+    const_temp_4x32b  = _mm_set1_epi16(16);
+    const_temp2_4x32b = _mm_set1_epi32(31);
+    const_temp3_4x32b = _mm_set1_epi32(32);
+    const_temp4_4x32b = _mm_set1_epi32(4);
+
+    two_nt_4x32b = _mm_set1_epi32(two_nt - nt);
+
+
+    sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
+
+    /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+    intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
+
+    row_4x32b = _mm_set_epi32(4, 3, 2, 1);
+
+    if(nt == 4)
+    {
+
+        WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+        int temp11, temp21, temp31, temp41;
+        // WORD8  ai1_fract_temp_val[16], ai1_row_temp_val[16];
+
+        __m128i fract1_8x16b, fract2_8x16b, sign_8x16b;
+        __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+
+        __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+        __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b; //, src_temp8_8x16b;
+        __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
+
+        row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+        /* pos = ((row + 1) * intra_pred_ang); */
+        res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+        zero_8x16b      = _mm_setzero_si128();
+        sign_8x16b      = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
+        res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
+
+        /* idx = pos >> 5; */
+        fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+        /* fract = pos & (31); */
+        ref_main_idx_4x32b = _mm_sub_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
+
+        /*(32 - fract) */
+        row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b);
+
+        fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+        fract2_8x16b = _mm_slli_epi16(row_4x32b, 8);
+
+        fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+        row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */
+
+        fract2_8x16b = _mm_unpackhi_epi8(row_4x32b, fract_4x32b);
+        fract1_8x16b = _mm_unpacklo_epi8(row_4x32b, fract_4x32b);
+
+        temp1_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+        temp2_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+        temp3_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+        temp4_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+
+        ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4);  /* next 32 bit values */
+        ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8);  /* next 32 bit values */
+        ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */
+        ref_main_idx1  = _mm_cvtsi128_si32(ref_main_idx_4x32b);    /* col=0*/
+        ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* col=1*/
+        ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* col=2*/
+        ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* col=3*/
+
+        /* loding 8-bit 16 pixels */
+        src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 - 1)); /* col=0*/
+        src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2 - 1)); /* col=1*/
+        src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3 - 1)); /* col=2*/
+        src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4 - 1)); /* col=3*/
+
+        src_temp1_8x16b =  _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
+        src_temp2_8x16b =  _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
+        src_temp3_8x16b =  _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
+        src_temp4_8x16b =  _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
+
+        /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+        src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+        src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+        src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+        src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+        /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+        src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+        src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+        src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+        src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+        /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+        src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
+        src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
+        src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
+        src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
+
+        /* converting 16 bit to 8 bit */
+        src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+        src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+
+        src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+        src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+        src_temp3_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+        src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 4);
+        src_temp1_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+        src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 12);
+
+        temp11 = _mm_cvtsi128_si32(src_temp7_8x16b);
+        temp21 = _mm_cvtsi128_si32(src_temp1_8x16b);
+        temp31 = _mm_cvtsi128_si32(src_temp2_8x16b);
+        temp41 = _mm_cvtsi128_si32(src_temp3_8x16b);
+
+        /* loding 4-bit 8 pixels values */
+        *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
+        *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
+        *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
+        *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
+
+    }
+
+    else if(nt == 16 || nt == 32)
+    {
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp4_4x32b = _mm_set1_epi16(8);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+        two_nt_4x32b = _mm_set1_epi16(two_nt);
+
+        for(col = 0; col < nt; col += 8)
+        {
+            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+            //WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
+
+            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* idx = pos >> 5; */
+            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+            fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+            fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+            fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+            fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+
+            fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b);
+            fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b);
+
+            temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
+            temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
+            temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
+            temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+            temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
+            temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
+            temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+            temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+            /* fract = pos & (31); */
+            ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+
+            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
+            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
+            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
+            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
+
+            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
+            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
+            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
+            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
+
+            for(row = 0; row < nt; row += 8)
+            {
+                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+
+                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+                /* loding 8-bit 16 pixels */
+                src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1 - (8 + row))); /* col=0*/
+                src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1 - (8 + row))); /* col=1*/
+                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1 - (8 + row))); /* col=2*/
+                src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1 - (8 + row))); /* col=3*/
+
+                /* loding 8-bit 16 pixels */
+                src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1 - (8 + row))); /* col=5*/
+                src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1 - (8 + row))); /* col=6*/
+                src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1 - (8 + row))); /* col=7*/
+                src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1 - (8 + row))); /* col=8*/
+
+                src_temp1_8x16b =  _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
+                src_temp2_8x16b =  _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
+                src_temp3_8x16b =  _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
+                src_temp4_8x16b =  _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
+
+                src_temp11_8x16b =  _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/
+                src_temp12_8x16b =  _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/
+                src_temp13_8x16b =  _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/
+                src_temp14_8x16b =  _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
+                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
+                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
+                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
+                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
+                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
+                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
+                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
+
+                src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+                src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+                src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+                src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+                src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+                src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+                src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+                src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+                src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+                src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+                src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+                src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+
+                src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+                src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+                src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+                src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp1_8x16b);       /* row=7*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp5_8x16b);       /* row=6*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp2_8x16b);       /* row=5*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp6_8x16b);       /* row=4*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp3_8x16b);       /* row=3*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp7_8x16b);       /* row=2*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp4_8x16b);       /* row=1*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 0))), src_temp8_8x16b);       /* row=0*/
+
+            }
+        }
+    }
+    else
+    {
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp4_4x32b = _mm_set1_epi16(8);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+        two_nt_4x32b = _mm_set1_epi16(two_nt - nt);
+        {
+            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+
+            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* idx = pos >> 5; */
+            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /* fract = pos & (31); */
+            ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            /*(32 - fract) */
+            fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+            fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+            fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+            fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+            fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+
+            fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b);
+            fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b);
+
+            temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
+            temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
+            temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
+            temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+            temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
+            temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
+            temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+            temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
+            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
+            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
+            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
+
+            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
+            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
+            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
+            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
+
+            {
+                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+                /* loding 8-bit 16 pixels */
+                src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1)); /* col=0*/
+                src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1)); /* col=1*/
+                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1)); /* col=2*/
+                src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1)); /* col=3*/
+
+                /* loding 8-bit 16 pixels */
+                src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1)); /* col=5*/
+                src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1)); /* col=6*/
+                src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1)); /* col=7*/
+                src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1)); /* col=8*/
+
+                src_temp1_8x16b =  _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
+                src_temp2_8x16b =  _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
+                src_temp3_8x16b =  _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
+                src_temp4_8x16b =  _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
+
+                src_temp11_8x16b =  _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/
+                src_temp12_8x16b =  _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/
+                src_temp13_8x16b =  _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/
+                src_temp14_8x16b =  _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
+                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
+                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
+                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
+                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
+                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
+                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
+                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
+
+                src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+                src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+                src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+                src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+                src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+                src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+                src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+                src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+                src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+                src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+                src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+                src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+
+                src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+                src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+                src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+                src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+                _mm_storel_epi64((__m128i *)(pu1_dst), src_temp8_8x16b);       /* row=0*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 1)), src_temp4_8x16b);       /* row=1*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 2)), src_temp7_8x16b);       /* row=2*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 3)), src_temp3_8x16b);       /* row=3*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 4)), src_temp6_8x16b);       /* row=4*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 5)), src_temp2_8x16b);       /* row=5*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 6)), src_temp5_8x16b);       /* row=6*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 7)), src_temp1_8x16b);       /* row=7*/
+
+            }
+        }
+    }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Intra prediction interpolation filter for luma mode 11 to mode 17
+*
+* @par Description:
+*    Intraprediction for mode 11 to 17  (negative angle, horizontal mode )
+*    with reference  neighboring samples location pointed by 'pu1_ref' to the
+*    TU block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_11_to_17_ssse3(UWORD8 *pu1_ref,
+                                               WORD32 src_strd,
+                                               UWORD8 *pu1_dst,
+                                               WORD32 dst_strd,
+                                               WORD32 nt,
+                                               WORD32 mode)
+{
+
+    /* This function and ihevc_intra_pred_luma_mode_19_to_25 are same except*/
+    /* for ref main & side samples assignment,can be combined for */
+    /* optimzation*/
+
+    WORD32 row, col, k;
+    WORD32 two_nt;
+    WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
+    WORD32 ref_idx;
+
+    __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
+    __m128i fract_4x32b,  intra_pred_ang_4x32b;
+    __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3;
+
+
+    UWORD8 ref_tmp[2 * MAX_CU_SIZE + 2];
+    UWORD8 *ref_main;
+    UWORD8 *ref_temp;
+    UNUSED(src_strd);
+    inv_ang_sum = 128;
+    two_nt    = 2 * nt;
+    ref_temp = ref_tmp + 1;
+    ref_main = ref_temp + nt - 1;
+    intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+    /* samples dependent on distance to obtain destination sample */
+    const_temp_4x32b  = _mm_set1_epi16(16);
+    const_temp2_4x32b = _mm_set1_epi32(31);
+    const_temp3_4x32b = _mm_set1_epi32(32);
+    const_temp4_4x32b = _mm_set1_epi32(4);
+
+    two_nt_4x32b = _mm_set1_epi32(1);
+
+
+    sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
+
+    /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+    intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
+
+    row_4x32b = _mm_set_epi32(4, 3, 2, 1);
+
+    if(nt == 4)
+    {
+
+        WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+        int temp11, temp21, temp31, temp41;
+//        WORD8  ai1_fract_temp_val[16], ai1_row_temp_val[16];
+
+        __m128i fract1_8x16b, fract2_8x16b;
+        __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+
+        __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+        __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+        __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, zero_8x16b, sign_8x16b;
+
+        /* Intermediate reference samples for negative angle modes */
+        /* This have to be removed during optimization*/
+        /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
+        inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
+
+        ref_main = ref_temp + nt - 1;
+        for(k = 0; k < nt + 1; k++)
+            ref_temp[k + nt - 1] = pu1_ref[two_nt - k];
+
+        ref_main = ref_temp + nt - 1;
+        ref_idx = (nt * intra_pred_ang) >> 5;
+        zero_8x16b = _mm_setzero_si128();
+
+        row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+        /* SIMD Optimization can be done using look-up table for the loop */
+        /* For negative angled derive the main reference samples from side */
+        /*  reference samples refer to section 8.4.4.2.6 */
+        for(k = -1; k > ref_idx; k--)
+        {
+            inv_ang_sum += inv_ang;
+            ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
+        }
+
+
+        /* pos = ((row + 1) * intra_pred_ang); */
+        res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+        sign_8x16b      = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
+        res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
+
+        /* idx = pos >> 5; */
+        fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+        /* fract = pos & (31); */
+        ref_main_idx_4x32b = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
+
+        /*(32 - fract) */
+        row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b);
+
+        fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+        fract2_8x16b = _mm_slli_epi16(row_4x32b, 8);
+
+        fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+        row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */
+
+        fract2_8x16b = _mm_unpackhi_epi8(fract_4x32b, row_4x32b);
+        fract1_8x16b = _mm_unpacklo_epi8(fract_4x32b, row_4x32b);
+
+        temp1_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+        temp2_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+        temp3_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+        temp4_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+
+        ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4);  /* next 32 bit values */
+        ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8);  /* next 32 bit values */
+        ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */
+        ref_main_idx1  = _mm_cvtsi128_si32(ref_main_idx_4x32b);    /* col=0*/
+        ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* col=1*/
+        ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* col=2*/
+        ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* col=3*/
+
+        /* loding 8-bit 16 pixels */
+        src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col=0*/
+        src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col=1*/
+        src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col=2*/
+        src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col=3*/
+
+        src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
+        src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
+        src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
+        src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
+
+        /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+        src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+        src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+        src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+        src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+        /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+        src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+        src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+        src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+        src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+        /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+        src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
+        src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
+        src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
+        src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
+
+        /* converting 16 bit to 8 bit */
+        src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+        src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+
+        src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+        src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+        src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+        src_temp1_8x16b = _mm_srli_si128(src_temp7_8x16b, 4);
+        src_temp2_8x16b = _mm_srli_si128(src_temp7_8x16b, 8);
+        src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 12);
+
+        temp11 = _mm_cvtsi128_si32(src_temp7_8x16b);
+        temp21 = _mm_cvtsi128_si32(src_temp1_8x16b);
+        temp31 = _mm_cvtsi128_si32(src_temp2_8x16b);
+        temp41 = _mm_cvtsi128_si32(src_temp3_8x16b);
+
+        /* loding 8-bit 4 pixels values */
+        *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
+        *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
+        *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
+        *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
+    }
+
+    else if(nt == 32)
+    {
+
+
+        __m128i temp1, temp2, temp3, temp11, temp12;
+        __m128i src_values0, src_values1;
+        /* Intermediate reference samples for negative angle modes */
+
+        ref_temp[two_nt - 1] = pu1_ref[two_nt - nt];
+        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1));
+        temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 17));
+        temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
+
+        /* For negative angled derive the main reference samples from side */
+
+        src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
+        src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 17)); /*(nt+16)-(two_nt-1)*/
+
+        temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode]));
+        temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
+
+        src_values0 = _mm_shuffle_epi8(src_values0, temp2);
+        src_values1 = _mm_shuffle_epi8(src_values1, temp2);
+        src_values0 = _mm_shuffle_epi8(src_values0, temp12);
+        src_values1 = _mm_shuffle_epi8(src_values1, temp11);
+
+        temp1 = _mm_shuffle_epi8(temp1, temp2);
+        temp3 = _mm_shuffle_epi8(temp3, temp2);
+
+        _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp3);
+        _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp1);
+        _mm_storeu_si128((__m128i *)(ref_main - 16), src_values0);
+        _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[17 - mode][0]), src_values1);
+
+
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp4_4x32b = _mm_set1_epi16(8);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+        two_nt_4x32b = _mm_set1_epi16(1);
+
+        for(col = 0; col < nt; col += 8)
+        {
+            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+            // WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
+
+            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* idx = pos >> 5; */
+            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /* fract = pos & (31); */
+            ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+            /*(32 - fract) */
+            fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+            fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+            fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+            fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+            fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+
+            fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
+            fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
+
+            temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
+            temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
+            temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
+            temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+            temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
+            temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
+            temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+            temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
+            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
+            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
+            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
+
+            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
+            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
+            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
+            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
+
+            for(row = 0; row < nt; row += 8)
+            {
+                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+
+                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+                /* loding 8-bit 16 pixels */
+                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/
+                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/
+                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/
+                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/
+
+                src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/
+                src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/
+                src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/
+                src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/
+
+                /* loding 8-bit 16 pixels */
+                src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/
+                src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/
+                src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/
+                src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/
+
+                src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
+                src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
+                src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
+                src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
+
+                src_temp11_8x16b =  _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
+                src_temp12_8x16b =  _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
+                src_temp13_8x16b =  _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
+                src_temp14_8x16b =  _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
+                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
+                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
+                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
+                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
+                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
+                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
+                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
+
+                src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+                src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+                src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+                src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+                src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+                src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+                src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+                src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+
+                src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+                src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+                src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+                src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+                src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+                src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+                src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+                src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b);          /* row=0*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b);       /* row=1*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b);       /* row=2*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b);       /* row=4*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b);       /* row=5*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b);       /* row=6*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b);       /* row=7*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b);       /* row=8*/
+
+            }
+        }
+    }
+    else if(nt == 16)
+    {
+
+        __m128i temp1, temp2, temp11, src_values0;
+        /* Intermediate reference samples for negative angle modes */
+        /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+        ref_temp[two_nt - 1] = pu1_ref[two_nt - nt];
+        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1));
+        temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
+        src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
+
+        temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
+
+        src_values0 = _mm_shuffle_epi8(src_values0, temp2);
+        temp1 = _mm_shuffle_epi8(temp1, temp2);
+        src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+
+        _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0);
+        _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
+
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp4_4x32b = _mm_set1_epi16(8);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+        two_nt_4x32b = _mm_set1_epi16(1);
+
+        for(col = 0; col < nt; col += 8)
+        {
+            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+            // WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
+
+            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* idx = pos >> 5; */
+            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /* fract = pos & (31); */
+            ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+            /*(32 - fract) */
+            fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+            fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+            fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+            fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+            fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+
+            fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
+            fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
+
+            temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
+            temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
+            temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
+            temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+            temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
+            temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
+            temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+            temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
+            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
+            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
+            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
+
+            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
+            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
+            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
+            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
+
+            for(row = 0; row < nt; row += 8)
+            {
+                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+
+                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+                /* loding 8-bit 16 pixels */
+                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/
+                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/
+                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/
+                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/
+
+                src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/
+                src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/
+                src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/
+                src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/
+
+                /* loding 8-bit 16 pixels */
+                src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/
+                src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/
+                src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/
+                src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/
+
+                src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
+                src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
+                src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
+                src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
+
+                src_temp11_8x16b =  _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
+                src_temp12_8x16b =  _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
+                src_temp13_8x16b =  _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
+                src_temp14_8x16b =  _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
+                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
+                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
+                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
+                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
+                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
+                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
+                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
+
+                src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+                src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+                src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+                src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+                src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+                src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+                src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+                src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+
+                src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+                src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+                src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+                src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+                src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+                src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+                src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+                src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b);          /* row=0*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b);       /* row=1*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b);       /* row=2*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b);       /* row=4*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b);       /* row=5*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b);       /* row=6*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b);       /* row=7*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b);       /* row=8*/
+
+            }
+        }
+    }
+    else
+    {
+
+
+        __m128i temp1, temp2, temp11, src_values0;
+        /* Intermediate reference samples for negative angle modes */
+        /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+        ref_temp[two_nt - 1] = pu1_ref[nt];
+        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 1));
+
+        /* For negative angled derive the main reference samples from side */
+
+        src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
+        temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
+        temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
+
+        src_values0 = _mm_shuffle_epi8(src_values0, temp2);
+        temp1 = _mm_shuffle_epi8(temp1, temp2);
+        src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+        src_values0 = _mm_srli_si128(src_values0, 8);
+
+        _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1);
+        _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0);
+
+
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp4_4x32b = _mm_set1_epi16(8);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+        two_nt_4x32b = _mm_set1_epi16(1);
+
+        {
+            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+            //WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
+
+            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* idx = pos >> 5; */
+            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /* fract = pos & (31); */
+            ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            /*(32 - fract) */
+            fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+            fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+            fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+            fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+            fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+            fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
+            fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
+
+            temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
+            temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
+            temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
+            temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+            temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
+            temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
+            temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+            temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
+            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
+            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
+            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
+
+            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
+            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
+            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
+            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
+
+            {
+                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+                /* loding 8-bit 16 pixels */
+                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1)); /* col=0*/
+                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2)); /* col=1*/
+                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3)); /* col=2*/
+                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4)); /* col=3*/
+
+                /* loding 8-bit 16 pixels */
+                src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5)); /* col=5*/
+                src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6)); /* col=6*/
+                src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7)); /* col=7*/
+                src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8)); /* col=8*/
+
+                src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
+                src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
+                src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
+                src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
+
+                src_temp11_8x16b =  _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
+                src_temp12_8x16b =  _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
+                src_temp13_8x16b =  _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
+                src_temp14_8x16b =  _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* row=0*/
+                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* row=1*/
+                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* row=2*/
+                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* row=3*/
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
+                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
+                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
+                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+                /* converting 16 bit to 8 bit */
+                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=4*/
+                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=5*/
+
+                src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+                src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+                src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+                src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+                src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+                src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+                src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+                src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+
+                src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+                src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+                src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+                src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+                src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+                src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+                src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+                src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp1_8x16b);       /* row=0*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp5_8x16b);       /* row=1*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp2_8x16b);       /* row=2*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp6_8x16b);       /* row=3*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (4))), src_temp3_8x16b);       /* row=4*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (5))), src_temp7_8x16b);       /* row=5*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (6))), src_temp4_8x16b);       /* row=6*/
+
+                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (7))), src_temp8_8x16b);       /* row=7*/
+
+            }
+        }
+    }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Intra prediction interpolation filter for luma mode 19 to mode 25
+*
+* @par Description:
+*    Intraprediction for mode 19 to 25  (negative angle, vertical mode ) with
+*    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+*    block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_mode_19_to_25_ssse3(UWORD8 *pu1_ref,
+                                               WORD32 src_strd,
+                                               UWORD8 *pu1_dst,
+                                               WORD32 dst_strd,
+                                               WORD32 nt,
+                                               WORD32 mode)
+{
+
+    WORD32 row, k;
+    WORD32 two_nt, intra_pred_ang;
+    WORD32 inv_ang, inv_ang_sum;
+    //WORD32 ref_main_idx, pos, fract, idx;
+    WORD32 ref_idx;
+    UWORD8 ref_tmp[(2 * MAX_CU_SIZE) + 2];
+    UWORD8 *ref_main, *ref_temp;
+
+    __m128i  /*fract_8x16b,*/ const_temp_8x16b, sm3;
+    __m128i temp1, temp2, temp3, temp4;
+    __m128i temp11, temp12, temp13, temp14;
+    UNUSED(src_strd);
+    two_nt = 2 * nt;
+    intra_pred_ang = gai4_ihevc_ang_table[mode];
+    inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
+
+    /* Intermediate reference samples for negative angle modes */
+    /* This have to be removed during optimization*/
+    /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+    ref_temp = ref_tmp + 1;
+    ref_main = ref_temp + nt - 1;
+
+
+    sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
+
+
+
+    const_temp_8x16b = _mm_set1_epi16(16);
+
+    if(nt == 32)
+    {
+
+        __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
+        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+        __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+        __m128i src_values0, src_values1, src_values2, src_values3;
+        __m128i  src_values4, src_values5, src_values6, src_values7;
+        WORD32 col = 0;
+
+        /* Intermediate reference samples for negative angle modes */
+        /* This have to be removed during optimization*/
+        /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+        ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
+        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt));
+        temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 16));
+
+        /* SIMD Optimization can be done using look-up table for the loop */
+        /* For negative angled derive the main reference samples from side */
+        /*  reference samples refer to section 8.4.4.2.6 */
+        src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/
+        src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 16)); /*(nt+16)-(two_nt-1)*/
+
+        temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19]));
+        temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
+
+        src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+        src_values1 = _mm_shuffle_epi8(src_values1, temp12);
+
+        _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
+        _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp3);
+        _mm_storeu_si128((__m128i *)(ref_main - 16), src_values1);
+        _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[mode - 19][0]), src_values0);
+
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+        const_temp8_4x32b = _mm_set1_epi16(8);
+
+        two_nt_4x32b = _mm_set1_epi16(1);
+
+        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+        for(row = 0; row < nt; row += 8)
+        {
+
+            WORD16 ref_main_idx[9];
+
+            __m128i res_temp5_4x32b;
+            __m128i fract1_8x16b, fract2_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* fract = pos & (31); */
+            src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            /* idx = pos >> 5; */
+            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
+            temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+            temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+            temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+            temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
+            temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+            temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+            row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
+            _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+            for(col = 0; col < nt; col += 16)
+            {
+                src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + col));
+                src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + col));
+                src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + col));
+                src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + col));
+                src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8 + col));
+                src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8 + col));
+                src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8 + col));
+                src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8 + col));
+
+                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+                src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+                src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+                src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+                src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+                src_values4 = _mm_maddubs_epi16(src_values4, temp1);
+                src_values5 = _mm_maddubs_epi16(src_values5, temp2);
+                src_values6 = _mm_maddubs_epi16(src_values6, temp3);
+                src_values7 = _mm_maddubs_epi16(src_values7, temp4);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_values0 = _mm_srai_epi16(src_values0,  5);
+                src_values1 = _mm_srai_epi16(src_values1,  5);
+                src_values2 = _mm_srai_epi16(src_values2,  5);
+                src_values3 = _mm_srai_epi16(src_values3,  5);
+                src_values4 = _mm_srai_epi16(src_values4,  5);
+                src_values5 = _mm_srai_epi16(src_values5,  5);
+                src_values6 = _mm_srai_epi16(src_values6,  5);
+                src_values7 = _mm_srai_epi16(src_values7,  5);
+
+                /* converting 16 bit to 8 bit */
+                src_values0 = _mm_packus_epi16(src_values0, src_values4);
+                src_values1 = _mm_packus_epi16(src_values1, src_values5);
+                src_values2 = _mm_packus_epi16(src_values2, src_values6);
+                src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+                /* loading 8-bit 8 pixels values */
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0);       /* row=0*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1);   /* row=1*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2);   /* row=2*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3);   /* row=3*/
+
+
+                src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + col));
+                src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + col));
+                src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + col));
+                src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + col));
+                src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8 + col));
+                src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8 + col));
+                src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8 + col));
+                src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8 + col));
+
+                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+                src_values0 = _mm_maddubs_epi16(src_values0, temp11);
+                src_values1 = _mm_maddubs_epi16(src_values1, temp12);
+                src_values2 = _mm_maddubs_epi16(src_values2, temp13);
+                src_values3 = _mm_maddubs_epi16(src_values3, temp14);
+                src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+                src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+                src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+                src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_values0 = _mm_srai_epi16(src_values0,  5);
+                src_values1 = _mm_srai_epi16(src_values1,  5);
+                src_values2 = _mm_srai_epi16(src_values2,  5);
+                src_values3 = _mm_srai_epi16(src_values3,  5);
+                src_values4 = _mm_srai_epi16(src_values4,  5);
+                src_values5 = _mm_srai_epi16(src_values5,  5);
+                src_values6 = _mm_srai_epi16(src_values6,  5);
+                src_values7 = _mm_srai_epi16(src_values7,  5);
+
+                /* converting 16 bit to 8 bit */
+                src_values0 = _mm_packus_epi16(src_values0, src_values4);
+                src_values1 = _mm_packus_epi16(src_values1, src_values5);
+                src_values2 = _mm_packus_epi16(src_values2, src_values6);
+                src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+                /* loading 8-bit 8 pixels values */
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0);   /* row=4*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1);   /* row=5*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2);   /* row=6*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3);   /* row=7*/
+
+            }
+            pu1_dst += 8 * dst_strd;
+        }
+
+    }
+    else if(nt == 16) /* for nt = 16 case */
+    {
+
+        __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
+        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+        __m128i row_4x32b, two_nt_4x32b, src_values12;
+        __m128i src_values0, src_values1, src_values2, src_values3;
+        __m128i  src_values4, src_values5, src_values6, src_values7;
+
+
+        /* Intermediate reference samples for negative angle modes */
+        /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+        ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
+        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt));
+
+        src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/
+
+        temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
+
+        src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+
+        _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0);
+        _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
+
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+        const_temp8_4x32b = _mm_set1_epi16(8);
+
+        two_nt_4x32b = _mm_set1_epi16(1);
+
+        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+        for(row = 0; row < nt; row += 8)
+        {
+
+            WORD16 ref_main_idx[9];
+
+            __m128i res_temp5_4x32b;
+            __m128i fract1_8x16b, fract2_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* fract = pos & (31); */
+            src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            /* idx = pos >> 5; */
+            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
+            temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+            temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+            temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+            temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
+            temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+            temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+            row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
+            _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+
+            {
+                src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0]));
+                src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1]));
+                src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2]));
+                src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3]));
+                src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8));
+                src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8));
+                src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8));
+                src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8));
+
+                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+                src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+                src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+                src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+                src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+                src_values4 = _mm_maddubs_epi16(src_values4, temp1);
+                src_values5 = _mm_maddubs_epi16(src_values5, temp2);
+                src_values6 = _mm_maddubs_epi16(src_values6, temp3);
+                src_values7 = _mm_maddubs_epi16(src_values7, temp4);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_values0 = _mm_srai_epi16(src_values0,  5);
+                src_values1 = _mm_srai_epi16(src_values1,  5);
+                src_values2 = _mm_srai_epi16(src_values2,  5);
+                src_values3 = _mm_srai_epi16(src_values3,  5);
+                src_values4 = _mm_srai_epi16(src_values4,  5);
+                src_values5 = _mm_srai_epi16(src_values5,  5);
+                src_values6 = _mm_srai_epi16(src_values6,  5);
+                src_values7 = _mm_srai_epi16(src_values7,  5);
+
+                /* converting 16 bit to 8 bit */
+                src_values0 = _mm_packus_epi16(src_values0, src_values4);
+                src_values1 = _mm_packus_epi16(src_values1, src_values5);
+                src_values2 = _mm_packus_epi16(src_values2, src_values6);
+                src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+                /* loading 8-bit 8 pixels values */
+                _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0);       /* row=0*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1);   /* row=1*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2);   /* row=2*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3);   /* row=3*/
+
+
+                src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4]));
+                src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5]));
+                src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6]));
+                src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7]));
+                src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8));
+                src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8));
+                src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8));
+                src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8));
+
+                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+                src_values0 = _mm_maddubs_epi16(src_values0, temp11);
+                src_values1 = _mm_maddubs_epi16(src_values1, temp12);
+                src_values2 = _mm_maddubs_epi16(src_values2, temp13);
+                src_values3 = _mm_maddubs_epi16(src_values3, temp14);
+                src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+                src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+                src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+                src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_values0 = _mm_srai_epi16(src_values0,  5);
+                src_values1 = _mm_srai_epi16(src_values1,  5);
+                src_values2 = _mm_srai_epi16(src_values2,  5);
+                src_values3 = _mm_srai_epi16(src_values3,  5);
+                src_values4 = _mm_srai_epi16(src_values4,  5);
+                src_values5 = _mm_srai_epi16(src_values5,  5);
+                src_values6 = _mm_srai_epi16(src_values6,  5);
+                src_values7 = _mm_srai_epi16(src_values7,  5);
+
+                /* converting 16 bit to 8 bit */
+                src_values0 = _mm_packus_epi16(src_values0, src_values4);
+                src_values1 = _mm_packus_epi16(src_values1, src_values5);
+                src_values2 = _mm_packus_epi16(src_values2, src_values6);
+                src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+                /* loading 8-bit 8 pixels values */
+                _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0);   /* row=4*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1);   /* row=5*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2);   /* row=6*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3);   /* row=7*/
+
+            }
+            pu1_dst += 8 * dst_strd;
+        }
+    }
+    else if(nt == 8)
+    {
+
+
+        __m128i const_temp2_4x32b, const_temp3_4x32b;
+        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+
+        __m128i row_4x32b, two_nt_4x32b, src_values12;
+        __m128i src_values0, src_values1, src_values2, src_values3;
+        __m128i  src_values4, src_values5, src_values6, src_values7;
+
+
+        /* Intermediate reference samples for negative angle modes */
+        /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+        ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
+        temp1 = _mm_loadl_epi64((__m128i *)(pu1_ref + two_nt));
+
+        /* For negative angled derive the main reference samples from side */
+
+        src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref)); /*nt-(nt+15)*/
+
+        temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
+
+        src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+        src_values0 = _mm_srli_si128(src_values0, 8);
+        _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1);
+        _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0);
+
+
+
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+
+
+        two_nt_4x32b = _mm_set1_epi16(1);
+
+
+        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+        {
+
+            WORD16 ref_main_idx[9];
+
+            __m128i res_temp5_4x32b;
+            __m128i fract1_8x16b, fract2_8x16b;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* fract = pos & (31); */
+            src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            /* idx = pos >> 5; */
+            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
+            temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+            temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+            temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+            temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
+            temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+            temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+            _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+
+            src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0]));  /* col = 0-7   */
+            src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1]));  /* col = 8-15  */
+            src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2]));  /* col = 16-23 */
+            src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3]));  /* col = 24-31 */
+            src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4]));  /* col = 32-39   */
+            src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5]));  /* col = 40-47  */
+            src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6]));  /* col = 48-55 */
+            src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7]));  /* col = 56-63*/
+
+            src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+            src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+            src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+            src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+            src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+            src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+            src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+            src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+            src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+            src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+            src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+            src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+            src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+            src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+            src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+            src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+            src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+            src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+            src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+            src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+            src_values0 = _mm_srai_epi16(src_values0,  5);
+            src_values1 = _mm_srai_epi16(src_values1,  5);
+            src_values2 = _mm_srai_epi16(src_values2,  5);
+            src_values3 = _mm_srai_epi16(src_values3,  5);
+            src_values4 = _mm_srai_epi16(src_values4,  5);
+            src_values5 = _mm_srai_epi16(src_values5,  5);
+            src_values6 = _mm_srai_epi16(src_values6,  5);
+            src_values7 = _mm_srai_epi16(src_values7,  5);
+
+            /* converting 16 bit to 8 bit */
+            src_values0 = _mm_packus_epi16(src_values0, src_values1);
+            src_values2 = _mm_packus_epi16(src_values2, src_values3);
+            src_values1 = _mm_srli_si128(src_values0, 8);
+            src_values3 = _mm_srli_si128(src_values2, 8);
+            src_values4 = _mm_packus_epi16(src_values4, src_values5);
+            src_values6 = _mm_packus_epi16(src_values6, src_values7);
+            src_values5 = _mm_srli_si128(src_values4, 8);
+            src_values7 = _mm_srli_si128(src_values6, 8);
+
+            /* loading 8-bit 8 pixels values */
+            _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0);       /* row=0*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1);   /* row=1*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2);   /* row=2*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3);   /* row=3*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4);   /* row=4*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5);   /* row=5*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6);   /* row=6*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7);   /* row=7*/
+        }
+    }
+    else /* if nt =4*/
+    {
+
+        __m128i const_temp2_4x32b, const_temp3_4x32b, zero_8x16b;
+        __m128i src_values10, src_values11, intra_pred_ang_4x32b, sign_8x16b;
+
+        __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+
+        for(k = 0; k < (nt + 1); k++)
+            ref_temp[k + nt - 1] = pu1_ref[two_nt + k];
+        ref_idx = (nt * intra_pred_ang) >> 5;
+        inv_ang_sum = 128;
+
+        for(k = -1; k > ref_idx; k--)
+        {
+            inv_ang_sum += inv_ang;
+            ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
+        }
+
+
+        const_temp2_4x32b = _mm_set1_epi32(31);
+        const_temp3_4x32b = _mm_set1_epi32(32);
+        zero_8x16b = _mm_setzero_si128();
+        two_nt_4x32b = _mm_set1_epi32(1);
+
+
+        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+        row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+        {
+            WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+            int temp11, temp21, temp31, temp41;
+
+
+            __m128i fract1_8x16b, fract2_8x16b,  res_temp5_4x32b;
+            __m128i src_values0, src_values1, src_values2, src_values3;
+            __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+            sign_8x16b      = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
+            res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
+
+            /* fract = pos & (31); */
+            src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
+
+            ref_main_temp0 = _mm_srli_si128(src_values12, 4);  /* next 32 bit values */
+            ref_main_temp1 = _mm_srli_si128(src_values12, 8);  /* next 32 bit values */
+            ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
+            ref_main_idx1  = _mm_cvtsi128_si32(src_values12);    /* row=0*/
+            ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* row=1*/
+            ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* row=2*/
+            ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* row=3*/
+
+            /* idx = pos >> 5; */
+            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
+
+            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+            temp3 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+            temp4 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+
+            src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1));     /* col = 0-7   */
+            src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2));   /* col = 8-15  */
+            src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3));  /* col = 16-23 */
+            src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4));  /* col = 24-31 */
+
+            src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+            src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+            src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+            src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+
+
+            src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+            src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+            src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+            src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+            src_values0 = _mm_srai_epi16(src_values0,  5);
+            src_values1 = _mm_srai_epi16(src_values1,  5);
+            src_values2 = _mm_srai_epi16(src_values2,  5);
+            src_values3 = _mm_srai_epi16(src_values3,  5);
+
+            /* converting 16 bit to 8 bit */
+            src_values0 = _mm_packus_epi16(src_values0, src_values1);
+            src_values2 = _mm_packus_epi16(src_values2, src_values3);
+            src_values1 = _mm_srli_si128(src_values0, 8);
+            src_values3 = _mm_srli_si128(src_values2, 8);
+
+            temp11 = _mm_cvtsi128_si32(src_values0);
+            temp21 = _mm_cvtsi128_si32(src_values1);
+            temp31 = _mm_cvtsi128_si32(src_values2);
+            temp41 = _mm_cvtsi128_si32(src_values3);
+
+            /* loding 4-bit 8 pixels values */
+            *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
+            *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
+            *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
+            *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
+
+        }
+    }
+}
+
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for luma mode 27 to mode 33
+*
+* @par Description:
+*    Intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
+*    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
+*    block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] dst_strd
+*  integer destination stride
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_27_to_33_ssse3(UWORD8 *pu1_ref,
+                                               WORD32 src_strd,
+                                               UWORD8 *pu1_dst,
+                                               WORD32 dst_strd,
+                                               WORD32 nt,
+                                               WORD32 mode)
+{
+    WORD32 row;
+    WORD32 two_nt;
+    WORD32 intra_pred_ang;
+
+    __m128i temp11, temp12, temp13, temp14;
+
+    __m128i     const_temp_8x16b;
+    __m128i temp1, temp2, temp3, temp4, sm3;
+    UNUSED(src_strd);
+    two_nt = 2 * nt;
+    intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+    const_temp_8x16b = _mm_set1_epi16(16);
+    sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
+    if(nt == 32)
+    {
+
+        __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
+        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+        __m128i row_4x32b, two_nt_4x32b, src_values12;
+        int col = 0;
+
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+        const_temp8_4x32b = _mm_set1_epi16(8);
+
+        two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
+
+        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+        for(row = 0; row < nt; row += 8)
+        {
+
+            WORD16 ref_main_idx[9];
+
+            __m128i res_temp5_4x32b;
+            __m128i fract1_8x16b, fract2_8x16b;
+            __m128i src_values0, src_values1, src_values2, src_values3;
+            __m128i  src_values4, src_values5, src_values6, src_values7;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* fract = pos & (31); */
+            src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            /* idx = pos >> 5; */
+            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
+            temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+            temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+            temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+            temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
+            temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+            temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+            row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
+            _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+            for(col = 0; col < nt; col += 16)
+            {
+                src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + col));
+                src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + col));
+                src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + col));
+                src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + col));
+                src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8 + col));
+                src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8 + col));
+                src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8 + col));
+                src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8 + col));
+
+                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+                src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+                src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+                src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+                src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+                src_values4 = _mm_maddubs_epi16(src_values4, temp1);
+                src_values5 = _mm_maddubs_epi16(src_values5, temp2);
+                src_values6 = _mm_maddubs_epi16(src_values6, temp3);
+                src_values7 = _mm_maddubs_epi16(src_values7, temp4);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_values0 = _mm_srai_epi16(src_values0,  5);
+                src_values1 = _mm_srai_epi16(src_values1,  5);
+                src_values2 = _mm_srai_epi16(src_values2,  5);
+                src_values3 = _mm_srai_epi16(src_values3,  5);
+                src_values4 = _mm_srai_epi16(src_values4,  5);
+                src_values5 = _mm_srai_epi16(src_values5,  5);
+                src_values6 = _mm_srai_epi16(src_values6,  5);
+                src_values7 = _mm_srai_epi16(src_values7,  5);
+
+                /* converting 16 bit to 8 bit */
+                src_values0 = _mm_packus_epi16(src_values0, src_values4);
+                src_values1 = _mm_packus_epi16(src_values1, src_values5);
+                src_values2 = _mm_packus_epi16(src_values2, src_values6);
+                src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+                /* loading 8-bit 8 pixels values */
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0);       /* row=0*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1);   /* row=1*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2);   /* row=2*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3);   /* row=3*/
+
+
+                src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + col));
+                src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + col));
+                src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + col));
+                src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + col));
+                src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8 + col));
+                src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8 + col));
+                src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8 + col));
+                src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8 + col));
+
+                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+                src_values0 = _mm_maddubs_epi16(src_values0, temp11);
+                src_values1 = _mm_maddubs_epi16(src_values1, temp12);
+                src_values2 = _mm_maddubs_epi16(src_values2, temp13);
+                src_values3 = _mm_maddubs_epi16(src_values3, temp14);
+                src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+                src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+                src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+                src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_values0 = _mm_srai_epi16(src_values0,  5);
+                src_values1 = _mm_srai_epi16(src_values1,  5);
+                src_values2 = _mm_srai_epi16(src_values2,  5);
+                src_values3 = _mm_srai_epi16(src_values3,  5);
+                src_values4 = _mm_srai_epi16(src_values4,  5);
+                src_values5 = _mm_srai_epi16(src_values5,  5);
+                src_values6 = _mm_srai_epi16(src_values6,  5);
+                src_values7 = _mm_srai_epi16(src_values7,  5);
+
+                /* converting 16 bit to 8 bit */
+                src_values0 = _mm_packus_epi16(src_values0, src_values4);
+                src_values1 = _mm_packus_epi16(src_values1, src_values5);
+                src_values2 = _mm_packus_epi16(src_values2, src_values6);
+                src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+                /* loading 8-bit 8 pixels values */
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0);   /* row=4*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1);   /* row=5*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2);   /* row=6*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3);   /* row=7*/
+
+            }
+            pu1_dst += 8 * dst_strd;
+        }
+
+    }
+    else if(nt == 16) /* for nt = 16 case */
+    {
+
+        __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
+        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+        __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+        const_temp8_4x32b = _mm_set1_epi16(8);
+
+        two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
+
+        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+        for(row = 0; row < nt; row += 8)
+        {
+
+            WORD16 ref_main_idx[9];
+
+            __m128i res_temp5_4x32b;
+            __m128i fract1_8x16b, fract2_8x16b;
+            __m128i src_values0, src_values1, src_values2, src_values3;
+            __m128i  src_values4, src_values5, src_values6, src_values7;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* fract = pos & (31); */
+            src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            /* idx = pos >> 5; */
+            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
+            temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+            temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+            temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+            temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
+            temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+            temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+            row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
+            _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+
+            {
+                src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0]));
+                src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1]));
+                src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2]));
+                src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3]));
+                src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8));
+                src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8));
+                src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8));
+                src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8));
+
+                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+                src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+                src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+                src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+                src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+                src_values4 = _mm_maddubs_epi16(src_values4, temp1);
+                src_values5 = _mm_maddubs_epi16(src_values5, temp2);
+                src_values6 = _mm_maddubs_epi16(src_values6, temp3);
+                src_values7 = _mm_maddubs_epi16(src_values7, temp4);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_values0 = _mm_srai_epi16(src_values0,  5);
+                src_values1 = _mm_srai_epi16(src_values1,  5);
+                src_values2 = _mm_srai_epi16(src_values2,  5);
+                src_values3 = _mm_srai_epi16(src_values3,  5);
+                src_values4 = _mm_srai_epi16(src_values4,  5);
+                src_values5 = _mm_srai_epi16(src_values5,  5);
+                src_values6 = _mm_srai_epi16(src_values6,  5);
+                src_values7 = _mm_srai_epi16(src_values7,  5);
+
+                /* converting 16 bit to 8 bit */
+                src_values0 = _mm_packus_epi16(src_values0, src_values4);
+                src_values1 = _mm_packus_epi16(src_values1, src_values5);
+                src_values2 = _mm_packus_epi16(src_values2, src_values6);
+                src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+                /* loading 8-bit 8 pixels values */
+                _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0);       /* row=0*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1);   /* row=1*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2);   /* row=2*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3);   /* row=3*/
+
+
+                src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4]));
+                src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5]));
+                src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6]));
+                src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7]));
+                src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8));
+                src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8));
+                src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8));
+                src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8));
+
+                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+                src_values0 = _mm_maddubs_epi16(src_values0, temp11);
+                src_values1 = _mm_maddubs_epi16(src_values1, temp12);
+                src_values2 = _mm_maddubs_epi16(src_values2, temp13);
+                src_values3 = _mm_maddubs_epi16(src_values3, temp14);
+                src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+                src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+                src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+                src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+                src_values0 = _mm_srai_epi16(src_values0,  5);
+                src_values1 = _mm_srai_epi16(src_values1,  5);
+                src_values2 = _mm_srai_epi16(src_values2,  5);
+                src_values3 = _mm_srai_epi16(src_values3,  5);
+                src_values4 = _mm_srai_epi16(src_values4,  5);
+                src_values5 = _mm_srai_epi16(src_values5,  5);
+                src_values6 = _mm_srai_epi16(src_values6,  5);
+                src_values7 = _mm_srai_epi16(src_values7,  5);
+
+                /* converting 16 bit to 8 bit */
+                src_values0 = _mm_packus_epi16(src_values0, src_values4);
+                src_values1 = _mm_packus_epi16(src_values1, src_values5);
+                src_values2 = _mm_packus_epi16(src_values2, src_values6);
+                src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+                /* loading 8-bit 8 pixels values */
+                _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0);   /* row=4*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1);   /* row=5*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2);   /* row=6*/
+                _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3);   /* row=7*/
+
+            }
+            pu1_dst += 8 * dst_strd;
+        }
+
+    }
+    else if(nt == 8)
+    {
+
+        __m128i const_temp2_4x32b, const_temp3_4x32b;
+        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+        __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+
+        const_temp2_4x32b = _mm_set1_epi16(31);
+        const_temp3_4x32b = _mm_set1_epi16(32);
+
+        two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
+
+
+        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+        //for(row = 0; row < nt; row +=4)
+        {
+
+            WORD16 ref_main_idx[9];
+
+            __m128i res_temp5_4x32b;
+            __m128i fract1_8x16b, fract2_8x16b;
+            __m128i src_values0, src_values1, src_values2, src_values3;
+            __m128i  src_values4, src_values5, src_values6, src_values7;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+            /* fract = pos & (31); */
+            src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
+
+            /* idx = pos >> 5; */
+            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
+            temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+            temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+            temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+            temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
+            temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+            temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+            _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+
+            src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0]));  /* col = 0-7   */
+            src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1]));  /* col = 8-15  */
+            src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2]));  /* col = 16-23 */
+            src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3]));  /* col = 24-31 */
+            src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4]));  /* col = 32-39   */
+            src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5]));  /* col = 40-47  */
+            src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6]));  /* col = 48-55 */
+            src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7]));  /* col = 56-63*/
+
+            src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+            src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+            src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+            src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+            src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
+            src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
+            src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
+            src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
+
+
+            src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+            src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+            src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+            src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+            src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+            src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+            src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+            src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+            src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+            src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+            src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+            src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+            src_values0 = _mm_srai_epi16(src_values0,  5);
+            src_values1 = _mm_srai_epi16(src_values1,  5);
+            src_values2 = _mm_srai_epi16(src_values2,  5);
+            src_values3 = _mm_srai_epi16(src_values3,  5);
+            src_values4 = _mm_srai_epi16(src_values4,  5);
+            src_values5 = _mm_srai_epi16(src_values5,  5);
+            src_values6 = _mm_srai_epi16(src_values6,  5);
+            src_values7 = _mm_srai_epi16(src_values7,  5);
+
+            /* converting 16 bit to 8 bit */
+            src_values0 = _mm_packus_epi16(src_values0, src_values1);
+            src_values2 = _mm_packus_epi16(src_values2, src_values3);
+            src_values1 = _mm_srli_si128(src_values0, 8);
+            src_values3 = _mm_srli_si128(src_values2, 8);
+            src_values4 = _mm_packus_epi16(src_values4, src_values5);
+            src_values6 = _mm_packus_epi16(src_values6, src_values7);
+            src_values5 = _mm_srli_si128(src_values4, 8);
+            src_values7 = _mm_srli_si128(src_values6, 8);
+
+            /* loading 8-bit 8 pixels values */
+            _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0);       /* row=0*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1);   /* row=1*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2);   /* row=2*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3);   /* row=3*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4);   /* row=4*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5);   /* row=5*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6);   /* row=6*/
+            _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7);   /* row=7*/
+        }
+
+    }
+    else /* if nt =4*/
+    {
+
+        __m128i const_temp2_4x32b, const_temp3_4x32b, zero_8x16b;
+        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+
+        __m128i row_4x32b, two_nt_4x32b, src_values12, sign_8x16b;
+
+
+        const_temp2_4x32b = _mm_set1_epi32(31);
+        const_temp3_4x32b = _mm_set1_epi32(32);
+        zero_8x16b = _mm_setzero_si128();
+        two_nt_4x32b = _mm_set1_epi32(two_nt + 1);
+
+
+        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+        row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
+        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+        {
+            int temp11, temp21, temp31, temp41;
+
+            WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+
+            __m128i fract1_8x16b, fract2_8x16b, res_temp5_4x32b;
+            __m128i src_values0, src_values1, src_values2, src_values3;
+            __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
+
+            /* pos = ((row + 1) * intra_pred_ang); */
+            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+            sign_8x16b      = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
+            res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
+
+            /* fract = pos & (31); */
+            src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
+
+            ref_main_temp0 = _mm_srli_si128(src_values12, 4);  /* next 32 bit values */
+            ref_main_temp1 = _mm_srli_si128(src_values12, 8);  /* next 32 bit values */
+            ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
+            ref_main_idx1  = _mm_cvtsi128_si32(src_values12);    /* row=0*/
+            ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* row=1*/
+            ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* row=2*/
+            ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* row=3*/
+
+            /* idx = pos >> 5; */
+            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+            /*(32 - fract) */
+            src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
+
+            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
+            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+            temp3 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
+            temp4 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+
+            src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1));     /* col = 0-7   */
+            src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2));   /* col = 8-15  */
+            src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3));  /* col = 16-23 */
+            src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4));  /* col = 24-31 */
+
+            src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
+            src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
+            src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
+            src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
+
+            src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+            src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+            src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+            src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+            src_values0 = _mm_srai_epi16(src_values0,  5);
+            src_values1 = _mm_srai_epi16(src_values1,  5);
+            src_values2 = _mm_srai_epi16(src_values2,  5);
+            src_values3 = _mm_srai_epi16(src_values3,  5);
+
+            /* converting 16 bit to 8 bit */
+            src_values0 = _mm_packus_epi16(src_values0, src_values1);
+            src_values2 = _mm_packus_epi16(src_values2, src_values3);
+            src_values1 = _mm_srli_si128(src_values0, 8);
+            src_values3 = _mm_srli_si128(src_values2, 8);
+
+            temp11 = _mm_cvtsi128_si32(src_values0);
+            temp21 = _mm_cvtsi128_si32(src_values1);
+            temp31 = _mm_cvtsi128_si32(src_values2);
+            temp41 = _mm_cvtsi128_si32(src_values3);
+
+            /* loding 4-bit 8 pixels values */
+            *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
+            *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
+            *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
+            *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
+
+        }
+    }
+}

diff --git a/common/x86/ihevc_itrans_recon_16x16_ssse3_intr.c b/common/x86/ihevc_itrans_recon_16x16_ssse3_intr.c
new file mode 100644
index 0000000..63cc1ef
--- /dev/null
+++ b/common/x86/ihevc_itrans_recon_16x16_ssse3_intr.c

@@ -0,0 +1,3340 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_iquant_itrans_recon_atom_intr.c
+ *
+ * @brief
+ *  Contains function definitions for inverse  quantization, inverse
+ * transform and reconstruction
+ *
+ * @author
+ *  100470
+ *  100592 (edited by)
+ *
+ * @par List of Functions:
+ *  - ihevc_iquant_itrans_recon_16x16_ssse3()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+
+
+#include <immintrin.h>
+#include <emmintrin.h>
+
+#include <tmmintrin.h>
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization, inverse  transform and
+ * reconstruction for 16x16 input block
+ *
+ * @par Description:
+ *  Performs inverse quantization , inverse transform  and adds the
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 16x16 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 16x16 buffer for storing inverse
+ *  transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 16x16 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 16x16 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_16x16_ssse3(WORD16 *pi2_src,
+                                    WORD16 *pi2_tmp,
+                                    UWORD8 *pu1_pred,
+                                    UWORD8 *pu1_dst,
+                                    WORD32 src_strd,
+                                    WORD32 pred_strd,
+                                    WORD32 dst_strd,
+                                    WORD32 zero_cols,
+                                    WORD32 zero_rows)
+{
+    __m128i m_temp_reg_0;
+    __m128i m_temp_reg_1;
+    __m128i m_temp_reg_10;
+    __m128i m_temp_reg_11;
+    __m128i m_temp_reg_12;
+    __m128i m_temp_reg_13;
+    __m128i m_temp_reg_14;
+
+    __m128i m_temp_reg_20;
+    __m128i m_temp_reg_21;
+    __m128i m_temp_reg_22;
+    __m128i m_temp_reg_23;
+    __m128i m_temp_reg_24;
+    __m128i m_temp_reg_25;
+    __m128i m_temp_reg_26;
+    __m128i m_temp_reg_27;
+    __m128i m_temp_reg_30;
+    __m128i m_temp_reg_31;
+    __m128i m_temp_reg_32;
+    __m128i m_temp_reg_33;
+    __m128i m_temp_reg_34;
+    __m128i m_temp_reg_35;
+    __m128i m_temp_reg_36;
+    __m128i m_temp_reg_37;
+    __m128i m_temp_reg_40;
+    __m128i m_temp_reg_41;
+    __m128i m_temp_reg_42;
+    __m128i m_temp_reg_43;
+    __m128i m_temp_reg_44;
+    __m128i m_temp_reg_45;
+    __m128i m_temp_reg_46;
+    __m128i m_temp_reg_47;
+
+    __m128i m_temp_reg_70;
+    __m128i m_temp_reg_71;
+    __m128i m_temp_reg_72;
+    __m128i m_temp_reg_73;
+    __m128i m_temp_reg_74;
+    __m128i m_temp_reg_75;
+    __m128i m_temp_reg_76;
+    __m128i m_temp_reg_77;
+    __m128i m_rdng_factor;
+    __m128i m_count;
+    __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
+    __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
+
+    WORD32 i;
+/*Lokesh*/
+    WORD32  zero_last8_cols_stg1;
+    WORD32  zero_last8_rows_stg1;
+    WORD32  zero_last12_rows_stg1;
+    WORD32  zero_last12_rows_stg2;
+    WORD32  zero_last8_rows_stg2;
+
+    WORD32  loop = 0;
+
+    WORD32 i4_shift = IT_SHIFT_STAGE_1;
+    WORD32 trans_size = TRANS_SIZE_16;
+
+
+
+
+    /* Following 3 instructions replicates the value in the */
+    /* lower 16 bits of m_add_iq in the entire register */
+
+    /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
+
+    zero_last8_cols_stg1 = ((zero_cols & 0xFF00) == 0xFF00) ? 1 : 0;
+    zero_last8_rows_stg1 = ((zero_rows & 0xFF00) == 0xFF00) ? 1 : 0;
+    zero_last12_rows_stg1 = ((zero_rows & 0xFFF0) == 0xFFF0) ? 1 : 0;
+
+    zero_last12_rows_stg2 = ((zero_cols & 0xFFF0) == 0xFFF0) ? 1 : 0;
+    zero_last8_rows_stg2 = zero_last8_cols_stg1;
+    if(zero_last8_cols_stg1)
+    {
+        loop = 1;
+    }
+    else
+        loop = 2;
+
+    /* i = 0 => lower 8 samples */
+    /* i = 1 => higher 8 samples */
+    for(i = 0; i < loop; i++)
+    {
+        {
+            WORD32 sample_half_index = i << 3;
+            WORD16 *pi2_tmp_src = pi2_src + sample_half_index;
+            WORD16 *pi2_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
+
+            m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+
+
+
+
+            /* If last 12 rows are zero : Rishab */
+            if(zero_last12_rows_stg1)
+            {
+
+                /* eee */
+                /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+                /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+                {
+                    /* Loading coeff and src for use in next block */
+
+                    m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get sign
+                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
+
+                    m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
+
+                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
+
+                    m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+
+                    m_temp_reg_26 = m_temp_reg_24;
+                    m_temp_reg_27 = m_temp_reg_25;
+                }
+
+                /* eo */
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+
+                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
+                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo0[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+
+                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
+                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+                }
+
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+
+                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+                }
+
+                /* eo1[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+
+                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+
+                }
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo2[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+
+                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
+
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+                }
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+                }
+
+                /* eo3[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+
+                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+                }
+            }
+            /* If last 8 rows are zero : Rishab */
+            else if(zero_last8_rows_stg1)
+            {
+                /* eeo */
+                /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+                /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+                {
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
+
+                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
+                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
+
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+                }
+
+                /* eee */
+                /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+                /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+                {
+                    /* Loading coeff and src for use in next block */
+                    m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to  get signs
+                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
+
+                    m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
+
+                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
+
+                    m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
+
+                    m_temp_reg_26 = m_temp_reg_24;
+                    m_temp_reg_27 = m_temp_reg_25;
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
+                }
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
+                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
+
+                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
+                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo0[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
+                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
+
+                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
+                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+                }
+
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
+                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
+
+                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo1[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
+                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
+
+                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
+
+                }
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo2[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+
+                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+                }
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+                }
+
+                /* eo3[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+
+                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+                }
+            } /* If all the rows are non-zero : Rishab */
+            else
+            {
+                /* eeo */
+                /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+                /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+
+                {
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
+
+                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
+                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
+
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+                }
+
+                /* eee */
+                /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+                /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+                {
+                    /* Loading coeff and src for use in next block */
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64  64
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
+
+                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
+                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
+
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
+
+                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
+
+                }
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+                    m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
+
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
+                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
+
+                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
+                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+
+                }
+
+                /* eo0[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
+                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
+
+                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
+                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+                }
+
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
+                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
+
+                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
+                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo1[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
+                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
+
+                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
+                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
+                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
+                }
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
+
+                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+                }
+
+                /* eo2[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
+
+                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+                }
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
+
+                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+                }
+
+                /* eo3[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+                }
+
+            }
+        }
+
+        {
+            WORD32 sample_half_index = i << 3;
+            WORD16 *pi2_tmp_src = pi2_src + sample_half_index + src_strd;
+
+            m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+        }
+
+        /* o & stage 1 out */
+        {
+            WORD32 j;
+            WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
+            WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
+            WORD32 out_stride = (trans_size << 1);
+            WORD32 in_stride = trans_size << 1;
+
+            if(zero_last12_rows_stg1)
+            {
+                for(j = 0; j < 2; j++)
+                {
+                    if(j) //H8B= higher 8 bytes L8B lower 8 bytes
+                    {
+                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+                    }
+                    else
+                    {
+                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+                    }
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+
+
+                    /* o0[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o1[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o2[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o3[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+                    }
+
+                    /* o4[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    /* o5[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    /* o6[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    /* o7[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+                    }
+                }
+            }
+            else if(zero_last8_rows_stg1)
+            {
+                for(j = 0; j < 2; j++)
+                {
+                    if(j)
+                    {
+                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
+                    }
+                    else
+                    {
+                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
+                    }
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
+
+                    /* o0[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+
+                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o1[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o2[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o3[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
+
+                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+                    }
+
+                    /* o4[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    /* o5[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    /* o6[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    /* o7[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+                    }
+                }
+
+            }
+            else
+            {
+
+
+
+                for(j = 0; j < 2; j++)
+                {
+                    if(j) //H8B= higher 8 bytes L8B lower 8 bytes
+                    {
+                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
+                        m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
+                        m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
+                    }
+                    else
+                    {
+                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
+                        m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
+                        m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
+                    }
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25  9
+
+
+                    /* o0[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
+                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
+                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o1[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
+                        m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
+                        m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
+                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o2[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
+                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
+                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o3[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
+                        m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
+                        m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
+
+                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+                    }
+
+                    /* o4[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
+                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
+                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    /* o5[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
+                        m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
+                        m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
+                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    /* o6[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
+                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
+                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
+
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    /* o7[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+                    }
+                }
+            }
+        }
+
+        /* Transpose */
+        {
+            WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
+            WORD16 *pi2_dst_scratch = ((i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp);
+            WORD32 out_stride = (trans_size << 1);
+            WORD32 in_stride = (trans_size << 1);
+            WORD32 j;
+
+            for(j = 0; j < 2; j++)
+            {
+                m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
+                pi2_src_scratch += in_stride;
+                m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
+                pi2_src_scratch += in_stride;
+                m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
+                pi2_src_scratch += in_stride;
+                m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
+                pi2_src_scratch += 8;
+                m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
+                pi2_src_scratch += 8;
+
+                m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
+                m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
+
+                m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
+                m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
+
+                m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
+                m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
+
+                m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
+                m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
+
+
+                m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
+                m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
+
+                m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
+                m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
+
+                m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
+                m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
+
+                m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
+                m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
+
+
+                m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
+                m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
+
+                m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
+                m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
+
+                m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
+                m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp1
+
+                m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp2
+                m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp3
+
+                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+                pi2_dst_scratch += out_stride;
+                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_44);
+                pi2_dst_scratch += out_stride;
+                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_41);
+                pi2_dst_scratch += out_stride;
+                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_45);
+                pi2_dst_scratch += 8;
+                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_42);
+                pi2_dst_scratch -= out_stride;
+                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_46);
+                pi2_dst_scratch -= out_stride;
+                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_43);
+                pi2_dst_scratch -= out_stride;
+                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_47);
+                pi2_dst_scratch += 8;
+            }
+        }
+    }
+
+    if(zero_last8_cols_stg1)
+    {
+        WORD16 *pi2_dst_scratch = (pi2_tmp + 8 * trans_size);
+        WORD32 out_stride = (trans_size << 1);
+        WORD32 j;
+
+        m_temp_reg_40 = _mm_setzero_si128();
+        for(j = 0; j < 2; j++)
+        {
+            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+            pi2_dst_scratch += out_stride;
+            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+            pi2_dst_scratch += out_stride;
+            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+            pi2_dst_scratch += out_stride;
+            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+            pi2_dst_scratch += 8;
+            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+            pi2_dst_scratch -= out_stride;
+            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+            pi2_dst_scratch -= out_stride;
+            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+            pi2_dst_scratch -= out_stride;
+            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+            pi2_dst_scratch += 8;
+        }
+    }
+
+
+
+
+    /* Stage 2 */
+    for(i = 0; i < 2; i++)
+    {
+        WORD16 *pi2_src_temp = (i) ? (pi2_tmp + 2 * trans_size) : (WORD16 *)(pi2_tmp);
+        WORD32 stride = (trans_size);
+        MEM_ALIGN16 WORD16 temp_array[256];
+
+        i4_shift = IT_SHIFT_STAGE_2;
+
+        if(zero_last12_rows_stg2)
+        {
+            /* eeo */
+            /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+            /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+            {
+                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
+
+                pi2_src_temp += (stride * 9);
+
+                if(!i)
+                {
+                    pi2_src_temp += (stride * 6 + 8);
+                }
+                else
+                {
+                    pi2_src_temp += (stride * 2 + 8);
+                }
+
+                pi2_src_temp -= (stride * 9);
+
+                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
+
+                m_temp_reg_20 = _mm_setzero_si128();
+                m_temp_reg_22 = _mm_setzero_si128();
+
+                m_temp_reg_21 = _mm_setzero_si128();
+                m_temp_reg_23 = _mm_setzero_si128();
+            }
+
+            /* eee */
+            /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+            /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+            {
+                /* Loading coeff and src for use in next block */
+
+                /* Loading coeff and src for use in next block */
+                m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_20, m_temp_reg_70);
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
+
+                m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
+
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
+                m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
+
+                m_temp_reg_26 = m_temp_reg_24;
+                m_temp_reg_27 = m_temp_reg_25;
+                /*  */
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_20);
+                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_20);
+            }
+
+            /* eo */
+            {
+                WORD16 *pi2_scratch = temp_array;
+                WORD32 out_stride = 8;
+
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+
+                    /* e[0][0-3] stored in pu1_dst[0] */
+                    /* e[7][0-3] stored in pu1_dst[1] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo0[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+
+                    /* e[0][4-7] stored in pu1_dst[2] */
+                    /* e[7][4-7] stored in pu1_dst[3] */
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+                }
+
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+
+                    /* e[1][0-3] stored in pu1_dst[4] */
+                    /* e[6][0-3] stored in pu1_dst[5] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo1[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+
+                    /* e[1][4-7] stored in pu1_dst[6]*/
+                    /* e[6][4-7] stored in pu1_dst[7] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+
+                }
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    /* e[2][0-3] stored in pu1_dst[8]*/
+                    /* e[5][0-3] stored in pu1_dst[9] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo2[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    /* e[2][4-7] stored in pu1_dst[10]*/
+                    /* e[5][4-7] stored in pu1_dst[11] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+                }
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                    /* e[3][0-3] stored in pu1_dst[12]*/
+                    /* e[4][0-3] stored in pu1_dst[13] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo3[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+                    /* e[3][4-7] stored in pu1_dst[14]*/
+                    /* e[4][4-7] stored in pu1_dst[15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+            }
+        }
+        else if(zero_last8_rows_stg2)
+        {
+            /* eeo */
+            /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+            /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+            {
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[3][0]); //83
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[4][0]); //36
+
+                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
+                pi2_src_temp += (stride);
+                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
+                pi2_src_temp += (stride * 8);
+
+                if(!i)
+                {
+                    pi2_src_temp += (stride * 6 + 8);
+                }
+                else
+                {
+                    pi2_src_temp += (stride * 2 + 8);
+                }
+
+                pi2_src_temp -= (stride * 8);
+                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
+                pi2_src_temp -= (stride);
+                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
+
+
+                m_temp_reg_76 = _mm_setzero_si128();
+
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
+
+                m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+                m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+            }
+
+            /* eee */
+            /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+            /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+            {
+                /* Loading coeff and src for use in next block */
+
+
+                m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_76, m_temp_reg_70);
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
+
+                m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
+                m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+
+                m_temp_reg_26 = m_temp_reg_24;
+                m_temp_reg_27 = m_temp_reg_25;
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+            }
+
+            /* eo */
+            {
+                WORD16 *pi2_scratch = temp_array;
+                WORD32 out_stride = 8;
+
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
+                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
+
+                    /* e[0][0-3] stored in pu1_dst[0] */
+                    /* e[7][0-3] stored in pu1_dst[1] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo0[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
+                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
+
+                    /* e[0][4-7] stored in pu1_dst[2] */
+                    /* e[7][4-7] stored in pu1_dst[3] */
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+                }
+
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
+                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
+
+                    /* e[1][0-3] stored in pu1_dst[4] */
+                    /* e[6][0-3] stored in pu1_dst[5] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo1[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
+                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
+
+                    /* e[1][4-7] stored in pu1_dst[6]*/
+                    /* e[6][4-7] stored in pu1_dst[7] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+
+                }
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    /* e[2][0-3] stored in pu1_dst[8]*/
+                    /* e[5][0-3] stored in pu1_dst[9] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo2[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    /* e[2][4-7] stored in pu1_dst[10]*/
+                    /* e[5][4-7] stored in pu1_dst[11] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+                }
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                    /* e[3][0-3] stored in pu1_dst[12]*/
+                    /* e[4][0-3] stored in pu1_dst[13] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo3[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+                    /* e[3][4-7] stored in pu1_dst[14]*/
+                    /* e[4][4-7] stored in pu1_dst[15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+            }
+        }
+
+        else
+        {
+            /* eeo */
+            /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+            /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+            {
+
+
+                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
+                pi2_src_temp += (stride);
+                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
+                pi2_src_temp += (stride * 7);
+                m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //8
+                pi2_src_temp += (stride);
+                m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //12
+                if(!i)
+                {
+                    pi2_src_temp += (stride * 6 + 8);
+                }
+                else
+                {
+                    pi2_src_temp += (stride * 2 + 8);
+                }
+                m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //14
+                pi2_src_temp -= (stride);
+                m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //10
+                pi2_src_temp -= (stride * 7);
+                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
+                pi2_src_temp -= (stride);
+                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
+
+                m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+                m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+
+            }
+
+            /* eee */
+            /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+            /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+            {
+                /* Loading coeff and src for use in next block */
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64  64
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
+
+                m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+                m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
+
+                m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+                m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
+
+            }
+
+            /* eo */
+            {
+                WORD16 *pi2_scratch = temp_array;
+                WORD32 out_stride = 8;
+
+
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+                    m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
+
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
+
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
+                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
+
+
+                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
+                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+
+                }
+
+                /* eo0[4-7] */
+                {
+
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
+
+                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
+                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
+
+                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
+                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+                }
+
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
+                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
+
+                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
+                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+                }
+
+                /* eo1[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
+                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
+
+                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
+                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
+                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
+                }
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
+
+                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo2[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
+
+                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+                }
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
+
+                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+
+                /* eo3[4-7] */
+                {
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += out_stride;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += out_stride;
+                }
+            }
+        }
+
+        if(zero_last12_rows_stg2)
+        {
+            /* o & stage 2 pre-transposed out */
+            {
+                WORD32 j;
+                WORD16 *pi2_src_scratch = temp_array;
+                WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
+                WORD32 out_stride = (trans_size);
+                WORD32 in_stride = (8) * 4;
+
+                pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
+
+                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
+
+                pi2_src_temp += (stride * 9);
+
+                if(0 == i)
+                {
+                    pi2_src_temp -= (stride * 2 - 8);
+                }
+                else
+                {
+                    pi2_src_temp -= (stride * 6 - 8);
+                }
+                pi2_src_temp -= (stride * 9);
+
+                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
+
+
+                for(j = 0; j < 2; j++)
+                {
+                    if(j)
+                    {
+                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+                    }
+                    else
+                    {
+                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+                    }
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+
+                    /* o0[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o1[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += ((!i) * out_stride + 8);
+                    }
+
+                    /* o2[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o3[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += (i * out_stride + 8);
+                    }
+
+                    /* o4[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o5[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += ((!i) * out_stride + 8);
+                    }
+
+                    /* o6[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o7[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += (i * out_stride + 8);
+                    }
+
+
+                }
+            }
+        }
+        else if(zero_last8_rows_stg2)
+        {
+            /* o & stage 2 pre-transposed out */
+            {
+                WORD32 j;
+                WORD16 *pi2_src_scratch = temp_array;
+                WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
+                WORD32 out_stride = (trans_size);
+                WORD32 in_stride = (8) * 4;
+
+                pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
+
+
+                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
+                pi2_src_temp += (stride);
+                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
+                pi2_src_temp += (stride * 8);
+
+                if(0 == i)
+                {
+                    pi2_src_temp -= (stride * 2 - 8);
+                }
+                else
+                {
+                    pi2_src_temp -= (stride * 6 - 8);
+                }
+
+                pi2_src_temp -= (stride * 8);
+                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
+                pi2_src_temp -= (stride);
+                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
+
+
+                for(j = 0; j < 2; j++)
+                {
+                    if(j)
+                    {
+                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
+                    }
+                    else
+                    {
+                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
+                    }
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
+
+                    /* o0[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+
+                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o1[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += ((!i) * out_stride + 8);
+                    }
+
+                    /* o2[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o3[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
+
+                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += (i * out_stride + 8);
+                    }
+
+                    /* o4[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o5[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += ((!i) * out_stride + 8);
+                    }
+
+                    /* o6[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o7[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += (i * out_stride + 8);
+                    }
+                }
+            }
+        }
+        else
+        {
+            /* o & stage 2 pre-transposed out */
+            {
+                WORD32 j;
+                WORD16 *pi2_src_scratch = temp_array;
+                WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
+                WORD32 out_stride = (trans_size);
+                WORD32 in_stride = (8) * 4;
+
+                pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
+
+
+                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
+                pi2_src_temp += (stride);
+                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
+                pi2_src_temp += (stride * 7);
+                m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //9
+                pi2_src_temp += (stride);
+                m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //13
+                if(0 == i)
+                {
+                    pi2_src_temp -= (stride * 2 - 8);
+                }
+                else
+                {
+                    pi2_src_temp -= (stride * 6 - 8);
+                }
+                m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //15
+                pi2_src_temp -= (stride);
+                m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //11
+                pi2_src_temp -= (stride * 7);
+                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
+                pi2_src_temp -= (stride);
+                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
+
+
+                for(j = 0; j < 2; j++)
+                {
+
+                    if(j) //H8B= higher 8 bytes L8B lower 8 bytes
+                    {
+                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
+                        m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
+                        m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
+                    }
+                    else
+                    {
+                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
+                        m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
+                        m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
+                    }
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25  9
+
+
+                    /* o0[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
+                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
+                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o1[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
+                        m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
+                        m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
+                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += ((!i) * out_stride + 8);
+                    }
+
+                    /* o2[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
+                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
+                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o3[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
+                        m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
+                        m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
+
+                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += (i * out_stride + 8);
+                    }
+
+                    /* o4[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
+                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
+                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o5[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
+                        m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
+                        m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
+                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += ((!i) * out_stride + 8);
+                    }
+
+                    /* o6[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
+                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
+                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
+
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+                    }
+
+                    /* o7[0-3] */
+                    {
+                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
+                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += (i * out_stride + 8);
+                    }
+
+                }
+            }
+        }
+    }
+
+    /* Transpose */
+    {
+        WORD16 *pi2_src_scratch;
+        UWORD8 *pu1_pred_temp = pu1_pred;
+        WORD32 out_stride = dst_strd;
+        WORD32 in_stride = trans_size;
+        WORD32 j;
+        m_temp_reg_1 = _mm_setzero_si128();
+        for(i = 0; i < 2; i++)
+        {
+            pi2_src_scratch = (i) ? (pi2_tmp + 8) : pi2_tmp;
+
+            for(j = 0; j < 2; j++)
+            {
+                m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
+                pi2_src_scratch += in_stride;
+                m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
+                pi2_src_scratch += ((!i) * in_stride + 8);
+                m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
+                pi2_src_scratch += (in_stride);
+                m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
+                pi2_src_scratch += (i * in_stride + 8);
+                m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
+                pi2_src_scratch += in_stride;
+                m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
+                pi2_src_scratch += ((!i) * in_stride + 8);
+                m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
+                pi2_src_scratch += in_stride;
+                m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
+                pi2_src_scratch += (i * in_stride + 8);
+
+                m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
+                m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
+
+                m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
+                m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
+
+                m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
+                m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
+
+                m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
+                m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
+
+
+                m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
+                m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
+
+                m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
+                m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
+
+                m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
+                m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
+
+                m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
+                m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
+
+
+                m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
+                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
+
+                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
+                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
+
+                m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
+                m_temp_reg_40 = _mm_add_epi16(m_temp_reg_40, m_temp_reg_0);
+                m_temp_reg_44 = _mm_add_epi16(m_temp_reg_44, m_temp_reg_12);
+
+                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+                pu1_dst += out_stride;
+                pu1_pred_temp += pred_strd;
+
+                m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
+                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
+
+                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
+                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
+
+                m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
+                m_temp_reg_41 = _mm_add_epi16(m_temp_reg_41, m_temp_reg_0);
+                m_temp_reg_45 = _mm_add_epi16(m_temp_reg_45, m_temp_reg_12);
+
+                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_41, m_temp_reg_45);
+                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+                pu1_dst += out_stride;
+                pu1_pred_temp += pred_strd;
+
+                m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
+                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
+
+                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
+                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
+
+                m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
+                m_temp_reg_42 = _mm_add_epi16(m_temp_reg_42, m_temp_reg_0);
+                m_temp_reg_46 = _mm_add_epi16(m_temp_reg_46, m_temp_reg_12);
+
+                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_42, m_temp_reg_46);
+                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+                pu1_dst += out_stride;
+                pu1_pred_temp += pred_strd;
+
+                m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
+                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
+
+                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
+                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
+
+                m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
+                m_temp_reg_43 = _mm_add_epi16(m_temp_reg_43, m_temp_reg_0);
+                m_temp_reg_47 = _mm_add_epi16(m_temp_reg_47, m_temp_reg_12);
+
+                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_43, m_temp_reg_47);
+                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+                pu1_dst += out_stride;
+                pu1_pred_temp += pred_strd;
+            }
+        }
+    }
+}

diff --git a/common/x86/ihevc_itrans_recon_32x32_ssse3_intr.c b/common/x86/ihevc_itrans_recon_32x32_ssse3_intr.c
new file mode 100644
index 0000000..1883758
--- /dev/null
+++ b/common/x86/ihevc_itrans_recon_32x32_ssse3_intr.c

@@ -0,0 +1,6628 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_itrans_recon_32x32_atom_intr.c
+ *
+ * @brief
+ *  Contains function definitions for inverse  quantization, inverse
+ * transform and reconstruction
+ *
+ * @author
+ *  100470
+ *
+ * @par List of Functions:
+ *  - ihevc_iquant_itrans_recon_32x32_ssse3()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_iquant_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+
+
+
+#include <immintrin.h>
+#include <emmintrin.h>
+
+#include <tmmintrin.h>
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization, inverse  transform and
+ * reconstruction for 16x16 input block
+ *
+ * @par Description:
+ *  Performs inverse quantization , inverse transform  and adds the
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 16x16 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 16x16 buffer for storing inverse
+ *  transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 16x16 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 16x16 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+/**/
+
+void ihevc_itrans_recon_32x32_ssse3(WORD16 *pi2_src,
+                                    WORD16 *pi2_tmp,
+                                    UWORD8 *pu1_pred,
+                                    UWORD8 *pu1_dst,
+                                    WORD32 src_strd,
+                                    WORD32 pred_strd,
+                                    WORD32 dst_strd,
+                                    WORD32 zero_cols,
+                                    WORD32 zero_rows)
+{
+    /* Inverse Transform */
+
+    WORD32 j;
+
+
+    WORD16 *pi2_tmp_orig;
+
+
+    /*MEM_ALIGN16  WORD32 temp_array[1024];
+    MEM_ALIGN16  WORD16 temp1_array[1024];*/
+    WORD16 *o_temp_ptr;
+    WORD16 *temp_ptr;
+
+    __m128i m_temp_reg_0;
+    __m128i m_temp_reg_1;
+    __m128i m_temp_reg_2;
+    __m128i m_temp_reg_3;
+    __m128i m_temp_reg_4;
+    __m128i m_temp_reg_5;
+    __m128i m_temp_reg_6;
+    __m128i m_temp_reg_7;
+    __m128i m_temp_reg_10;
+    __m128i m_temp_reg_11;
+    __m128i m_temp_reg_12;
+    __m128i m_temp_reg_13;
+    __m128i m_temp_reg_14;
+    __m128i m_temp_reg_15;
+    __m128i m_temp_reg_16;
+    __m128i m_temp_reg_17;
+    __m128i m_temp_reg_18;
+    __m128i m_temp_reg_19;
+    __m128i m_temp_reg_20;
+    __m128i m_temp_reg_21;
+    __m128i m_temp_reg_22;
+    __m128i m_temp_reg_23;
+    __m128i m_temp_reg_30;
+    __m128i m_temp_reg_31;
+    __m128i m_temp_reg_32;
+    __m128i m_temp_reg_33;
+    __m128i m_temp_reg_34;
+    __m128i m_temp_reg_35;
+    __m128i m_temp_reg_36;
+    __m128i m_temp_reg_37;
+    __m128i m_temp_reg_40;
+    __m128i m_temp_reg_41;
+    __m128i m_temp_reg_42;
+    __m128i m_temp_reg_43;
+    __m128i m_temp_reg_44;
+    __m128i m_temp_reg_45;
+    __m128i m_temp_reg_46;
+    __m128i m_temp_reg_47;
+
+    __m128i m_temp_reg_70;
+    __m128i m_temp_reg_71;
+    __m128i m_temp_reg_72;
+    __m128i m_temp_reg_73;
+    __m128i m_temp_reg_74;
+    __m128i m_temp_reg_75;
+    __m128i m_temp_reg_76;
+    __m128i m_temp_reg_77;
+
+    __m128i m_temp_reg_80;
+    __m128i m_temp_reg_81;
+    __m128i m_temp_reg_82;
+    __m128i m_temp_reg_83;
+    __m128i m_temp_reg_84;
+    __m128i m_temp_reg_85;
+    __m128i m_temp_reg_86;
+    __m128i m_temp_reg_87;
+
+    __m128i m_temp_reg_90;
+    __m128i m_temp_reg_91;
+    __m128i m_temp_reg_92;
+    __m128i m_temp_reg_93;
+    __m128i m_temp_reg_94;
+    __m128i m_temp_reg_95;
+    __m128i m_temp_reg_96;
+    __m128i m_temp_reg_97;
+
+    __m128i m_rdng_factor;
+    __m128i m_count;
+    __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
+    __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
+
+    __m128i temp1, temp2, temp3, temp4;
+    __m128i temp5, temp6, temp7, temp8;
+
+    __m128i all_zero_reg;
+    WORD32 i;
+
+    /*Lokesh*/
+    WORD32  zero_last24_cols_stg1;
+    WORD32  zero_last24_rows_stg1;
+    WORD32  zero_last28_rows_stg1;
+
+    WORD32  zero_last28_rows_stg2;
+    WORD32  zero_last24_rows_stg2;
+
+    WORD32  trans_size_stg1;
+
+    WORD32 i4_shift = IT_SHIFT_STAGE_1;
+    WORD32 trans_size = TRANS_SIZE_32;
+
+
+    /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
+    zero_last24_cols_stg1 = ((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
+    zero_last24_rows_stg1 = ((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
+    zero_last28_rows_stg1 = ((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
+
+    zero_last28_rows_stg2 = ((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
+    zero_last24_rows_stg2 = zero_last24_cols_stg1;
+
+    if((zero_last28_rows_stg2) || (zero_last24_cols_stg1))
+    {
+        trans_size_stg1 = 8;
+
+    }
+    else
+    {
+        trans_size_stg1 = 32;
+    }
+
+    all_zero_reg = _mm_setzero_si128();
+
+    o_temp_ptr  = pi2_tmp;
+    temp_ptr = (pi2_tmp + 1024);
+
+    pi2_tmp += 2048;
+    pi2_tmp_orig = pi2_tmp;
+
+    for(i = 0; i < trans_size_stg1; i += 8)
+    {
+
+
+        {
+            WORD16 *pi2_tmp_src = pi2_src;
+
+            m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+
+            m_temp_reg_80 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_81 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_82 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_83 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_84 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_85 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_86 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_87 = _mm_load_si128((__m128i *)pi2_tmp_src);
+        }
+
+        if(zero_last28_rows_stg1)
+        {
+            /* eeo */
+            /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+            /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+            {
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
+
+                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+/* eeeo[0]= m_temp_reg_20  */
+/* eeeo[1]= m_temp_reg_21  */
+/* eeee[0]= m_temp_reg_22  */
+/* eeee[1]= m_temp_reg_23  */
+
+                /* eee[0] = eeee[0] + eeeo[0]; */
+                m_temp_reg_40 = m_temp_reg_14;
+
+                /* eee[3] = eeee[0] - eeeo[0]; */
+                m_temp_reg_43 = m_temp_reg_14;
+
+                /* eee[2] = eeee[1] - eeeo[1]; */
+                m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
+
+                /* eee[1] = eeee[1] + eeeo[1];*/
+                m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
+
+                m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+
+                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+/* eeeo[0]= m_temp_reg_20  */
+/* eeeo[1]= m_temp_reg_21  */
+/* eeee[0]= m_temp_reg_22  */
+/* eeee[1]= m_temp_reg_23  */
+
+                /* eee[0] = eeee[0] + eeeo[0]; */
+                m_temp_reg_44 = m_temp_reg_14;
+
+                /* eee[3] = eeee[0] - eeeo[0]; */
+                m_temp_reg_47 = m_temp_reg_14;
+
+                /* eee[2] = eeee[1] - eeeo[1]; */
+                m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
+
+                /* eee[1] = eeee[1] + eeeo[1];*/
+                m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
+
+
+            }
+            /* eo */
+            {
+                WORD16 *pi2_scratch = o_temp_ptr;
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
+                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
+                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
+                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
+                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
+
+                //m_temp_reg_10 = _mm_cvtepi16_epi32(m_temp_reg_71);
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
+
+                m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    //m_temp_reg_14 = _mm_cvtepi16_epi32(m_temp_reg_71);
+                    m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo0[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo1[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo2[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /**************************************************************************/
+
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo3[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo4[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+                /* eo4[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /***********************************************************************/
+
+                /* eo5[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo5[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff6);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo6[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo6[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff7);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo7[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo7[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff8);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+            }
+        }
+        else if(zero_last24_rows_stg1)
+        {
+            {
+                /* eeo */
+                /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+                /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
+
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
+
+                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+                /* eeeo[0]= m_temp_reg_20  */
+                /* eeeo[1]= m_temp_reg_21  */
+                /* eeee[0]= m_temp_reg_22  */
+                /* eeee[1]= m_temp_reg_23  */
+
+                /* eee[0] = eeee[0] + eeeo[0]; */
+                m_temp_reg_40 = m_temp_reg_14;
+
+                /* eee[3] = eeee[0] - eeeo[0]; */
+                m_temp_reg_43 = m_temp_reg_14;
+
+                /* eee[2] = eeee[1] - eeeo[1]; */
+                m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
+
+                /* eee[1] = eeee[1] + eeeo[1];*/
+                m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
+
+                /* for row 4 to 7 */
+
+                m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+
+                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+                /* eeeo[0]= m_temp_reg_20  */
+                /* eeeo[1]= m_temp_reg_21  */
+                /* eeee[0]= m_temp_reg_22  */
+                /* eeee[1]= m_temp_reg_23  */
+
+                /* eee[0] = eeee[0] + eeeo[0]; */
+                m_temp_reg_44 = m_temp_reg_14;
+
+                /* eee[3] = eeee[0] - eeeo[0]; */
+                m_temp_reg_47 = m_temp_reg_14;
+
+                /* eee[2] = eeee[1] - eeeo[1]; */
+                m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
+
+                /* eee[1] = eeee[1] + eeeo[1];*/
+                m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
+
+
+                /* eeo[] */
+                /* for(k = 0; k < 4; k++) */
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
+
+                m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
+
+                m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
+
+                m_temp_reg_33 = _mm_setzero_si128();
+
+                /* eeo */
+                {
+                    /* eeo0[0-3] */
+                    {
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                        m_temp_reg_90 = m_temp_reg_34;
+                        m_temp_reg_97 = m_temp_reg_35;
+                    }
+                    /* eeo0[4-7] */
+                    {
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+                        m_temp_reg_91 = m_temp_reg_34;
+                        m_temp_reg_96 = m_temp_reg_35;
+
+                    }
+
+                    /* eeo1[0-3] */
+                    {
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
+
+                        /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+                        /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
+
+                        m_temp_reg_92 = m_temp_reg_34;
+                        m_temp_reg_95 = m_temp_reg_35;
+
+                    }
+
+                    /* eo1[4-7] */
+                    {
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
+
+                        /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+                        /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
+
+                        m_temp_reg_93 = m_temp_reg_34;
+                        m_temp_reg_94 = m_temp_reg_35;
+
+
+                    }
+
+                    /* eo2[0-3] */
+                    {
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
+
+                        /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+                        /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+                        temp1 = m_temp_reg_34;
+                        temp7 = m_temp_reg_35;
+
+                    }
+
+                    /* eo2[4-7] */
+                    {
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
+
+                        /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+                        /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+                        temp2 = m_temp_reg_34;
+                        temp6 = m_temp_reg_35;
+
+                    }
+
+                    /* eo3[0-3] */
+                    {
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                        /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+                        /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
+
+                        temp3 = m_temp_reg_34;
+                        temp5 = m_temp_reg_35;
+
+                    }
+
+
+                    /* eo3[4-7] */
+                    {
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+                        /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+                        /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
+
+                        temp4 = m_temp_reg_34;
+                        temp8 = m_temp_reg_35;
+
+
+                    }
+                    /* All values of ee[] array in pi2_temp */
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
+
+                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+
+                    m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+                    m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
+
+                }
+            }
+            /* eo */
+            {
+                WORD16 *pi2_scratch = o_temp_ptr;
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo0[4-7] */
+                {
+                    m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
+
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo1[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
+
+                /* eo2[0-3] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo2[4-7] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /**************************************************************************/
+
+
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
+
+                /* eo3[0-3] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo3[4-7] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
+
+                /* eo4[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+                /* eo4[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /***********************************************************************/
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
+
+                /* eo5[0-3] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo5[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
+
+                /* eo6[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo6[4-7] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
+
+                /* eo7[0-3] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo7[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+            }
+
+        }
+        else
+        {
+
+            {
+                /* eeo */
+                /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+                /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
+
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
+
+                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
+
+                m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
+                m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
+
+                m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
+                m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
+
+
+                /* eeeo[0]= m_temp_reg_20  */
+                /* eeeo[1]= m_temp_reg_21  */
+                /* eeee[0]= m_temp_reg_22  */
+                /* eeee[1]= m_temp_reg_23  */
+
+                /* eee[0] = eeee[0] + eeeo[0]; */
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
+
+                /* eee[3] = eeee[0] - eeeo[0]; */
+                m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
+
+                /* eee[2] = eeee[1] - eeeo[1]; */
+                m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
+
+                /* eee[1] = eeee[1] + eeeo[1];*/
+                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
+
+                /* for row 4 to 7 */
+
+                m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
+                m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
+
+                /* Interleaving row 8 and row 24*/
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
+
+                m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+                m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
+
+                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
+
+                m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
+                m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
+
+                m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
+                m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
+
+
+                /* eeeo[0]= m_temp_reg_20  */
+                /* eeeo[1]= m_temp_reg_21  */
+                /* eeee[0]= m_temp_reg_22  */
+                /* eeee[1]= m_temp_reg_23  */
+
+                /* eee[0] = eeee[0] + eeeo[0]; */
+                m_temp_reg_44 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
+
+                /* eee[3] = eeee[0] - eeeo[0]; */
+                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
+
+                /* eee[2] = eeee[1] - eeeo[1]; */
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
+
+                /* eee[1] = eeee[1] + eeeo[1];*/
+                m_temp_reg_45 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
+
+
+                // eeo[]
+                /* for(k = 0; k < 4; k++) */
+
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
+
+                /* eeo */
+                {
+                    /* eeo0[0-3] */
+                    {
+                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
+
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        m_temp_reg_90 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                        m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    }
+
+                    m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
+                    m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
+                    m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
+                    m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
+
+                    /* eeo0[4-7] */
+                    {
+                        m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+                        m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
+
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        m_temp_reg_91 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+                        m_temp_reg_96 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+                    }
+
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89  50
+
+                    /* eeo1[0-3] */
+                    {
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
+
+                        m_temp_reg_92 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
+                        m_temp_reg_95 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
+
+                    }
+
+                    /* eeo1[4-7] */
+                    {
+
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
+
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
+
+                        m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
+                        m_temp_reg_94 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
+
+
+                    }
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18  75
+
+                    /* eeo2[0-3] */
+                    {
+
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+                        /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+                        /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+                        temp1 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
+                        temp7 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
+
+                    }
+
+                    /* eeo2[4-7] */
+                    {
+
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
+
+                        /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+                        /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+                        temp2 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
+                        temp6 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
+
+                    }
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75  -89
+
+                    /* eeo3[0-3] */
+                    {
+
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+                        /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+                        /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
+
+                        temp3 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
+                        temp5 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
+
+
+                    }
+
+                    /* eeo3[4-7] */
+                    {
+
+                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
+
+                        /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+                        /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+
+                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
+                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
+                        temp4 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
+                        temp8 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
+
+                    }
+
+
+                    /* All values of ee[] array in pi2_temp */
+
+                    /* for(k = 0; k < 8; k++) */
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
+                }
+            }
+            /* eo */
+            {
+                WORD16 *pi2_scratch = o_temp_ptr;
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
+
+                m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+                m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
+                m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
+                m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
+
+                m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
+                m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
+                m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
+                m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+                /* eo0[4-7] */
+                {
+                    m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+                    m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
+                    m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0  -43
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80  90
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70  25
+
+                /* eo1[0-3] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /* eo1[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70  87
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25  57
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90  43
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo2[4-7] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+                /**************************************************************************/
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87  9
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90  25
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80  57
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo3[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25  90
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9  87
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43  70
+
+                /* eo4[0-3] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo4[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                /***********************************************************************/
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57  25
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87  70
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9  -80
+
+                /* eo5[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo5[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90  -80
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43  9
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57  87
+
+                /* eo6[0-3] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo6[4-7] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43  -57
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70  -80
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87  -90
+
+                /* eo7[0-3] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+
+                /* eo7[4-7] */
+                {
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
+                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
+
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+                    pi2_scratch += 8;
+                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+                    pi2_scratch += 8;
+
+                }
+
+            }
+
+        }
+        /*  All e[] are done */
+        /****************************/
+
+
+        {
+
+            WORD16 *pi2_tmp_src = pi2_src + src_strd;
+
+            m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+
+            m_temp_reg_80 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_81 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_82 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_83 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_84 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_85 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_86 = _mm_load_si128((__m128i *)pi2_tmp_src);
+            pi2_tmp_src += (src_strd << 1);
+            m_temp_reg_87 = _mm_load_si128((__m128i *)pi2_tmp_src);
+        }
+
+        if(zero_last28_rows_stg1)
+        {
+            /* o & stage 1 out */
+            {
+                WORD32 j;
+                WORD16 *pi2_src_scratch = o_temp_ptr;
+                WORD16 *pi2_dst_scratch = temp_ptr;
+                WORD32 out_stride = (trans_size << 1);
+                WORD32 in_stride = trans_size;
+
+                for(j = 0; j < 2; j++)
+                {
+                    if(j)
+                    {
+                        m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+                        m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+                    }
+
+                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+
+                    /* o0[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+
+                    /* o1[0-3] */
+                    {
+
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+
+                    /* o2[0-3] */
+                    {
+
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+
+                    /* o3[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+
+                    /* o4[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+
+                    /* o5[0-3] */
+                    {
+
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+
+                    /* o6[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+
+                    /* o7[0-3] */
+                    {
+
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+
+                    /* o8[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+
+                    /* o9[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+
+                    /* o10[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+
+                    /* o11[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+
+                    /* o12[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+
+                    /* o13[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+
+                    /* o14[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+
+                    /* o15[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+                    }
+
+                }
+            }
+        }
+        else if(zero_last24_rows_stg1)
+        {
+            /* o & stage 1 out */
+            {
+                WORD32 j;
+                WORD16 *pi2_src_scratch = o_temp_ptr;
+                WORD16 *pi2_dst_scratch = temp_ptr;
+                WORD32 out_stride = (trans_size << 1);
+                WORD32 in_stride = trans_size;
+
+                for(j = 0; j < 2; j++)
+                {
+                    if(j)
+                    {
+                        m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+                        m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+                        m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
+                        m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
+                    }
+
+                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
+                    m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
+
+                    /* o0[0-3] */
+                    {
+
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
+
+                    /* o1[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
+
+                    /* o2[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
+
+                    /* o3[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
+
+                    /* o4[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
+
+                    /* o5[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
+
+                    /* o6[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
+
+                    /* o7[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
+
+                    /* o8[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
+
+                    /* o9[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
+
+                    /* o10[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
+
+                    /* o11[0-3] */
+                    {
+
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
+
+                    /* o12[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
+
+                    /* o13[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
+
+                    /* o14[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
+
+                    /* o15[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+                    }
+
+                }
+            }
+        }
+        else
+        {
+            /* o & stage 1 out */
+            {
+                WORD32 j;
+                WORD16 *pi2_src_scratch = o_temp_ptr;
+                WORD16 *pi2_dst_scratch = temp_ptr;
+                WORD32 out_stride = (trans_size << 1);
+                WORD32 in_stride = trans_size;
+
+
+                for(j = 0; j < 2; j++)
+                {
+                    if(j)
+                    {
+                        m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+                        m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+                        m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
+                        m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
+                        m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
+                        m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
+                        m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
+                        m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
+
+                        m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
+                        m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
+                        m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
+                        m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
+                        m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
+                        m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
+                        m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
+                        m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
+                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
+                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
+                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
+                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
+
+                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
+                    m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
+                    m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
+                    m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
+                    temp1 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
+                    temp2 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
+                    temp3 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
+                    temp4 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
+
+
+                    /* o0[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
+                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
+                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
+                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
+                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
+
+
+                    /* o1[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
+                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
+                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
+                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
+                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
+
+                    /* o2[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
+                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
+                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
+                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
+                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
+
+                    /* o3[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
+                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
+                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
+                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
+                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
+
+                    /* o4[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
+                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
+                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
+                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
+                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
+
+                    /* o5[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
+                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
+                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
+                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
+                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
+
+
+                    /* o6[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
+                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
+                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
+                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
+                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
+
+                    /* o7[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
+                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
+                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
+                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
+                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
+
+
+                    /* o8[0-3] */
+                    {
+
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
+                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
+                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
+                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
+                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
+
+
+                    /* o9[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
+                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
+                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
+                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
+                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
+
+                    /* o10[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
+                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
+                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
+                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
+                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
+
+                    /* o11[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
+                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
+                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
+                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
+                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
+
+
+                    /* o12[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
+                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
+                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
+                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
+                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
+
+
+                    /* o13[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
+                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
+                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
+                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
+                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
+
+
+                    /* o14[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch -= in_stride;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch -= out_stride;
+
+                    }
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
+                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
+                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
+                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
+                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
+
+                    /* o15[0-3] */
+                    {
+                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                        pi2_src_scratch += 8;
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                        m_count = _mm_cvtsi32_si128(i4_shift);
+                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                        pi2_dst_scratch += 8;
+                    }
+
+                }
+            }
+        }
+        /* Transpose */
+        {
+            WORD16 *pi2_src_scratch = temp_ptr;
+            WORD16 *pi2_dst_scratch = pi2_tmp;
+            WORD32 in_stride = (trans_size << 1);
+
+            for(j = 0; j < 2; j++)
+            {
+                m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch += in_stride;
+                m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch += in_stride;
+                m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch += in_stride;
+                m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch += in_stride;
+                m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch += in_stride;
+                m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch += in_stride;
+                m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch += in_stride;
+                m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch += 8;
+
+                m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch -= in_stride;
+                m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
+                pi2_src_scratch += 8;
+
+
+                m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
+                m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
+
+                m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
+                m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
+
+                m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
+                m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
+
+                m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
+                m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
+
+                m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
+                m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
+
+                m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
+                m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
+
+                m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
+                m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
+
+                m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
+                m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
+
+                /****************/
+
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
+                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
+
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
+
+                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
+                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
+
+                m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
+                m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
+
+                m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
+                m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
+
+                m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
+                m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
+
+                m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
+                m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
+
+                /******************/
+
+                m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);
+                m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);
+
+                m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);
+                m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);
+
+                m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);
+                m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);
+
+                m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);
+                m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);
+
+                m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);
+                m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);
+
+                m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);
+                m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);
+
+                m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);
+                m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);
+
+                m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);
+                m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);
+
+                _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_30);
+                _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_34);
+                _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_36);
+                _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_32);
+
+                _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_31);
+                _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_35);
+                _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_37);
+                _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_33);
+
+                _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_80);
+                _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_84);
+                _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_86);
+                _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_82);
+
+                _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_81);
+                _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_85);
+                _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_87);
+                _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_83);
+
+                pi2_dst_scratch += 4 * trans_size;
+            }
+        }
+        pi2_src += 8;
+//      pi2_dequant_coeff +=8;
+        pi2_tmp += 8 * trans_size;
+        zero_cols = zero_cols >> 1;
+    }
+
+    if(trans_size_stg1 != TRANS_SIZE_32)
+    {
+        m_temp_reg_10 = _mm_setzero_si128();
+
+        for(i = trans_size_stg1; i < 32; i += 8)
+        {
+            WORD16 *pi2_dst_scratch = pi2_tmp;
+
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_10);
+
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_10);
+
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_10);
+
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_10);
+
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 8), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 16), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 24), m_temp_reg_10);
+
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 8), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 16), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 24), m_temp_reg_10);
+
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 8), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 16), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 24), m_temp_reg_10);
+
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 8), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 16), m_temp_reg_10);
+            _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 24), m_temp_reg_10);
+
+            pi2_tmp += 8 * trans_size;
+        }
+    }
+
+    pi2_tmp = pi2_tmp_orig;
+
+    /* Inverse Transform 2nd stage */
+
+    for(j = 0; j < trans_size; j += 4)
+    {
+        i4_shift = IT_SHIFT_STAGE_2;
+
+        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+        if(zero_last28_rows_stg2)
+        {
+            {
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
+                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
+                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
+                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
+                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
+
+                m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, all_zero_reg);
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                }
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
+
+                }
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                }
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
+                }
+                /* eo4[0-3] */
+                {
+                    m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+                }
+
+                /* eo5[0-3] */
+                {
+                    m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
+                }
+
+                /* eo6[0-3] */
+                {
+                    m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
+                }
+                /* eo7[0-3] */
+                {
+                    m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
+                }
+            }
+
+            m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
+
+            m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
+
+            m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+            m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+            m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+            /* e[]*/
+
+            temp1 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_90);  /* ee[0] */
+            temp2 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_90);  /* ee[15] */
+
+            temp3 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_91);  /* ee[1] */
+            temp4 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_91);  /* ee[14] */
+
+            temp5 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_92);  /* ee[2] */
+            temp6 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_92);  /* ee[13] */
+
+            temp7 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_93);  /* ee[3] */
+            temp8 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_93);  /* ee[12] */
+
+            m_temp_reg_90 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_94);  /* ee[4] */
+            m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_94);  /* ee[11] */
+
+            m_temp_reg_92 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_95);  /* ee[5] */
+            m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_95);  /* ee[10] */
+
+            m_temp_reg_94 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_96);  /* ee[6] */
+            m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_96);  /* ee[9] */
+
+            m_temp_reg_96 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_97);  /* ee[7] */
+            m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_97);  /* ee[8] */
+
+            /*o[k]*/
+            {
+
+                WORD16 *pi2_dst_scratch = temp_ptr;
+                WORD32 out_stride = 8;
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+
+                m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
+                m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
+
+
+                /* o0[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+
+                /* o1[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+
+                /* o2[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_sub_epi32(temp5, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_add_epi32(temp5, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+
+                /* o3[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_sub_epi32(temp7, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_add_epi32(temp7, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+
+                /* o4[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+
+                /* o5[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+
+                /* o6[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+
+                /* o7[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+
+                /* o8[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+
+                /* o9[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+
+                /* o10[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+
+                /* o11[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+
+                /* o12[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+
+                /* o13[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+
+                /* o14[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+
+                /* o15[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += 8;
+                }
+
+            }
+
+        }
+        else if(zero_last24_rows_stg2)
+        {
+            /* eo */
+            {
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+
+                m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
+                m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
+
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
+
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                }
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
+
+                /* eo3[0-3] */
+                {
+
+                    m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
+
+                /* eo4[0-3] */
+                {
+                    m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
+
+                /* eo5[0-3] */
+                {
+                    m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
+                /* eo6[0-3] */
+                {
+                    m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
+                /* eo7[0-3] */
+                {
+                    m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                }
+
+            }
+
+            /* eeo */
+            {
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50
+
+                m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
+
+                /* eeo0[0-3] */
+                {
+                    temp1 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+                }
+
+                /* eeo1[0-3] */
+                {
+                    temp2 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
+
+                }
+
+                /* eo2[0-3] */
+                {
+                    temp3 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
+
+                }
+
+
+                /* eo3[0-3] */
+                {
+                    temp4 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+                }
+
+            }
+
+            m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83
+            m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36
+            m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
+
+            m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
+
+            //m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_70);
+            m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+            m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+            m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+            m_temp_reg_70 = _mm_add_epi32(m_temp_reg_14, temp1);  /* ee[0] */
+            m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_14, temp1);  /* ee[7] */
+
+            m_temp_reg_72 = _mm_add_epi32(m_temp_reg_16, temp2);  /* ee[1] */
+            m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_16, temp2);  /* ee[6] */
+
+            m_temp_reg_74 = _mm_add_epi32(m_temp_reg_16, temp3);  /* ee[2] */
+            m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_16, temp3);  /* ee[5] */
+
+            m_temp_reg_76 = _mm_add_epi32(m_temp_reg_14, temp4);  /* ee[3] */
+            m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_14, temp4);  /* ee[4] */
+
+            /* e[]*/
+
+            temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[0] */
+            temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[15] */
+
+            temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[1] */
+            temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[14] */
+
+            temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[2] */
+            temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[13] */
+
+            temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[3] */
+            temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[12] */
+
+            m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[4] */
+            m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[11] */
+
+            m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[5] */
+            m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[10] */
+
+            m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[6] */
+            m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[9] */
+
+            m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[7] */
+            m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[8] */
+
+            /*o[k] */
+            {
+
+                WORD16 *pi2_dst_scratch = temp_ptr;
+                WORD32 out_stride = 8;
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
+
+                m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
+                m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
+                m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
+                m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
+
+                /* o0[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                    m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
+
+                /* o1[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+                    m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
+
+                /* o2[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
+
+                /* o3[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
+
+                /* o4[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
+
+                /* o5[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
+
+                /* o6[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
+
+                /* o7[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
+
+                /* o8[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
+
+                /* o9[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
+
+                /* o10[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
+
+                /* o11[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
+
+                /* o12[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
+
+                /* o13[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
+
+                /* o14[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
+
+                /* o15[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += 8;
+                }
+
+            }
+        }
+        else
+        {
+            /* eo */
+            {
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
+
+
+                m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
+                m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
+                m_temp_reg_12 = _mm_loadu_si128((__m128i *)&pi2_tmp[10 * trans_size]);
+                m_temp_reg_13 = _mm_loadu_si128((__m128i *)&pi2_tmp[14 * trans_size]);
+                m_temp_reg_18 = _mm_loadu_si128((__m128i *)&pi2_tmp[18 * trans_size]);
+                m_temp_reg_19 = _mm_loadu_si128((__m128i *)&pi2_tmp[22 * trans_size]);
+                m_temp_reg_20 = _mm_loadu_si128((__m128i *)&pi2_tmp[26 * trans_size]);
+                m_temp_reg_21 = _mm_loadu_si128((__m128i *)&pi2_tmp[30 * trans_size]);
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_18, m_temp_reg_19);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_21);
+
+                /* eo0[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_90 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0  -43
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80  90
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70  25
+
+                /* eo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70  87
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25  57
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90  43
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_92 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87  9
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90  25
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80  57
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_93 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25  90
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9  87
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43  70
+
+
+                /* eo4[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+
+                    m_temp_reg_94 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57  25
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87  70
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9  -80
+
+                /* eo5[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_95 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90  -80
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43  9
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57  87
+
+                /* eo6[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_96 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43  -57
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70  -80
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87  -90
+
+                /* eo7[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+                    m_temp_reg_97 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+
+                }
+
+            }
+
+            /* eeo */
+            {
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
+
+                m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
+                m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[12 * trans_size]);
+                m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[20 * trans_size]);
+                m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[28 * trans_size]);
+
+                /* eeo0[0-3] */
+                {
+
+                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+                    m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
+
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+                    temp1 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                }
+
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89  50
+
+                /* eeo1[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+                    temp2 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                }
+
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18  75
+
+                /* eo2[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+                    temp3 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                }
+
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75  -89
+
+                /* eo3[0-3] */
+                {
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+                    temp4 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                }
+
+
+            }
+
+            m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
+            m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
+
+            m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
+            m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
+
+            m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[8 * trans_size]);
+            m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[24 * trans_size]);
+
+            m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
+
+            m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
+            m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[16 * trans_size]);
+
+            m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
+
+            m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
+            m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
+
+            m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
+            m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
+
+/* eeeo[0]= m_temp_reg_20  */
+/* eeeo[1]= m_temp_reg_21  */
+/* eeee[0]= m_temp_reg_22  */
+/* eeee[1]= m_temp_reg_23  */
+
+            /* eee[0] = eeee[0] + eeeo[0]; */
+            m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
+
+            /* eee[3] = eeee[0] - eeeo[0]; */
+            m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
+
+            /* eee[2] = eeee[1] - eeeo[1]; */
+            m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
+
+            /* eee[1] = eeee[1] + eeeo[1];*/
+            m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
+
+            m_temp_reg_70 = _mm_add_epi32(m_temp_reg_40, temp1);  /* ee[0] */
+            m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_40, temp1);  /* ee[7] */
+
+            m_temp_reg_72 = _mm_add_epi32(m_temp_reg_41, temp2);  /* ee[1] */
+            m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_41, temp2);  /* ee[6] */
+
+            m_temp_reg_74 = _mm_add_epi32(m_temp_reg_42, temp3);  /* ee[2] */
+            m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_42, temp3);  /* ee[5] */
+
+            m_temp_reg_76 = _mm_add_epi32(m_temp_reg_43, temp4);  /* ee[3] */
+            m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_43, temp4);  /* ee[4] */
+
+/* e[]*/
+
+            temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[0] */
+            temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[15] */
+
+            temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[1] */
+            temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[14] */
+
+            temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[2] */
+            temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[13] */
+
+            temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[3] */
+            temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[12] */
+
+            m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[4] */
+            m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[11] */
+
+            m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[5] */
+            m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[10] */
+
+            m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[6] */
+            m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[9] */
+
+            m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[7] */
+            m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[8] */
+
+/*o[k] */
+            {
+
+                WORD16 *pi2_dst_scratch = temp_ptr;
+                WORD32 out_stride = 8;
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
+                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
+                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
+                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
+                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
+
+
+                m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
+                m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
+                m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
+                m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
+                m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[9 * trans_size]);
+                m_temp_reg_75 = _mm_loadu_si128((__m128i *)&pi2_tmp[11 * trans_size]);
+                m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[13 * trans_size]);
+                m_temp_reg_77 = _mm_loadu_si128((__m128i *)&pi2_tmp[15 * trans_size]);
+
+                m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[17 * trans_size]);
+                m_temp_reg_81 = _mm_loadu_si128((__m128i *)&pi2_tmp[19 * trans_size]);
+                m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[21 * trans_size]);
+                m_temp_reg_83 = _mm_loadu_si128((__m128i *)&pi2_tmp[23 * trans_size]);
+                m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[25 * trans_size]);
+                m_temp_reg_85 = _mm_loadu_si128((__m128i *)&pi2_tmp[27 * trans_size]);
+                m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[29 * trans_size]);
+                m_temp_reg_87 = _mm_loadu_si128((__m128i *)&pi2_tmp[31 * trans_size]);
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
+                m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
+                m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
+                m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
+                m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
+
+                /* o0[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
+                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
+                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
+                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
+                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
+
+                /* o1[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp3, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp3, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
+                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
+                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
+                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
+                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
+
+                /* o2[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
+                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
+                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
+                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
+                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
+
+                /* o3[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
+                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
+                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
+                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
+                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
+
+                /* o4[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
+                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
+                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
+                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
+                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
+
+                /* o5[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
+                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
+                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
+                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
+                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
+
+                /* o6[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
+                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
+                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
+                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
+                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
+
+                /* o7[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += 8;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
+                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
+                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
+                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
+                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
+
+                /* o8[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
+                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
+                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
+                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
+                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
+
+                /* o9[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
+                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
+                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
+                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
+                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
+
+                /* o10[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
+                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
+                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
+                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
+                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
+
+                /* o11[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
+                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
+                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
+                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
+                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
+
+                /* o12[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
+                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
+                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
+                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
+                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
+
+                /* o13[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
+                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
+                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
+                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
+                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
+
+                /* o14[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += out_stride;
+
+                }
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
+                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
+                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
+                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
+                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
+
+                /* o15[0-3] */
+                {
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+                    m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
+                    m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_count = _mm_cvtsi32_si128(i4_shift);
+                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+                    pi2_dst_scratch += 8;
+                }
+
+            }
+
+        }
+
+        /* Transpose */
+        {
+
+            WORD16 *pi2_src_scratch = temp_ptr;
+            WORD32 out_stride = dst_strd;
+            WORD32 in_stride = 8;
+
+            m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += 8;
+
+            m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += in_stride;
+            m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
+            pi2_src_scratch += 8;
+
+
+            m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
+            m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
+
+            m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
+            m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
+
+            m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
+            m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
+
+            m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
+            m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
+
+            m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
+            m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
+
+            m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
+            m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
+
+            m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
+            m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
+
+            m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
+            m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
+
+
+            m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
+            m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
+
+            m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
+            m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
+
+            m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
+            m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
+
+            m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
+            m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
+
+            m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
+            m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
+
+            m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
+            m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
+
+            m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
+            m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
+
+            m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
+            m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
+
+
+            m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);       // row0 = 0-7
+            m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);       // row1 = 0-7
+
+            m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);     // row0=24-31
+            m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);     // row1=24-31
+
+            m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);       // row0=8-15
+            m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);       // row1=8-15
+
+            m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);     // row0=16-23
+            m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);     // row1=16-23
+
+            m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);      // row2 =0-7
+            m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);      // row3 =0-7
+
+            m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);    // row2=24-31
+            m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);    // row3=24-31
+
+            m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);      // row2=8-15
+            m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);      // row3=8-15
+
+            m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);    // row2=16-23
+            m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);    // row3=16-23
+
+            m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
+
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_30, m_temp_reg_0);
+            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_34, m_temp_reg_0);
+            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+            _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+
+            m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
+
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_36, m_temp_reg_0);
+            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_32, m_temp_reg_0);
+            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
+            pu1_dst += out_stride;
+            pu1_pred += pred_strd;
+
+
+            m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
+
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_31, m_temp_reg_0);
+            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_35, m_temp_reg_0);
+            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+            _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+
+            m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
+
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_37, m_temp_reg_0);
+            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_33, m_temp_reg_0);
+            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
+            pu1_dst += out_stride;
+            pu1_pred += pred_strd;
+
+            m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
+
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_80, m_temp_reg_0);
+            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_84, m_temp_reg_0);
+            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+            _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+
+            m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
+
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_86, m_temp_reg_0);
+            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_82, m_temp_reg_0);
+            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
+            pu1_dst += out_stride;
+            pu1_pred += pred_strd;
+
+
+            m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
+
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_81, m_temp_reg_0);
+            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_85, m_temp_reg_0);
+            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+            _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+
+            m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
+
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_87, m_temp_reg_0);
+            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_83, m_temp_reg_0);
+            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+            _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
+            pu1_dst += out_stride;
+            pu1_pred += pred_strd;
+
+        }
+        pi2_tmp += 4;
+    }
+}
+
+

diff --git a/common/x86/ihevc_itrans_recon_sse42_intr.c b/common/x86/ihevc_itrans_recon_sse42_intr.c
new file mode 100644
index 0000000..b472486
--- /dev/null
+++ b/common/x86/ihevc_itrans_recon_sse42_intr.c

@@ -0,0 +1,2503 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_itrans_recon_x86_intr.c
+ *
+ * @brief
+ *  Contains function definitions for inverse  quantization, inverse
+ * transform and reconstruction
+ *
+ * @author
+ *  100470
+ *  100592 (edited by)
+ *
+ * @par List of Functions:
+ *  - ihevc_itrans_recon_4x4_ttype1_sse42()
+ *  - ihevc_itrans_recon_4x4_sse42()
+ *  - ihevc_itrans_recon_8x8_sse42()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_iquant_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization, inverse  transform
+ * type1(DST) and reconstruction for 4x4  input block
+ *
+ * @par Description:
+ *  Performs inverse quantization , inverse transform type 1  and adds
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 4x4 buffer for storing inverse
+ *  transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 4x4 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_itrans_recon_4x4_ttype1_sse42(WORD16 *pi2_src,
+                                         WORD16 *pi2_tmp,
+                                         UWORD8 *pu1_pred,
+                                         UWORD8 *pu1_dst,
+                                         WORD32 src_strd,
+                                         WORD32 pred_strd,
+                                         WORD32 dst_strd,
+                                         WORD32 zero_cols,
+                                         WORD32 zero_rows)
+{
+    __m128i m_temp_reg_0;
+    __m128i m_temp_reg_1;
+    __m128i m_temp_reg_2;
+    __m128i m_temp_reg_3;
+    __m128i m_temp_reg_4;
+    __m128i m_temp_reg_10;
+    __m128i m_temp_reg_11;
+    __m128i m_temp_reg_12;
+    __m128i m_temp_reg_13;
+    __m128i m_temp_reg_14;
+    __m128i m_temp_reg_20;
+    __m128i m_temp_reg_21;
+    __m128i m_temp_reg_22;
+    __m128i m_temp_reg_23;
+    __m128i m_temp_reg_24;
+    __m128i m_temp_reg_25;
+    __m128i m_temp_reg_30;
+    __m128i m_temp_reg_31;
+    __m128i m_temp_reg_32;
+    __m128i m_temp_reg_33;
+    __m128i m_temp_reg_34;
+    __m128i m_temp_reg_35;
+    __m128i m_temp_reg_36;
+    __m128i m_coeff1, m_coeff2, m_coeff3;
+    __m128i m_rdng_factor;
+    __m128i m_count;
+
+    WORD32 i4_shift = IT_SHIFT_STAGE_1;
+    UNUSED(zero_rows);
+    UNUSED(zero_cols);
+    UNUSED(pi2_tmp);
+
+    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype1[2][0]); //74
+
+    m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
+
+    m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
+    m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
+
+    m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
+    m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);
+
+    /* c[4] in m_temp_reg_14 */
+    /* c[4] = src[0] - src[2] + src[3] */
+    {
+        m_temp_reg_14 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_2);
+    }
+
+    /* c[3] in m_temp_reg_13 */
+    {
+        m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);
+    }
+
+    /* c[0] in m_temp_reg_10 */
+    {
+        m_temp_reg_10 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_2);
+    }
+
+    /* c[1] in m_temp_reg_11 */
+    {
+        m_temp_reg_11 = _mm_add_epi32(m_temp_reg_2, m_temp_reg_3);
+    }
+
+    /* c[2] in m_temp_reg_12 */
+    {
+        m_temp_reg_12 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_3);
+    }
+
+    /* c[4] in m_temp_reg_14 */
+    /* c[4] = src[0] - src[2] + src[3] */
+    {
+        m_temp_reg_14 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_3);
+    }
+
+    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype1[1][0]); //29
+    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype1[0][0]); //55
+
+    /* Stage 1 outputs stored in m_temp_reg_20-23 */
+    {
+        m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_10, m_coeff1); //29*c0
+        m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_11, m_coeff2); //55*c1
+
+        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+
+        m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_11, m_coeff1); //29*c1
+        m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_12, m_coeff2); //55*c2
+
+        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+
+        m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_10, m_coeff2); //55*c0
+        m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_12, m_coeff1); //29*c2
+        m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_14, m_coeff3); //74*c4
+
+        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+        m_count = _mm_cvtsi32_si128(i4_shift);
+
+        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+        m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_13);
+        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
+
+        m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+        m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
+
+        m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
+        m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_13);
+        m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
+
+        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
+
+        m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
+        m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
+        m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
+        m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
+
+        m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+        m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+        m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+        m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+
+        m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
+        m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
+
+        m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
+        m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
+
+    }
+
+    /* Stage 2 */
+    {
+        i4_shift = IT_SHIFT_STAGE_2;
+
+        m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+        m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
+        m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+        m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
+        m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
+        m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);
+
+        /* c[4] stored in m_temp_reg_4 */
+        {
+            m_temp_reg_4 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+        }
+
+        /* c[3] stored in m_temp_reg_3 */
+        {
+            m_temp_reg_3 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3);
+        }
+
+        /* c[0] stored in m_temp_reg_0 */
+        {
+            m_temp_reg_0 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+        }
+
+        /* c[1] stored in m_temp_reg_1 */
+        {
+            m_temp_reg_1 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_21);
+        }
+
+        /* c[2] stored in m_temp_reg_2 */
+        {
+            m_temp_reg_2 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_23);
+        }
+
+        /* c[4] stored in m_temp_reg_4 */
+        {
+            m_temp_reg_4 = _mm_add_epi32(m_temp_reg_4, m_temp_reg_23);
+        }
+
+        /* Stage 2 output generation */
+        {
+            m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1); //29*c0
+            m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_1, m_coeff2); //55*c1
+
+            m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+
+            m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1); //29*c1
+            m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2); //55*c2
+
+            m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+
+            m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2); //55*c0
+            m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1); //29*c2
+            m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_4, m_coeff3); //74*c4
+
+            m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+            m_count = _mm_cvtsi32_si128(i4_shift);
+
+            m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_3);
+            m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+            m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
+
+            m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+            m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
+
+            m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_3);
+            m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
+            m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
+
+            m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
+
+            m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
+            m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
+            m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
+            m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
+
+            m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+            m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+            m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+            m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+
+            m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
+            m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
+
+            m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
+            m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
+        }
+
+        /* Recon and store */
+        {
+            WORD32 *pi4_dst = (WORD32 *)pu1_dst;
+
+            m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+            pu1_pred += pred_strd;
+            m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+            pu1_pred += pred_strd;
+            m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+            pu1_pred += pred_strd;
+            m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+            m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+            m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
+            m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
+            m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);
+            m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
+            m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
+
+            m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
+            m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
+
+            m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
+
+            *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
+            m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
+            m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
+            m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
+            pu1_dst += dst_strd;
+            pi4_dst = (WORD32 *)(pu1_dst);
+
+            *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
+            pu1_dst += dst_strd;
+            pi4_dst = (WORD32 *)(pu1_dst);
+
+            *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
+            pu1_dst += dst_strd;
+            pi4_dst = (WORD32 *)(pu1_dst);
+
+            *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
+        }
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization, inverse  transform
+ * (DCT) and reconstruction for 4x4  input block
+ *
+ * @par Description:
+ *  Performs inverse quantization , inverse transform and adds
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 4x4 buffer for storing inverse
+ *  transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 4x4 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_4x4_sse42(WORD16 *pi2_src,
+                                  WORD16 *pi2_tmp,
+                                  UWORD8 *pu1_pred,
+                                  UWORD8 *pu1_dst,
+                                  WORD32 src_strd,
+                                  WORD32 pred_strd,
+                                  WORD32 dst_strd,
+                                  WORD32 zero_cols,
+                                  WORD32 zero_rows)
+{
+
+
+    __m128i m_temp_reg_0;
+    __m128i m_temp_reg_1;
+    __m128i m_temp_reg_2;
+    __m128i m_temp_reg_3;
+    __m128i m_temp_reg_10;
+    __m128i m_temp_reg_11;
+    __m128i m_temp_reg_12;
+    __m128i m_temp_reg_13;
+    __m128i m_temp_reg_14;
+    __m128i m_temp_reg_15;
+    __m128i m_temp_reg_20;
+    __m128i m_temp_reg_21;
+    __m128i m_temp_reg_22;
+    __m128i m_temp_reg_23;
+    __m128i m_temp_reg_24;
+    __m128i m_temp_reg_25;
+    __m128i m_temp_reg_30;
+    __m128i m_temp_reg_31;
+    __m128i m_temp_reg_33;
+    __m128i m_temp_reg_34;
+    __m128i m_coeff1, m_coeff3;
+    __m128i m_rdng_factor;
+    __m128i m_count;
+
+
+    WORD32 i4_shift = IT_SHIFT_STAGE_1;
+    UNUSED(zero_rows);
+    UNUSED(zero_cols);
+    UNUSED(pi2_tmp);
+
+
+    m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
+
+    m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
+    m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
+
+    m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
+    m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);
+
+
+    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype0[0][0]); //36
+    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype0[2][0]); //83
+
+    /* e */
+    {
+        m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6);
+        m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 6);
+    }
+
+    /* o */
+    {
+        m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1); //src[1]*36
+        m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3); //src[3]*83
+        m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3); //src[1]*83
+        m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1); //src[3]*36
+    }
+
+    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+
+    /* e1 stored in m_temp_reg_31 */
+    {
+        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
+    }
+
+    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+
+    /* e0 stored in m_temp_reg_30 */
+    {
+        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
+    }
+
+    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+    m_count = _mm_cvtsi32_si128(i4_shift);
+
+    /* o1 stored in m_temp_reg_33 */
+    {
+        m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
+    }
+
+    /* e1 + add */
+    {
+        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+    }
+
+    /* e0 + add */
+    {
+        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+    }
+
+    /* o0 stored in m_temp_reg_34 */
+    {
+        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
+    }
+
+    /* Stage 1 outputs */
+    {
+        m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
+        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
+
+        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
+        m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
+
+
+        m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
+        m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
+        m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
+        m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
+
+        m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+        m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+        m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+        m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+
+        m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
+        m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
+
+        m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
+        m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
+    }
+
+    /* Stage 2 */
+    {
+        i4_shift = IT_SHIFT_STAGE_2;
+
+        m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+        m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+
+        m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
+        m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
+
+        m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
+        m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);
+
+        /* e */
+        {
+            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_20, 6);
+        }
+
+        /* o */
+        {
+            m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_22, m_coeff1); //src[1]*36
+            m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3); //src[1]*83
+            m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_23, m_coeff3); //src[3]*83
+            m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_23, m_coeff1); //src[3]*36
+        }
+
+        /* e */
+        {
+            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_21, 6);
+        }
+
+        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+
+        /* e1 stored in m_temp_reg_31 */
+        {
+            m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
+        }
+
+        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+
+        /* e0 stored in m_temp_reg_30 */
+        {
+            m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
+        }
+
+        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+        m_count = _mm_cvtsi32_si128(i4_shift);
+
+        /* o1 stored in m_temp_reg_33 */
+        {
+            m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
+        }
+
+        /* e1 + add */
+        {
+            m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+        }
+
+        /* e0 + add */
+        {
+            m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+        }
+
+        /* o0 stored in m_temp_reg_34 */
+        {
+            m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
+        }
+
+        /* Stage 2 outputs */
+        {
+            m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
+            m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
+            m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
+            m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
+
+            m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
+            m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
+            m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
+            m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
+
+            m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+            m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+            m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+            m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+
+            m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
+            m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
+
+            m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
+            m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
+        }
+
+        /* Recon and store */
+        {
+            UWORD32 *pu4_dst = (UWORD32 *)pu1_dst;
+
+            m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+            pu1_pred += pred_strd;
+            m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+            pu1_pred += pred_strd;
+            m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+            pu1_pred += pred_strd;
+            m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+            m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+            m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
+            m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
+            m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
+            m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);
+            m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
+
+            m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
+            m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
+
+            m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
+
+            *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
+            m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
+            m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
+            m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
+            pu1_dst += dst_strd;
+            pu4_dst = (UWORD32 *)(pu1_dst);
+
+            *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
+            pu1_dst += dst_strd;
+            pu4_dst = (UWORD32 *)(pu1_dst);
+
+            *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
+            pu1_dst += dst_strd;
+            pu4_dst = (UWORD32 *)(pu1_dst);
+
+            *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
+        }
+    }
+}
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization, inverse  transform and
+ * reconstruction for 8c8 input block
+ *
+ * @par Description:
+ *  Performs inverse quantization , inverse transform  and adds the
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 8x8 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 8x8 buffer for storing inverse
+ *  transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 8x8 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 8x8 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_itrans_recon_8x8_sse42(WORD16 *pi2_src,
+                                  WORD16 *pi2_tmp,
+                                  UWORD8 *pu1_pred,
+                                  UWORD8 *pu1_dst,
+                                  WORD32 src_strd,
+                                  WORD32 pred_strd,
+                                  WORD32 dst_strd,
+                                  WORD32 zero_cols,
+                                  WORD32 zero_rows)
+{
+    __m128i m_temp_reg_0;
+    __m128i m_temp_reg_1;
+    __m128i m_temp_reg_2;
+    __m128i m_temp_reg_3;
+    __m128i m_temp_reg_5;
+    __m128i m_temp_reg_6;
+    __m128i m_temp_reg_7;
+    __m128i m_temp_reg_4;
+    __m128i m_temp_reg_10;
+    __m128i m_temp_reg_11;
+    __m128i m_temp_reg_12;
+    __m128i m_temp_reg_13;
+    __m128i m_temp_reg_14;
+    __m128i m_temp_reg_15;
+    __m128i m_temp_reg_16;
+    __m128i m_temp_reg_17;
+    __m128i m_temp_reg_20;
+    __m128i m_temp_reg_21;
+    __m128i m_temp_reg_22;
+    __m128i m_temp_reg_23;
+    __m128i m_temp_reg_24;
+    __m128i m_temp_reg_25;
+    __m128i m_temp_reg_26;
+    __m128i m_temp_reg_27;
+    __m128i m_temp_reg_30;
+    __m128i m_temp_reg_31;
+    __m128i m_temp_reg_32;
+    __m128i m_temp_reg_33;
+    __m128i m_temp_reg_34;
+    __m128i m_temp_reg_35;
+    __m128i m_temp_reg_36;
+    __m128i m_temp_reg_37;
+    __m128i m_temp_reg_40;
+    __m128i m_temp_reg_41;
+    __m128i m_temp_reg_42;
+    __m128i m_temp_reg_43;
+    __m128i m_temp_reg_44;
+    __m128i m_temp_reg_45;
+    __m128i m_temp_reg_46;
+    __m128i m_temp_reg_47;
+    __m128i m_temp_reg_50;
+    __m128i m_temp_reg_51;
+    __m128i m_temp_reg_52;
+    __m128i m_temp_reg_53;
+    __m128i m_temp_reg_54;
+    __m128i m_temp_reg_55;
+    __m128i m_temp_reg_56;
+    __m128i m_temp_reg_57;
+    __m128i m_temp_reg_60;
+    __m128i m_temp_reg_61;
+    __m128i m_temp_reg_62;
+    __m128i m_temp_reg_63;
+    __m128i m_temp_reg_64;
+    __m128i m_temp_reg_65;
+    __m128i m_temp_reg_66;
+    __m128i m_temp_reg_67;
+    __m128i m_temp_reg_70;
+    __m128i m_temp_reg_71;
+    __m128i m_temp_reg_72;
+    __m128i m_temp_reg_73;
+    __m128i m_temp_reg_74;
+    __m128i m_temp_reg_75;
+    __m128i m_temp_reg_76;
+    __m128i m_temp_reg_77;
+    __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
+
+    WORD32 check_row_stage_1;   /* Lokesh */
+    WORD32 check_row_stage_2;   /* Lokesh */
+
+    __m128i m_rdng_factor;
+    WORD32 i4_shift = IT_SHIFT_STAGE_1;
+    UNUSED(pi2_tmp);
+    check_row_stage_1   = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0;
+    check_row_stage_2   = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0;
+
+    m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+
+    m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src);
+
+    if(!check_row_stage_2)
+    {
+        if(!check_row_stage_1)
+        {
+            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+            {
+                //Interleaving 0,4 row in 0 , 1 Rishab
+                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+
+                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+            }
+
+
+            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+            /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
+            {
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+                //Interleaving 2,6 row in 4, 5 Rishab
+                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+
+                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+
+                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+
+
+                /* e */
+
+                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+            }
+
+            /* o */
+            {
+
+                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+                {
+
+                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    //o0:1B*89+3B*75,5B*50+7B*18
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+
+
+                    /* Column 0 of destination computed here */
+                    /* It is stored in m_temp_reg_50 */
+                    /* Column 7 of destination computed here */
+                    /* It is stored in m_temp_reg_57 */
+                    /* Upper 8 bytes of both registers are zero due to zero_cols*/
+
+
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_setzero_si128();
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+                    //o1:1B*75-3B*18,5B*89+7B*50
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+
+                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+                    /* Loading coeff for computing o2  in the next block */
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
+
+                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+
+
+
+                    /* Column 1 of destination computed here */
+                    /* It is stored in m_temp_reg_51 */
+                    /* Column 6 of destination computed here */
+                    /* It is stored in m_temp_reg_56 */
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+                    //o2:1B*50-3B*89,5B*18+7B*75
+                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+
+                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+                    /* Loading coeff for computing o3  in the next block */
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
+
+
+
+                    /* Column 2 of destination computed here */
+                    /* It is stored in m_temp_reg_52 */
+                    /* Column 5 of destination computed here */
+                    /* It is stored in m_temp_reg_55 */
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+                    //o3:1B*18-3B*50,5B*75-7B*89
+                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+
+                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+
+                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+
+                    /* Column 3 of destination computed here */
+                    /* It is stored in m_temp_reg_53 */
+                    /* Column 4 of destination computed here */
+                    /* It is stored in m_temp_reg_54 */
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+
+                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+                }
+            }
+
+            /* Transpose of the destination 8x8 matrix done here */
+            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+            /* respectively */
+            {
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+
+                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+                m_temp_reg_54 = _mm_setzero_si128();
+                m_temp_reg_55 = _mm_setzero_si128();
+                m_temp_reg_56 = _mm_setzero_si128();
+                m_temp_reg_57 = _mm_setzero_si128();
+            }
+        }
+        else
+        {
+            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+            {
+                //Interleaving 0,4 row in 0 , 1 Rishab
+                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+
+                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+            }
+
+
+            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+            /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
+            {
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+                //Interleaving 2,6 row in 4, 5 Rishab
+                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+
+                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+
+                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+
+
+                /* e */
+
+                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+            }
+
+            /* o */
+            {
+
+                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+                {
+
+                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+                    //o0:1B*89+3B*75,5B*50+7B*18
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+
+
+
+                    /* Column 0 of destination computed here */
+                    /* It is stored in m_temp_reg_50 */
+                    /* Column 7 of destination computed here */
+                    /* It is stored in m_temp_reg_57 */
+                    /* Upper 8 bytes of both registers are zero due to zero_cols*/
+
+
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_setzero_si128();
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+                    //o1:1B*75-3B*18,5B*89+7B*50
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+
+                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+                    /* Loading coeff for computing o2  in the next block */
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
+
+                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+
+
+
+                    /* Column 1 of destination computed here */
+                    /* It is stored in m_temp_reg_51 */
+                    /* Column 6 of destination computed here */
+                    /* It is stored in m_temp_reg_56 */
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+                    //o2:1B*50-3B*89,5B*18+7B*75
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+
+                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+                    /* Loading coeff for computing o3  in the next block */
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+
+
+                    /* Column 2 of destination computed here */
+                    /* It is stored in m_temp_reg_52 */
+                    /* Column 5 of destination computed here */
+                    /* It is stored in m_temp_reg_55 */
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+                    //o3:1B*18-3B*50,5B*75-7B*89
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+
+                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+
+                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
+
+
+                    /* Column 3 of destination computed here */
+                    /* It is stored in m_temp_reg_53 */
+                    /* Column 4 of destination computed here */
+                    /* It is stored in m_temp_reg_54 */
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+
+                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+                }
+            }
+
+            /* Transpose of the destination 8x8 matrix done here */
+            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+            /* respectively */
+            {
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+
+                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+                m_temp_reg_54 = _mm_setzero_si128();
+                m_temp_reg_55 = _mm_setzero_si128();
+                m_temp_reg_56 = _mm_setzero_si128();
+                m_temp_reg_57 = _mm_setzero_si128();
+            }
+        }
+
+        /* Stage 2 */
+        i4_shift = IT_SHIFT_STAGE_2;
+        {
+            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+            {
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
+
+                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
+            }
+
+
+            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+            {
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
+
+
+                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+                /* Loading coeff for computing o0 in the next block */
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
+
+
+
+                /* e */
+
+                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+            }
+
+            /* o */
+            {
+
+                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+                {
+                    //o0:1B*89+3B*75,1T*89+3T*75
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+                    /* Loading coeff for computing o1 in the next block */
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+
+
+
+                    /* Column 0 of destination computed here */
+                    /* It is stored in m_temp_reg_50 */
+                    /* Column 7 of destination computed here */
+                    /* It is stored in m_temp_reg_57 */
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+                    //o1:1B*75-3B*18,1T*75-3T*18
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+
+
+                    /* Loading coeff for computing o2  in the next block */
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+
+
+
+                    /* Column 1 of destination computed here */
+                    /* It is stored in m_temp_reg_51 */
+                    /* Column 6 of destination computed here */
+                    /* It is stored in m_temp_reg_56 */
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+                    //o2:1B*50-3B*89,5T*18+7T*75.
+                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                    m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+
+                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+                    /* Loading coeff for computing o3  in the next block */
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+
+
+                    /* Column 2 of destination computed here */
+                    /* It is stored in m_temp_reg_52 */
+                    /* Column 5 of destination computed here */
+                    /* It is stored in m_temp_reg_55 */
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+                    //o3:1B*18-3B*50,1T*18-3T*50
+                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+                    m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+
+                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+
+                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+                    /* Column 3 of destination computed here */
+                    /* It is stored in m_temp_reg_53 */
+                    /* Column 4 of destination computed here */
+                    /* It is stored in m_temp_reg_54 */
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+                    m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+                    m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
+                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
+                    m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
+
+                    m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
+                    m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
+                    m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
+                    m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
+
+                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+                }
+            }
+
+            /* Transpose of the destination 8x8 matrix done here */
+            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+            /* respectively */
+            {
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+                m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+                m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+                m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+                m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+            }
+
+            /* Recon and store */
+            {
+                m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+                m_temp_reg_50 = _mm_setzero_si128();
+                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
+                m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
+                m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
+                m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
+                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
+                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
+                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
+                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
+
+                m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
+                m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
+                m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
+                m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
+                m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
+                m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
+                m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
+                m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
+
+                m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
+                m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
+                m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
+                m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
+                m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
+                m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
+                m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
+                m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
+
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
+                pu1_dst += dst_strd;
+            }
+        }
+    }
+    else
+
+    {
+
+        /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+        /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+#if 1
+        if(!check_row_stage_1)
+        {
+            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+            {
+                //Interleaving 0,4 row in 0 , 1 Rishab
+                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
+
+                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+
+                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+            }
+
+
+            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+            {
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+                //Interleaving 2,6 row in 4, 5 Rishab
+                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
+
+                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
+                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+
+
+                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+                //m_coeff4 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+                //m_coeff2 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+            }
+
+            /* e */
+            {
+                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+            }
+
+            /* o */
+            {
+
+                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+                {
+
+                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+                    //o0:1B*89+3B*75,1T*89+3T*75
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+                }
+
+                /* Column 0 of destination computed here */
+                /* It is stored in m_temp_reg_50 */
+                /* Column 7 of destination computed here */
+                /* It is stored in m_temp_reg_57 */
+                {
+
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+
+                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+                    /* Loading coeff for computing o2  in the next block */
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+
+                }
+
+                /* Column 1 of destination computed here */
+                /* It is stored in m_temp_reg_51 */
+                /* Column 6 of destination computed here */
+                /* It is stored in m_temp_reg_56 */
+                {
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    //o2:1B*50-3B*89,1T*50-3T*89
+                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+                    m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+
+                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+
+                    /* Loading coeff for computing o3  in the next block */
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+
+                }
+
+                /* Column 2 of destination computed here */
+                /* It is stored in m_temp_reg_52 */
+                /* Column 5 of destination computed here */
+                /* It is stored in m_temp_reg_55 */
+                {
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    //o3:1B*18-3B*50,1T*18-3T*50
+                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+                    m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+
+                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+
+                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+                }
+
+                /* Column 3 of destination computed here */
+                /* It is stored in m_temp_reg_53 */
+                /* Column 4 of destination computed here */
+                /* It is stored in m_temp_reg_54 */
+                {
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+                }
+            }
+
+            /* Transpose of the destination 8x8 matrix done here */
+            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+            /* respectively */
+            {
+
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+
+                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+                m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+                m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+            }
+        }
+        else
+        {
+#endif
+
+            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+            {
+                //Interleaving 0,4 row in 0 , 1 Rishab
+                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
+
+                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+
+                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+            }
+
+
+            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+            {
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+                //Interleaving 2,6 row in 4, 5 Rishab
+                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
+
+                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
+                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+
+
+                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+            }
+
+            /* e */
+            {
+                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+            }
+
+            /* o */
+            {
+
+                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+                {
+
+                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+                    m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
+                    //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
+
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+                }
+
+                /* Column 0 of destination computed here */
+                /* It is stored in m_temp_reg_50 */
+                /* Column 7 of destination computed here */
+                /* It is stored in m_temp_reg_57 */
+                {
+
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
+
+                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+                    /* Loading coeff for computing o2  in the next block */
+
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
+
+                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+                    m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
+                }
+
+                /* Column 1 of destination computed here */
+                /* It is stored in m_temp_reg_51 */
+                /* Column 6 of destination computed here */
+                /* It is stored in m_temp_reg_56 */
+                {
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
+
+                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+
+                    /* Loading coeff for computing o3  in the next block */
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+                }
+
+                /* Column 2 of destination computed here */
+                /* It is stored in m_temp_reg_52 */
+                /* Column 5 of destination computed here */
+                /* It is stored in m_temp_reg_55 */
+                {
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
+
+                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+
+                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
+                    m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
+                }
+
+                /* Column 3 of destination computed here */
+                /* It is stored in m_temp_reg_53 */
+                /* Column 4 of destination computed here */
+                /* It is stored in m_temp_reg_54 */
+                {
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+                }
+            }
+
+            /* Transpose of the destination 8x8 matrix done here */
+            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+            /* respectively */
+            {
+
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+
+                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+                m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+                m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+            }
+        }
+        /* Stage 2 */
+
+        i4_shift = IT_SHIFT_STAGE_2;
+
+        {
+
+            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+            {
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
+
+                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
+            }
+
+
+            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+            {
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
+
+
+                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+                /* Loading coeff for computing o0 in the next block */
+                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
+            }
+
+            /* e */
+            {
+                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+            }
+
+            /* o */
+            {
+                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57);
+                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57);
+
+                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+                {
+                    //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+                    /* Loading coeff for computing o1 in the next block */
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+                }
+
+                /* Column 0 of destination computed here */
+                /* It is stored in m_temp_reg_50 */
+                /* Column 7 of destination computed here */
+                /* It is stored in m_temp_reg_57 */
+                {
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
+
+                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+
+
+                    /* Loading coeff for computing o2  in the next block */
+                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
+
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+                    m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
+                }
+
+                /* Column 1 of destination computed here */
+                /* It is stored in m_temp_reg_51 */
+                /* Column 6 of destination computed here */
+                /* It is stored in m_temp_reg_56 */
+                {
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+                    //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+                    /* Loading coeff for computing o3  in the next block */
+
+                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+                }
+
+                /* Column 2 of destination computed here */
+                /* It is stored in m_temp_reg_52 */
+                /* Column 5 of destination computed here */
+                /* It is stored in m_temp_reg_55 */
+                {
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+                    //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
+
+                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+
+                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
+                    m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
+                }
+
+                /* Column 3 of destination computed here */
+                /* It is stored in m_temp_reg_53 */
+                /* Column 4 of destination computed here */
+                /* It is stored in m_temp_reg_54 */
+                {
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+                    m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+                    m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
+                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
+                    m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
+
+                    m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
+                    m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
+                    m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
+                    m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
+
+                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+                }
+            }
+
+            /* Transpose of the destination 8x8 matrix done here */
+            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+            /* respectively */
+            {
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+                m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+                m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+                m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+                m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+            }
+
+            /* Recon and store */
+            {
+                m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+
+                m_temp_reg_50 = _mm_setzero_si128();
+                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
+                m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
+                m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
+                m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
+                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
+                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
+                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
+                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
+
+                m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
+                m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
+                m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
+                m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
+                m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
+                m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
+                m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
+                m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
+
+                m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
+                m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
+                m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
+                m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
+                m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
+                m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
+                m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
+                m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
+
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
+                pu1_dst += dst_strd;
+
+            }
+
+
+        }
+
+
+    }
+}

diff --git a/common/x86/ihevc_itrans_recon_ssse3_intr.c b/common/x86/ihevc_itrans_recon_ssse3_intr.c
new file mode 100644
index 0000000..960ecdf
--- /dev/null
+++ b/common/x86/ihevc_itrans_recon_ssse3_intr.c

@@ -0,0 +1,2744 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_itrans_recon_atom_intr.c
+ *
+ * @brief
+ *  Contains function definitions for inverse  quantization, inverse
+ * transform and reconstruction
+ *
+ * @author
+ *  100470
+ *  100592 (edited by)
+ *
+ * @par List of Functions:
+ *  - ihevc_itrans_recon_4x4_ttype1_ssse3()
+ *  - ihevc_itrans_recon_4x4_ssse3()
+ *  - ihevc_itrans_recon_8x8_ssse3()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_iquant_itrans_recon.h"
+#include "ihevc_trans_macros.h"
+
+
+#include <immintrin.h>
+#include <emmintrin.h>
+
+#include <tmmintrin.h>
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization, inverse  transform
+ * type1(DST) and reconstruction for 4x4  input block
+ *
+ * @par Description:
+ *  Performs inverse quantization , inverse transform type 1  and adds
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 4x4 buffer for storing inverse
+ *  transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 4x4 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_4x4_ttype1_ssse3(WORD16 *pi2_src,
+                                         WORD16 *pi2_tmp,
+                                         UWORD8 *pu1_pred,
+                                         UWORD8 *pu1_dst,
+                                         WORD32 src_strd,
+                                         WORD32 pred_strd,
+                                         WORD32 dst_strd,
+                                         WORD32 zero_cols,
+                                         WORD32 zero_rows)
+{
+    __m128i m_temp_reg_0;
+    __m128i m_temp_reg_1;
+    __m128i m_temp_reg_2;
+    __m128i m_temp_reg_3;
+    __m128i m_temp_reg_4;
+    __m128i m_temp_reg_10;
+    __m128i m_temp_reg_11;
+    __m128i m_temp_reg_12;
+    __m128i m_temp_reg_13;
+    __m128i m_temp_reg_14;
+    __m128i m_temp_reg_20;
+    __m128i m_temp_reg_21;
+    __m128i m_temp_reg_22;
+    __m128i m_temp_reg_23;
+    __m128i m_temp_reg_24;
+    __m128i m_temp_reg_25;
+    __m128i m_temp_reg_30;
+    __m128i m_temp_reg_31;
+    __m128i m_temp_reg_32;
+    __m128i m_temp_reg_33;
+    __m128i m_temp_reg_34;
+    __m128i m_temp_reg_35;
+    __m128i m_temp_reg_36;
+    __m128i m_rdng_factor;
+    __m128i m_count;
+
+    __m128i m_ge_zero16b_flag_row0;
+    __m128i m_ge_zero16b_flag_row1;
+    __m128i m_ge_zero16b_flag_row2;
+    __m128i m_ge_zero16b_flag_row3;
+
+    __m128i m_zero = _mm_setzero_si128();
+
+    WORD32 i4_shift = IT_SHIFT_STAGE_1;
+    UNUSED(zero_cols);
+    UNUSED(zero_rows);
+    UNUSED(pi2_tmp);
+    m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
+
+    m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_0);
+    m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_1);
+    m_ge_zero16b_flag_row2 = _mm_cmpgt_epi16(m_zero, m_temp_reg_2);
+    m_ge_zero16b_flag_row3 = _mm_cmpgt_epi16(m_zero, m_temp_reg_3);
+
+    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_0, m_ge_zero16b_flag_row0);
+    m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_1, m_ge_zero16b_flag_row1);
+    m_temp_reg_2 = _mm_unpacklo_epi16(m_temp_reg_2, m_ge_zero16b_flag_row2);
+    m_temp_reg_3 = _mm_unpacklo_epi16(m_temp_reg_3, m_ge_zero16b_flag_row3);
+
+    /*m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
+    m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
+
+    m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
+    m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);*/
+
+    /* c[4] in m_temp_reg_14 */
+    /* c[4] = src[0] - src[2] + src[3] */
+    {
+        m_temp_reg_14 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_2);
+    }
+
+    /* c[3] in m_temp_reg_13 */
+    {
+        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 6);
+        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 3);
+        m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_1, 1);
+        m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+        m_temp_reg_13 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+        //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);
+    }
+
+    /* c[0] in m_temp_reg_10 */
+    {
+        m_temp_reg_10 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_2);
+    }
+
+    /* c[1] in m_temp_reg_11 */
+    {
+        m_temp_reg_11 = _mm_add_epi32(m_temp_reg_2, m_temp_reg_3);
+    }
+
+    /* c[2] in m_temp_reg_12 */
+    {
+        m_temp_reg_12 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_3);
+    }
+
+    /* c[4] in m_temp_reg_14 */
+    /* c[4] = src[0] - src[2] + src[3] */
+    {
+        m_temp_reg_14 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_3);
+    }
+
+    /* Stage 1 outputs stored in m_temp_reg_20-23 */
+    {
+        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_10, 5);
+        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_10, 1);
+        m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_10);
+        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
+        //m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_10, m_coeff1);//29*c0
+
+        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_11, 6);
+        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_11, 3);
+        m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_11);
+        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
+        //m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_11, m_coeff2);//55*c1
+
+        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+
+        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_11, 5);
+        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_11, 1);
+        m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_11);
+        m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
+        //m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_11, m_coeff1);//29*c1
+
+        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_12, 6);
+        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_12, 3);
+        m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_12);
+        m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
+        //m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_12, m_coeff2);//55*c2
+
+        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+
+        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_10, 6);
+        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_10, 3);
+        m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_10);
+        m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
+        //m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_10, m_coeff2);//55*c0
+
+        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_12, 5);
+        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_12, 1);
+        m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_12);
+        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
+        //m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_12, m_coeff1);//29*c2
+
+        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_14, 6);
+        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_14, 3);
+        m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_14, 1);
+        m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+        m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+        //m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_14, m_coeff3);//74*c4
+
+        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+        m_count = _mm_cvtsi32_si128(i4_shift);
+
+        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+        m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_13);
+        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
+
+        m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+        m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
+
+        m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
+        m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_13);
+        m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
+
+        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
+
+        m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
+        m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
+        m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
+        m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
+
+        m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+        m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+        m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+        m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+
+        m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
+        m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
+
+        m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
+        m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
+
+    }
+
+    /* Stage 2 */
+    {
+        i4_shift = IT_SHIFT_STAGE_2;
+
+        /*m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+        m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
+        m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+        m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
+        m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
+        m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);*/
+
+        m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_30);
+        m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_31);
+
+        m_temp_reg_20 = _mm_unpacklo_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
+        m_temp_reg_21 = _mm_unpacklo_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
+        m_temp_reg_22 = _mm_unpackhi_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
+        m_temp_reg_23 = _mm_unpackhi_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
+
+
+        /* c[4] stored in m_temp_reg_4 */
+        {
+            m_temp_reg_4 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+        }
+
+        /* c[3] stored in m_temp_reg_3 */
+        {
+            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_22, 6);
+            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_22, 3);
+            m_temp_reg_12 = _mm_slli_epi32(m_temp_reg_22, 1);
+            m_temp_reg_13 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
+            m_temp_reg_3 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_13);
+            //m_temp_reg_3 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3);
+        }
+
+        /* c[0] stored in m_temp_reg_0 */
+        {
+            m_temp_reg_0 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+        }
+
+        /* c[1] stored in m_temp_reg_1 */
+        {
+            m_temp_reg_1 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_21);
+        }
+
+        /* c[2] stored in m_temp_reg_2 */
+        {
+            m_temp_reg_2 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_23);
+        }
+
+        /* c[4] stored in m_temp_reg_4 */
+        {
+            m_temp_reg_4 = _mm_add_epi32(m_temp_reg_4, m_temp_reg_23);
+        }
+
+        /* Stage 2 output generation */
+        {
+            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 5);
+            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_0, 1);
+            m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_0);
+            m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
+            //m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1);//29*c0
+
+            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_1, 6);
+            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_1, 3);
+            m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_1);
+            m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
+            //m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_1, m_coeff2);//55*c1
+
+            m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+
+            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_1, 5);
+            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_1, 1);
+            m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_1);
+            m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
+            //m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//29*c1
+
+            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_2, 6);
+            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 3);
+            m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_2);
+            m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
+            //m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2);//55*c2
+
+            m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+
+            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6);
+            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_0, 3);
+            m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_0);
+            m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
+            //m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2);//55*c0
+
+            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_2, 5);
+            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 1);
+            m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_2);
+            m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
+            //m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1);//29*c2
+
+            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_4, 6);
+            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_4, 3);
+            m_temp_reg_12 = _mm_slli_epi32(m_temp_reg_4, 1);
+            m_temp_reg_13 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
+            m_temp_reg_36 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_13);
+            //m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_4, m_coeff3);//74*c4
+
+            m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+            m_count = _mm_cvtsi32_si128(i4_shift);
+
+            m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_3);
+            m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+            m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
+
+            m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+            m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
+
+            m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_3);
+            m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
+            m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
+
+            m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
+
+            m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
+            m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
+            m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
+            m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
+
+            m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+            m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+            m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+            m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+
+            m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
+            m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
+
+            m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
+            m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
+        }
+
+        /* Recon and store */
+        {
+            WORD32 *pi4_dst = (WORD32 *)pu1_dst;
+
+            m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+            pu1_pred += pred_strd;
+            m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+            pu1_pred += pred_strd;
+            m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+            pu1_pred += pred_strd;
+            m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_zero);
+            m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_zero);
+            m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_zero);
+            m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_zero);
+
+            /*m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+            m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
+            m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
+            m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);*/
+
+            m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
+            m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
+
+            m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
+            m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
+
+            m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
+
+            *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
+            m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
+            m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
+            m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
+            pu1_dst += dst_strd;
+            pi4_dst = (WORD32 *)(pu1_dst);
+
+            *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
+            pu1_dst += dst_strd;
+            pi4_dst = (WORD32 *)(pu1_dst);
+
+            *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
+            pu1_dst += dst_strd;
+            pi4_dst = (WORD32 *)(pu1_dst);
+
+            *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
+        }
+    }
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization, inverse  transform
+ * (DCT) and reconstruction for 4x4  input block
+ *
+ * @par Description:
+ *  Performs inverse quantization , inverse transform and adds
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 4x4 buffer for storing inverse
+ *  transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 4x4 block
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_4x4_ssse3(WORD16 *pi2_src,
+                                  WORD16 *pi2_tmp,
+                                  UWORD8 *pu1_pred,
+                                  UWORD8 *pu1_dst,
+                                  WORD32 src_strd,
+                                  WORD32 pred_strd,
+                                  WORD32 dst_strd,
+                                  WORD32 zero_cols,
+                                  WORD32 zero_rows)
+{
+    __m128i m_temp_reg_0;
+    __m128i m_temp_reg_1;
+    __m128i m_temp_reg_2;
+    __m128i m_temp_reg_3;
+    __m128i m_temp_reg_4;
+    __m128i m_temp_reg_10;
+    __m128i m_temp_reg_11;
+    __m128i m_temp_reg_12;
+    __m128i m_temp_reg_13;
+    __m128i m_temp_reg_14;
+    __m128i m_temp_reg_15;
+    __m128i m_temp_reg_20;
+    __m128i m_temp_reg_21;
+    __m128i m_temp_reg_22;
+    __m128i m_temp_reg_23;
+    __m128i m_temp_reg_24;
+    __m128i m_temp_reg_25;
+    __m128i m_temp_reg_30;
+    __m128i m_temp_reg_31;
+    __m128i m_temp_reg_33;
+    __m128i m_temp_reg_34;
+    __m128i m_rdng_factor;
+    __m128i m_count;
+
+    __m128i m_ge_zero16b_flag_row0;
+    __m128i m_ge_zero16b_flag_row1;
+    __m128i m_ge_zero16b_flag_row2;
+    __m128i m_ge_zero16b_flag_row3;
+
+    __m128i m_zero = _mm_setzero_si128();
+
+    WORD32 i4_shift = IT_SHIFT_STAGE_1;
+    UNUSED(zero_rows);
+    UNUSED(zero_cols);
+    UNUSED(pi2_tmp);
+
+    m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
+
+    m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_0);
+    m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_1);
+    m_ge_zero16b_flag_row2 = _mm_cmpgt_epi16(m_zero, m_temp_reg_2);
+    m_ge_zero16b_flag_row3 = _mm_cmpgt_epi16(m_zero, m_temp_reg_3);
+
+    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_0, m_ge_zero16b_flag_row0);
+    m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_1, m_ge_zero16b_flag_row1);
+    m_temp_reg_2 = _mm_unpacklo_epi16(m_temp_reg_2, m_ge_zero16b_flag_row2);
+    m_temp_reg_3 = _mm_unpacklo_epi16(m_temp_reg_3, m_ge_zero16b_flag_row3);
+
+    /*m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
+    m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
+
+    m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
+    m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);*/
+
+    /* e */
+    {
+        m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6);
+        m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 6);
+    }
+
+    /* o */
+    {
+        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 5);
+        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 2);
+        m_temp_reg_12 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+        //m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//src[1]*36
+
+        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_3, 6);
+        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_3, 4);
+        m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_3, 1);
+        m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_3);
+        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_22);
+        m_temp_reg_13 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_24);
+        //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3);//src[3]*83
+
+        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 6);
+        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 4);
+        m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_1, 1);
+        m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_1);
+        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_22);
+        m_temp_reg_14 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_24);
+        //m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);//src[1]*83
+
+        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_3, 5);
+        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_3, 2);
+        m_temp_reg_15 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+        //m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1);//src[3]*36
+    }
+
+    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+
+    /* e1 stored in m_temp_reg_31 */
+    {
+        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
+    }
+
+    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+
+    /* e0 stored in m_temp_reg_30 */
+    {
+        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
+    }
+
+    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+    m_count = _mm_cvtsi32_si128(i4_shift);
+
+    /* o1 stored in m_temp_reg_33 */
+    {
+        m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
+    }
+
+    /* e1 + add */
+    {
+        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+    }
+
+    /* e0 + add */
+    {
+        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+    }
+
+    /* o0 stored in m_temp_reg_34 */
+    {
+        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
+    }
+
+    /* Stage 1 outputs */
+    {
+        m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
+        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
+
+        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
+        m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
+
+
+        m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
+        m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
+        m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
+        m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
+
+        m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+        m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+        m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+        m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+
+        m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
+        m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
+
+        m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
+        m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
+    }
+
+    /* Stage 2 */
+    {
+        i4_shift = IT_SHIFT_STAGE_2;
+
+        /*m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+        m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);*/
+
+        m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_30);
+        m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_31);
+
+        m_temp_reg_20 = _mm_unpacklo_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
+        m_temp_reg_21 = _mm_unpacklo_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
+        m_temp_reg_22 = _mm_unpackhi_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
+        m_temp_reg_23 = _mm_unpackhi_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
+
+        /*m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
+        m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
+
+        m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
+        m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);*/
+
+        /* e */
+        {
+            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_20, 6);
+        }
+
+        /* o */
+        /*{
+            m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_22, m_coeff1);//src[1]*36
+            m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3);//src[1]*83
+            m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_23, m_coeff3);//src[3]*83
+            m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_23, m_coeff1);//src[3]*36
+        }*/
+        {
+            m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_22, 5);
+            m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_22, 2);
+            m_temp_reg_12 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_1);
+            //m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//src[1]*36
+
+            m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_23, 6);
+            m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_23, 4);
+            m_temp_reg_2 = _mm_slli_epi32(m_temp_reg_23, 1);
+            m_temp_reg_3 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_23);
+            m_temp_reg_4 = _mm_add_epi32(m_temp_reg_1, m_temp_reg_2);
+            m_temp_reg_13 = _mm_add_epi32(m_temp_reg_3, m_temp_reg_4);
+            //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3);//src[3]*83
+
+            m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_22, 6);
+            m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_22, 4);
+            m_temp_reg_2 = _mm_slli_epi32(m_temp_reg_22, 1);
+            m_temp_reg_3 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_22);
+            m_temp_reg_4 = _mm_add_epi32(m_temp_reg_1, m_temp_reg_2);
+            m_temp_reg_14 = _mm_add_epi32(m_temp_reg_3, m_temp_reg_4);
+            //m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);//src[1]*83
+
+            m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_23, 5);
+            m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_23, 2);
+            m_temp_reg_15 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_1);
+            //m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1);//src[3]*36
+        }
+
+        /* e */
+        {
+            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_21, 6);
+        }
+
+        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+
+        /* e1 stored in m_temp_reg_31 */
+        {
+            m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
+        }
+
+        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+
+        /* e0 stored in m_temp_reg_30 */
+        {
+            m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
+        }
+
+        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+        m_count = _mm_cvtsi32_si128(i4_shift);
+
+        /* o1 stored in m_temp_reg_33 */
+        {
+            m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
+        }
+
+        /* e1 + add */
+        {
+            m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+        }
+
+        /* e0 + add */
+        {
+            m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+        }
+
+        /* o0 stored in m_temp_reg_34 */
+        {
+            m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
+        }
+
+        /* Stage 2 outputs */
+        {
+            m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
+            m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
+            m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
+            m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
+
+            m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
+            m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
+            m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
+            m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
+
+            m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+            m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+            m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+            m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+
+            m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
+            m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
+
+            m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
+            m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
+        }
+
+        /* Recon and store */
+        {
+            UWORD32 *pu4_dst = (UWORD32 *)pu1_dst;
+
+            m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+            pu1_pred += pred_strd;
+            m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+            pu1_pred += pred_strd;
+            m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+            pu1_pred += pred_strd;
+            m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+            //m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
+
+            //m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
+            //m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);
+
+            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_zero);
+            m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_zero);
+            m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_zero);
+            m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_zero);
+
+            m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
+            m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
+
+            m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
+            m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
+
+            m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
+
+            *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
+            m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
+            m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
+            m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
+            pu1_dst += dst_strd;
+            pu4_dst = (UWORD32 *)(pu1_dst);
+
+            *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
+            pu1_dst += dst_strd;
+            pu4_dst = (UWORD32 *)(pu1_dst);
+
+            *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
+            pu1_dst += dst_strd;
+            pu4_dst = (UWORD32 *)(pu1_dst);
+
+            *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
+        }
+    }
+}
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs inverse quantization, inverse  transform and
+ * reconstruction for 8c8 input block
+ *
+ * @par Description:
+ *  Performs inverse quantization , inverse transform  and adds the
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 8x8 coefficients
+ *
+ * @param[in] pi2_tmp
+ *  Temporary 8x8 buffer for storing inverse
+ *  transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ *  Prediction 8x8 block
+ *
+ * @param[in] pi2_dequant_coeff
+ *  Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ *  Output 8x8 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] qp_div
+ *  Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ *  Quantization parameter % 6
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_src
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_itrans_recon_8x8_ssse3(WORD16 *pi2_src,
+                                  WORD16 *pi2_tmp,
+                                  UWORD8 *pu1_pred,
+                                  UWORD8 *pu1_dst,
+                                  WORD32 src_strd,
+                                  WORD32 pred_strd,
+                                  WORD32 dst_strd,
+                                  WORD32 zero_cols,
+                                  WORD32 zero_rows)
+{
+    __m128i m_temp_reg_0;
+    __m128i m_temp_reg_1;
+    __m128i m_temp_reg_2;
+    __m128i m_temp_reg_3;
+    __m128i m_temp_reg_5;
+    __m128i m_temp_reg_6;
+    __m128i m_temp_reg_7;
+    __m128i m_temp_reg_4;
+    __m128i m_temp_reg_10;
+    __m128i m_temp_reg_11;
+    __m128i m_temp_reg_12;
+    __m128i m_temp_reg_13;
+    __m128i m_temp_reg_14;
+    __m128i m_temp_reg_15;
+    __m128i m_temp_reg_16;
+    __m128i m_temp_reg_17;
+    __m128i m_temp_reg_20;
+    __m128i m_temp_reg_21;
+    __m128i m_temp_reg_22;
+    __m128i m_temp_reg_23;
+    __m128i m_temp_reg_24;
+    __m128i m_temp_reg_25;
+    __m128i m_temp_reg_26;
+    __m128i m_temp_reg_27;
+    __m128i m_temp_reg_30;
+    __m128i m_temp_reg_31;
+    __m128i m_temp_reg_32;
+    __m128i m_temp_reg_33;
+    __m128i m_temp_reg_34;
+    __m128i m_temp_reg_35;
+    __m128i m_temp_reg_36;
+    __m128i m_temp_reg_37;
+    __m128i m_temp_reg_40;
+    __m128i m_temp_reg_41;
+    __m128i m_temp_reg_42;
+    __m128i m_temp_reg_43;
+    __m128i m_temp_reg_44;
+    __m128i m_temp_reg_45;
+    __m128i m_temp_reg_46;
+    __m128i m_temp_reg_47;
+    __m128i m_temp_reg_50;
+    __m128i m_temp_reg_51;
+    __m128i m_temp_reg_52;
+    __m128i m_temp_reg_53;
+    __m128i m_temp_reg_54;
+    __m128i m_temp_reg_55;
+    __m128i m_temp_reg_56;
+    __m128i m_temp_reg_57;
+    __m128i m_temp_reg_60;
+    __m128i m_temp_reg_61;
+    __m128i m_temp_reg_62;
+    __m128i m_temp_reg_63;
+    __m128i m_temp_reg_64;
+    __m128i m_temp_reg_65;
+    __m128i m_temp_reg_66;
+    __m128i m_temp_reg_67;
+    __m128i m_temp_reg_70;
+    __m128i m_temp_reg_71;
+    __m128i m_temp_reg_72;
+    __m128i m_temp_reg_73;
+    __m128i m_temp_reg_74;
+    __m128i m_temp_reg_75;
+    __m128i m_temp_reg_76;
+    __m128i m_temp_reg_77;
+    __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
+
+    WORD32 check_row_stage_1;   /* Lokesh */
+    WORD32 check_row_stage_2;   /* Lokesh */
+
+    __m128i m_rdng_factor;
+    //__m128i m_count;
+    WORD32 i4_shift = IT_SHIFT_STAGE_1;
+    UNUSED(zero_rows);
+    UNUSED(zero_cols);
+    UNUSED(pi2_tmp);
+
+    check_row_stage_1   = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0;
+    check_row_stage_2   = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0;
+
+    m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+
+    m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src);
+    pi2_src += src_strd;
+    m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src);
+
+    if(!check_row_stage_2)
+    {
+        if(!check_row_stage_1)
+        {
+            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+            {
+                //Interleaving 0,4 row in 0 , 1 Rishab
+                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+
+                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+            }
+
+
+            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+            /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
+            {
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+                //Interleaving 2,6 row in 4, 5 Rishab
+                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+
+                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+
+                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+
+
+                /* e */
+
+                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+            }
+
+            /* o */
+            {
+
+                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+                {
+
+                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    //o0:1B*89+3B*75,5B*50+7B*18
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+
+
+                    /* Column 0 of destination computed here */
+                    /* It is stored in m_temp_reg_50 */
+                    /* Column 7 of destination computed here */
+                    /* It is stored in m_temp_reg_57 */
+                    /* Upper 8 bytes of both registers are zero due to zero_cols*/
+
+
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_setzero_si128();
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+                    //o1:1B*75-3B*18,5B*89+7B*50
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+
+                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+                    /* Loading coeff for computing o2  in the next block */
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
+
+                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+
+
+
+                    /* Column 1 of destination computed here */
+                    /* It is stored in m_temp_reg_51 */
+                    /* Column 6 of destination computed here */
+                    /* It is stored in m_temp_reg_56 */
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+                    //o2:1B*50-3B*89,5B*18+7B*75
+                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+
+                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+                    /* Loading coeff for computing o3  in the next block */
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
+
+
+
+                    /* Column 2 of destination computed here */
+                    /* It is stored in m_temp_reg_52 */
+                    /* Column 5 of destination computed here */
+                    /* It is stored in m_temp_reg_55 */
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+                    //o3:1B*18-3B*50,5B*75-7B*89
+                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+
+                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+
+                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+
+                    /* Column 3 of destination computed here */
+                    /* It is stored in m_temp_reg_53 */
+                    /* Column 4 of destination computed here */
+                    /* It is stored in m_temp_reg_54 */
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+
+                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+                }
+            }
+
+            /* Transpose of the destination 8x8 matrix done here */
+            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+            /* respectively */
+            {
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+                //m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+                //m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+                //m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+                //m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+                //m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+                //m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+                //m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+                //m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+
+                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+                /*m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+                m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+                */
+                m_temp_reg_54 = _mm_setzero_si128();
+                m_temp_reg_55 = _mm_setzero_si128();
+                m_temp_reg_56 = _mm_setzero_si128();
+                m_temp_reg_57 = _mm_setzero_si128();
+            }
+        }
+        else
+        {
+            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+            {
+                //Interleaving 0,4 row in 0 , 1 Rishab
+                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+
+                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+            }
+
+
+            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+            /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
+            {
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+                //Interleaving 2,6 row in 4, 5 Rishab
+                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+
+                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+
+                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+
+
+                /* e */
+
+                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+            }
+
+            /* o */
+            {
+
+                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+                {
+
+                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+                    //o0:1B*89+3B*75,5B*50+7B*18
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+
+
+
+                    /* Column 0 of destination computed here */
+                    /* It is stored in m_temp_reg_50 */
+                    /* Column 7 of destination computed here */
+                    /* It is stored in m_temp_reg_57 */
+                    /* Upper 8 bytes of both registers are zero due to zero_cols*/
+
+
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_setzero_si128();
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+                    //o1:1B*75-3B*18,5B*89+7B*50
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+
+                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+                    /* Loading coeff for computing o2  in the next block */
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
+
+                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+
+
+
+                    /* Column 1 of destination computed here */
+                    /* It is stored in m_temp_reg_51 */
+                    /* Column 6 of destination computed here */
+                    /* It is stored in m_temp_reg_56 */
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+                    //o2:1B*50-3B*89,5B*18+7B*75
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+
+                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+                    /* Loading coeff for computing o3  in the next block */
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+
+
+                    /* Column 2 of destination computed here */
+                    /* It is stored in m_temp_reg_52 */
+                    /* Column 5 of destination computed here */
+                    /* It is stored in m_temp_reg_55 */
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+                    //o3:1B*18-3B*50,5B*75-7B*89
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+
+                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+
+                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
+
+
+                    /* Column 3 of destination computed here */
+                    /* It is stored in m_temp_reg_53 */
+                    /* Column 4 of destination computed here */
+                    /* It is stored in m_temp_reg_54 */
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+
+                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+                }
+            }
+
+            /* Transpose of the destination 8x8 matrix done here */
+            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+            /* respectively */
+            {
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+                //m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+                //m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+                //m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+                //m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+                //m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+                //m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+                //m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+                //m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+
+                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+                /*m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+                m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+                */
+                m_temp_reg_54 = _mm_setzero_si128();
+                m_temp_reg_55 = _mm_setzero_si128();
+                m_temp_reg_56 = _mm_setzero_si128();
+                m_temp_reg_57 = _mm_setzero_si128();
+            }
+        }
+
+        /* Stage 2 */
+        i4_shift = IT_SHIFT_STAGE_2;
+        {
+            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+            {
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
+
+                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
+            }
+
+
+            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+            {
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
+
+
+                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+                /* Loading coeff for computing o0 in the next block */
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
+
+
+
+                /* e */
+
+                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+            }
+
+            /* o */
+            {
+
+                //m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55,m_temp_reg_57);
+                //m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55,m_temp_reg_57);
+
+                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+                {
+                    //o0:1B*89+3B*75,1T*89+3T*75
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+                    /* Loading coeff for computing o1 in the next block */
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+
+
+
+                    /* Column 0 of destination computed here */
+                    /* It is stored in m_temp_reg_50 */
+                    /* Column 7 of destination computed here */
+                    /* It is stored in m_temp_reg_57 */
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+                    //o1:1B*75-3B*18,1T*75-3T*18
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+
+
+                    /* Loading coeff for computing o2  in the next block */
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+
+
+
+                    /* Column 1 of destination computed here */
+                    /* It is stored in m_temp_reg_51 */
+                    /* Column 6 of destination computed here */
+                    /* It is stored in m_temp_reg_56 */
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+                    //o2:1B*50-3B*89,5T*18+7T*75.
+                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                    m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+
+                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+                    /* Loading coeff for computing o3  in the next block */
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+
+
+                    /* Column 2 of destination computed here */
+                    /* It is stored in m_temp_reg_52 */
+                    /* Column 5 of destination computed here */
+                    /* It is stored in m_temp_reg_55 */
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+                    //o3:1B*18-3B*50,1T*18-3T*50
+                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+                    m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+
+                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+
+                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+                    /* Column 3 of destination computed here */
+                    /* It is stored in m_temp_reg_53 */
+                    /* Column 4 of destination computed here */
+                    /* It is stored in m_temp_reg_54 */
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+                    m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+                    m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
+                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
+                    m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
+
+                    m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
+                    m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
+                    m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
+                    m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
+
+                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+                }
+            }
+
+            /* Transpose of the destination 8x8 matrix done here */
+            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+            /* respectively */
+            {
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+                m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+                m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+                m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+                m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+            }
+
+            /* Recon and store */
+            {
+                m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+                m_temp_reg_50 = _mm_setzero_si128();
+                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
+                m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
+                m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
+                m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
+                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
+                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
+                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
+                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
+
+                m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
+                m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
+                m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
+                m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
+                m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
+                m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
+                m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
+                m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
+
+                m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
+                m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
+                m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
+                m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
+                m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
+                m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
+                m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
+                m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
+
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
+                pu1_dst += dst_strd;
+            }
+        }
+    }
+    else
+
+    {
+
+        /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+        /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+#if 1
+        if(!check_row_stage_1)
+        {
+            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+            {
+                //Interleaving 0,4 row in 0 , 1 Rishab
+                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
+
+                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+
+                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+            }
+
+
+            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+            {
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+                //Interleaving 2,6 row in 4, 5 Rishab
+                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
+
+                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
+                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+
+
+                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+                //m_coeff4 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+                //m_coeff2 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+            }
+
+            /* e */
+            {
+                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+            }
+
+            /* o */
+            {
+
+                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+                {
+
+                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+                    //  m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75,m_temp_reg_77);
+                    //   m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75,m_temp_reg_77);
+                    //o0:1B*89+3B*75,1T*89+3T*75
+                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+                    //m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+                    //m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
+
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+                    //m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+                    //m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+                }
+
+                /* Column 0 of destination computed here */
+                /* It is stored in m_temp_reg_50 */
+                /* Column 7 of destination computed here */
+                /* It is stored in m_temp_reg_57 */
+                {
+
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
+                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+                    //m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+                    //m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
+
+                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+                    /* Loading coeff for computing o2  in the next block */
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+
+                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+                    //m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+                    //m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
+                }
+
+                /* Column 1 of destination computed here */
+                /* It is stored in m_temp_reg_51 */
+                /* Column 6 of destination computed here */
+                /* It is stored in m_temp_reg_56 */
+                {
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    //o2:1B*50-3B*89,1T*50-3T*89
+                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+                    m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+
+                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+
+                    /* Loading coeff for computing o3  in the next block */
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+
+                    //m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+                    //m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+                }
+
+                /* Column 2 of destination computed here */
+                /* It is stored in m_temp_reg_52 */
+                /* Column 5 of destination computed here */
+                /* It is stored in m_temp_reg_55 */
+                {
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    //o3:1B*18-3B*50,1T*18-3T*50
+                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+                    m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+
+                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+
+                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+                }
+
+                /* Column 3 of destination computed here */
+                /* It is stored in m_temp_reg_53 */
+                /* Column 4 of destination computed here */
+                /* It is stored in m_temp_reg_54 */
+                {
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+                }
+            }
+
+            /* Transpose of the destination 8x8 matrix done here */
+            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+            /* respectively */
+            {
+
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+
+                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+                m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+                m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+            }
+        }
+        else
+        {
+#endif
+
+            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+            {
+                //Interleaving 0,4 row in 0 , 1 Rishab
+                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
+
+                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+
+                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+            }
+
+
+            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+            {
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+                //Interleaving 2,6 row in 4, 5 Rishab
+                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
+
+                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
+                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+
+
+                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+            }
+
+            /* e */
+            {
+                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+            }
+
+            /* o */
+            {
+
+                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+                {
+
+                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+                    m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+                    m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
+                    //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
+
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+                }
+
+                /* Column 0 of destination computed here */
+                /* It is stored in m_temp_reg_50 */
+                /* Column 7 of destination computed here */
+                /* It is stored in m_temp_reg_57 */
+                {
+
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
+
+                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+                    /* Loading coeff for computing o2  in the next block */
+
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
+
+                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+                    m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
+                }
+
+                /* Column 1 of destination computed here */
+                /* It is stored in m_temp_reg_51 */
+                /* Column 6 of destination computed here */
+                /* It is stored in m_temp_reg_56 */
+                {
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
+
+                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+
+                    /* Loading coeff for computing o3  in the next block */
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+                }
+
+                /* Column 2 of destination computed here */
+                /* It is stored in m_temp_reg_52 */
+                /* Column 5 of destination computed here */
+                /* It is stored in m_temp_reg_55 */
+                {
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
+
+                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+
+                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
+                    m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
+                }
+
+                /* Column 3 of destination computed here */
+                /* It is stored in m_temp_reg_53 */
+                /* Column 4 of destination computed here */
+                /* It is stored in m_temp_reg_54 */
+                {
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+                }
+            }
+
+            /* Transpose of the destination 8x8 matrix done here */
+            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+            /* respectively */
+            {
+
+
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+
+                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+                m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+                m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+            }
+        }
+        /* Stage 2 */
+
+        i4_shift = IT_SHIFT_STAGE_2;
+
+        {
+
+            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+            {
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
+
+                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
+            }
+
+
+            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+            {
+                //m_temp_reg_66 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1);
+                //m_temp_reg_64 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2);
+                //m_temp_reg_62 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2);
+                //m_temp_reg_60 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1);
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
+
+
+                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+                //m_temp_reg_16 = _mm_sub_epi32(m_temp_reg_64, m_temp_reg_66);
+                //m_temp_reg_14 = _mm_add_epi32(m_temp_reg_60, m_temp_reg_62);
+
+                /* Loading coeff for computing o0 in the next block */
+                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+
+                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
+                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
+                /*m_temp_reg_3 = _mm_srli_si128(m_temp_reg_53, 8);
+                m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
+                m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);
+                */
+
+            }
+
+            /* e */
+            {
+                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+            }
+
+            /* o */
+            {
+                /*  m_temp_reg_4 = _mm_cvtepi16_epi32(m_temp_reg_55);
+                    m_temp_reg_5 = _mm_srli_si128(m_temp_reg_55, 8);
+                    m_temp_reg_6 = _mm_cvtepi16_epi32(m_temp_reg_57);
+                    m_temp_reg_7 = _mm_srli_si128(m_temp_reg_57, 8);
+                    m_temp_reg_5 = _mm_cvtepi16_epi32(m_temp_reg_5);
+                    m_temp_reg_7 = _mm_cvtepi16_epi32(m_temp_reg_7);
+                    */
+                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57);
+                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57);
+
+                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+                {
+                    //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+                    /* Loading coeff for computing o1 in the next block */
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+                }
+
+                /* Column 0 of destination computed here */
+                /* It is stored in m_temp_reg_50 */
+                /* Column 7 of destination computed here */
+                /* It is stored in m_temp_reg_57 */
+                {
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
+
+                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+
+
+                    /* Loading coeff for computing o2  in the next block */
+                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
+
+                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+                    m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
+                }
+
+                /* Column 1 of destination computed here */
+                /* It is stored in m_temp_reg_51 */
+                /* Column 6 of destination computed here */
+                /* It is stored in m_temp_reg_56 */
+                {
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+                    //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
+                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+                    /* Loading coeff for computing o3  in the next block */
+
+                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
+
+                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+                }
+
+                /* Column 2 of destination computed here */
+                /* It is stored in m_temp_reg_52 */
+                /* Column 5 of destination computed here */
+                /* It is stored in m_temp_reg_55 */
+                {
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+                    //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
+                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
+                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
+
+                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+
+                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
+                    m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
+                }
+
+                /* Column 3 of destination computed here */
+                /* It is stored in m_temp_reg_53 */
+                /* Column 4 of destination computed here */
+                /* It is stored in m_temp_reg_54 */
+                {
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+                    m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+                    m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
+                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
+                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
+                    m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
+
+                    m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
+                    m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
+                    m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
+                    m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
+
+                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+                }
+            }
+
+            /* Transpose of the destination 8x8 matrix done here */
+            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+            /* respectively */
+            {
+                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+                m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+                m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+                m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+                m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+                m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+                m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+            }
+
+            /* Recon and store */
+            {
+                m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                pu1_pred += pred_strd;
+                m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+
+                m_temp_reg_50 = _mm_setzero_si128();
+                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
+                m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
+                m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
+                m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
+                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
+                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
+                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
+                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
+
+                m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
+                m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
+                m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
+                m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
+                m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
+                m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
+                m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
+                m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
+
+                m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
+                m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
+                m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
+                m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
+                m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
+                m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
+                m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
+                m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
+
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
+                pu1_dst += dst_strd;
+                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
+                pu1_dst += dst_strd;
+
+            }
+
+
+        }
+
+
+    }
+}
+

diff --git a/common/x86/ihevc_mem_fns_ssse3_intr.c b/common/x86/ihevc_mem_fns_ssse3_intr.c
new file mode 100644
index 0000000..ca0b77a
--- /dev/null
+++ b/common/x86/ihevc_mem_fns_ssse3_intr.c

@@ -0,0 +1,168 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_mem_fns_atom_intr.c
+ *
+ * @brief
+ *  Functions used for memory operations
+ *
+ * @author
+ *  Ittiam
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_mem_fns.h"
+
+#include <immintrin.h>
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *   memcpy of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ *   Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu1_dst
+ *  UWORD8 pointer to the destination
+ *
+ * @param[in] pu1_src
+ *  UWORD8 pointer to the source
+ *
+ * @param[in] num_bytes
+ *  number of bytes to copy
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+
+
+void ihevc_memcpy_mul_8_ssse3(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes)
+{
+    int col;
+    for(col = num_bytes; col >= 8; col -= 8)
+    {
+        __m128i src_temp16x8b;
+        src_temp16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+        pu1_src += 8;
+        _mm_storel_epi64((__m128i *)(pu1_dst), src_temp16x8b);
+        pu1_dst += 8;
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *   memset of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ *   Does memset of 8bit data for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu1_dst
+ *  UWORD8 pointer to the destination
+ *
+ * @param[in] value
+ *  UWORD8 value used for memset
+ *
+ * @param[in] num_bytes
+ *  number of bytes to set
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_memset_mul_8_ssse3(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes)
+{
+    int col;
+    __m128i src_temp16x8b;
+    src_temp16x8b = _mm_set1_epi8(value);
+    for(col = num_bytes; col >= 8; col -= 8)
+    {
+        _mm_storel_epi64((__m128i *)(pu1_dst), src_temp16x8b);
+        pu1_dst += 8;
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *   memset of 16bit data of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ *   Does memset of 16bit data for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu2_dst
+ *  UWORD8 pointer to the destination
+ *
+ * @param[in] value
+ *  UWORD16 value used for memset
+ *
+ * @param[in] num_words
+ *  number of words to set
+ * @returns
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_memset_16bit_mul_8_ssse3(UWORD16 *pu2_dst, UWORD16 value, UWORD32 num_words)
+{
+    int col;
+    __m128i src_temp16x8b;
+    src_temp16x8b = _mm_set1_epi16(value);
+    for(col = num_words; col >= 8; col -= 8)
+    {
+        _mm_storeu_si128((__m128i *)(pu2_dst), src_temp16x8b);
+        pu2_dst += 8;
+    }
+}
+

diff --git a/common/x86/ihevc_padding_ssse3_intr.c b/common/x86/ihevc_padding_ssse3_intr.c
new file mode 100644
index 0000000..42ee5ac
--- /dev/null
+++ b/common/x86/ihevc_padding_ssse3_intr.c

@@ -0,0 +1,334 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_padding_atom_intr.c
+*
+* @brief
+*  Contains function definitions for Padding
+*
+* @author
+*  Srinivas T
+*
+* @par List of Functions:
+*   - ihevc_pad_left_luma_ssse3()
+*   - ihevc_pad_left_chroma_ssse3()
+*   - ihevc_pad_right_luma_ssse3()
+*   - ihevc_pad_right_chroma_ssse3()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#include <string.h>
+#include <assert.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_mem_fns.h"
+#include "ihevc_debug.h"
+
+#include <immintrin.h>
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Padding (luma block) at the left of a 2d array
+*
+* @par Description:
+*   The left column of a 2d array is replicated for pad_size times at the left
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @param[in] pad_size
+*  integer -padding size of the array
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_left_luma_ssse3(UWORD8 *pu1_src,
+                               WORD32 src_strd,
+                               WORD32 ht,
+                               WORD32 pad_size)
+{
+    WORD32 row;
+    WORD32 i;
+    UWORD8 *pu1_dst;
+    __m128i const0_16x8b;
+
+    const0_16x8b = _mm_setzero_si128();
+
+    ASSERT(pad_size % 8 == 0);
+
+    for(row = 0; row < ht; row++)
+    {
+        __m128i src_temp0_16x8b;
+
+        src_temp0_16x8b =  _mm_loadu_si128((__m128i *)pu1_src);
+        pu1_dst = pu1_src - pad_size;
+        src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b);
+        for(i = 0; i < pad_size; i += 8)
+        {
+            _mm_storel_epi64((__m128i *)(pu1_dst + i), src_temp0_16x8b);
+        }
+        pu1_src += src_strd;
+    }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Padding (chroma block) at the left of a 2d array
+*
+* @par Description:
+*   The left column of a 2d array is replicated for pad_size times at the left
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array (each colour component)
+*
+* @param[in] pad_size
+*  integer -padding size of the array
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_left_chroma_ssse3(UWORD8 *pu1_src,
+                                 WORD32 src_strd,
+                                 WORD32 ht,
+                                 WORD32 pad_size)
+{
+    WORD32 row;
+    WORD32 col;
+    UWORD8 *pu1_dst;
+    __m128i const0_16x8b, const1_16x8b;
+    const0_16x8b = _mm_setzero_si128();
+    const1_16x8b = _mm_set1_epi8(1);
+    const0_16x8b = _mm_unpacklo_epi8(const0_16x8b, const1_16x8b);
+
+    ASSERT(pad_size % 8 == 0);
+    for(row = 0; row < ht; row++)
+    {
+        __m128i src_temp0_16x8b;
+
+        src_temp0_16x8b =  _mm_loadu_si128((__m128i *)pu1_src);
+        pu1_dst = pu1_src - pad_size;
+        src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b);
+
+        for(col = 0; col < pad_size; col += 8)
+        {
+            _mm_storel_epi64((__m128i *)(pu1_dst + col), src_temp0_16x8b);
+        }
+        pu1_src += src_strd;
+    }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding (luma block) at the right of a 2d array
+*
+* @par Description:
+* The right column of a 2d array is replicated for pad_size times at the right
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @param[in] pad_size
+*  integer -padding size of the array
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_right_luma_ssse3(UWORD8 *pu1_src,
+                                WORD32 src_strd,
+                                WORD32 ht,
+                                WORD32 pad_size)
+{
+    WORD32 row;
+    WORD32 col;
+    UWORD8 *pu1_dst;
+    __m128i const0_16x8b;
+
+    ASSERT(pad_size % 8 == 0);
+
+    for(row = 0; row < ht; row++)
+    {
+        __m128i src_temp0_16x8b;
+
+        src_temp0_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src - 1));
+        const0_16x8b = _mm_setzero_si128();
+        pu1_dst = pu1_src;
+        src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b);
+        for(col = 0; col < pad_size; col += 8)
+        {
+            _mm_storel_epi64((__m128i *)(pu1_dst + col), src_temp0_16x8b);
+        }
+        pu1_src += src_strd;
+    }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding (chroma block) at the right of a 2d array
+*
+* @par Description:
+* The right column of a 2d array is replicated for pad_size times at the right
+*
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[in] src_strd
+*  integer source stride
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array (each colour component)
+*
+* @param[in] pad_size
+*  integer -padding size of the array
+*
+* @param[in] ht
+*  integer height of the array
+*
+* @param[in] wd
+*  integer width of the array
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_right_chroma_ssse3(UWORD8 *pu1_src,
+                                  WORD32 src_strd,
+                                  WORD32 ht,
+                                  WORD32 pad_size)
+{
+    WORD32 row;
+    WORD32 col;
+    UWORD8 *pu1_dst;
+    __m128i const0_16x8b, const1_16x8b;
+    const0_16x8b = _mm_setzero_si128();
+    const1_16x8b = _mm_set1_epi8(1);
+    const0_16x8b = _mm_unpacklo_epi8(const0_16x8b, const1_16x8b);
+
+    ASSERT(pad_size % 8 == 0);
+
+    for(row = 0; row < ht; row++)
+    {
+        __m128i src_temp0_16x8b;
+
+        src_temp0_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src - 2));
+        pu1_dst = pu1_src;
+        src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b);
+        for(col = 0; col < pad_size; col += 8)
+        {
+            _mm_storel_epi64((__m128i *)(pu1_dst + col), src_temp0_16x8b);
+        }
+
+        pu1_src += src_strd;
+    }
+}
+

diff --git a/common/x86/ihevc_platform_macros.h b/common/x86/ihevc_platform_macros.h
new file mode 100644
index 0000000..ae688e6
--- /dev/null
+++ b/common/x86/ihevc_platform_macros.h

@@ -0,0 +1,118 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_platform_macros.h
+*
+* @brief
+*  Platform specific Macro definitions used in the codec
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+#ifndef _IHEVC_PLATFORM_MACROS_H_
+#define _IHEVC_PLATFORM_MACROS_H_
+
+//#include <immintrin.h>
+
+
+#define CLIP_U8(x) CLIP3((x), 0,     255)
+#define CLIP_S8(x) CLIP3((x), -128,  127)
+
+#define CLIP_U10(x) CLIP3((x), 0,     1023);
+#define CLIP_S10(x) CLIP3((x), -512,  511);
+
+#define CLIP_U12(x) CLIP3((x), 0,     4095);
+#define CLIP_S12(x) CLIP3((x), -2048,  2047);
+
+#define CLIP_U16(x) CLIP3((x), 0,        65535)
+#define CLIP_S16(x) CLIP3((x), -32768,   32767)
+
+
+
+#define SHL(x,y) (((y) < 32) ? ((x) << (y)) : 0)
+#define SHR(x,y) (((y) < 32) ? ((x) >> (y)) : 0)
+
+#define SHR_NEG(val,shift)  ((shift>0)?(val>>shift):(val<<(-shift)))
+#define SHL_NEG(val,shift)  ((shift<0)?(val>>(-shift)):(val<<shift))
+
+
+#define ITT_BIG_ENDIAN(x)       ((x << 24))                |   \
+                            ((x & 0x0000ff00) << 8)    |   \
+                            ((x & 0x00ff0000) >> 8)    |   \
+                            ((UWORD32)x >> 24);
+
+
+#define NOP(nop_cnt)    {UWORD32 nop_i; for (nop_i = 0; nop_i < nop_cnt; nop_i++);}
+
+#define PLD(a)
+#define INLINE inline
+
+static INLINE UWORD32 CLZ(UWORD32 u4_word)
+{
+    if(u4_word)
+        return (__builtin_clz(u4_word));
+    else
+        return 32;
+}
+
+static INLINE UWORD32 CTZ(UWORD32 u4_word)
+{
+    if(0 == u4_word)
+        return 31;
+    else
+    {
+        unsigned int index;
+        index = __builtin_ctz(u4_word);
+        return (UWORD32)index;
+    }
+}
+
+#define GCC_ENABLE 1
+
+#if GCC_ENABLE
+#define _mm256_loadu2_m128i(X,Y) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((Y))), _mm_loadu_si128((X)),1);
+
+#define _mm256_storeu2_m128i(X,Y,Z) {_mm_storeu_si128 ((Y), _mm256_castsi256_si128((Z)));_mm_storeu_si128 ((X), _mm256_extracti128_si256((Z),1));}
+
+#define _mm256_set_m128i(X,Y) _mm256_insertf128_si256(_mm256_castsi128_si256((Y)),(X),1);
+
+#endif
+
+
+#define PREFETCH_ENABLE 1
+
+#if PREFETCH_ENABLE
+#define PREFETCH(ptr, type) _mm_prefetch(ptr, type);
+#else
+#define PREFETCH(ptr, type)
+#endif
+
+#define MEM_ALIGN8 __attribute__ ((aligned (8)))
+#define MEM_ALIGN16 __attribute__ ((aligned (16)))
+#define MEM_ALIGN32 __attribute__ ((aligned (32)))
+
+#endif /* _IHEVC_PLATFORM_MACROS_H_ */

diff --git a/common/x86/ihevc_sao_ssse3_intr.c b/common/x86/ihevc_sao_ssse3_intr.c
new file mode 100644
index 0000000..cffd2a9
--- /dev/null
+++ b/common/x86/ihevc_sao_ssse3_intr.c

@@ -0,0 +1,5653 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_sao_atom_intr.c
+*
+* @brief
+*  Contains function definitions for Sample adaptive offset(SAO) used in-loop
+* filtering
+*
+* @author
+* 100592
+*
+* @par List of Functions:
+*   - ihevc_sao_band_offset_luma_ssse3()
+*   - ihevc_sao_band_offset_chroma_ssse3()
+*   - ihevc_sao_edge_offset_class0_ssse3()
+*   - ihevc_sao_edge_offset_class0_chroma_ssse3()
+*   - ihevc_sao_edge_offset_class1_ssse3()
+*   - ihevc_sao_edge_offset_class1_chroma_ssse3()
+*   - ihevc_sao_edge_offset_class2_ssse3()
+*   - ihevc_sao_edge_offset_class2_chroma_ssse3()
+*   - ihevc_sao_edge_offset_class3_ssse3()
+*   - ihevc_sao_edge_offset_class3_chroma_ssse3()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_defs.h"
+#include "ihevc_tables_x86_intr.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_sao.h"
+
+#include <immintrin.h>
+
+#define NUM_BAND_TABLE  32
+/**
+*******************************************************************************
+*
+* @brief
+* Has two sets of functions : band offset and edge offset both for luma and chroma
+* edge offset has horizontal ,vertical, 135 degree and 45 degree
+*
+* @par Description:
+*
+*
+* @param[in-out] pu1_src
+*  Pointer to the source
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in-out] pu1_src_left
+*  source left boundary
+*
+* @param[in-out] pu1_src_top
+* Source top boundary
+*
+* @param[in-out] pu1_src_top_left
+*  Source top left boundary
+*
+* @param[in] pu1_src_top_right
+*  Source top right boundary
+*
+* @param[in] pu1_src_bot_left
+*  Source bottom left boundary
+*
+* @param[in] pu1_avail
+*  boundary availability flags
+*
+* @param[in] pi1_sao_offset_u
+*  Chroma U sao offset values
+*
+* @param[in] pi1_sao_offset_v
+*  Chroma V sao offset values
+*
+* @param[in] pi1_sao_offset
+*  Luma sao offset values
+*
+* @param[in] wd
+*  width of the source
+
+* @param[in] ht
+*  height of the source
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_sao_band_offset_luma_ssse3(UWORD8 *pu1_src,
+                                      WORD32 src_strd,
+                                      UWORD8 *pu1_src_left,
+                                      UWORD8 *pu1_src_top,
+                                      UWORD8 *pu1_src_top_left,
+                                      WORD32 sao_band_pos,
+                                      WORD8 *pi1_sao_offset,
+                                      WORD32 wd,
+                                      WORD32 ht)
+{
+    WORD32 row, col;
+    UWORD8 *pu1_src_cpy;
+    WORD32 wd_rem;
+    WORD8 offset = 0;
+
+    __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
+    __m128i band_table0_8x16b, band_table1_8x16b, band_table2_8x16b, band_table3_8x16b;
+    __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
+    __m128i band_pos_16x8b;
+    __m128i sao_offset;
+    __m128i cmp_mask, cmp_store;
+
+    /* Updating left and top-left and top */
+    for(row = 0; row < ht; row++)
+    {
+        pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
+    }
+    pu1_src_top_left[0] = pu1_src_top[wd - 1];
+    for(col = 0; col < wd; col += 8)
+    {
+        tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
+        _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
+        offset += 8;
+    }
+
+    //replicating sao_band_pos as 8 bit value 16 times
+
+
+    band_pos_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos << 3));
+    //value set for sao_offset extraction
+    tmp_set_128i_1  = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1);
+    tmp_set_128i_2  = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2);
+    tmp_set_128i_3  = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3);
+    tmp_set_128i_4  = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4);
+
+    //loaded sao offset values
+    sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
+
+    //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
+    band_table0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
+    band_table1_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
+    band_table2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
+    band_table3_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
+
+    //band_position addition
+    band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, band_pos_16x8b);
+    band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, band_pos_16x8b);
+    band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, band_pos_16x8b);
+    band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, band_pos_16x8b);
+    //sao_offset duplication
+    tmp_set_128i_1  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
+    tmp_set_128i_2  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
+    tmp_set_128i_3  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
+    tmp_set_128i_4  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
+    //settng for comparision
+    cmp_mask = _mm_set1_epi16(16);
+    cmp_store = _mm_set1_epi16(0x00ff);
+
+    //sao_offset addition
+    band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, tmp_set_128i_1);
+    band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, tmp_set_128i_2);
+    band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, tmp_set_128i_3);
+    band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, tmp_set_128i_4);
+    //masking upper 8bit values of each  16 bit band table value
+    band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
+    band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
+    band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
+    band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
+
+    switch(sao_band_pos)
+    {
+        case 0:
+            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
+            band_table0_8x16b = _mm_and_si128(band_table0_8x16b, tmp_set_128i_2);
+            break;
+        case 28:
+            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
+            band_table3_8x16b = _mm_or_si128(band_table3_8x16b, tmp_set_128i_2);
+            break;
+        case 29:
+            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
+            band_table2_8x16b = _mm_or_si128(band_table2_8x16b, tmp_set_128i_2);
+            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
+            band_table3_8x16b = _mm_and_si128(band_table3_8x16b, tmp_set_128i_2);
+            break;
+        case 30:
+            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
+            band_table1_8x16b = _mm_or_si128(band_table1_8x16b, tmp_set_128i_2);
+            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
+            band_table2_8x16b = _mm_and_si128(band_table2_8x16b, tmp_set_128i_2);
+            break;
+        case 31:
+            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
+            band_table0_8x16b = _mm_or_si128(band_table0_8x16b, tmp_set_128i_2);
+            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
+            band_table1_8x16b = _mm_and_si128(band_table1_8x16b, tmp_set_128i_2);
+            break;
+        default:
+            break;
+    }
+    //sao_offset is reused for zero cmp mask.
+    sao_offset = _mm_setzero_si128();
+    tmp_set_128i_1 = _mm_set1_epi8(1);
+    //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
+    cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
+
+    //masking upper 8bit values of each  16 bit band table value
+    band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
+    band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
+    band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
+    band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
+
+    //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
+    band_table0_8x16b = _mm_packus_epi16(band_table0_8x16b, band_table1_8x16b);
+    band_table2_8x16b = _mm_packus_epi16(band_table2_8x16b, band_table3_8x16b);
+
+    band_table3_8x16b = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
+    band_pos_16x8b = _mm_packus_epi16(band_pos_16x8b, band_pos_16x8b); //band_pos is now 8 bit aligned
+    band_table3_8x16b = _mm_sub_epi8(band_table3_8x16b, tmp_set_128i_1); // to compare if value is greater than 31
+
+    cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
+    //  band_pos_16x8b = _mm_or_si128(band_pos_16x8b,cmp_store);
+
+    for(col = wd; col >= 16; col -= 16)
+    {
+        pu1_src_cpy = pu1_src;
+        for(row = ht; row > 0; row -= 2)
+        {
+
+
+            //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+            src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+            // row = 1
+            src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+
+
+            //saturated substract 8 bit
+            tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
+            tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
+            //if the values less than 0 put ff
+            tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
+            tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
+            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+            //if the values gret=ater than 31 put ff
+            tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
+            tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
+            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+
+
+            //row 0 and row1
+            //if the values >16 then put ff ,cmp_mask = dup16(15)
+            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
+            //values 16 to 31 for row 0 & 1 but values <16 ==0
+            tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
+            // values 0 to 15 for row 0 & 1
+            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
+            //values 16 to 31 for row 0 & 1 but values <16 masked to ff
+            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
+            tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
+            //row 2 and  row 3
+            //if the values >16 then put ff ,cmp_mask = dup16(15)
+            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
+            //values 16 to 31 for row 2 & 3 but values <16 ==0
+            tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
+            // values 0 to 15 for row 2 & 3
+            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
+            //values 16 to 31 for row 2 & 3 but values <16 masked to ff
+            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
+            tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
+
+            //row 0 and row 1
+            //to preserve pixel values in which no offset needs to be added.
+            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
+            src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
+
+            //row 2 and row 3
+            //to preserve pixel values in which no offset needs to be added.
+            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
+            src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
+
+            //indexing 0 - 15 bandtable indexes
+            tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
+            tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
+            tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
+            tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
+            // combining all offsets results
+            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+            // combing results woth the pixel values
+            src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
+            src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
+
+
+            //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+            _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
+            // row = 1
+            _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp2_8x16b);
+
+            pu1_src_cpy += (src_strd << 1);
+        }
+        pu1_src += 16;
+    }
+    wd_rem = wd & 0xF;
+    if(wd_rem)
+    {pu1_src_cpy = pu1_src;
+        for(row = ht; row > 0; row -= 4)
+        {
+
+
+            //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+            src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+            // row = 1
+            src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+            // row = 2
+            src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+            // row = 3
+            src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
+            //row0 and row1 packed and row2 and row3 packed
+
+            src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
+            src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
+
+            //saturated substract 8 bit
+            tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
+            tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
+            //if the values less than 0 put ff
+            tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
+            tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
+            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+            //if the values gret=ater than 31 put ff
+            tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
+            tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
+            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+
+
+
+            //row 0 and row1
+            //if the values >16 then put ff ,cmp_mask = dup16(15)
+            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
+            //values 16 to 31 for row 0 & 1 but values <16 ==0
+            tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
+            // values 0 to 15 for row 0 & 1
+            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
+            //values 16 to 31 for row 0 & 1 but values <16 masked to ff
+            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
+            tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
+            //row 2 and  row 3
+            //if the values >16 then put ff ,cmp_mask = dup16(15)
+            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
+            //values 16 to 31 for row 2 & 3 but values <16 ==0
+            tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
+            // values 0 to 15 for row 2 & 3
+            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
+            //values 16 to 31 for row 2 & 3 but values <16 masked to ff
+            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
+            tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
+
+            //row 0 and row 1
+            //to preserve pixel values in which no offset needs to be added.
+            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
+            src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
+
+            //row 2 and row 3
+            //to preserve pixel values in which no offset needs to be added.
+            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
+            src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
+
+            //indexing 0 - 15 bandtable indexes
+            tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
+            tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
+            tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
+            tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
+            // combining all offsets results
+            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+            // combing results woth the pixel values
+            src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
+            src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
+
+            //Getting row1 separately
+            src_temp1_8x16b = _mm_srli_si128(src_temp0_8x16b, 8);
+            //Getting row3 separately
+            src_temp3_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+
+            //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+            _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
+            // row = 1
+            _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp1_8x16b);
+            // row = 2
+            _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp2_8x16b);
+            // row = 3
+            _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp3_8x16b);
+
+            pu1_src_cpy += (src_strd << 2);
+
+        }
+        pu1_src += 8;
+    }
+
+
+}
+
+void ihevc_sao_band_offset_chroma_ssse3(UWORD8 *pu1_src,
+                                        WORD32 src_strd,
+                                        UWORD8 *pu1_src_left,
+                                        UWORD8 *pu1_src_top,
+                                        UWORD8 *pu1_src_top_left,
+                                        WORD32 sao_band_pos_u,
+                                        WORD32 sao_band_pos_v,
+                                        WORD8 *pi1_sao_offset_u,
+                                        WORD8 *pi1_sao_offset_v,
+                                        WORD32 wd,
+                                        WORD32 ht)
+{
+    WORD32 row, col;
+    WORD8 offset = 0;
+
+
+    __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
+    __m128i cmp_msk2;
+    __m128i band_table0_16x8b, band_table1_16x8b, band_table2_16x8b, band_table3_16x8b;
+    __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
+    __m128i band_pos_u_16x8b, band_pos_v_16x8b;
+    __m128i sao_offset;
+    __m128i cmp_mask;
+
+
+    /* Updating left and top and top-left */
+    for(row = 0; row < ht; row++)
+    {
+        pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
+        pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
+    }
+    pu1_src_top_left[0] = pu1_src_top[wd - 2];
+    pu1_src_top_left[1] = pu1_src_top[wd - 1];
+    for(col = 0; col < wd; col += 8)
+    {
+        tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
+        _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
+        offset += 8;
+    }
+
+    { // band _table creation
+        __m128i temp0_8x16b, temp1_8x16b, temp2_8x16b, temp3_8x16b;
+        // Band table for U component : band_table0_16x8b and band_table2_16x8b
+        //replicating sao_band_pos as 8 bit value 16 times
+        band_pos_u_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_u << 3));
+        //value set for sao_offset extraction
+        tmp_set_128i_1  = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1);
+        tmp_set_128i_2  = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2);
+        tmp_set_128i_3  = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3);
+        tmp_set_128i_4  = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4);
+
+        //loaded sao offset values
+        sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
+
+        //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
+        band_table0_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
+        band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
+        band_table2_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
+        band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
+
+        //band_position addition
+        band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, band_pos_u_16x8b);
+        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_u_16x8b);
+        band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, band_pos_u_16x8b);
+        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_u_16x8b);
+        //sao_offset duplication
+        temp0_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
+        temp1_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
+        temp2_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
+        temp3_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
+
+        //sao_offset addition
+        band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, temp0_8x16b);
+        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, temp1_8x16b);
+        band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, temp2_8x16b);
+        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, temp3_8x16b);
+        //reuse for clipping
+        temp1_8x16b = _mm_set1_epi16(0x00ff);
+        //settng for comparision
+        cmp_mask = _mm_set1_epi16(16);
+
+        //masking upper 8bit values of each  16 bit band table value
+        band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
+        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
+        band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
+        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
+
+        //temp1_8x16b reuse for compare storage
+        switch(sao_band_pos_u)
+        {
+            case 0:
+                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
+                band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp3_8x16b);
+                break;
+            case 28:
+                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
+                band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
+                break;
+            case 29:
+                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
+                band_table2_16x8b = _mm_or_si128(band_table2_16x8b, temp3_8x16b);
+                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
+                band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
+                break;
+            case 30:
+                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
+                band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
+                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
+                band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp3_8x16b);
+                break;
+            case 31:
+                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
+                band_table0_16x8b = _mm_or_si128(band_table0_16x8b, temp3_8x16b);
+                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
+                band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
+                break;
+            default:
+                break;
+        }
+        //masking upper 8bit values of each  16 bit band table value
+        band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
+        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
+        band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
+        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
+        //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
+        band_table0_16x8b = _mm_packus_epi16(band_table0_16x8b, band_table1_16x8b);
+        band_table2_16x8b = _mm_packus_epi16(band_table2_16x8b, band_table3_16x8b);
+        // Band table for U component over
+
+        // Band table for V component : band_table1_16x8b and band_table3_16x8b
+        // replicating sao_band_pos as 8 bit value 16 times
+        band_pos_v_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_v << 3));
+
+        //loaded sao offset values
+        sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
+
+        //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
+        temp0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
+        band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
+        temp2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
+        band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
+
+        //band_position addition
+        temp0_8x16b = _mm_add_epi16(temp0_8x16b, band_pos_v_16x8b);
+        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_v_16x8b);
+        temp2_8x16b = _mm_add_epi16(temp2_8x16b, band_pos_v_16x8b);
+        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_v_16x8b);
+        //sao_offset duplication
+        tmp_set_128i_1  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
+        tmp_set_128i_2  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
+        tmp_set_128i_3  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
+        tmp_set_128i_4  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
+
+        //sao_offset addition
+        temp0_8x16b = _mm_add_epi16(temp0_8x16b, tmp_set_128i_1);
+        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, tmp_set_128i_2);
+        temp2_8x16b = _mm_add_epi16(temp2_8x16b, tmp_set_128i_3);
+        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, tmp_set_128i_4);
+
+        //masking upper 8bit values of 16 bit band table value
+        temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
+        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
+        temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
+        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
+        //temp1_8x16b reuse for compare storage
+
+        switch(sao_band_pos_v)
+        {
+            case 0:
+                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
+                temp0_8x16b = _mm_and_si128(temp0_8x16b, temp3_8x16b);
+                break;
+            case 28:
+                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
+                band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
+                break;
+            case 29:
+                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
+                temp2_8x16b = _mm_or_si128(temp2_8x16b, temp3_8x16b);
+                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
+                band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
+                break;
+            case 30:
+                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
+                band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
+                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
+                temp2_8x16b = _mm_and_si128(temp2_8x16b, temp3_8x16b);
+                break;
+            case 31:
+                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
+                temp0_8x16b = _mm_or_si128(temp0_8x16b, temp3_8x16b);
+                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
+                band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
+                break;
+            default:
+                break;
+        }
+        //masking upper 8bit values of each  16 bit band table value
+        temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
+        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
+        temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
+        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
+        //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
+        band_table1_16x8b = _mm_packus_epi16(temp0_8x16b, band_table1_16x8b);
+        band_table3_16x8b = _mm_packus_epi16(temp2_8x16b, band_table3_16x8b);
+        //band table for u and v created
+    }
+    {
+        UWORD8 *pu1_src_cpy;
+        WORD32 wd_rem;
+
+
+        //sao_offset is reused for zero cmp mask.
+        sao_offset = _mm_setzero_si128();
+        tmp_set_128i_1 = _mm_set1_epi8(1);
+        //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
+        cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
+        //to avoid ffff to be saturated to 0 instead it should be to ff
+
+        cmp_msk2 = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
+        band_pos_u_16x8b = _mm_packus_epi16(band_pos_u_16x8b, band_pos_u_16x8b); //band_pos_u is now 8 bit aligned
+        band_pos_v_16x8b = _mm_packus_epi16(band_pos_v_16x8b, band_pos_v_16x8b); //band_pos_v is now 8 bit aligned
+        cmp_msk2 = _mm_sub_epi8(cmp_msk2, tmp_set_128i_1); // to compare if value is greater than 31
+
+        cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
+
+        for(col = wd; col >= 16; col -= 16)
+        {
+            pu1_src_cpy = pu1_src;
+            for(row = ht; row > 0; row -= 2)
+            {
+                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+                // row = 1
+                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+
+                //odd values
+                src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
+                src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
+                //even values
+                src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
+                src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
+                src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
+                src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
+                //combining odd values
+                src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
+                //combining even values
+                src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
+
+                //saturated substract 8 bit
+                tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
+                tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
+                //if the values less than 0 put ff
+                tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
+                tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
+                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+                //if the values greater than 31 put ff
+                tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
+                tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
+                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+                // registers reused to increase performance
+                //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
+                src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
+                //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and  row 3
+                src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
+
+                //values 16 to 31 for row 0 & 1 but values <16 ==0
+                tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
+                // values 0 to 15 for row 0 & 1
+                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
+                //values 16 to 31 for row 2 & 3 but values <16 ==0
+                tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
+                // values 0 to 15 for row 2 & 3
+                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
+
+                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
+                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
+                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and  row 3
+                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
+                tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
+                tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
+
+
+                //to choose which pixel values to preserve in row 0 and row 1
+                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
+                //to choose which pixel values to preserve in row 2 and row 3
+                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
+                //values of all rows to which no offset needs to be added preserved.
+                src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
+                src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
+
+                //indexing 0 - 15 bandtable indexes
+                tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
+                tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
+                //indexing 16 -31 bandtable indexes
+                tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
+                tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
+                // combining all offsets results
+                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
+                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
+                // combing results with the pixel values
+                src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
+                src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
+                //reorganising even and odd values
+                src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
+                src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
+
+
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
+                // row = 1
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp3_8x16b);
+
+
+                pu1_src_cpy += (src_strd << 1);
+
+            }
+            pu1_src += 16;
+        }
+
+        wd_rem = wd & 0xF;
+        if(wd_rem)
+        {
+            pu1_src_cpy = pu1_src;
+            for(row = ht; row > 0; row -= 4)
+            {
+                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+                // row = 1
+                src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+                // row = 2
+                src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+                // row = 3
+                src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
+                //row0 and row1 packed and row2 and row3 packed
+
+                src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
+                src_temp3_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
+                //odd values
+                src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
+                src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
+                //even values
+                src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
+                src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
+                src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
+                src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
+                //combining odd values
+                src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
+                //combining even values
+                src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
+
+                //saturated substract 8 bit
+                tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
+                tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
+                //if the values less than 0 put ff
+                tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
+                tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
+                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+                //if the values greater than 31 put ff
+                tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
+                tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
+                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+                // registers reused to increase performance
+                //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
+                src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
+                //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and  row 3
+                src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
+
+                //values 16 to 31 for row 0 & 1 but values <16 ==0
+                tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
+                // values 0 to 15 for row 0 & 1
+                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
+                //values 16 to 31 for row 2 & 3 but values <16 ==0
+                tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
+                // values 0 to 15 for row 2 & 3
+                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
+
+                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
+                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
+                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and  row 3
+                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
+                tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
+                tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
+
+
+                //to choose which pixel values to preserve in row 0 and row 1
+                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
+                //to choose which pixel values to preserve in row 2 and row 3
+                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
+                //values of all rows to which no offset needs to be added preserved.
+                src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
+                src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
+
+                //indexing 0 - 15 bandtable indexes
+                tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
+                tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
+                //indexing 16 -31 bandtable indexes
+                tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
+                tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
+                // combining all offsets results
+                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
+                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
+                // combing results with the pixel values
+                src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
+                src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
+                //reorganising even and odd values
+                src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
+                src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
+                //Getting row1 separately
+                src_temp0_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+                //Getting row3 separately
+                src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
+                // row = 1
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp0_8x16b);
+                // row = 2
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp3_8x16b);
+                // row = 3
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp2_8x16b);
+
+                pu1_src_cpy += (src_strd << 2);
+
+            }
+            pu1_src += 16;
+        }
+
+
+    }
+}
+
+
+
+void ihevc_sao_edge_offset_class0_ssse3(UWORD8 *pu1_src,
+                                        WORD32 src_strd,
+                                        UWORD8 *pu1_src_left,
+                                        UWORD8 *pu1_src_top,
+                                        UWORD8 *pu1_src_top_left,
+                                        UWORD8 *pu1_src_top_right,
+                                        UWORD8 *pu1_src_bot_left,
+                                        UWORD8 *pu1_avail,
+                                        WORD8 *pi1_sao_offset,
+                                        WORD32 wd,
+                                        WORD32 ht)
+{
+    WORD32 row, col;
+    UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
+    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
+    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
+    UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
+    UWORD8 u1_avail0, u1_avail1;
+    WORD32 wd_rem;
+    WORD32 offset = 0;
+    __m128i src_temp0_16x8b, src_temp1_16x8b;
+    __m128i left0_16x8b, left1_16x8b;
+    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b, cmp_gt1_16x8b, cmp_lt1_16x8b;
+    __m128i edge0_16x8b, edge1_16x8b;
+    __m128i au1_mask8x16b;
+    __m128i edge_idx_8x16b, sao_offset_8x16b;
+    __m128i const2_16x8b, const0_16x8b;
+    __m128i left_store_16x8b;
+    UNUSED(pu1_src_top_right);
+    UNUSED(pu1_src_bot_left);
+
+    au1_mask8x16b = _mm_set1_epi8(0xff);
+
+    /* Update  top and top-left arrays */
+
+    *pu1_src_top_left = pu1_src_top[wd - 1];
+
+    for(col = wd; col >= 16; col -= 16)
+    {
+        const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
+        _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
+        offset += 16;
+    }
+
+    //setting availability mask to ff size MAX_CTB_SIZE
+    for(col = 0; col < MAX_CTB_SIZE; col += 16)
+        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
+    for(row = 0; row < ht; row++)
+    {
+        au1_src_left_tmp[row] = pu1_src_left[row];
+    }
+    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
+
+    //availability mask creation
+    u1_avail0 = pu1_avail[0];
+    u1_avail1 = pu1_avail[1];
+    au1_mask[0] = u1_avail0;
+    au1_mask[wd - 1] = u1_avail1;
+
+    const2_16x8b = _mm_set1_epi8(2);
+    const0_16x8b = _mm_setzero_si128();
+    pu1_src_left_cpy = au1_src_left_tmp;
+    pu1_src_left_str = au1_src_left_tmp1;
+    {
+        au1_mask_cpy = au1_mask;
+        for(col = wd; col >= 16; col -= 16)
+        {
+            pu1_src_cpy = pu1_src;
+            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
+            //pu1_src_left_cpy =au1_src_left_tmp;
+            for(row = ht; row > 0; row -= 2)
+            {
+
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
+                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+                // row = 1
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 2);
+                //row 1 left
+                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
+                //row 0 left
+                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+
+
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
+                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
+                cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
+                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
+
+                //row = 0 right
+                edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
+                // row = 1 right
+                edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
+                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
+                cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
+                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
+
+                //combining sign-left and sign_right
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+                //cnvert to 16 bit then add and then saturated pack
+                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
+                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
+                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+
+
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+                pu1_src_cpy += (src_strd << 1);
+                pu1_src_left_cpy += 2;
+                pu1_src_left_str += 2;
+            }
+            au1_mask_cpy += 16;
+            pu1_src += 16;
+            pu1_src_left_cpy -= ht;
+            pu1_src_left_str -= ht;
+
+            pu1_left_tmp = pu1_src_left_cpy;
+            pu1_src_left_cpy = pu1_src_left_str;
+            pu1_src_left_str = pu1_left_tmp;
+        }
+
+        wd_rem = wd & 0xF;
+        if(wd_rem)
+        {
+
+            cmp_gt1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
+            _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt1_16x8b);
+
+            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
+            pu1_src_cpy = pu1_src;
+            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
+            //pu1_src_left_cpy =au1_src_left_tmp;
+            for(row = ht; row > 0; row -= 4)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
+                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+                // row = 1
+                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+                // row  = 2
+                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+                // row = 3
+                cmp_gt1_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
+
+
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
+                //row 3 left
+                edge0_16x8b = _mm_slli_si128(cmp_gt1_16x8b, 8);
+                cmp_lt1_16x8b = _mm_alignr_epi8(cmp_gt1_16x8b, left_store_16x8b, 15);
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
+                //row 2 left
+                edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
+                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
+                //row 1 left
+                edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
+                cmp_lt0_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 15);
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
+                //row 0 left
+                edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
+
+                // packing rows together for 16 SIMD operations
+                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
+                src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_gt1_16x8b);
+                // packing rows together for 16 SIMD operations
+                left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, cmp_lt0_16x8b);
+                left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, cmp_lt1_16x8b);
+
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
+                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
+                cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
+                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
+
+                //row = 0 right
+                edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 1));
+                // row = 1 right
+                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 1));
+                // row = 2 right
+                edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
+                // row = 3 right
+                cmp_gt1_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 1));
+                // packing rows together for 16 SIMD operations
+                edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
+                edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_gt1_16x8b);
+
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
+                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
+                cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
+                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
+
+                //combining sign-left and sign_right
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+                //shuffle to get sao offset
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+                //cnvert to 16 bit then add and then saturated pack
+                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
+                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
+                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+                //separting row 1 and row 3
+                cmp_lt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+                cmp_lt1_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
+
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_lt0_16x8b);
+                // row = 2
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
+                // row = 3
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt1_16x8b);
+
+                pu1_src_cpy += (src_strd << 2);
+                pu1_src_left_cpy += 4;
+                pu1_src_left_str += 4;
+            }
+            pu1_src += wd;
+            pu1_src_left_cpy -= ht;
+            pu1_src_left_str -= ht;
+
+            pu1_left_tmp = pu1_src_left_cpy;
+            pu1_src_left_cpy = pu1_src_left_str;
+            pu1_src_left_str = pu1_left_tmp;
+        }
+        for(row = 0; row < ht; row++)
+        {
+            pu1_src_left[row] = pu1_src_left_cpy[row];
+        }
+    }
+}
+
+
+void ihevc_sao_edge_offset_class0_chroma_ssse3(UWORD8 *pu1_src,
+                                               WORD32 src_strd,
+                                               UWORD8 *pu1_src_left,
+                                               UWORD8 *pu1_src_top,
+                                               UWORD8 *pu1_src_top_left,
+                                               UWORD8 *pu1_src_top_right,
+                                               UWORD8 *pu1_src_bot_left,
+                                               UWORD8 *pu1_avail,
+                                               WORD8 *pi1_sao_offset_u,
+                                               WORD8 *pi1_sao_offset_v,
+                                               WORD32 wd,
+                                               WORD32 ht)
+{
+    WORD32 row, col;
+    UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
+    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
+    UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
+    UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
+    UWORD8 u1_avail0, u1_avail1;
+    WORD32 wd_rem;
+    WORD32 offset = 0;
+
+    __m128i src_temp0_16x8b, src_temp1_16x8b;
+    __m128i left0_16x8b, left1_16x8b;
+    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+    __m128i edge0_16x8b, edge1_16x8b;
+    __m128i au1_mask8x16b;
+    __m128i edge_idx_8x16b, sao_offset_8x16b;
+    __m128i const2_16x8b, const0_16x8b;
+    __m128i left_store_16x8b;
+    __m128i chroma_offset_8x16b;
+    UNUSED(pu1_src_top_right);
+    UNUSED(pu1_src_bot_left);
+
+    au1_mask8x16b = _mm_set1_epi8(0xff);
+
+    /* Update  top and top-left arrays */
+    pu1_src_top_left[0] = pu1_src_top[wd - 2];
+    pu1_src_top_left[1] = pu1_src_top[wd - 1];;
+
+    for(col = wd; col >= 16; col -= 16)
+    {
+        const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
+        _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
+        offset += 16;
+    }
+    for(row = 0; row < 2 * ht; row++)
+    {
+        au1_src_left_tmp[row] = pu1_src_left[row];
+    }
+    //setting availability mask to ff size MAX_CTB_SIZE
+    for(col = 0; col < MAX_CTB_SIZE; col += 16)
+        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
+
+    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
+    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
+    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
+    //availability mask creation
+    u1_avail0 = pu1_avail[0];
+    u1_avail1 = pu1_avail[1];
+    au1_mask[0] = u1_avail0;
+    au1_mask[1] = u1_avail0;
+    au1_mask[wd - 1] = u1_avail1;
+    au1_mask[wd - 2] = u1_avail1;
+    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
+    const2_16x8b = _mm_set1_epi8(2);
+    const0_16x8b = _mm_setzero_si128();
+
+    {
+        pu1_src_left_cpy = au1_src_left_tmp;
+        pu1_src_left_str = au1_src_left_tmp1;
+        au1_mask_cpy = au1_mask;
+        for(col = wd; col >= 16; col -= 16)
+        {
+            pu1_src_cpy = pu1_src;
+            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
+
+            for(row = ht; row > 0; row -= 2)
+            {
+
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
+                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+                // row = 1
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
+                //row 1 left
+                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
+                //row 0 left
+                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+
+
+                //separating +ve and and -ve values.row 0 left
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                //separating +ve and and -ve values.row 1 left
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+
+                //row = 0 right
+                edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
+                // row = 1 right
+                edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
+                //separating +ve and and -ve values.row 0 right
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                //separating +ve and and -ve values.row 1 right
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                //combining sign-left and sign_right
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+                //adding chroma offset to access U and V
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+                //cnvert to 16 bit then add and then saturated pack
+                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
+                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
+                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+                pu1_src_cpy += (src_strd << 1);
+                pu1_src_left_cpy += 4;
+                pu1_src_left_str += 4;
+            }
+            au1_mask_cpy += 16;
+            pu1_src += 16;
+            pu1_src_left_cpy -= 2 * ht;
+            pu1_src_left_str -= 2 * ht;
+
+            pu1_left_tmp = pu1_src_left_cpy;
+            pu1_src_left_cpy = pu1_src_left_str;
+            pu1_src_left_str = pu1_left_tmp;
+        }
+
+        wd_rem = wd & 0xF;
+        if(wd_rem)
+        {
+
+            cmp_gt0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
+            _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt0_16x8b);
+
+            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
+            pu1_src_cpy = pu1_src;
+            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
+
+            for(row = ht; row > 0; row -= 4)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
+                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+                // row = 1
+                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+                // row  = 2
+                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+                // row = 3
+                cmp_lt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
+
+
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 8);
+                //row 3 left
+                edge0_16x8b = _mm_slli_si128(cmp_lt0_16x8b, 8);
+                left0_16x8b = _mm_alignr_epi8(cmp_lt0_16x8b, left_store_16x8b, 14);
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
+                //row 2 left
+                edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
+                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
+
+
+                // packing rows together for 16 SIMD operations
+                src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_lt0_16x8b);
+                left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, left0_16x8b);
+
+                //row 1 left
+                edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
+                edge1_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 14);
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
+                //row 0 left
+                edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
+                // packing rows together for 16 SIMD operations
+                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
+                left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, edge1_16x8b);
+
+                //separating +ve and and -ve values.for row 2 and row 3
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+
+
+
+
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+
+                //row = 0 right
+                edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2));
+                // row = 1 right
+                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 2));
+                // row = 2 right
+                edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
+                // row = 3 right
+                cmp_lt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 2));
+                // packing rows together for 16 SIMD operations
+                edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
+                edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_lt0_16x8b);
+
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                //combining sign-left and sign_right
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+                //shuffle to get sao offset
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+                //adding chroma offset to access U and V
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+                //cnvert to 16 bit then add and then saturated pack
+                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
+                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
+                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+
+                //seaprting row 1 and row 3
+                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+                cmp_lt0_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
+
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+                // row = 2
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
+                // row = 3
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+                pu1_src_cpy += (src_strd << 2);
+                pu1_src_left_cpy += 8;
+                pu1_src_left_str += 8;
+            }
+            pu1_src += wd;
+            pu1_src_left_cpy -= 2 * ht;
+            pu1_src_left_str -= 2 * ht;
+
+            pu1_left_tmp = pu1_src_left_cpy;
+            pu1_src_left_cpy = pu1_src_left_str;
+            pu1_src_left_str = pu1_left_tmp;
+        }
+        for(row = 0; row < 2 * ht; row++)
+        {
+            pu1_src_left[row] = pu1_src_left_cpy[row];
+        }
+    }
+
+}
+
+
+void ihevc_sao_edge_offset_class1_ssse3(UWORD8 *pu1_src,
+                                        WORD32 src_strd,
+                                        UWORD8 *pu1_src_left,
+                                        UWORD8 *pu1_src_top,
+                                        UWORD8 *pu1_src_top_left,
+                                        UWORD8 *pu1_src_top_right,
+                                        UWORD8 *pu1_src_bot_left,
+                                        UWORD8 *pu1_avail,
+                                        WORD8 *pi1_sao_offset,
+                                        WORD32 wd,
+                                        WORD32 ht)
+{
+    WORD32 row, col;
+    UWORD8 *pu1_src_top_cpy;
+    UWORD8 *pu1_src_cpy;
+    WORD32 wd_rem;
+
+
+    __m128i src_top_16x8b, src_bottom_16x8b;
+    __m128i src_temp0_16x8b, src_temp1_16x8b;
+    __m128i signup0_16x8b, signdwn1_16x8b;
+    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+    __m128i edge0_16x8b, edge1_16x8b;
+    __m128i edge_idx_8x16b, sao_offset_8x16b;
+    __m128i const2_16x8b, const0_16x8b;
+
+    UNUSED(pu1_src_top_right);
+    UNUSED(pu1_src_bot_left);
+
+
+    /* Updating left and top-left  */
+    for(row = 0; row < ht; row++)
+    {
+        pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
+    }
+    *pu1_src_top_left = pu1_src_top[wd - 1];
+
+
+
+    pu1_src_top_cpy = pu1_src_top;
+    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
+
+    /* Update height and source pointers based on the availability flags */
+    if(0 == pu1_avail[2])
+    {
+        pu1_src_top_cpy = pu1_src;
+        pu1_src += src_strd;
+        ht--;
+    }
+    if(0 == pu1_avail[3])
+    {
+        ht--;
+    }
+
+    const2_16x8b = _mm_set1_epi8(2);
+    const0_16x8b = _mm_setzero_si128();
+
+    {
+        WORD32 ht_rem;
+        for(col = wd; col >= 16; col -= 16)
+        {
+            pu1_src_cpy = pu1_src;
+            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+            //row = 0
+            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+            //separating +ve and and -ve values.
+            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+            //creating mask 00 for +ve and -ve values and FF for zero.
+            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+            //combining the appropriate sign change
+            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+            for(row = ht; row >= 2; row -= 2)
+            {
+
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                // row = 2
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+
+                //row 0 -row1
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //row1-row0
+                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+
+                //row1 -bottom
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                //combining sign-left and sign_right
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
+
+                //for the next iteration signup0_16x8b = -signdwn1_16x8b
+                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+                //copying the next top
+                src_top_16x8b = src_temp1_16x8b;
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+                src_temp0_16x8b = src_bottom_16x8b;
+                pu1_src_cpy += (src_strd << 1);
+            }
+            ht_rem = ht & 0x1;
+
+            if(ht_rem)
+            {
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                //current row -next row
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //adding top and botton and constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                //copying the next top
+                src_top_16x8b = src_temp0_16x8b;
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+            }
+            if(0 == pu1_avail[3])
+            {
+                src_top_16x8b = src_bottom_16x8b;
+            }
+            //updating top flag
+            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+            pu1_src += 16;
+        }
+
+        wd_rem = wd & 0xF;
+        if(wd_rem)
+        {
+            pu1_src_cpy = pu1_src;
+            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
+            //row = 0
+            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+            //separating +ve and and -ve values.
+            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+            //creating mask 00 for +ve and -ve values and FF for zero.
+            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+            //combining the appropriate sign change
+            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+            for(row = ht; row >= 4; row -= 4)
+            {
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+                // row = 2
+                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+                //row 0 -row1
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                //row1-row0
+                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+                //row1 -row2
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+                //packing row 0 n row 1
+                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
+                //row = 3
+                src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
+                // row = 4
+                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
+
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+                signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
+                //separating +ve and and -ve values.(2,3)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
+
+                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
+                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
+                //separating +ve and and -ve values.(3,4)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
+                //combining sign-left and sign_right
+                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
+
+                edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
+
+                //packing row 2 n row 3
+                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
+                //for the next iteration signup0_16x8b = -signdwn1_16x8b
+                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
+
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+                //the next top already in  src_top_16x8b
+                //src_top_16x8b = src_temp1_16x8b;
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
+                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
+
+                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+                //row = 2
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
+                // row = 3
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+                src_temp0_16x8b = src_temp1_16x8b;
+                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+                pu1_src_cpy += (src_strd << 2);
+
+            }
+            ht_rem = ht & 0x2;
+            if(ht_rem)
+            {
+
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+                // row = 2
+                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+                //row 0 -row1
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //row1-row0
+                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+                //row1 -row2
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+                //adding top and down substraction
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+                //for the next iteration signup0_16x8b = -signdwn1_16x8b
+                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
+                src_top_16x8b = src_temp1_16x8b;
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+                //the next top already in  src_top_16x8b
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
+
+                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+                src_temp0_16x8b = src_bottom_16x8b;
+                pu1_src_cpy += (src_strd << 1);
+
+            }
+            ht_rem = ht & 0x1;
+            if(ht_rem)
+            {
+
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+
+                //row 0 -row1
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //adding top and down substraction
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
+                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                src_top_16x8b = src_temp0_16x8b;
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                pu1_src_cpy += (src_strd);
+
+            }
+            if(0 == pu1_avail[3])
+            {
+                src_top_16x8b = src_bottom_16x8b;
+            }
+            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+            pu1_src += 8;
+        }
+    }
+}
+
+void ihevc_sao_edge_offset_class1_chroma_ssse3(UWORD8 *pu1_src,
+                                               WORD32 src_strd,
+                                               UWORD8 *pu1_src_left,
+                                               UWORD8 *pu1_src_top,
+                                               UWORD8 *pu1_src_top_left,
+                                               UWORD8 *pu1_src_top_right,
+                                               UWORD8 *pu1_src_bot_left,
+                                               UWORD8 *pu1_avail,
+                                               WORD8 *pi1_sao_offset_u,
+                                               WORD8 *pi1_sao_offset_v,
+                                               WORD32 wd,
+                                               WORD32 ht)
+{
+    WORD32 row, col;
+    UWORD8 *pu1_src_top_cpy;
+    UWORD8 *pu1_src_cpy;
+    WORD32 wd_rem;
+
+
+    __m128i src_top_16x8b, src_bottom_16x8b;
+    __m128i src_temp0_16x8b, src_temp1_16x8b;
+    __m128i signup0_16x8b, signdwn1_16x8b;
+    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+    __m128i edge0_16x8b, edge1_16x8b;
+    __m128i edge_idx_8x16b, sao_offset_8x16b;
+    __m128i const2_16x8b, const0_16x8b;
+    __m128i chroma_offset_8x16b;
+
+    UNUSED(pu1_src_top_right);
+    UNUSED(pu1_src_bot_left);
+
+    /* Updating left and top and top-left */
+    for(row = 0; row < ht; row++)
+    {
+        pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
+        pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
+    }
+    pu1_src_top_left[0] = pu1_src_top[wd - 2];
+    pu1_src_top_left[1] = pu1_src_top[wd - 1];
+
+
+
+    pu1_src_top_cpy = pu1_src_top;
+    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
+    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
+    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
+    /* Update height and source pointers based on the availability flags */
+    if(0 == pu1_avail[2])
+    {
+        pu1_src_top_cpy = pu1_src;
+        pu1_src += src_strd;
+        ht--;
+    }
+    if(0 == pu1_avail[3])
+    {
+        ht--;
+    }
+    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
+    const2_16x8b = _mm_set1_epi8(2);
+    const0_16x8b = _mm_setzero_si128();
+
+
+    {
+        WORD32 ht_rem;
+
+
+
+        for(col = wd; col >= 16; col -= 16)
+        {
+            pu1_src_cpy = pu1_src;
+            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+            //row = 0
+            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+            //separating +ve and and -ve values.
+            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+            //creating mask 00 for +ve and -ve values and FF for zero.
+            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+            //combining the appropriate sign change
+            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+            for(row = ht; row >= 2; row -= 2)
+            {
+
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                // row = 2
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+
+                //row 0 -row1
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //row1-row0
+                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+
+                //row1 -bottom
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                //combining sign-left and sign_right
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
+
+                //for the next iteration signup0_16x8b = -signdwn1_16x8b
+                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+                //copying the next top
+                src_top_16x8b = src_temp1_16x8b;
+
+
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+                //adding chroma offset to access U and V
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
+                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+                src_temp0_16x8b = src_bottom_16x8b;
+                pu1_src_cpy += (src_strd << 1);
+            }
+            ht_rem = ht & 0x1;
+
+            if(ht_rem)
+            {
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                //current row -next row
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //adding top and botton and constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                //copying the next top
+                src_top_16x8b = src_temp0_16x8b;
+
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                //adding chroma offset to access U and V
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+            }
+            if(0 == pu1_avail[3])
+            {
+                src_top_16x8b = src_bottom_16x8b;
+            }
+            //updating top flag
+            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+            pu1_src += 16;
+        }
+
+        wd_rem = wd & 0xF;
+        if(wd_rem)
+        {
+            pu1_src_cpy = pu1_src;
+            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
+            //row = 0
+            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+            //separating +ve and and -ve values.
+            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+            //creating mask 00 for +ve and -ve values and FF for zero.
+            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+            //combining the appropriate sign change
+            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+            for(row = ht; row >= 4; row -= 4)
+            {
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+                // row = 2
+                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+                //row 0 -row1
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                //row1-row0
+                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+                //row1 -row2
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+                //packing row 0 n row 1
+                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
+                //row = 3
+                src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
+                // row = 4
+                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
+
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+                signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
+                //separating +ve and and -ve values.(2,3)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
+
+                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
+                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
+                //separating +ve and and -ve values.(3,4)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
+                //combining sign-left and sign_right
+                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
+
+                edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
+
+                //packing row 2 n row 3
+                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
+                //for the next iteration signup0_16x8b = -signdwn1_16x8b
+                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+                //adding chroma offset to access U and V
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+                //the next top already in  src_top_16x8b
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
+                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
+                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
+
+                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+                //row = 2
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
+                // row = 3
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+                src_temp0_16x8b = src_temp1_16x8b;
+                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+                pu1_src_cpy += (src_strd << 2);
+
+            }
+            ht_rem = ht & 0x2;
+            if(ht_rem)
+            {
+
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+                // row = 2
+                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+                //row 0 -row1
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //row1-row0
+                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+                //row1 -row2
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+                //adding top and down substraction
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+                //for the next iteration signup0_16x8b = -signdwn1_16x8b
+                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
+                src_top_16x8b = src_temp1_16x8b;
+
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+
+                //adding chroma offset to access U and V
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                //the next top already in  src_top_16x8b
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
+
+                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+                src_temp0_16x8b = src_bottom_16x8b;
+                pu1_src_cpy += (src_strd << 1);
+
+            }
+            ht_rem = ht & 0x1;
+            if(ht_rem)
+            {
+
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+
+                //row 0 -row1
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //adding top and down substraction
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                src_top_16x8b = src_temp0_16x8b;
+
+                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
+                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                //adding chroma offset to access U and V
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                pu1_src_cpy += (src_strd);
+
+            }
+            if(0 == pu1_avail[3])
+            {
+                src_top_16x8b = src_bottom_16x8b;
+            }
+            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+            pu1_src += 8;
+        }
+    }
+}
+
+/* 135 degree filtering */
+void ihevc_sao_edge_offset_class2_ssse3(UWORD8 *pu1_src,
+                                        WORD32 src_strd,
+                                        UWORD8 *pu1_src_left,
+                                        UWORD8 *pu1_src_top,
+                                        UWORD8 *pu1_src_top_left,
+                                        UWORD8 *pu1_src_top_right,
+                                        UWORD8 *pu1_src_bot_left,
+                                        UWORD8 *pu1_avail,
+                                        WORD8 *pi1_sao_offset,
+                                        WORD32 wd,
+                                        WORD32 ht)
+{
+    WORD32 row, col;
+    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
+    UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
+    UWORD8 *pu1_firstleft;
+    UWORD8 *pu1_src_cpy, *pu1_src_org;
+    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
+    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
+    UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
+    WORD32 wd_rem;
+    UWORD8 u1_pos_0_0_tmp, u1_pos_wd_ht_tmp;
+    WORD32 ht_tmp, ht_0;
+
+    WORD32 bit_depth;
+    UWORD8 u1_avail0, u1_avail1;
+
+    __m128i src_top_16x8b, src_bottom_16x8b;
+    __m128i src_temp0_16x8b, src_temp1_16x8b;
+    __m128i signup0_16x8b, signdwn1_16x8b;
+    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+    __m128i edge0_16x8b, edge1_16x8b;
+    __m128i au1_mask8x16b;
+    __m128i edge_idx_8x16b, sao_offset_8x16b;
+    __m128i const2_16x8b, const0_16x8b;
+    __m128i left_store_16x8b;
+    UNUSED(pu1_src_top_right);
+    UNUSED(pu1_src_bot_left);
+
+    ht_0 = ht; ht_tmp = ht;
+    au1_mask8x16b = _mm_set1_epi8(0xff);
+
+    //setting availability mask to ff size MAX_CTB_SIZE
+    for(col = 0; col < MAX_CTB_SIZE; col += 16)
+        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
+    for(row = 0; row < ht; row++)
+    {
+        au1_src_left_tmp[row] = pu1_src_left[row];
+    }
+    bit_depth = BIT_DEPTH_LUMA;
+    pu1_src_org = pu1_src;
+    pu1_src_top_cpy = pu1_src_top;
+    pu1_src_left_cpy2 = au1_src_left_tmp;
+    pu1_src_left_cpy = au1_src_left_tmp;
+    pu1_src_left_str2 = au1_src_left_tmp1;
+    pu1_src_left_str = au1_src_left_tmp1;
+    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
+
+
+    /* If top-left is available, process separately */
+    if(0 != pu1_avail[4])
+    {
+        WORD8 edge_idx;
+
+        edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
+                        SIGN(pu1_src[0] - pu1_src[1 + src_strd]);
+
+        edge_idx = gi1_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_0_0_tmp = pu1_src[0];
+        }
+    }
+    else
+    {
+        u1_pos_0_0_tmp = pu1_src[0];
+    }
+
+    /* If bottom-right is available, process separately */
+    if(0 != pu1_avail[7])
+    {
+        WORD8 edge_idx;
+
+        edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]) +
+                        SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]);
+
+        edge_idx = gi1_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
+        }
+    }
+    else
+    {
+        u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
+    }
+    pu1_firstleft = pu1_src_top_left;
+
+    /* Update height and source pointers based on the availability flags */
+    if(0 == pu1_avail[2])
+    {
+        pu1_firstleft = pu1_src_left_cpy2;
+        pu1_src_left_cpy2++;
+        pu1_src_left_str2++;
+        pu1_src_top_cpy = pu1_src;
+        pu1_src += src_strd;
+        ht--;
+    }
+    if(0 == pu1_avail[3])
+    {
+        ht--;
+        ht_0--;
+    }
+    //storing top left in a mmx register
+    left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
+    const2_16x8b = _mm_set1_epi8(2);
+    const0_16x8b = _mm_setzero_si128();
+    left_store_16x8b = _mm_slli_si128(left_store_16x8b, 15);
+    //update top -left
+    *pu1_src_top_left = pu1_src_top[wd - 1];
+    //availability mask creation
+    u1_avail0 = pu1_avail[0];
+    u1_avail1 = pu1_avail[1];
+    au1_mask[0] = u1_avail0;
+    au1_mask[wd - 1] = u1_avail1;
+    {
+        WORD32 ht_rem;
+
+
+        pu1_src_left_cpy = pu1_src_left_cpy2;
+        pu1_src_left_str = pu1_src_left_str2;
+        au1_mask_cpy = au1_mask;
+        for(col = wd; col >= 16; col -= 16)
+        {
+            pu1_src_cpy = pu1_src;
+            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+            //row = 0
+            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
+            //loading the mask
+            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
+            //separating +ve and and -ve values.
+            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+            //creating mask 00 for +ve and -ve values and FF for zero.
+            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+            //combining the appropriate sign change
+            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+
+            for(row = ht; row >= 2; row -= 2)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                //row = 1
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                // row = 1 right
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
+                //to insert left in row 0
+                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15);
+                //row 0 -row1
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //manipulation for row 1 - row 0
+                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
+                //row1-row0
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                // row = 2 right
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
+
+
+                //row1 -bottom
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                // row = 2
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+                //combining sign-left and sign_right
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+
+                //storing the row 1 left for next row.
+                signup0_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+
+                //combining sign-left and sign_right
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
+                //manipulation for bottom - row 1
+                signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 15);
+                //eliminating old left for row 0 and row 1
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+                //bottom - row1
+                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //for the next iteration bottom -row1
+                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //row1  getting it right for left of next block
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+                //row0  getting it right for left of next block
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+                //copying the next top
+                src_top_16x8b = src_temp1_16x8b;
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+
+                //store left boundary
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+                src_temp0_16x8b = src_bottom_16x8b;
+                pu1_src_cpy += (src_strd << 1);
+                pu1_src_left_cpy += 2;
+                pu1_src_left_str += 2;
+            }
+            ht_rem = ht & 0x1;
+
+            if(ht_rem)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
+                //current row -next row
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //adding top and botton and constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                //eliminating old left for row 0 and row 1
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
+
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+                //row0  getting it right for left of next block
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+                //copying the next top
+                src_top_16x8b = src_temp0_16x8b;
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+                //store left boundary
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                pu1_src_cpy += (src_strd);
+                pu1_src_left_cpy += 1;
+                pu1_src_left_str += 1;
+            }
+            if(0 == pu1_avail[3])
+            {
+                src_top_16x8b = src_bottom_16x8b;
+                pu1_src_left_str[0] = pu1_src_cpy[15];
+            }
+            if(0 == pu1_avail[2])
+            {
+                pu1_src_left_str[-ht_0] = pu1_src[15 - src_strd];
+            }
+
+            //for the top left of next part of the block
+            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+            //updating top flag
+            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+            pu1_src += 16;
+            au1_mask_cpy += 16;
+
+
+            pu1_left_tmp = pu1_src_left_cpy2;
+            pu1_src_left_cpy2 = pu1_src_left_str2;
+            pu1_src_left_str2 = pu1_left_tmp;
+
+            pu1_src_left_cpy = pu1_src_left_cpy2;
+            pu1_src_left_str = pu1_src_left_str2;
+        }
+
+        wd_rem = wd & 0xF;
+        if(wd_rem)
+        {
+            pu1_src_left_cpy = pu1_src_left_cpy2;
+            pu1_src_left_str = pu1_src_left_str2;
+            pu1_src_cpy = pu1_src;
+            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
+            //row = 0
+            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
+            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
+            //separating +ve and and -ve values.
+            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+            //creating mask 00 for +ve and -ve values and FF for zero.
+            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+            //preparing au1_mask
+            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
+            //combining the appropriate sign change
+            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+
+            for(row = ht; row >= 4; row -= 4)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                // row = 2
+                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+                //right row1
+                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
+                //row 0 -row1
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+                //manipulation for row 1 -row 0
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 15);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //row 0 left
+                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //row 1 -row0
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //row1-row0
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+
+                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+                //right row2
+                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
+                //packing row 0 n row 1
+                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
+                //row1 -row2
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+                //manipulation for row 2 -row 1
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
+                //row 1 left
+                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
+                //row = 3
+                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
+
+                // row = 4
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
+
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+
+                //separating +ve and and -ve values.(2,1)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+                //manipulation for row 3 -row 2
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //row 2 left
+                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
+                //combining the appropriate sign change
+                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
+
+                //separating +ve and and -ve values.(3,2)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //right row3
+                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
+                //combining the appropriate sign change
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
+
+                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
+
+                //separating +ve and and -ve values.(2,3)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+                //right row 4
+                signdwn1_16x8b =  _mm_srli_si128(src_temp1_16x8b, 1);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
+
+                //separating +ve and and -ve values.(3,bottom)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
+                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
+
+                //manipulation for bottom -row 3
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
+                //eliminating old left for row 0,1,2,3
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
+                //packing row 2 n row 3
+                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
+                //row 3 left
+                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
+                //loading row 3 right into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
+                //adding bottom and top values of row 2 and row 3
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
+                //separating +ve and and -ve values.(botttom,3)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+                //to store right of row 2
+                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
+
+                //storing right of row 2into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+                //to store right of row 0
+                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+                //storing right of row 1 into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+
+                //storing right of row 0 into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
+                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
+
+                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
+
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+                //row = 2
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
+                // row = 3
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+                src_temp0_16x8b = src_temp1_16x8b;
+                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+                pu1_src_cpy += (src_strd << 2);
+                pu1_src_left_cpy += 4;
+                pu1_src_left_str += 4;
+            }
+            ht_rem = ht & 0x2;
+            if(ht_rem)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                // row = 2
+                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+                //row 0 -row 1
+                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+                //manipulation for row 1 -row 0
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 15);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //manipulation for row 1 - row 0
+                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                //row1-row0
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign chang
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //row 1 -bottom
+                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
+
+                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+                //row1 -bottom
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+                //manipulation for bottom -row1
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
+                //manipulation for bottom- row 1
+                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
+                //adding top and down substraction
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+                //bottom - row 1
+                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+                //eliminating old left for row 0,1
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //for the next iteration signup0_16x8b
+                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
+
+                //storing right of row 1 into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+                //for storing right of row 1
+                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+
+                src_top_16x8b = src_temp1_16x8b;
+                //storing right of row 0 into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+                //the next top already in  src_top_16x8b
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
+
+                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+                src_temp0_16x8b = src_bottom_16x8b;
+                pu1_src_cpy += (src_strd << 1);
+                pu1_src_left_cpy += 2;
+                pu1_src_left_str += 2;
+            }
+            ht_rem = ht & 0x1;
+            if(ht_rem)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                //left store manipulation 1
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
+                //row 0 -row1
+                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //adding top and down substraction
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+                //for row 0 right to put into left store
+                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
+                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
+                //filling the left boundary value
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                src_top_16x8b = src_temp0_16x8b;
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
+
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                pu1_src_cpy += (src_strd);
+                pu1_src_left_cpy += 1;
+                pu1_src_left_str += 1;
+            }
+            if(0 == pu1_avail[3])
+            {
+                src_top_16x8b = src_bottom_16x8b;
+                pu1_src_left_str[0] = pu1_src_cpy[7];
+            }
+
+            if(0 == pu1_avail[2])
+            {
+                pu1_src_left_str[-ht_0] = pu1_src[7 - src_strd];
+            }
+
+            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+            pu1_src += 8;
+            au1_mask_cpy += 16;
+
+            pu1_left_tmp = pu1_src_left_cpy2;
+            pu1_src_left_cpy2 = pu1_src_left_str2;
+            pu1_src_left_str2 = pu1_left_tmp;
+
+            pu1_src_left_cpy = pu1_src_left_cpy2;
+            pu1_src_left_str = pu1_src_left_str2;
+        }
+        pu1_src_org[0] = u1_pos_0_0_tmp;
+        pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp;
+        pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
+        for(row = 0; row < ht_tmp; row++)
+        {
+            pu1_src_left[row] = pu1_src_left_cpy[row];
+        }
+    }
+
+}
+
+/* 135 degree filtering */
+void ihevc_sao_edge_offset_class2_chroma_ssse3(UWORD8 *pu1_src,
+                                               WORD32 src_strd,
+                                               UWORD8 *pu1_src_left,
+                                               UWORD8 *pu1_src_top,
+                                               UWORD8 *pu1_src_top_left,
+                                               UWORD8 *pu1_src_top_right,
+                                               UWORD8 *pu1_src_bot_left,
+                                               UWORD8 *pu1_avail,
+                                               WORD8 *pi1_sao_offset_u,
+                                               WORD8 *pi1_sao_offset_v,
+                                               WORD32 wd,
+                                               WORD32 ht)
+{
+    WORD32 row, col;
+    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
+    UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
+    UWORD8 *pu1_firstleft;
+    UWORD8 *pu1_src_cpy, *pu1_src_org;
+    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
+    UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
+    UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
+    WORD32 wd_rem;
+    UWORD8 u1_pos_0_0_tmp_u, u1_pos_0_0_tmp_v, u1_pos_wd_ht_tmp_u, u1_pos_wd_ht_tmp_v;
+    WORD32 ht_tmp;
+    WORD32 ht_0;
+
+    WORD32 bit_depth;
+    UWORD8 u1_avail0, u1_avail1;
+
+    __m128i src_temp0_16x8b, src_temp1_16x8b;
+    __m128i signup0_16x8b, signdwn1_16x8b;
+    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+    __m128i edge0_16x8b, edge1_16x8b;
+    __m128i src_top_16x8b, src_bottom_16x8b;
+    __m128i au1_mask8x16b;
+    __m128i edge_idx_8x16b, sao_offset_8x16b;
+    __m128i const2_16x8b, const0_16x8b;
+    __m128i left_store_16x8b;
+    __m128i chroma_offset_8x16b;
+
+    UNUSED(pu1_src_top_right);
+    UNUSED(pu1_src_bot_left);
+
+    ht_0 = ht; ht_tmp = ht;
+    au1_mask8x16b = _mm_set1_epi8(0xff);
+    /* Updating left and top-left  */
+    for(row = 0; row < 2 * ht; row++)
+    {
+        au1_src_left_tmp[row] = pu1_src_left[row];
+    }
+    //setting availability mask to ff size MAX_CTB_SIZE
+    for(col = 0; col < MAX_CTB_SIZE; col += 16)
+        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
+    bit_depth = BIT_DEPTH_LUMA;
+    pu1_src_org = pu1_src;
+    pu1_src_top_cpy = pu1_src_top;
+    pu1_src_left_cpy2 = au1_src_left_tmp;
+    pu1_src_left_cpy = au1_src_left_tmp;
+    pu1_src_left_str2 = au1_src_left_tmp1;
+    pu1_src_left_str = au1_src_left_tmp1;
+    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
+    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
+    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
+
+    /* If top-left is available, process separately */
+    if(0 != pu1_avail[4])
+    {
+        WORD32 edge_idx;
+
+        /* U */
+        edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
+                        SIGN(pu1_src[0] - pu1_src[2 + src_strd]);
+
+        edge_idx = gi1_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_0_0_tmp_u = pu1_src[0];
+        }
+
+        /* V */
+        edge_idx = 2 + SIGN(pu1_src[1] - pu1_src_top_left[1]) +
+                        SIGN(pu1_src[1] - pu1_src[1 + 2 + src_strd]);
+
+        edge_idx = gi1_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_0_0_tmp_v = CLIP3(pu1_src[1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_0_0_tmp_v = pu1_src[1];
+        }
+    }
+    else
+    {
+        u1_pos_0_0_tmp_u = pu1_src[0];
+        u1_pos_0_0_tmp_v = pu1_src[1];
+    }
+
+    /* If bottom-right is available, process separately */
+    if(0 != pu1_avail[7])
+    {
+        WORD32 edge_idx;
+
+        /* U */
+        edge_idx = 2 + SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]) +
+                        SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]);
+
+        edge_idx = gi1_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_wd_ht_tmp_u = CLIP3(pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
+        }
+
+        /* V */
+        edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) +
+                        SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]);
+
+        edge_idx = gi1_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
+        }
+    }
+    else
+    {
+        u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
+        u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
+    }
+    pu1_firstleft = pu1_src_top_left;
+
+    /* Update height and source pointers based on the availability flags */
+    if(0 == pu1_avail[2])
+    {
+        pu1_firstleft = pu1_src_left_cpy2;
+        pu1_src_left_cpy2 += 2;
+        pu1_src_left_str2 += 2;
+        pu1_src_top_cpy = pu1_src;
+        pu1_src += src_strd;
+        ht--;
+    }
+    if(0 == pu1_avail[3])
+    {
+        ht--;
+        ht_0--;
+    }
+    //storing top left in a mmx register
+    left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
+    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
+    const2_16x8b = _mm_set1_epi8(2);
+    const0_16x8b = _mm_setzero_si128();
+    left_store_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+
+    //availability mask creation
+    u1_avail0 = pu1_avail[0];
+    u1_avail1 = pu1_avail[1];
+    au1_mask[0] = u1_avail0;
+    au1_mask[1] = u1_avail0;
+    au1_mask[wd - 1] = u1_avail1;
+    au1_mask[wd - 2] = u1_avail1;
+
+    /* top-left arrays */
+    pu1_src_top_left[0] = pu1_src_top[wd - 2];
+    pu1_src_top_left[1] = pu1_src_top[wd - 1];
+    {
+        WORD32 ht_rem;
+        au1_mask_cpy = au1_mask;
+
+        pu1_src_left_cpy = pu1_src_left_cpy2;
+        pu1_src_left_str = pu1_src_left_str2;
+        for(col = wd; col >= 16; col -= 16)
+        {
+            pu1_src_cpy = pu1_src;
+            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+            //row = 0
+            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
+            //loading the mask
+            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
+            //separating +ve and and -ve values.
+            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+            //creating mask 00 for +ve and -ve values and FF for zero.
+            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+            //combining the appropriate sign change
+            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+
+            for(row = ht; row >= 2; row -= 2)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                //row = 1
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                // row = 1 right
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
+                //to insert left in row 0
+                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+                //row 0 -row1
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //manipulation for row 1 - row 0
+                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
+                //row1-row0
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                 // row = 2 right
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
+
+
+                //row1 -bottom
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                // row = 2
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+                //combining sign-left and sign_right
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+
+                //storing the row 1 left for next row.
+                signup0_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+
+                //combining sign-left and sign_right
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
+                //manipulation for bottom - row 1
+                signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 14);
+                //eliminating old left for row 0 and row 1
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
+                //bottom - row1
+                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //for the next iteration bottom -row1
+                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //row1  getting it right for left of next iteration
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
+                //copying the next top
+                src_top_16x8b = src_temp1_16x8b;
+                //row0  getting its right for left of next iteration.
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+
+
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+                //adding chroma offset to access U and V
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
+                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+
+                //store left boundary
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+                src_temp0_16x8b = src_bottom_16x8b;
+                pu1_src_cpy += (src_strd << 1);
+                pu1_src_left_cpy += 4;
+                pu1_src_left_str += 4;
+            }
+            ht_rem = ht & 0x1;
+
+            if(ht_rem)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
+                //current row -next row
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //adding top and botton and constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+                //eliminating old left for row 0 and row 1
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+                //copying the next top
+                src_top_16x8b = src_temp0_16x8b;
+                //row0  getting it right for left of next block
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                //adding chroma offset to access U and V
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                pu1_src_cpy += (src_strd);
+                pu1_src_left_cpy += 2;
+                pu1_src_left_str += 2;
+            }
+            if(0 == pu1_avail[3])
+            {
+                src_top_16x8b = src_bottom_16x8b;
+                pu1_src_left_str[1] = pu1_src_cpy[15];
+                pu1_src_left_str[0] = pu1_src_cpy[14];
+            }
+            if(0 == pu1_avail[2])
+            {
+                pu1_src_left_str[-2 * ht_0] = pu1_src[14 - src_strd];
+                pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[15 - src_strd];
+            }
+
+            //for the top left of next part of the block
+            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+            //updating top flag
+            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+            pu1_src += 16;
+            au1_mask_cpy += 16;
+
+            pu1_left_tmp = pu1_src_left_cpy2;
+            pu1_src_left_cpy2 = pu1_src_left_str2;
+            pu1_src_left_str2 = pu1_left_tmp;
+
+            pu1_src_left_cpy = pu1_src_left_cpy2;
+            pu1_src_left_str = pu1_src_left_str2;
+        }
+        wd_rem = wd & 0xF;
+        if(wd_rem)
+        {
+            pu1_src_left_cpy = pu1_src_left_cpy2;
+            pu1_src_left_str = pu1_src_left_str2;
+            pu1_src_cpy = pu1_src;
+            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
+            //row = 0
+            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
+            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
+            //separating +ve and and -ve values.
+            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+            //creating mask 00 for +ve and -ve values and FF for zero.
+            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+            //preparing au1_mask
+            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
+            //combining the appropriate sign change
+            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+
+            for(row = ht; row >= 4; row -= 4)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                // row = 2
+                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+                //right row1
+                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+                //row 0 -row1
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+                //manipulation for row 1 -row 0
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //row 0 left
+                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //row 1 -row0
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //row1-row0
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+
+                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+                //right row2
+                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
+                //packing row 0 n row 1
+                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
+                //row1 -row2
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+                //manipulation for row 2 -row 1
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
+                //row 1 left
+                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
+                //row = 3
+                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
+
+                // row = 4
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
+
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+
+                //separating +ve and and -ve values.(2,1)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+                //manipulation for row 3 -row 2
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //row 2 left
+                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
+                //combining the appropriate sign change
+                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
+
+                //separating +ve and and -ve values.(3,2)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //right row3
+                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
+                //combining the appropriate sign change
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
+
+                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
+
+                //separating +ve and and -ve values.(2,3)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+                //right row 4
+                signdwn1_16x8b =  _mm_srli_si128(src_temp1_16x8b, 2);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
+
+                //separating +ve and and -ve values.(3,bottom)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
+                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
+
+                //manipulation for bottom -row 3
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 8);
+                //eliminating old left for row 0,1,2,3
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
+                //packing row 2 n row 3
+                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
+                //row 3 left
+                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
+
+                //adding bottom and top values of row 2 and row 3
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
+                //separating +ve and and -ve values.(botttom,3)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
+
+                //to store right of row 2
+                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
+                //loading row 3 right into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
+                //storing right of row 2into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+                //to store right of row 0
+                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+                //storing right of row 1 into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+                //storing right of row 0 into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+
+                //adding chroma offset to access U and V
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
+                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
+                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
+
+                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
+
+
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+                //row = 2
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
+                // row = 3
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+                src_temp0_16x8b = src_temp1_16x8b;
+                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+                pu1_src_cpy += (src_strd << 2);
+                pu1_src_left_cpy += 8;
+                pu1_src_left_str += 8;
+            }
+            ht_rem = ht & 0x2;
+            if(ht_rem)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                // row = 2
+                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+                //row 0 -row 1
+                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+                //manipulation for row 1 -row 0
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //manipulation for row 1 - row 0
+                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                //row1-row0
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign chang
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //row 1 -bottom
+                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
+
+                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+                //row1 -bottom
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+                //manipulation for bottom -row1
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
+                //eliminating old left for row 0,1
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
+                //manipulation for bottom- row 1
+                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
+                //adding top and down substraction
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+                //bottom - row 1
+                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+                //shifting row 1
+                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //for the next iteration signup0_16x8b
+                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
+                //storing right of row 1 into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); //for storing right of row 0
+                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+                //the next top  in  src_top_16x8b
+                src_top_16x8b = src_temp1_16x8b;
+                //storing right of row 0 into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+
+
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+
+                //adding chroma offset to access U and V
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                //the next top already in  src_top_16x8b
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
+
+                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+                src_temp0_16x8b = src_bottom_16x8b;
+                pu1_src_cpy += (src_strd << 1);
+                pu1_src_left_cpy += 4;
+                pu1_src_left_str += 4;
+            }
+            ht_rem = ht & 0x1;
+            if(ht_rem)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+                //row 0 -row1
+                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //adding top and down substraction
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+
+                //for row 0 right to put into left store
+                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+                //left store manipulation 1
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+                src_top_16x8b = src_temp0_16x8b;
+                //filling the left boundary value
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
+                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
+
+
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                //adding chroma offset to access U and V
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
+
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                pu1_src_cpy += (src_strd);
+                pu1_src_left_cpy += 2;
+                pu1_src_left_str += 2;
+            }
+            if(0 == pu1_avail[3])
+            {
+                src_top_16x8b = src_bottom_16x8b;
+                pu1_src_left_str[1] = pu1_src_cpy[7];
+                pu1_src_left_str[0] = pu1_src_cpy[6];
+            }
+
+            if(0 == pu1_avail[2])
+            {
+                pu1_src_left_str[-2 * ht_0] = pu1_src[6 - src_strd];
+                pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[7 - src_strd];
+            }
+
+            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+            pu1_src += 8;
+
+            pu1_left_tmp = pu1_src_left_cpy2;
+            pu1_src_left_cpy2 = pu1_src_left_str2;
+            pu1_src_left_str2 = pu1_left_tmp;
+
+            pu1_src_left_cpy = pu1_src_left_cpy2;
+            pu1_src_left_str = pu1_src_left_str2;
+        }
+        pu1_src_org[0] = u1_pos_0_0_tmp_u;
+        pu1_src_org[1] = u1_pos_0_0_tmp_v;
+        pu1_src_org[wd - 2 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_u;
+        pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_v;
+        pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 2) : pu1_src_left_cpy;
+        for(row = 0; row < 2 * ht_tmp; row++)
+        {
+            pu1_src_left[row] = pu1_src_left_cpy[row];
+        }
+    }
+
+}
+
+void ihevc_sao_edge_offset_class3_ssse3(UWORD8 *pu1_src,
+                                        WORD32 src_strd,
+                                        UWORD8 *pu1_src_left,
+                                        UWORD8 *pu1_src_top,
+                                        UWORD8 *pu1_src_top_left,
+                                        UWORD8 *pu1_src_top_right,
+                                        UWORD8 *pu1_src_bot_left,
+                                        UWORD8 *pu1_avail,
+                                        WORD8 *pi1_sao_offset,
+                                        WORD32 wd,
+                                        WORD32 ht)
+{
+    WORD32 row, col;
+    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
+    UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
+    UWORD8 *pu1_src_cpy, *pu1_src_org;
+    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
+    UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
+    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
+    WORD32 wd_rem;
+    UWORD8 u1_pos_wd_0_tmp, u1_pos_0_ht_tmp;
+    WORD32 ht_tmp;
+    WORD32 bit_depth;
+    UWORD8 u1_avail0, u1_avail1;
+
+    __m128i src_top_16x8b, src_bottom_16x8b;
+    __m128i src_temp0_16x8b, src_temp1_16x8b;
+    __m128i signup0_16x8b, signdwn1_16x8b;
+    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+    __m128i edge0_16x8b, edge1_16x8b;
+    __m128i au1_mask8x16b;
+    __m128i edge_idx_8x16b, sao_offset_8x16b;
+    __m128i const2_16x8b, const0_16x8b;
+    __m128i left_store_16x8b;
+
+    ht_tmp = ht;
+    au1_mask8x16b = _mm_set1_epi8(0xff);
+
+    au1_src_left_tmp[0] = pu1_src[(wd - 1)];
+    //manipulation for bottom left
+    for(row = 1; row < ht; row++)
+    {
+        au1_src_left_tmp[row] = pu1_src_left[row];
+    }
+    au1_src_left_tmp[ht] = pu1_src_bot_left[0];
+
+    *pu1_src_top_left = pu1_src_top[wd - 1];
+    //setting availability mask to ff size MAX_CTB_SIZE
+    for(col = 0; col < MAX_CTB_SIZE; col += 16)
+        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
+    bit_depth = BIT_DEPTH_LUMA;
+    pu1_src_org = pu1_src;
+    pu1_src_top_cpy = pu1_src_top;
+    pu1_src_left_cpy2 = au1_src_left_tmp;
+    pu1_src_left_cpy = au1_src_left_tmp;
+    pu1_src_left_str2 = au1_src_left_tmp1;
+    pu1_src_left_str = au1_src_left_tmp1;
+    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
+
+    /* If top-right is available, process separately */
+    if(0 != pu1_avail[5])
+    {
+        WORD32 edge_idx;
+
+        edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +
+                        SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]);
+
+        edge_idx = gi1_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_wd_0_tmp = CLIP3(pu1_src[wd - 1] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_wd_0_tmp = pu1_src[wd - 1];
+        }
+    }
+    else
+    {
+        u1_pos_wd_0_tmp = pu1_src[wd - 1];
+    }
+
+    /* If bottom-left is available, process separately */
+    if(0 != pu1_avail[6])
+    {
+        WORD32 edge_idx;
+
+        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) +
+                        SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
+
+        edge_idx = gi1_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_0_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
+        }
+    }
+    else
+    {
+        u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
+    }
+
+
+
+    /* Update height and source pointers based on the availability flags */
+    if(0 == pu1_avail[2])
+    {
+        pu1_src_left_cpy2++;
+        pu1_src_left_str2++;
+        pu1_src_top_cpy = pu1_src;
+        pu1_src += src_strd;
+        ht--;
+    }
+    if(0 == pu1_avail[3])
+    {
+        ht--;
+    }
+
+
+    const2_16x8b = _mm_set1_epi8(2);
+    const0_16x8b = _mm_setzero_si128();
+
+
+    //availability mask creation
+    u1_avail0 = pu1_avail[0];
+    u1_avail1 = pu1_avail[1];
+    au1_mask[0] = u1_avail0;
+    au1_mask[wd - 1] = u1_avail1;
+    {
+        WORD32 ht_rem;
+
+        pu1_src_left_cpy = pu1_src_left_cpy2;
+        pu1_src_left_str = pu1_src_left_str2;
+        au1_mask_cpy = au1_mask;
+        for(col = wd; col >= 16; col -= 16)
+        {
+            pu1_src_cpy = pu1_src;
+            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 1));
+            //row = 0
+            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+
+            //loading the mask
+            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
+            //separating +ve and and -ve values.
+            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+            //creating mask 00 for +ve and -ve values and FF for zero.
+            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+            //combining the appropriate sign change
+            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+            for(row = ht; row >= 2; row -= 2)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
+                //row = 1
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                //to insert left in row 1
+                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+                // row = 0 right
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
+
+                //manipulation for row 1 - row 0
+                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
+                //row 0 -row1
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
+                //combining sign-left and sign_right
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+
+                //row1-row0
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+
+                // row = 2
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+                // row = 1 right
+                signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
+
+                //bottom - row1
+                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //for the next iteration bottom -row1
+                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                //to insert left in row 1
+                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
+                //manipulation for row 1 - bottom
+                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
+
+                //row1 -bottom
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                //combining sign-left and sign_right
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
+
+                //eliminating old left for row 0 and row 1
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+
+                //row1  getting it right for left of next block
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+                //row0  getting it right for left of next block
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+                //copying the next top
+                src_top_16x8b = src_temp1_16x8b;
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+                //store left boundary
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+                src_temp0_16x8b = src_bottom_16x8b;
+                pu1_src_cpy += (src_strd << 1);
+                pu1_src_left_cpy += 2;
+                pu1_src_left_str += 2;
+            }
+            ht_rem = ht & 0x1;
+
+            if(ht_rem)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                //to insert left in row 1
+                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+                //manipulation for row 1 - row 0
+                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
+
+                //current row -next row
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //adding top and bottom and constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                //eliminating old left for row 0 and row 1
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
+
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+                //row0  getting it right for left of next block
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+                //copying the next top
+                src_top_16x8b = src_temp0_16x8b;
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+                //store left boundary
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                pu1_src_cpy += (src_strd);
+                src_temp0_16x8b = src_bottom_16x8b;
+                pu1_src_left_cpy++;
+                pu1_src_left_str++;
+            }
+            {   //for bottom right
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+            }
+            if(0 == pu1_avail[3])
+            {
+                src_top_16x8b = src_bottom_16x8b;
+            }
+            //for the top left of next part of the block
+            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+            //updating top flag
+            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+            pu1_src += 16;
+            au1_mask_cpy += 16;
+
+            pu1_left_tmp = pu1_src_left_cpy2;
+            pu1_src_left_cpy2 = pu1_src_left_str2;
+            pu1_src_left_str2 = pu1_left_tmp;
+
+            pu1_src_left_cpy = pu1_src_left_cpy2;
+            pu1_src_left_str = pu1_src_left_str2;
+        }
+
+        wd_rem = wd & 0xF;
+        if(wd_rem)
+        {
+            pu1_src_cpy = pu1_src;
+            pu1_src_left_cpy = pu1_src_left_cpy2;
+            pu1_src_left_str = pu1_src_left_str2;
+            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 1));
+            //row = 0
+            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
+            //separating +ve and and -ve values.
+            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+            //creating mask 00 for +ve and -ve values and FF for zero.
+            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+            //preparing au1_mask
+            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
+            //combining the appropriate sign change
+            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+
+            for(row = ht; row >= 4; row -= 4)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                // row = 2
+                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+                //manipulation for row 0 -row 1
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
+                //row 1 left
+                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
+                //row 0 -row1
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //manipulatiing for row 1 -row 0
+                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //row 1 -row0
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //row1-row0
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+
+                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+                //manipulation for row 1 -row 2
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
+                //row 2 left
+                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
+                //packing row 0 n row 1
+                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
+                //row1 -row2
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+
+                //row 1 right
+                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
+                //row = 3
+                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
+
+                // row = 4
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
+
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+
+                //separating +ve and and -ve values.(2,1)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //row 2 right
+                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
+                //combining the appropriate sign change
+                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
+
+                //separating +ve and and -ve values.(3,2)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //manipulation for row 2 -row 3
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
+                //row 3 left
+                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
+                //combining the appropriate sign change
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
+
+                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
+
+                //separating +ve and and -ve values.(2,3)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+                //manipulation for row 3 -bottom
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 11);
+                //bottom left
+                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
+
+                //separating +ve and and -ve values.(3,bottom)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
+                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
+
+
+                //eliminating old left for row 0,1,2,3
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
+                //packing row 2 n row 3
+                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
+                //row 3 right
+                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
+                //loading row 3 right into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
+                //adding bottom and top values of row 2 and row 3
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
+                //separating +ve and and -ve values.(botttom,3)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+                //to store right of row 2
+                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
+
+                //storing right of row 2into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+                //to store right of row 0
+                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+                //storing right of row 1 into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+
+                //storing right of row 0 into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
+                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
+
+                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
+
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+                //row = 2
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
+                // row = 3
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+                src_temp0_16x8b = src_temp1_16x8b;
+                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+                pu1_src_cpy += (src_strd << 2);
+                pu1_src_left_cpy += 4;
+                pu1_src_left_str += 4;
+            }
+            ht_rem = ht & 0x2;
+            if(ht_rem)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                // row = 2
+                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+                //manipulation for row 0 -row 1
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
+                //bottom left
+                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //manipulation for row 1 - row 0
+                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                //row1-row0
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign chang
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                //manipulation for row 1 -bottom
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
+                //bottom left
+                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
+
+                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+                //row1 -bottom
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+                //manipulation for bottom- row 1 (row 1 right)
+                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
+                //adding top and down substraction
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+                //bottom - row 1
+                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+                //eliminating old left for row 0,1
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //for the next iteration signup0_16x8b
+                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
+
+                //storing right of row 1 into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+                //for storing right of row 1
+                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+
+                src_top_16x8b = src_temp1_16x8b;
+                //storing right of row 0 into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+                //the next top already in  src_top_16x8b
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
+
+                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+                src_temp0_16x8b = src_bottom_16x8b;
+                pu1_src_cpy += (src_strd << 1);
+                pu1_src_left_cpy += 2;
+                pu1_src_left_str += 2;
+            }
+            ht_rem = ht & 0x1;
+            if(ht_rem)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+
+                //manipulation for row 0 -bottom
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
+                //bottom left
+                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //adding top and down substraction
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+                //for row 0 right to put into left store
+                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
+                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
+                //left store manipulation 1
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
+                //filling the left boundary value
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                src_top_16x8b = src_temp0_16x8b;
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
+
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                pu1_src_cpy += (src_strd);
+                src_temp0_16x8b = src_bottom_16x8b;
+                pu1_src_left_cpy++;
+                pu1_src_left_str++;
+            }
+            {   //for bottom right
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
+                src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+            }
+            if(0 == pu1_avail[3])
+            {
+                src_top_16x8b = src_bottom_16x8b;
+            }
+            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+            pu1_src += 8;
+
+            pu1_left_tmp = pu1_src_left_cpy2;
+            pu1_src_left_cpy2 = pu1_src_left_str2;
+            pu1_src_left_str2 = pu1_left_tmp;
+
+            pu1_src_left_cpy = pu1_src_left_cpy2;
+            pu1_src_left_str = pu1_src_left_str2;
+
+        }
+        pu1_src_org[wd - 1] = u1_pos_wd_0_tmp;
+        pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp;
+        pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
+        pu1_src_left[0] = au1_src_left_tmp[0];
+        for(row = 1; row < ht_tmp; row++)
+        {
+            pu1_src_left[row] = pu1_src_left_cpy[row];
+        }
+    }
+
+}
+
+void ihevc_sao_edge_offset_class3_chroma_ssse3(UWORD8 *pu1_src,
+                                               WORD32 src_strd,
+                                               UWORD8 *pu1_src_left,
+                                               UWORD8 *pu1_src_top,
+                                               UWORD8 *pu1_src_top_left,
+                                               UWORD8 *pu1_src_top_right,
+                                               UWORD8 *pu1_src_bot_left,
+                                               UWORD8 *pu1_avail,
+                                               WORD8 *pi1_sao_offset_u,
+                                               WORD8 *pi1_sao_offset_v,
+                                               WORD32 wd,
+                                               WORD32 ht)
+{
+    WORD32 row, col;
+    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
+    UWORD8 *pu1_src_cpy, *pu1_src_org;
+    UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
+    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
+    WORD32 wd_rem;
+    UWORD8 u1_pos_wd_0_tmp_u, u1_pos_wd_0_tmp_v, u1_pos_0_ht_tmp_u, u1_pos_0_ht_tmp_v;
+    WORD32 ht_tmp;
+    WORD32 bit_depth;
+    UWORD8 u1_avail0, u1_avail1;
+
+    __m128i src_top_16x8b, src_bottom_16x8b;
+    __m128i src_temp0_16x8b, src_temp1_16x8b;
+    __m128i signup0_16x8b, signdwn1_16x8b;
+    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+    __m128i edge0_16x8b, edge1_16x8b;
+    __m128i au1_mask8x16b;
+    __m128i edge_idx_8x16b, sao_offset_8x16b;
+    __m128i left_store_16x8b;
+    __m128i const0_16x8b, const2_16x8b;
+    __m128i chroma_offset_8x16b;
+
+    ht_tmp = ht;
+    au1_mask8x16b = _mm_set1_epi8(0xff);
+
+
+    au1_src_left_tmp[0] = pu1_src[(wd - 2)];
+    au1_src_left_tmp[1] = pu1_src[(wd - 1)];
+    //manipulation for bottom left
+    for(row = 2; row < 2 * ht; row++)
+    {
+        au1_src_left_tmp[row] = pu1_src_left[row];
+    }
+    au1_src_left_tmp[2 * ht] = pu1_src_bot_left[0];
+    au1_src_left_tmp[2 * ht + 1] = pu1_src_bot_left[1];
+
+    pu1_src_top_left[0] = pu1_src_top[wd - 2];
+    pu1_src_top_left[1] = pu1_src_top[wd - 1];
+    //setting availability mask to ff size MAX_CTB_SIZE
+    for(col = 0; col < MAX_CTB_SIZE; col += 16)
+        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
+    bit_depth = BIT_DEPTH_LUMA;
+    pu1_src_org = pu1_src;
+    pu1_src_top_cpy = pu1_src_top;
+    pu1_src_left_cpy2 = au1_src_left_tmp;
+    pu1_src_left_cpy = au1_src_left_tmp;
+    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
+    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
+    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
+    /* If top-right is available, process separately */
+    if(0 != pu1_avail[5])
+    {
+        WORD32 edge_idx;
+
+        /* U */
+        edge_idx = 2 + SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +
+                        SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]);
+
+        edge_idx = gi1_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_wd_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
+        }
+
+        /* V */
+        edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +
+                        SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]);
+
+        edge_idx = gi1_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_wd_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
+        }
+    }
+    else
+    {
+        u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
+        u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
+    }
+
+    /* If bottom-left is available, process separately */
+    if(0 != pu1_avail[6])
+    {
+        WORD32 edge_idx;
+
+        /* U */
+        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) +
+                        SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
+
+        edge_idx = gi1_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_0_ht_tmp_u = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
+        }
+
+        /* V */
+        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) +
+                        SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]);
+
+        edge_idx = gi1_table_edge_idx[edge_idx];
+
+        if(0 != edge_idx)
+        {
+            u1_pos_0_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd + 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+        }
+        else
+        {
+            u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
+        }
+    }
+    else
+    {
+        u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
+        u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
+    }
+
+
+
+    /* Update height and source pointers based on the availability flags */
+    if(0 == pu1_avail[2])
+    {
+        pu1_src_left_cpy2 += 2;
+        pu1_src_top_cpy = pu1_src;
+        pu1_src += src_strd;
+        ht--;
+    }
+    if(0 == pu1_avail[3])
+    {
+        ht--;
+    }
+
+    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
+    const2_16x8b = _mm_set1_epi8(2);
+    const0_16x8b = _mm_setzero_si128();
+
+
+    //availability mask creation
+    u1_avail0 = pu1_avail[0];
+    u1_avail1 = pu1_avail[1];
+    au1_mask[0] = u1_avail0;
+    au1_mask[1] = u1_avail0;
+    au1_mask[wd - 1] = u1_avail1;
+    au1_mask[wd - 2] = u1_avail1;
+    {
+        WORD32 ht_rem;
+        au1_mask_cpy = au1_mask;
+        for(col = wd; col >= 16; col -= 16)
+        {
+            pu1_src_cpy = pu1_src;
+            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 2));
+            //row = 0
+            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+
+            //loading the mask
+            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
+            //separating +ve and and -ve values.
+            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+            //creating mask 00 for +ve and -ve values and FF for zero.
+            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+            //combining the appropriate sign change
+            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+            pu1_src_left_cpy = pu1_src_left_cpy2;
+
+            for(row = ht; row >= 2; row -= 2)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
+                //row = 1
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                //to insert left in row 1
+                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+                // row = 0 right
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
+
+                //manipulation for row 1 - row 0
+                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
+                //row 0 -row1
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
+                //combining sign-left and sign_right
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+
+                //row1-row0
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+
+                // row = 2
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+                // row = 1 right
+                signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
+
+                //bottom - row1
+                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //for the next iteration bottom -row1
+                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                //to insert left in row 1
+                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
+                //manipulation for row 1 - bottom
+                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
+
+                //row1 -bottom
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                //combining sign-left and sign_right
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
+
+                //eliminating old left for row 0 and row 1
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
+                //row1  getting it right for left of next block
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
+                //row0  getting it right for left of next block
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+                //copying the next top
+                src_top_16x8b = src_temp1_16x8b;
+
+
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+
+                //adding chroma offset to access U and V
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
+                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+                //store left boundary
+                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+                src_temp0_16x8b = src_bottom_16x8b;
+                pu1_src_cpy += (src_strd << 1);
+                pu1_src_left_cpy += 4;
+            }
+            ht_rem = ht & 0x1;
+
+            if(ht_rem)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                //to insert left in row 1
+                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+                //manipulation for row 1 - row 0
+                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
+
+                //current row -next row
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //adding top and bottom and constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                //eliminating old left for row 0 and row 1
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+                //row0  getting it right for left of next block
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+                //copying the next top
+                src_top_16x8b = src_temp0_16x8b;
+
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+
+                //adding chroma offset to access U and V
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+
+
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                //store left boundary
+                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+
+                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                pu1_src_cpy += (src_strd);
+                src_temp0_16x8b = src_bottom_16x8b;
+                pu1_src_left_cpy += 2;
+            }
+            {   //for bottom right
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+            }
+            if(0 == pu1_avail[3])
+            {
+                src_top_16x8b = src_bottom_16x8b;
+            }
+            //for the top left of next part of the block
+            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+            //updating top flag
+            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+            pu1_src += 16;
+            au1_mask_cpy += 16;
+        }
+        pu1_src_left_cpy = pu1_src_left_cpy2;
+        wd_rem = wd & 0xF;
+        if(wd_rem)
+        {
+            pu1_src_cpy = pu1_src;
+            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 2));
+            //row = 0
+            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
+            //separating +ve and and -ve values.
+            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+            //creating mask 00 for +ve and -ve values and FF for zero.
+            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+            //preparing au1_mask
+            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
+            //combining the appropriate sign change
+            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+            pu1_src_left_cpy = pu1_src_left_cpy2;
+            for(row = ht; row >= 4; row -= 4)
+            {
+                left_store_16x8b = _mm_loadu_si128((__m128i *)pu1_src_left_cpy);
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                // row = 2
+                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+                //manipulation for row 0 -row 1
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
+                //row 1 left
+                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
+                //row 0 -row1
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //manipulatiing for row 1 -row 0
+                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //row 1 -row0
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //row1-row0
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+
+                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+                //manipulation for row 1 -row 2
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
+                //row 2 left
+                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
+                //packing row 0 n row 1
+                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
+                //row1 -row2
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+
+                //row 1 right
+                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+                //row = 3
+                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
+
+                // row = 4
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
+
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+
+                //separating +ve and and -ve values.(2,1)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //row 2 right
+                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
+                //combining the appropriate sign change
+                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
+
+                //separating +ve and and -ve values.(3,2)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //manipulation for row 2 -row 3
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 8);
+                //row 3 left
+                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
+                //combining the appropriate sign change
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
+
+                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
+
+                //separating +ve and and -ve values.(2,3)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+                //manipulation for row 3 -bottom
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 6);
+                //bottom left
+                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
+
+                //separating +ve and and -ve values.(3,bottom)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
+                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
+
+
+                //eliminating old left for row 0,1,2,3
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
+                //packing row 2 n row 3
+                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
+                //row 3 right
+                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
+                //loading row 3 right into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
+                //adding bottom and top values of row 2 and row 3
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
+                //separating +ve and and -ve values.(botttom,3)
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+                //to store right of row 2
+                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
+
+                //storing right of row 2into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+                //to store right of row 0
+                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+                //storing right of row 1 into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+                //storing right of row 0 into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+
+
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+                //adding chroma offset to access U and V
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
+                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
+                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
+
+                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
+                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+                //row = 2
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
+                // row = 3
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+                src_temp0_16x8b = src_temp1_16x8b;
+                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+                pu1_src_cpy += (src_strd << 2);
+                pu1_src_left_cpy += 8;
+            }
+            ht_rem = ht & 0x2;
+            if(ht_rem)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+                // row = 2
+                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+                //manipulation for row 0 -row 1
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
+                //bottom left
+                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //manipulation for row 1 - row 0
+                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                //row1-row0
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign chang
+                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+                //manipulation for row 1 -bottom
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
+                //bottom left
+                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
+
+                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+                //row1 -bottom
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+
+                //manipulation for bottom- row 1 (row 1 right)
+                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+                //adding top and down substraction
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+                //bottom - row 1
+                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+                //eliminating old left for row 0,1
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
+                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //for the next iteration signup0_16x8b
+                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
+
+                //storing right of row 1 into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+                //for storing right of row 1
+                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+
+                src_top_16x8b = src_temp1_16x8b;
+                //storing right of row 0 into left
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                //adding chroma offset to access U and V
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+                //the next top already in  src_top_16x8b
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
+
+                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+
+                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                // row = 1
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+                src_temp0_16x8b = src_bottom_16x8b;
+                pu1_src_cpy += (src_strd << 1);
+                pu1_src_left_cpy += 4;
+            }
+            ht_rem = ht & 0x1;
+            if(ht_rem)
+            {
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+
+                //manipulation for row 0 -bottom
+                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
+                //bottom left
+                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
+                //separating +ve and and -ve values.
+                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+                //creating mask 00 for +ve and -ve values and FF for zero.
+                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+                //combining the appropriate sign change
+                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+                //adding top and down substraction
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+                //for row 0 right to put into left store
+                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+                //adding constant 2
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
+                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
+                //left store manipulation 1
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+                //filling the left boundary value
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+                src_top_16x8b = src_temp0_16x8b;
+
+                //shuffle to get sao index
+                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+                //using availability mask
+                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+                //adding chroma offset to access U and V
+                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+                //shuffle to get sao offset
+                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+                //cnvert to 16 bit then add and then saturated pack
+                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
+
+                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+                pu1_src_cpy += (src_strd);
+                src_temp0_16x8b = src_bottom_16x8b;
+                pu1_src_left_cpy += 2;
+            }
+            {   //for bottom right
+                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+                src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+            }
+            if(0 == pu1_avail[3])
+            {
+                src_top_16x8b = src_bottom_16x8b;
+            }
+
+            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+            pu1_src += 8;
+        }
+        pu1_src_org[wd - 2] = u1_pos_wd_0_tmp_u;
+        pu1_src_org[wd - 1] = u1_pos_wd_0_tmp_v;
+        pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp_u;
+        pu1_src_org[(ht_tmp - 1) * src_strd + 1] = u1_pos_0_ht_tmp_v;
+        for(row = 0; row < 2 * ht_tmp; row++)
+        {
+            pu1_src_left[row] = au1_src_left_tmp[row];
+        }
+    }
+
+}

diff --git a/common/x86/ihevc_tables_x86_intr.c b/common/x86/ihevc_tables_x86_intr.c
new file mode 100644
index 0000000..0fc3de2
--- /dev/null
+++ b/common/x86/ihevc_tables_x86_intr.c

@@ -0,0 +1,120 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_tables_x86_intr.c
+*
+* @brief
+*  Contains function Definition for intra prediction  interpolation filters
+*
+*
+* @author
+*  Rishab
+*
+* @par List of Functions:
+
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_defs.h"
+#include "ihevc_mem_fns.h"
+#include "ihevc_tables_x86_intr.h"
+
+// LUMA INTRA PRED
+const UWORD8 IHEVCE_SHUFFLEMASKY1[16] = { 0x03, 0x02, 0x01, 0x00,
+    0x02, 0x03, 0x03, 0x04,
+    0x08, 0x08, 0x08, 0x08,
+    0x08, 0x08, 0x08, 0x08 };
+
+const UWORD8 IHEVCE_SHUFFLEMASKY2[16] = { 0x07, 0x06, 0x05, 0x04,
+    0x03, 0x02, 0x01, 0x00,
+    0x08, 0x08, 0x08, 0x08,
+    0x08, 0x08, 0x08, 0x08 };
+
+const UWORD8 IHEVCE_SHUFFLEMASKY3[16] = { 0x0f, 0x0e, 0x0d, 0x0c,
+    0x0b, 0x0a, 0x09, 0x08,
+    0x07, 0x06, 0x05, 0x04,
+    0x03, 0x02, 0x01, 0x00 };
+
+const UWORD8 IHEVCE_SHUFFLEMASK4[16] = { 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00 };
+
+const UWORD8 IHEVCE_SHUFFLEMASK5[16] = { 0x00, 0x01, 0x08, 0x09,
+    0x0f, 0x0f, 0x0f, 0x0f,
+    0x0f, 0x0f, 0x0f, 0x0f,
+    0x0f, 0x0f, 0x0f, 0x0f };
+/// CHROMA INTRA PRED
+const UWORD8 IHEVCE_SHUFFLEMASKY7[16] = { 0x06, 0x07, 0x04, 0x05,
+    0x02, 0x03, 0x00, 0x01,
+    0x08, 0x08, 0x08, 0x08,
+    0x08, 0x08, 0x08, 0x08 };
+
+const UWORD8 IHEVCE_SHUFFLEMASKY8[16] = { 0x0e, 0x0f, 0x0c, 0x0d,
+    0x0a, 0x0b, 0x08, 0x09,
+    0x06, 0x07, 0x04, 0x05,
+    0x02, 0x03, 0x00, 0x01 };
+
+const UWORD8 IHEVCE_SHUFFLEMASKY9[16] = { 0x00, 0x01, 0x04, 0x05,
+    0x08, 0x09, 0x0c, 0x0d,
+    0x02, 0x03, 0x06, 0x07,
+    0x0a, 0x0b, 0x0e, 0x0f };
+
+const UWORD8 IHEVCE_SHUFFLEMASKY11[16] = { 0x01, 0x00, 0x02, 0x01,
+    0x03, 0x02, 0x04, 0x03,
+    0x05, 0x04, 0x06, 0x05,
+    0x07, 0x06, 0x08, 0x07 };
+//INTRAPRED
+const UWORD8 inv_angle_shuffle[7][32] =
+{
+    { 3, 0x80, 0x80, 0x80, 1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0x80, 0x80, 0x80, 0, 1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15 },
+    { 6, 0x80, 0x80, 0x80, 0x80, 0x80, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 1, 2, 4, 5, 7, 8, 10, 11, 13, 14 },
+    { 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2, 4, 6, 8, 9, 11, 13, 15, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 1, 3, 5, 7, 8, 10, 12, 14 },
+    { 10, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2, 5, 7, 10, 12, 15, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 1, 4, 6, 9, 11, 14 },
+    { 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 4, 7, 11, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2, 5, 9, 12 },
+    { 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 6, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 3, 10 },
+    { 15, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0 }
+};
+
+/// DEBLOCK TABLES
+const WORD8 coef_d[16] = { 0, 1, -2, 1, 1, -2, 1, 0, 0, 1, -2, 1, 1, -2, 1, 0 };
+const WORD8 coef_de1[16] = { 3, -9, 9, -3, 3, -9, 9, -3, 3, -9, 9, -3, 3, -9, 9, -3 };
+const WORD8 coef_dep1[16] = { -2, 1, 1, -2, -2, 1, 1, -2, -2, 1, 1, -2, -2, 1, 1, -2 };
+const WORD32 shuffle_d[4] = { 0x80800403, 0x80800c0b, 0x03000704, 0x0b080f0c };
+const WORD32 shuffle0[2] = { 0x80098001, 0x800e8006 };
+const WORD32 shuffle1[4] = { 0x05040100, 0x0d0c0908, 0x07060302, 0x0f0e0b0a };
+const WORD32 shuffle2[4] = { 0x80808080, 0x03020100, 0x07060504, 0x80808080 };
+const WORD32 shuffle3[4] = { 0x80808080, 0x0b0a0908, 0x0f0e0d0c, 0x80808080 };
+
+const WORD8 delta0[16] =  { 1, -4, 1, -4, 1, -4, 1, -4, 1, -4, 1, -4, 1, -4, 1, -4 };
+const WORD8 delta1[16] =  { 4, -1, 4, -1, 4, -1, 4, -1, 4, -1, 4, -1, 4, -1, 4, -1 };
+const WORD32 shuffle_uv[4] = { 0x03010200, 0x0b090a08, 0x07050604, 0x0f0d0e0c };

diff --git a/common/x86/ihevc_weighted_pred_sse42_intr.c b/common/x86/ihevc_weighted_pred_sse42_intr.c
new file mode 100644
index 0000000..94a3f6d
--- /dev/null
+++ b/common/x86/ihevc_weighted_pred_sse42_intr.c

@@ -0,0 +1,2115 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_weighted_pred_x86_intr.c
+*
+* @brief
+*  Contains function definitions for weighted prediction used in inter
+* prediction
+*
+* @author
+*
+*
+* @par List of Functions:
+*   - ihevc_weighted_pred_uni_sse42()
+*   - ihevc_weighted_pred_bi_sse42()
+*   - ihevc_weighted_pred_bi_default_sse42()
+*   - ihevc_weighted_pred_chroma_uni_sse42()
+*   - ihevc_weighted_pred_chroma_bi_sse42()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <assert.h>
+
+#include "ihevc_debug.h"
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_defs.h"
+#include "ihevc_weighted_pred.h"
+#include "ihevc_inter_pred.h"
+
+#include <immintrin.h>
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Does uni-weighted prediction on the array pointed by  pi2_src and stores
+* it at the location pointed by pi2_dst
+*
+* @par Description:
+*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
+* offset
+*
+* @param[in] pi2_src
+*  Pointer to the source
+*
+* @param[out] pu1_dst
+*  Pointer to the destination
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] wgt0
+*  weight to be multiplied to the source
+*
+* @param[in] off0
+*  offset to be added after rounding and
+*
+* @param[in] shifting
+*
+*
+* @param[in] shift
+*  (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_uni_sse42(WORD16 *pi2_src,
+                                   UWORD8 *pu1_dst,
+                                   WORD32 src_strd,
+                                   WORD32 dst_strd,
+                                   WORD32 wgt0,
+                                   WORD32 off0,
+                                   WORD32 shift,
+                                   WORD32 lvl_shift,
+                                   WORD32 ht,
+                                   WORD32 wd)
+{
+    WORD32 row, col, temp;
+    WORD32 dst0, dst1, dst2, dst3;
+
+    /* all 128 bit registers are named with a suffix mxnb, where m is the */
+    /* number of n bits packed in the register                            */
+    __m128i src_temp0_4x32b, src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b;
+    __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_4x32b, off0_4x32b;
+
+    ASSERT(wd % 4 == 0); /* checking assumption*/
+    ASSERT(ht % 4 == 0); /* checking assumption*/
+
+    temp = 1 << (shift - 1);
+
+    // seting values in register
+    const_temp_4x32b = _mm_set1_epi32(temp);
+    lvl_shift_4x32b = _mm_set1_epi32(lvl_shift);
+    wgt0_4x32b = _mm_set1_epi32(wgt0);
+    off0_4x32b = _mm_set1_epi32(off0);
+
+    if(0 == (wd & 7)) /* wd multiple of 8 case */
+    {
+        __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
+
+        /*  outer for loop starts from here */
+        for(row = 0; row < ht; row += 4)
+        {
+            for(col = 0; col < wd; col += 8)
+            {   /* for row =0 ,1,2,3*/
+
+                /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
+                /* row = 1 */
+                src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
+                /* row = 2 */
+                src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
+                /* row = 3 */
+                src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
+
+                /* row = 0 */ /* Last 4 pixels */
+                src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
+                /* row = 1 */
+                src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
+                /* row = 2 */
+                src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd + 4));
+                /* row = 3 */
+                src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd + 4));
+
+                /* considering pix. 4:0 by converting 16-into 32 bit */ /* First 4 pixels */
+                src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
+                src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
+                src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
+                src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
+
+                /* (pi2_src[col] + lvl_shift)*/ /* First 4 pixels */
+                src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
+                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
+                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
+                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
+
+                /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ /* First 4 pixels */
+                src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
+                src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
+                src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
+                src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
+
+                /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
+                src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
+                src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
+                src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
+                src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
+
+                /* (pi2_src[col] + lvl_shift)*/ /* Last 4 pixels */
+                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift_4x32b);
+                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift_4x32b);
+                src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift_4x32b);
+                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift_4x32b);
+
+                /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ /* Last 4 pixels */
+                src_temp4_4x32b  = _mm_mullo_epi32(src_temp4_4x32b, wgt0_4x32b);
+                src_temp5_4x32b  = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
+                src_temp6_4x32b  = _mm_mullo_epi32(src_temp6_4x32b, wgt0_4x32b);
+                src_temp7_4x32b  = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
+
+                /* i4_tmp += 1 << (shift - 1) */ /* First 4 pixels */
+                src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
+                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
+                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
+                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
+
+                /* (i4_tmp >> shift) */ /* First 4 pixels */
+                src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
+                src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
+                src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
+                src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
+
+                /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
+                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, const_temp_4x32b);
+                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
+                src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, const_temp_4x32b);
+                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
+
+                /* (i4_tmp >> shift) */ /* Last 4 pixels */
+                src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift);
+                src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift);
+                src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift);
+                src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift);
+
+                /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */
+                src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
+                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
+                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
+                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
+
+                /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
+                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, off0_4x32b);
+                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, off0_4x32b);
+                src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, off0_4x32b);
+                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, off0_4x32b);
+
+                src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp4_4x32b);
+                src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
+                src_temp2_4x32b = _mm_packs_epi32(src_temp2_4x32b, src_temp6_4x32b);
+                src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
+                /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+                src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
+                src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
+                src_temp2_4x32b = _mm_packus_epi16(src_temp2_4x32b, src_temp2_4x32b);
+                src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);
+
+                /* store four 8-bit output values  */
+                _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 2*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp2_4x32b); /* row = 1*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp3_4x32b); /* row = 3*/
+
+                /* To update pointer */
+                pi2_src += 8;
+                pu1_dst += 8;
+
+            } /* inner loop ends here(4-output values in single iteration) */
+
+            pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
+            pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
+
+        }
+    }
+    else  /* wd multiple of 4 case */
+    {
+        /*  outer for loop starts from here */
+        for(row = 0; row < ht; row += 4)
+        {
+            for(col = 0; col < wd; col += 4)
+            {   /* for row =0 ,1,2,3*/
+
+                /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
+                /* row = 1 */
+                src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
+                /* row = 2 */
+                src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
+                /* row = 3 */
+                src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
+
+                /* considering pix. 4:0 by converting 16-into 32 bit */
+                src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
+                src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
+                src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
+                src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
+
+                /* (pi2_src[col] + lvl_shift)*/
+                src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
+                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
+                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
+                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
+
+                /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
+                src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
+                src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
+                src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
+
+                /* i4_tmp += 1 << (shift - 1) */
+                src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
+                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
+                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
+                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
+
+                /* (i4_tmp >> shift) */
+                src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
+                src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
+                src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
+                src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
+
+                /*i4_tmp = (i4_tmp >> shift) + off0; */
+                src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
+                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
+                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
+                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
+
+                src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp1_4x32b);
+                src_temp2_4x32b = _mm_packs_epi32(src_temp2_4x32b, src_temp3_4x32b);
+
+                /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+                src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp2_4x32b);
+
+                dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
+                /* dst row = 1 to 3 */
+                src_temp1_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 1);
+                src_temp2_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 2);
+                src_temp3_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 3);
+
+                /* store four 8-bit output values  */
+                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+                dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
+                dst2 = _mm_cvtsi128_si32(src_temp2_4x32b);
+                dst3 = _mm_cvtsi128_si32(src_temp3_4x32b);
+
+                /* row = 1 to row = 3 */
+                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+                *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
+                *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
+
+                /* To update pointer */
+                pi2_src += 4;
+                pu1_dst += 4;
+
+            } /* inner loop ends here(4-output values in single iteration) */
+
+            pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
+            pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
+
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does chroma uni-weighted prediction on array pointed by pi2_src and stores
+* it at the location pointed by pi2_dst
+*
+* @par Description:
+*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
+* offset
+*
+* @param[in] pi2_src
+*  Pointer to the source
+*
+* @param[out] pu1_dst
+*  Pointer to the destination
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] wgt0
+*  weight to be multiplied to the source
+*
+* @param[in] off0
+*  offset to be added after rounding and
+*
+* @param[in] shifting
+*
+*
+* @param[in] shift
+*  (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source (each colour component)
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_chroma_uni_sse42(WORD16 *pi2_src,
+                                          UWORD8 *pu1_dst,
+                                          WORD32 src_strd,
+                                          WORD32 dst_strd,
+                                          WORD32 wgt0_cb,
+                                          WORD32 wgt0_cr,
+                                          WORD32 off0_cb,
+                                          WORD32 off0_cr,
+                                          WORD32 shift,
+                                          WORD32 lvl_shift,
+                                          WORD32 ht,
+                                          WORD32 wd)
+{
+    WORD32 row, col, temp, wdx2;
+    /* all 128 bit registers are named with a suffix mxnb, where m is the */
+    /* number of n bits packed in the register                            */
+
+    __m128i src_temp0_4x32b, src_temp1_4x32b;
+    __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_4x32b, off0_4x32b;
+
+    ASSERT(wd % 2 == 0); /* checking assumption*/
+    ASSERT(ht % 2 == 0); /* checking assumption*/
+
+    temp = 1 << (shift - 1);
+    wdx2 = 2 * wd;
+
+    // seting values in register
+    const_temp_4x32b = _mm_set1_epi32(temp);
+    lvl_shift_4x32b = _mm_set1_epi32(lvl_shift);
+    wgt0_4x32b = _mm_set_epi32(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
+    off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb);
+
+#if 0 /* Enable this for ht%4=0 case. But was degrading performance for lower sizes and improving for higher sizes!!! */
+    if( 0 == (ht & 3)) /* ht multiple of 4 case */
+    {
+        if( 0 == (wdx2 & 15)) /* 2*wd multiple of 168 case */
+        {
+            __m128i src_temp2_4x32b, src_temp3_4x32b;
+            __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
+            __m128i src_temp8_4x32b, src_temp9_4x32b, src_temp10_4x32b, src_temp11_4x32b;
+            __m128i src_temp12_4x32b, src_temp13_4x32b, src_temp14_4x32b, src_temp15_4x32b;
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row +=4)
+            {
+                for(col = 0; col < wdx2; col +=16)
+                {
+                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
+                    /* row = 1 */
+                    src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd));
+                    /* row = 0 */ /* Second 4 pixels */
+                    src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+4));
+                    /* row = 1 */
+                    src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+4));
+                    /* row = 0 */ /* Third 4 pixels */
+                    src_temp4_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+8));
+                    /* row = 1 */
+                    src_temp5_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+8));
+                    /* row = 0 */ /* Last 4 pixels */
+                    src_temp6_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+12));
+                    /* row = 1 */
+                    src_temp7_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+12));
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */
+                    src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
+                    src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
+                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp0_4x32b  = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
+                    src_temp1_4x32b  = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
+                    src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
+                    src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
+                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp2_4x32b  = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
+                    src_temp3_4x32b  = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
+                    src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
+                    src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, lvl_shift_4x32b);
+                    src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp4_4x32b  = _mm_mullo_epi32 (src_temp4_4x32b, wgt0_4x32b);
+                    src_temp5_4x32b  = _mm_mullo_epi32 (src_temp5_4x32b, wgt0_4x32b);
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
+                    src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
+                    src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, lvl_shift_4x32b);
+                    src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp6_4x32b  = _mm_mullo_epi32 (src_temp6_4x32b, wgt0_4x32b);
+                    src_temp7_4x32b  = _mm_mullo_epi32 (src_temp7_4x32b, wgt0_4x32b);
+
+                    /* i4_tmp += 1 << (shift - 1) */
+                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
+                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
+                    src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
+
+                    /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
+                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
+                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
+                    src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
+
+                    /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
+                    src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, const_temp_4x32b);
+                    src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b,  shift);
+                    src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
+
+                    /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
+                    src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, const_temp_4x32b);
+                    src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b,  shift);
+                    src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
+
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */
+                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
+                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
+                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
+                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
+                    src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, off0_4x32b);
+                    src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, off0_4x32b);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
+                    src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, off0_4x32b);
+                    src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, off0_4x32b);
+
+                    src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp2_4x32b);
+                    src_temp1_4x32b = _mm_packs_epi32 (src_temp1_4x32b, src_temp3_4x32b);
+                    src_temp4_4x32b = _mm_packs_epi32 (src_temp4_4x32b, src_temp6_4x32b);
+                    src_temp5_4x32b = _mm_packs_epi32 (src_temp5_4x32b, src_temp7_4x32b);
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+                    src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp4_4x32b);
+                    src_temp1_4x32b = _mm_packus_epi16 (src_temp1_4x32b, src_temp5_4x32b);
+
+                    /* store 16 8-bit output values  */
+                    _mm_storeu_si128((__m128i*)(pu1_dst+0*dst_strd), src_temp0_4x32b); /* row = 0*/
+                    _mm_storeu_si128((__m128i*)(pu1_dst+1*dst_strd), src_temp1_4x32b); /* row = 1*/
+
+                    /* row = 2 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp8_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
+                    /* row = 3 */
+                    src_temp9_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
+                    /* row = 2 */ /* Second 4 pixels */
+                    src_temp10_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+4));
+                    /* row = 3 */
+                    src_temp11_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+4));
+                    /* row = 2 */ /* Third 4 pixels */
+                    src_temp12_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+8));
+                    /* row = 3 */
+                    src_temp13_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+8));
+                    /* row = 2 */ /* Last 4 pixels */
+                    src_temp14_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+12));
+                    /* row = 3 */
+                    src_temp15_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+12));
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */
+                    src_temp8_4x32b  = _mm_cvtepi16_epi32(src_temp8_4x32b);
+                    src_temp9_4x32b  = _mm_cvtepi16_epi32(src_temp9_4x32b);
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, lvl_shift_4x32b);
+                    src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp8_4x32b  = _mm_mullo_epi32 (src_temp8_4x32b, wgt0_4x32b);
+                    src_temp9_4x32b  = _mm_mullo_epi32 (src_temp9_4x32b, wgt0_4x32b);
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
+                    src_temp10_4x32b  = _mm_cvtepi16_epi32(src_temp10_4x32b);
+                    src_temp11_4x32b  = _mm_cvtepi16_epi32(src_temp11_4x32b);
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, lvl_shift_4x32b);
+                    src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp10_4x32b  = _mm_mullo_epi32 (src_temp10_4x32b, wgt0_4x32b);
+                    src_temp11_4x32b  = _mm_mullo_epi32 (src_temp11_4x32b, wgt0_4x32b);
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
+                    src_temp12_4x32b  = _mm_cvtepi16_epi32(src_temp12_4x32b);
+                    src_temp13_4x32b  = _mm_cvtepi16_epi32(src_temp13_4x32b);
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, lvl_shift_4x32b);
+                    src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp12_4x32b  = _mm_mullo_epi32 (src_temp12_4x32b, wgt0_4x32b);
+                    src_temp13_4x32b  = _mm_mullo_epi32 (src_temp13_4x32b, wgt0_4x32b);
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
+                    src_temp14_4x32b  = _mm_cvtepi16_epi32(src_temp14_4x32b);
+                    src_temp15_4x32b  = _mm_cvtepi16_epi32(src_temp15_4x32b);
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, lvl_shift_4x32b);
+                    src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp14_4x32b  = _mm_mullo_epi32 (src_temp14_4x32b, wgt0_4x32b);
+                    src_temp15_4x32b  = _mm_mullo_epi32 (src_temp15_4x32b, wgt0_4x32b);
+
+                    /* i4_tmp += 1 << (shift - 1) */
+                    src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, const_temp_4x32b);
+                    src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp8_4x32b = _mm_srai_epi32(src_temp8_4x32b,  shift);
+                    src_temp9_4x32b = _mm_srai_epi32(src_temp9_4x32b,  shift);
+
+                    /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
+                    src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, const_temp_4x32b);
+                    src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp10_4x32b = _mm_srai_epi32(src_temp10_4x32b,  shift);
+                    src_temp11_4x32b = _mm_srai_epi32(src_temp11_4x32b,  shift);
+
+                    /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
+                    src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, const_temp_4x32b);
+                    src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp12_4x32b = _mm_srai_epi32(src_temp12_4x32b,  shift);
+                    src_temp13_4x32b = _mm_srai_epi32(src_temp13_4x32b,  shift);
+
+                    /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
+                    src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, const_temp_4x32b);
+                    src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp14_4x32b = _mm_srai_epi32(src_temp14_4x32b,  shift);
+                    src_temp15_4x32b = _mm_srai_epi32(src_temp15_4x32b,  shift);
+
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */
+                    src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, off0_4x32b);
+                    src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, off0_4x32b);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
+                    src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, off0_4x32b);
+                    src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, off0_4x32b);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
+                    src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, off0_4x32b);
+                    src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, off0_4x32b);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
+                    src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, off0_4x32b);
+                    src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, off0_4x32b);
+
+                    src_temp8_4x32b = _mm_packs_epi32 (src_temp8_4x32b, src_temp10_4x32b);
+                    src_temp9_4x32b = _mm_packs_epi32 (src_temp9_4x32b, src_temp11_4x32b);
+                    src_temp12_4x32b = _mm_packs_epi32 (src_temp12_4x32b, src_temp14_4x32b);
+                    src_temp13_4x32b = _mm_packs_epi32 (src_temp13_4x32b, src_temp15_4x32b);
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+                    src_temp8_4x32b = _mm_packus_epi16 (src_temp8_4x32b, src_temp12_4x32b);
+                    src_temp9_4x32b = _mm_packus_epi16 (src_temp9_4x32b, src_temp13_4x32b);
+
+                    /* store 16 8-bit output values  */
+                    _mm_storeu_si128((__m128i*)(pu1_dst+2*dst_strd), src_temp8_4x32b); /* row = 2*/
+                    _mm_storeu_si128((__m128i*)(pu1_dst+3*dst_strd), src_temp9_4x32b); /* row = 3*/
+
+                    pi2_src += 16;  /* Pointer update */
+                    pu1_dst += 16; /* Pointer update */
+
+                } /* inner loop ends here(4-output values in single iteration) */
+                pi2_src = pi2_src - wdx2 + 4*src_strd;  /* Pointer update */
+                pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
+            }
+        }
+        else if( 0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
+        {
+            __m128i src_temp2_4x32b,src_temp3_4x32b;
+            __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row +=4)
+            {
+                for(col = 0; col < wdx2; col +=8)
+                {
+                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
+                    /* row = 1 */
+                    src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd));
+                    /* row = 2 */
+                    src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
+                    /* row = 3 */
+                    src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
+
+                    /* row = 0 */ /* Last 4 pixels */
+                    src_temp4_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+4));
+                    /* row = 1 */
+                    src_temp5_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+4));
+                    /* row = 2 */
+                    src_temp6_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+4));
+                    /* row = 3 */
+                    src_temp7_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+4));
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */
+                    src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
+                    src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
+                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp0_4x32b  = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
+                    src_temp1_4x32b  = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
+                    src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
+                    src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
+                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp2_4x32b  = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
+                    src_temp3_4x32b  = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */
+                    src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
+                    src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, lvl_shift_4x32b);
+                    src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp4_4x32b  = _mm_mullo_epi32 (src_temp4_4x32b, wgt0_4x32b);
+                    src_temp5_4x32b  = _mm_mullo_epi32 (src_temp5_4x32b, wgt0_4x32b);
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */
+                    src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
+                    src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, lvl_shift_4x32b);
+                    src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp6_4x32b  = _mm_mullo_epi32 (src_temp6_4x32b, wgt0_4x32b);
+                    src_temp7_4x32b  = _mm_mullo_epi32 (src_temp7_4x32b, wgt0_4x32b);
+
+                    /* i4_tmp += 1 << (shift - 1) */
+                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
+                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
+                    src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
+
+                    /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
+                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
+                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
+                    src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
+
+                    /* i4_tmp += 1 << (shift - 1) */
+                    src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, const_temp_4x32b);
+                    src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b,  shift);
+                    src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
+
+                    /* i4_tmp += 1 << (shift - 1) */
+                    src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, const_temp_4x32b);
+                    src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b,  shift);
+                    src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
+
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */
+                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
+                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
+                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
+                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */
+                    src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, off0_4x32b);
+                    src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, off0_4x32b);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */
+                    src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, off0_4x32b);
+                    src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, off0_4x32b);
+
+                    src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp4_4x32b);
+                    src_temp1_4x32b = _mm_packs_epi32 (src_temp1_4x32b, src_temp5_4x32b);
+                    src_temp2_4x32b = _mm_packs_epi32 (src_temp2_4x32b, src_temp6_4x32b);
+                    src_temp3_4x32b = _mm_packs_epi32 (src_temp3_4x32b, src_temp7_4x32b);
+
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+                    src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp0_4x32b);
+                    src_temp1_4x32b = _mm_packus_epi16 (src_temp1_4x32b, src_temp1_4x32b);
+                    src_temp2_4x32b = _mm_packus_epi16 (src_temp2_4x32b, src_temp2_4x32b);
+                    src_temp3_4x32b = _mm_packus_epi16 (src_temp3_4x32b, src_temp3_4x32b);
+
+                    /* store four 8-bit output values  */
+                    _mm_storel_epi64((__m128i*)(pu1_dst+0*dst_strd), src_temp0_4x32b); /* row = 0*/
+                    _mm_storel_epi64((__m128i*)(pu1_dst+1*dst_strd), src_temp1_4x32b); /* row = 1*/
+                    _mm_storel_epi64((__m128i*)(pu1_dst+2*dst_strd), src_temp2_4x32b); /* row = 0*/
+                    _mm_storel_epi64((__m128i*)(pu1_dst+3*dst_strd), src_temp3_4x32b); /* row = 1*/
+
+                    pi2_src += 8;   /* Pointer update */
+                    pu1_dst += 8; /* Pointer update */
+
+                } /* inner loop ends here(4-output values in single iteration) */
+                pi2_src = pi2_src - wdx2 + 4*src_strd;  /* Pointer update */
+                pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
+            }
+        }
+        else /* 2*wd multiple of 4 case */
+        {
+            WORD32 dst0, dst1, dst2, dst3;
+            __m128i src_temp2_4x32b,src_temp3_4x32b;
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row +=4)
+            {
+                for(col = 0; col < wdx2; col +=4)
+                {
+                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
+                    /* row = 1 */
+                    src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+1*src_strd));
+                    /* row = 2 */
+                    src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
+                    /* row = 3 */
+                    src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */
+                    src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
+                    src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
+                    src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
+                    src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
+
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
+                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp0_4x32b  = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
+                    src_temp1_4x32b  = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);
+
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
+                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp2_4x32b  = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
+                    src_temp3_4x32b  = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);
+
+                    /* i4_tmp += 1 << (shift - 1) */
+                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
+                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
+                    src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */
+                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
+                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
+
+                    /* i4_tmp += 1 << (shift - 1) */
+                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
+                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
+                    src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */
+                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
+                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
+
+                    src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp1_4x32b);
+                    src_temp2_4x32b = _mm_packs_epi32 (src_temp2_4x32b, src_temp3_4x32b);
+
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+                    src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp2_4x32b);
+
+                    dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
+                    /* dst row = 1 to 3 */
+                    src_temp1_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 1);
+                    src_temp2_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 2);
+                    src_temp3_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 3);
+
+                    /* store four 8-bit output values  */
+                    *(WORD32 *) (&pu1_dst[0*dst_strd]) = dst0;
+
+                    dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
+                    dst2 = _mm_cvtsi128_si32(src_temp2_4x32b);
+                    dst3 = _mm_cvtsi128_si32(src_temp3_4x32b);
+                    /* row = 1 */
+                    *(WORD32 *) (&pu1_dst[1*dst_strd]) = dst1;
+                    /* row = 2 */
+                    *(WORD32 *) (&pu1_dst[2*dst_strd]) = dst2;
+                    /* row = 3 */
+                    *(WORD32 *) (&pu1_dst[3*dst_strd]) = dst3;
+
+                    pi2_src += 4;   /* Pointer update */
+                    pu1_dst += 4; /* Pointer update */
+
+                } /* inner loop ends here(4-output values in single iteration) */
+                pi2_src = pi2_src - wdx2 + 4*src_strd;  /* Pointer update */
+                pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
+            }
+        }
+    }
+    else /* ht multiple of 2 case */
+#endif
+
+    {
+        if(0 == (wdx2 & 15)) /* 2*wd multiple of 168 case */
+        {
+            __m128i src_temp2_4x32b, src_temp3_4x32b;
+            __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row += 2)
+            {
+                for(col = 0; col < wdx2; col += 16)
+                {
+                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
+                    /* row = 1 */
+                    src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
+
+                    /* row = 0 */ /* Second 4 pixels */
+                    src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
+                    /* row = 1 */
+                    src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
+                    /* row = 0 */ /* Third 4 pixels */
+                    src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 8));
+                    /* row = 1 */
+                    src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8));
+                    /* row = 0 */ /* Last 4 pixels */
+                    src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 12));
+                    /* row = 1 */
+                    src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 12));
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */
+                    src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
+                    src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
+                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
+                    src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
+                    src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
+                    src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
+                    src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
+                    src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
+                    src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
+                    src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift_4x32b);
+                    src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp4_4x32b  = _mm_mullo_epi32(src_temp4_4x32b, wgt0_4x32b);
+                    src_temp5_4x32b  = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
+                    src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
+                    src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift_4x32b);
+                    src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp6_4x32b  = _mm_mullo_epi32(src_temp6_4x32b, wgt0_4x32b);
+                    src_temp7_4x32b  = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
+
+                    /* i4_tmp += 1 << (shift - 1) */
+                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
+                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
+                    src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
+
+                    /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
+                    src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
+                    src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
+                    src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
+
+                    /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
+                    src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, const_temp_4x32b);
+                    src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b,  shift);
+                    src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
+
+                    /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
+                    src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, const_temp_4x32b);
+                    src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b,  shift);
+                    src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
+
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */
+                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
+                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
+                    src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
+                    src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
+                    src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, off0_4x32b);
+                    src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, off0_4x32b);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
+                    src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, off0_4x32b);
+                    src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, off0_4x32b);
+
+                    src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp2_4x32b);
+                    src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
+                    src_temp4_4x32b = _mm_packs_epi32(src_temp4_4x32b, src_temp6_4x32b);
+                    src_temp5_4x32b = _mm_packs_epi32(src_temp5_4x32b, src_temp7_4x32b);
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+                    src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp4_4x32b);
+                    src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp5_4x32b);
+
+                    /* store 16 8-bit output values  */
+                    _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
+                    _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 1*/
+
+                    pi2_src += 16;  /* Pointer update */
+                    pu1_dst += 16; /* Pointer update */
+
+                } /* inner loop ends here(4-output values in single iteration) */
+                pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
+                pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+            }
+        }
+        else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
+        {
+            __m128i src_temp2_4x32b, src_temp3_4x32b;
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row += 2)
+            {
+                for(col = 0; col < wdx2; col += 8)
+                {
+                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
+                    /* row = 1 */
+                    src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
+
+                    /* row = 0 */ /* Last 4 pixels */
+                    src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
+                    /* row = 1 */
+                    src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */
+                    src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
+                    src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
+                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
+                    src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
+                    src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
+                    src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
+                    src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
+                    src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
+
+                    /* i4_tmp += 1 << (shift - 1) */
+                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
+                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
+                    src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
+
+                    /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
+                    src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
+                    src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
+                    /* (i4_tmp >> shift) */
+                    src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
+                    src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
+
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */
+                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
+                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
+                    src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
+                    src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
+
+                    src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp2_4x32b);
+                    src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
+
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+                    src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
+                    src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
+
+                    /* store four 8-bit output values  */
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 1*/
+
+                    pi2_src += 8;   /* Pointer update */
+                    pu1_dst += 8; /* Pointer update */
+
+                } /* inner loop ends here(4-output values in single iteration) */
+                pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
+                pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+            }
+        }
+        else /* 2*wd multiple of 4 case */
+        {
+            WORD32 dst0, dst1;
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row += 2)
+            {
+                for(col = 0; col < wdx2; col += 4)
+                {
+                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
+                    /* row = 1 */
+                    src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */
+                    src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
+                    src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
+
+                    /* (pi2_src[col] + lvl_shift)*/
+                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
+                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
+
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+                    src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
+                    src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
+
+                    /* i4_tmp += 1 << (shift - 1) */
+                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
+                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
+
+                    /* (i4_tmp >> shift) */
+                    src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
+                    src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
+
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */
+                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
+                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
+
+                    src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp1_4x32b);
+
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+                    src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
+
+                    dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
+                    /* dst row = 1 to 3 */
+                    src_temp1_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 1);
+
+                    /* store four 8-bit output values  */
+                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+                    dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
+                    /* row = 1 */
+                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+
+                    pi2_src += 4;   /* Pointer update */
+                    pu1_dst += 4; /* Pointer update */
+
+                } /* inner loop ends here(4-output values in single iteration) */
+                pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
+                pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+            }
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Does bi-weighted prediction on the arrays pointed by  pi2_src1 and
+* pi2_src2 and stores it at location pointed  by pi2_dst
+*
+* @par Description:
+*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
+* off1 + 1) << (shift - 1) ) >> shift
+*
+* @param[in] pi2_src1
+*  Pointer to source 1
+*
+* @param[in] pi2_src2
+*  Pointer to source 2
+*
+* @param[out] pu1_dst
+*  Pointer to destination
+*
+* @param[in] src_strd1
+*  Source stride 1
+*
+* @param[in] src_strd2
+*  Source stride 2
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] wgt0
+*  weight to be multiplied to source 1
+*
+* @param[in] off0
+*  offset 0
+*
+* @param[in] wgt1
+*  weight to be multiplied to source 2
+*
+* @param[in] off1
+*  offset 1
+*
+* @param[in] shift
+*  (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift1
+*  added before shift and offset
+*
+* @param[in] lvl_shift2
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_bi_sse42(WORD16 *pi2_src1,
+                                  WORD16 *pi2_src2,
+                                  UWORD8 *pu1_dst,
+                                  WORD32 src_strd1,
+                                  WORD32 src_strd2,
+                                  WORD32 dst_strd,
+                                  WORD32 wgt0,
+                                  WORD32 off0,
+                                  WORD32 wgt1,
+                                  WORD32 off1,
+                                  WORD32 shift,
+                                  WORD32 lvl_shift1,
+                                  WORD32 lvl_shift2,
+                                  WORD32 ht,
+                                  WORD32 wd)
+{
+    WORD32 row, col, temp;
+
+    __m128i src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b, src_temp4_4x32b;
+    __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_4x32b, wgt1_4x32b;
+
+
+    ASSERT(wd % 4 == 0); /* checking assumption*/
+    ASSERT(ht % 2 == 0); /* checking assumption*/
+
+    temp = (off0 + off1 + 1) << (shift - 1);
+
+    // seting values in register
+    const_temp_4x32b = _mm_set1_epi32(temp);
+    lvl_shift1_4x32b = _mm_set1_epi32(lvl_shift1);
+    lvl_shift2_4x32b = _mm_set1_epi32(lvl_shift2);
+    wgt0_4x32b = _mm_set1_epi32(wgt0);
+    wgt1_4x32b = _mm_set1_epi32(wgt1);
+
+    if(0 == (wd & 7)) /* wd multiple of 8 case */
+    {
+        __m128i src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b, src_temp8_4x32b;
+        /*  outer for loop starts from here */
+        for(row = 0; row < ht; row += 2)
+        {
+            for(col = 0; col < wd; col += 8)
+            {
+                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
+                src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
+                src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
+                src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
+                /* Next 4 pixels */
+                src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 4)); /* row = 0 */
+                src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 4)); /* row = 0 */
+                src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1 + 4)); /* row = 1 */
+                src_temp8_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2 + 4)); /* row = 1 */
+
+                /* considering pix. 4:0 by converting 16-into 32 bit */
+                src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
+                src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
+                /* (pi2_src1[col] + lvl_shift1) */
+                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
+                /* (pi2_src2[col] + lvl_shift2) */
+                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
+                /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
+                src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
+                /*(pi2_src2[col] + lvl_shift2) * wgt1 */
+                src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
+
+                src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
+                src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
+                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
+                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
+                src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
+                src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
+
+                /* Next 4 Pixels */
+                src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
+                src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
+                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift1_4x32b);
+                src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift2_4x32b);
+                src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
+                src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt1_4x32b);
+                src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
+                src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b);
+                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift1_4x32b);
+                src_temp8_4x32b = _mm_add_epi32(src_temp8_4x32b, lvl_shift2_4x32b);
+                src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
+                src_temp8_4x32b = _mm_mullo_epi32(src_temp8_4x32b, wgt1_4x32b);
+
+                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
+                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
+                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
+                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
+                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
+                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
+                /* (i4_tmp >> shift) */
+                src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
+                src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
+
+                /* Next 4 Pixels */
+                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, src_temp6_4x32b);
+                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, src_temp8_4x32b);
+                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
+                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
+                src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
+                src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
+
+                src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
+                src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
+
+                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
+                src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);
+
+                /* store four 8-bit output values  */
+                _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_4x32b); /* row = 0*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_4x32b); /* row = 1*/
+
+                pi2_src1 += 8;  /* Pointer update */
+                pi2_src2 += 8;  /* Pointer update */
+                pu1_dst  += 8;  /* Pointer update */
+
+            } /* inner loop ends here(4-output values in single iteration) */
+
+            pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
+            pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
+            pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */
+
+        } /* outer loop ends */
+    }
+    else /* wd multiple of 4 case */
+    {
+        WORD32 dst0, dst1;
+        /*  outer for loop starts from here */
+        for(row = 0; row < ht; row += 2)
+        {
+            for(col = 0; col < wd; col += 4)
+            {
+                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
+                src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
+                src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
+                src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
+
+                /* considering pix. 4:0 by converting 16-into 32 bit */
+                src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
+                src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
+                /* (pi2_src1[col] + lvl_shift1) */
+                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
+                /* (pi2_src2[col] + lvl_shift2) */
+                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
+                /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
+                src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
+                /*(pi2_src2[col] + lvl_shift2) * wgt1 */
+                src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
+
+                src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
+                src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
+                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
+                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
+                src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
+                src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
+
+                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
+                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
+                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
+
+                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
+                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
+                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
+
+                /* (i4_tmp >> shift) */
+                src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
+                src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
+
+                src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
+
+                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
+
+                dst0 = _mm_cvtsi128_si32(src_temp1_4x32b);
+
+                /* dst row = 1 to 3 */
+                src_temp2_4x32b = _mm_shuffle_epi32(src_temp1_4x32b, 1);
+
+                /* store four 8-bit output values  */
+                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+                dst1 = _mm_cvtsi128_si32(src_temp2_4x32b);
+
+                /* row = 1 to 3 */
+                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+
+                pi2_src1 += 4;  /* Pointer update */
+                pi2_src2 += 4;  /* Pointer update */
+                pu1_dst  += 4;  /* Pointer update */
+
+            } /* inner loop ends here(4-output values in single iteration) */
+
+            pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
+            pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
+            pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */
+
+        } /* outer loop ends */
+    }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does chroma bi-weighted prediction on the arrays pointed by  pi2_src1 and
+* pi2_src2 and stores it at location pointed  by pi2_dst
+*
+* @par Description:
+*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
+* off1 + 1) << (shift - 1) ) >> shift
+*
+* @param[in] pi2_src1
+*  Pointer to source 1
+*
+* @param[in] pi2_src2
+*  Pointer to source 2
+*
+* @param[out] pu1_dst
+*  Pointer to destination
+*
+* @param[in] src_strd1
+*  Source stride 1
+*
+* @param[in] src_strd2
+*  Source stride 2
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] wgt0
+*  weight to be multiplied to source 1
+*
+* @param[in] off0
+*  offset 0
+*
+* @param[in] wgt1
+*  weight to be multiplied to source 2
+*
+* @param[in] off1
+*  offset 1
+*
+* @param[in] shift
+*  (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift1
+*  added before shift and offset
+*
+* @param[in] lvl_shift2
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source (each colour component)
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_chroma_bi_sse42(WORD16 *pi2_src1,
+                                         WORD16 *pi2_src2,
+                                         UWORD8 *pu1_dst,
+                                         WORD32 src_strd1,
+                                         WORD32 src_strd2,
+                                         WORD32 dst_strd,
+                                         WORD32 wgt0_cb,
+                                         WORD32 wgt0_cr,
+                                         WORD32 off0_cb,
+                                         WORD32 off0_cr,
+                                         WORD32 wgt1_cb,
+                                         WORD32 wgt1_cr,
+                                         WORD32 off1_cb,
+                                         WORD32 off1_cr,
+                                         WORD32 shift,
+                                         WORD32 lvl_shift1,
+                                         WORD32 lvl_shift2,
+                                         WORD32 ht,
+                                         WORD32 wd)
+{
+    WORD32 row, col, temp1, temp2;
+    WORD32 wdx2;
+
+    __m128i src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b, src_temp4_4x32b;
+    __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_4x32b, wgt1_4x32b;
+
+
+    ASSERT(wd % 2 == 0); /* checking assumption*/
+    ASSERT(ht % 2 == 0); /* checking assumption*/
+
+    temp1 = (off0_cb + off1_cb + 1) << (shift - 1);
+    temp2 = (off0_cr + off1_cr + 1) << (shift - 1);
+
+    // seting values in register
+    const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1);
+    lvl_shift1_4x32b = _mm_set1_epi32(lvl_shift1);
+    lvl_shift2_4x32b = _mm_set1_epi32(lvl_shift2);
+    wgt0_4x32b = _mm_set_epi32(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
+    wgt1_4x32b = _mm_set_epi32(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb);
+
+    wdx2 = wd * 2;
+
+    if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */
+    {
+        __m128i src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b, src_temp8_4x32b;
+        /*  outer for loop starts from here */
+        for(row = 0; row < ht; row += 2)
+        {
+            for(col = 0; col < wdx2; col += 8)
+            {
+                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
+                src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
+                src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
+                src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
+                /* Next 4 pixels */
+                src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 4)); /* row = 0 */
+                src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 4)); /* row = 0 */
+                src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1 + 4)); /* row = 1 */
+                src_temp8_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2 + 4)); /* row = 1 */
+
+                /* considering pix. 4:0 by converting 16-into 32 bit */
+                src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
+                src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
+                /* (pi2_src1[col] + lvl_shift1) */
+                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
+                /* (pi2_src2[col] + lvl_shift2) */
+                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
+                /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
+                src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
+                /*(pi2_src2[col] + lvl_shift2) * wgt1 */
+                src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
+
+                src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
+                src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
+                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
+                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
+                src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
+                src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
+
+                /* Next 4 Pixels */
+                src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
+                src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
+                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift1_4x32b);
+                src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift2_4x32b);
+                src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
+                src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt1_4x32b);
+                src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
+                src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b);
+                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift1_4x32b);
+                src_temp8_4x32b = _mm_add_epi32(src_temp8_4x32b, lvl_shift2_4x32b);
+                src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
+                src_temp8_4x32b = _mm_mullo_epi32(src_temp8_4x32b, wgt1_4x32b);
+
+                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
+                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
+                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
+                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
+                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
+                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
+                /* (i4_tmp >> shift) */
+                src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
+                src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
+
+                /* Next 4 Pixels */
+                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, src_temp6_4x32b);
+                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, src_temp8_4x32b);
+                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
+                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
+                src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
+                src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
+
+                src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
+                src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
+
+                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
+                src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);
+
+                /* store four 8-bit output values  */
+                _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_4x32b); /* row = 0*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_4x32b); /* row = 1*/
+
+                pi2_src1 += 8;  /* Pointer update */
+                pi2_src2 += 8;  /* Pointer update */
+                pu1_dst  += 8;  /* Pointer update */
+
+            } /* inner loop ends here(4-output values in single iteration) */
+
+            pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
+            pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
+            pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
+
+        } /* outer loop ends */
+    }
+    else /* wdx2 multiple of 4 case */
+    {
+        WORD32 dst0, dst1;
+        /*  outer for loop starts from here */
+        for(row = 0; row < ht; row += 2)
+        {
+            for(col = 0; col < wdx2; col += 4)
+            {
+                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
+                src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
+                src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
+                src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
+
+                /* considering pix. 4:0 by converting 16-into 32 bit */
+                src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
+                src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
+                /* (pi2_src1[col] + lvl_shift1) */
+                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
+                /* (pi2_src2[col] + lvl_shift2) */
+                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
+                /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
+                src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
+                /*(pi2_src2[col] + lvl_shift2) * wgt1 */
+                src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
+
+                src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
+                src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
+                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
+                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
+                src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
+                src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
+
+                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
+                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
+                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
+
+                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
+                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
+                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
+
+                /* (i4_tmp >> shift) */
+                src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
+                src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
+
+                src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
+
+                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
+
+                dst0 = _mm_cvtsi128_si32(src_temp1_4x32b);
+
+                /* dst row = 1 to 3 */
+                src_temp2_4x32b = _mm_shuffle_epi32(src_temp1_4x32b, 1);
+
+                /* store four 8-bit output values  */
+                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+                dst1 = _mm_cvtsi128_si32(src_temp2_4x32b);
+
+                /* row = 1 to 3 */
+                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+
+                pi2_src1 += 4;  /* Pointer update */
+                pi2_src2 += 4;  /* Pointer update */
+                pu1_dst  += 4;  /* Pointer update */
+
+            } /* inner loop ends here(4-output values in single iteration) */
+
+            pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
+            pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
+            pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
+        }
+    }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location  pointed by pi2_dst
+*
+* @par Description:
+*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
+* >> shift  where shift = 15 - BitDepth
+*
+* @param[in] pi2_src1
+*  Pointer to source 1
+*
+* @param[in] pi2_src2
+*  Pointer to source 2
+*
+* @param[out] pu1_dst
+*  Pointer to destination
+*
+* @param[in] src_strd1
+*  Source stride 1
+*
+* @param[in] src_strd2
+*  Source stride 2
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] lvl_shift1
+*  added before shift and offset
+*
+* @param[in] lvl_shift2
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source
+*
+* @returns
+*
+* @remarks
+*  None
+*
+* Assumption : ht%4 == 0, wd%4 == 0
+* shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
+* final result will match even if intermediate precision is in 16 bit.
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_bi_default_sse42(WORD16 *pi2_src1,
+                                          WORD16 *pi2_src2,
+                                          UWORD8 *pu1_dst,
+                                          WORD32 src_strd1,
+                                          WORD32 src_strd2,
+                                          WORD32 dst_strd,
+                                          WORD32 lvl_shift1,
+                                          WORD32 lvl_shift2,
+                                          WORD32 ht,
+                                          WORD32 wd)
+{
+    WORD32 row, col, temp;
+    WORD32 shift;
+
+    __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+    __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b;
+    __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+    ASSERT(wd % 4 == 0); /* checking assumption*/
+    ASSERT(ht % 2 == 0); /* checking assumption*/
+
+    shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
+    temp = 1 << (shift - 1);
+
+    // seting values in register
+    lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1);
+    lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2);
+    const_temp_8x16b = _mm_set1_epi16(temp);
+
+    lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b);
+    lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b);
+
+    if(0 == (ht & 3)) /* ht multiple of 4*/
+    {
+        if(0 == (wd & 15)) /* wd multiple of 16 case */
+        {
+            __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
+            __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row += 4)
+            {
+                for(col = 0; col < wd; col += 16)
+                {
+                    /*load 8 pixel values */ /* First 8 Values */
+                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
+                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
+                    /* row = 1 */
+                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
+                    src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
+                    /* row = 2 */
+                    src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
+                    src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
+                    /* row = 3 */
+                    src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
+                    src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
+
+                    /*load 8 pixel values */ /* Second 8 Values */
+                    src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
+                    src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
+                    /* row = 1 */
+                    src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
+                    src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
+                    /* row = 2 */
+                    src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
+                    src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
+
+                    /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
+                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
+                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
+
+                    /*load 8 pixel values */ /* Second 8 Values */
+                    /* row = 3 */
+                    src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
+                    src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
+
+                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
+                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
+                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
+
+                    /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
+                    src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
+                    src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
+                    src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
+                    src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+                    /* (i4_tmp >> shift) */ /* First 8 Values */
+                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
+                    src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
+                    src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
+                    src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
+
+                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
+                    src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
+                    src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
+                    src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
+                    src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
+
+                    /* (i4_tmp >> shift) */ /* Second 8 Values */
+                    src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
+                    src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
+                    src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  shift);
+                    src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b,  shift);
+
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */
+                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b);
+                    src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b);
+                    src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b);
+                    src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b);
+
+                    /* store four 8-bit output values  */ /* 16 8 Values */
+                    _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
+                    _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
+                    _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
+                    _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
+
+                    /* To update pointer */
+                    pi2_src1 += 16;
+                    pi2_src2 += 16;
+                    pu1_dst  += 16;
+
+                } /* inner loop ends here(8-output values in single iteration) */
+
+                pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
+                pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
+                pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */
+
+            }
+        }
+        else if(0 == (wd & 7)) /* multiple of 8 case */
+        {
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row += 4)
+            {
+                for(col = 0; col < wd; col += 8)
+                {
+                    /*load 8 pixel values */
+                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
+                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
+                    /* row = 1 */
+                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
+                    src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
+                    /* row = 2 */
+                    src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
+                    src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
+                    /* row = 3 */
+                    src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
+                    src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
+
+                    /* (pi2_src1[col] + pi2_src2[col]) */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
+                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
+                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
+
+                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
+                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
+                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
+
+                    /* (i4_tmp >> shift) */
+                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
+                    src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
+                    src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
+                    src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
+
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+                    src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
+                    src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
+                    src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
+
+                    /* store four 8-bit output values  */
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
+
+                    /* To update pointer */
+                    pi2_src1 += 8;
+                    pi2_src2 += 8;
+                    pu1_dst  += 8;
+
+                } /* inner loop ends here(8-output values in single iteration) */
+
+                pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
+                pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
+                pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */
+
+            }
+        }
+        else /* wd multiple of 4 case*/
+        {
+            WORD32 dst0, dst1, dst2, dst3;
+
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row += 4)
+            {
+                for(col = 0; col < wd; col += 4)
+                {
+                    /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
+                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
+
+                    /* row = 1 */
+                    src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
+                    src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
+                    /* row = 2 */
+                    src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
+                    src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
+                    /* row = 3 */
+                    src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
+                    src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
+
+                    /* Pack two rows together */
+                    src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
+                    src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
+                    src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
+                    src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
+
+                    /* (pi2_src1[col] + pi2_src2[col]) */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
+
+                    /* (i4_tmp >> shift) */
+                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
+                    src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
+
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+                    src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
+
+                    dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
+                    /* dst row = 1 to 3 */
+                    src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
+                    src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
+
+                    /* store four 8-bit output values  */
+                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+                    dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
+                    dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
+                    dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
+
+                    /* row = 1 to row = 3 */
+                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+                    *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
+                    *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
+
+                    /* To update pointer */
+                    pi2_src1 += 4;
+                    pi2_src2 += 4;
+                    pu1_dst  += 4;
+
+                } /* inner loop ends here(4-output values in single iteration) */
+
+                pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
+                pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
+                pu1_dst  = pu1_dst  - wd + 4 * dst_strd;  /* Pointer update */
+
+            }
+        }
+    }
+    else /* ht multiple of 2 case and wd multiple of 4 case*/
+    {
+
+        WORD32 dst0, dst1;
+
+        /*  outer for loop starts from here */
+        for(row = 0; row < ht; row += 2)
+        {
+            for(col = 0; col < wd; col += 4)
+            {
+                /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
+                src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
+                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
+
+                /* row = 1 */
+                src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
+                src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
+
+                /* Pack two rows together */
+                src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
+                src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
+
+                /* (pi2_src1[col] + pi2_src2[col]) */
+                src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+
+                /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+                src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+
+                /* (i4_tmp >> shift) */
+                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
+
+                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+
+                dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
+                /* dst row = 1 to 3 */
+                src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
+
+                /* store four 8-bit output values  */
+                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+                dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
+
+                /* row = 1 to row = 3 */
+                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+
+                /* To update pointer */
+                pi2_src1 += 4;
+                pi2_src2 += 4;
+                pu1_dst  += 4;
+
+            } /* inner loop ends here(4-output values in single iteration) */
+
+            pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
+            pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
+            pu1_dst  = pu1_dst  - wd + 2 * dst_strd;  /* Pointer update */
+
+        }
+
+    }
+
+}

diff --git a/common/x86/ihevc_weighted_pred_ssse3_intr.c b/common/x86/ihevc_weighted_pred_ssse3_intr.c
new file mode 100644
index 0000000..b8778a3
--- /dev/null
+++ b/common/x86/ihevc_weighted_pred_ssse3_intr.c

@@ -0,0 +1,2386 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_weighted_pred_atom_intr.c
+*
+* @brief
+*  Contains function definitions for weighted prediction used in inter
+* prediction
+*
+* @author
+*
+*
+* @par List of Functions:
+*   - ihevc_weighted_pred_uni_ssse3()
+*   - ihevc_weighted_pred_bi_ssse3()
+*   - ihevc_weighted_pred_bi_default_ssse3()
+*   - ihevc_weighted_pred_chroma_uni_ssse3()
+*   - ihevc_weighted_pred_chroma_bi_ssse3()
+*   - ihevc_weighted_pred_chroma_bi_default_ssse3()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <assert.h>
+
+#include "ihevc_debug.h"
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_defs.h"
+#include "ihevc_weighted_pred.h"
+#include "ihevc_inter_pred.h"
+
+
+#include <immintrin.h>
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Does uni-weighted prediction on the array pointed by  pi2_src and stores
+* it at the location pointed by pi2_dst
+*
+* @par Description:
+*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
+* offset
+*
+* @param[in] pi2_src
+*  Pointer to the source
+*
+* @param[out] pu1_dst
+*  Pointer to the destination
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] wgt0
+*  weight to be multiplied to the source
+*
+* @param[in] off0
+*  offset to be added after rounding and
+*
+* @param[in] shifting
+*
+*
+* @param[in] shift
+*  (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_uni_ssse3(WORD16 *pi2_src,
+                                   UWORD8 *pu1_dst,
+                                   WORD32 src_strd,
+                                   WORD32 dst_strd,
+                                   WORD32 wgt0,
+                                   WORD32 off0,
+                                   WORD32 shift,
+                                   WORD32 lvl_shift,
+                                   WORD32 ht,
+                                   WORD32 wd)
+{
+    WORD32 row, col, temp;
+
+    /* all 128 bit registers are named with a suffix mxnb, where m is the */
+    /* number of n bits packed in the register                            */
+    __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
+    __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b;
+    __m128i res_temp0_4x32b, res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b;
+
+    ASSERT(wd % 4 == 0); /* checking assumption*/
+    ASSERT(ht % 4 == 0); /* checking assumption*/
+
+    temp = 1 << (shift - 1);
+
+    // seting values in register
+    lvl_shift_4x32b = _mm_set1_epi16(lvl_shift);
+    wgt0_8x16b = _mm_set1_epi16(wgt0);
+
+    /* lvl_shift * wgt0 */
+    res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b);
+    res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b);
+
+    const_temp_4x32b = _mm_set1_epi32(temp);
+    off0_4x32b = _mm_set1_epi32(off0);
+
+
+    /* lvl_shift * wgt0 */
+    lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b);
+    /* lvl_shift * wgt0 + 1 << (shift - 1) */
+    lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b);
+
+    if(0 == (wd & 7)) /* wd multiple of 8 case */
+    {
+        __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b;
+
+        /*  outer for loop starts from here */
+        for(row = 0; row < ht; row += 4)
+        {
+            for(col = 0; col < wd; col += 8)
+            {   /* for row =0 ,1,2,3*/
+
+                /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
+                /* row = 1 */
+                src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
+                /* row = 2 */
+                src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
+                /* row = 3 */
+                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
+
+                /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
+                res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
+                res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
+                res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b);
+                res_temp3_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
+
+                /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
+                src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
+                src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
+                src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b);
+                src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
+
+                /* Get 32 bit Result */
+                res_temp4_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
+                res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
+                res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
+                res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
+
+                res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
+                res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
+                res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
+                res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
+
+                /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
+                res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b);
+                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b);
+                res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b);
+                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b);
+                res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
+                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
+                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
+                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
+
+                /* (i4_tmp >> shift) */ /* First 4 pixels */
+                res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
+                res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
+                res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift);
+                res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
+
+                /* (i4_tmp >> shift) */ /* Last 4 pixels */
+                res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b, shift);
+                res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift);
+                res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b, shift);
+                res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift);
+
+                /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */
+                res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
+                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
+                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
+                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
+
+                /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
+                res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b);
+                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b);
+                res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b);
+                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b);
+
+                res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp4_4x32b);
+                res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
+                res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp6_4x32b);
+                res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
+                /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+                res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
+                res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
+                res_temp2_4x32b = _mm_packus_epi16(res_temp2_4x32b, res_temp2_4x32b);
+                res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
+
+                /* store four 8-bit output values  */
+                _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 2*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), res_temp2_4x32b); /* row = 1*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), res_temp3_4x32b); /* row = 3*/
+
+                /* To update pointer */
+                pi2_src += 8;
+                pu1_dst += 8;
+
+            } /* inner loop ends here(4-output values in single iteration) */
+
+            pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
+            pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
+
+        }
+    }
+    else  /* wd multiple of 4 case */
+    {
+        WORD32 dst0, dst1, dst2, dst3;
+        /*  outer for loop starts from here */
+        for(row = 0; row < ht; row += 4)
+        {
+            for(col = 0; col < wd; col += 4)
+            {   /* for row =0 ,1,2,3*/
+
+                /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src));
+                /* row = 1 */
+                src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
+                /* row = 2 */
+                src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 2 * src_strd));
+                /* row = 3 */
+                src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 3 * src_strd));
+
+                /* 2 rows together */
+                src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp2_8x16b);
+                src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
+
+                /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
+                res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
+                res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
+                /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Higher 16 bit */
+                src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
+                src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
+
+                /* Get 32 bit Result */
+                res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
+                res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
+
+                res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
+                res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
+
+                /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
+                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
+                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
+                res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
+                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
+
+                /* (i4_tmp >> shift) */
+                res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
+                res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
+                res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift);
+                res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
+
+                /*i4_tmp = (i4_tmp >> shift) + off0; */
+                res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
+                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
+                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
+                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
+
+                res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b);
+                res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp3_4x32b);
+
+                /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+                res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp2_4x32b);
+
+                dst0 = _mm_cvtsi128_si32(res_temp0_4x32b);
+                /* dst row = 1 to 3 */
+                res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1);
+                res_temp2_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 2);
+                res_temp3_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 3);
+
+                /* store four 8-bit output values  */
+                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+                dst1 = _mm_cvtsi128_si32(res_temp1_4x32b);
+                dst2 = _mm_cvtsi128_si32(res_temp2_4x32b);
+                dst3 = _mm_cvtsi128_si32(res_temp3_4x32b);
+
+                /* row = 1 to row = 3 */
+                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+                *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
+                *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
+
+                /* To update pointer */
+                pi2_src += 4;
+                pu1_dst += 4;
+
+            } /* inner loop ends here(4-output values in single iteration) */
+
+            pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
+            pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
+
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does chroma uni-weighted prediction on array pointed by pi2_src and stores
+* it at the location pointed by pi2_dst
+*
+* @par Description:
+*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
+* offset
+*
+* @param[in] pi2_src
+*  Pointer to the source
+*
+* @param[out] pu1_dst
+*  Pointer to the destination
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] wgt0
+*  weight to be multiplied to the source
+*
+* @param[in] off0
+*  offset to be added after rounding and
+*
+* @param[in] shifting
+*
+*
+* @param[in] shift
+*  (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source (each colour component)
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_weighted_pred_chroma_uni_ssse3(WORD16 *pi2_src,
+                                          UWORD8 *pu1_dst,
+                                          WORD32 src_strd,
+                                          WORD32 dst_strd,
+                                          WORD32 wgt0_cb,
+                                          WORD32 wgt0_cr,
+                                          WORD32 off0_cb,
+                                          WORD32 off0_cr,
+                                          WORD32 shift,
+                                          WORD32 lvl_shift,
+                                          WORD32 ht,
+                                          WORD32 wd)
+{
+    WORD32 row, col, temp, wdx2;
+    /* all 128 bit registers are named with a suffix mxnb, where m is the */
+    /* number of n bits packed in the register                            */
+
+    __m128i src_temp0_8x16b, src_temp1_8x16b;
+    __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b;
+    __m128i res_temp0_4x32b, res_temp1_4x32b;
+
+    ASSERT(wd % 2 == 0); /* checking assumption*/
+    ASSERT(ht % 2 == 0); /* checking assumption*/
+
+    temp = 1 << (shift - 1);
+    wdx2 = 2 * wd;
+
+    // seting values in register
+    lvl_shift_4x32b = _mm_set1_epi16(lvl_shift);
+    wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
+
+    /* lvl_shift * wgt0 */
+    res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b);
+    res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b);
+
+    const_temp_4x32b = _mm_set1_epi32(temp);
+    off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb);
+
+    /* lvl_shift * wgt0 */
+    lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b);
+    /* lvl_shift * wgt0 + 1 << (shift - 1) */
+    lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b);
+
+    {
+        if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
+        {
+            __m128i src_temp2_8x16b, src_temp3_8x16b;
+            __m128i res_temp2_4x32b, res_temp3_4x32b;
+            __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b;
+
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row += 2)
+            {
+                for(col = 0; col < wdx2; col += 16)
+                {
+                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
+                    /* row = 1 */
+                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
+                    /* row = 0 */ /* Next 8 pixels */
+                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 8));
+                    /* row = 1 */
+                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8));
+
+                    /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
+                    res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
+                    res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
+                    res_temp4_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b);
+                    res_temp5_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
+
+                    /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
+                    src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
+                    src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
+                    src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b);
+                    src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
+
+                    /* Get 32 bit Result */
+                    res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
+                    res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
+                    res_temp6_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp2_8x16b);
+                    res_temp7_4x32b = _mm_unpackhi_epi16(res_temp5_4x32b, src_temp3_8x16b);
+
+                    res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
+                    res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
+                    res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp2_8x16b);
+                    res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, src_temp3_8x16b);
+
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
+                    res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
+                    res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
+                    res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
+                    res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
+                    res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b);
+                    res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b);
+                    res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b);
+                    res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b);
+
+                    /* (i4_tmp >> shift) */
+                    res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b,  shift);
+                    res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
+                    res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b,  shift);
+                    res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */
+                    res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
+                    res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
+                    res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
+                    res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
+
+                    /* (i4_tmp >> shift) */
+                    res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b,  shift);
+                    res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b,  shift);
+                    res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b,  shift);
+                    res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b,  shift);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
+                    res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b);
+                    res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
+                    res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b);
+                    res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b);
+
+                    res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b);
+                    res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
+                    res_temp4_4x32b = _mm_packs_epi32(res_temp4_4x32b, res_temp6_4x32b);
+                    res_temp5_4x32b = _mm_packs_epi32(res_temp5_4x32b, res_temp7_4x32b);
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+                    res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp4_4x32b);
+                    res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp5_4x32b);
+
+                    /* store 16 8-bit output values  */
+                    _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
+                    _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/
+
+                    pi2_src += 16;  /* Pointer update */
+                    pu1_dst += 16; /* Pointer update */
+
+                } /* inner loop ends here(4-output values in single iteration) */
+                pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
+                pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+            }
+        }
+        else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
+        {
+            __m128i res_temp2_4x32b, res_temp3_4x32b;
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row += 2)
+            {
+                for(col = 0; col < wdx2; col += 8)
+                {
+                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
+                    /* row = 1 */
+                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
+
+                    /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
+                    res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
+                    res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
+                    /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
+                    src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
+                    src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
+
+                    /* Get 32 bit Result */
+                    res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
+                    res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
+
+                    res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
+                    res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
+
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
+                    res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
+                    res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
+                    res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
+                    res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
+
+                    /* (i4_tmp >> shift) */
+                    res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b,  shift);
+                    res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
+                    res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b,  shift);
+                    res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
+
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */
+                    res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
+                    res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
+                    res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
+                    res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
+
+                    res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b);
+                    res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
+
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+                    res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
+                    res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
+
+                    /* store four 8-bit output values  */
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/
+
+                    pi2_src += 8;   /* Pointer update */
+                    pu1_dst += 8; /* Pointer update */
+
+                } /* inner loop ends here(4-output values in single iteration) */
+                pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
+                pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+            }
+        }
+        else /* 2*wd multiple of 4 case */
+        {
+            WORD32 dst0, dst1;
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row += 2)
+            {
+                for(col = 0; col < wdx2; col += 4)
+                {
+                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src));
+                    /* row = 1 */
+                    src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
+
+                    /* 2 rows together */
+                    src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
+
+                    /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
+                    res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
+                    /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
+                    src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
+
+                    /* Get 32 bit Result */
+                    res_temp1_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
+                    res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
+
+                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
+                    res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
+                    res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
+
+                    /* (i4_tmp >> shift) */
+                    res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b,  shift);
+                    res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
+
+                    /*i4_tmp = (i4_tmp >> shift) + off0; */
+                    res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
+                    res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
+
+                    res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b);
+
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+                    res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
+
+                    dst0 = _mm_cvtsi128_si32(res_temp0_4x32b);
+                    /* dst row = 1 to 3 */
+                    res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1);
+
+                    /* store four 8-bit output values  */
+                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+                    dst1 = _mm_cvtsi128_si32(res_temp1_4x32b);
+                    /* row = 1 */
+                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+
+                    pi2_src += 4;   /* Pointer update */
+                    pu1_dst += 4; /* Pointer update */
+
+                } /* inner loop ends here(4-output values in single iteration) */
+                pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
+                pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+            }
+        }
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Does bi-weighted prediction on the arrays pointed by  pi2_src1 and
+* pi2_src2 and stores it at location pointed  by pi2_dst
+*
+* @par Description:
+*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
+* off1 + 1) << (shift - 1) ) >> shift
+*
+* @param[in] pi2_src1
+*  Pointer to source 1
+*
+* @param[in] pi2_src2
+*  Pointer to source 2
+*
+* @param[out] pu1_dst
+*  Pointer to destination
+*
+* @param[in] src_strd1
+*  Source stride 1
+*
+* @param[in] src_strd2
+*  Source stride 2
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] wgt0
+*  weight to be multiplied to source 1
+*
+* @param[in] off0
+*  offset 0
+*
+* @param[in] wgt1
+*  weight to be multiplied to source 2
+*
+* @param[in] off1
+*  offset 1
+*
+* @param[in] shift
+*  (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift1
+*  added before shift and offset
+*
+* @param[in] lvl_shift2
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_weighted_pred_bi_ssse3(WORD16 *pi2_src1,
+                                  WORD16 *pi2_src2,
+                                  UWORD8 *pu1_dst,
+                                  WORD32 src_strd1,
+                                  WORD32 src_strd2,
+                                  WORD32 dst_strd,
+                                  WORD32 wgt0,
+                                  WORD32 off0,
+                                  WORD32 wgt1,
+                                  WORD32 off1,
+                                  WORD32 shift,
+                                  WORD32 lvl_shift1,
+                                  WORD32 lvl_shift2,
+                                  WORD32 ht,
+                                  WORD32 wd)
+{
+    WORD32 row, col, temp;
+
+    __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+    __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b;
+    __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b;
+
+#include <assert.h>
+    ASSERT(wd % 4 == 0); /* checking assumption*/
+    ASSERT(ht % 4 == 0); /* checking assumption*/
+
+    temp = (off0 + off1 + 1) << (shift - 1);
+
+    // seting values in register
+    lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1);
+    wgt0_8x16b = _mm_set1_epi16(wgt0);
+    lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2);
+    wgt1_8x16b = _mm_set1_epi16(wgt1);
+
+    /* lvl_shift1 * wgt0 */
+    res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b);
+    res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b);
+    /* lvl_shift2 * wgt1 */
+    res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b);
+    res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b);
+
+    const_temp_4x32b = _mm_set1_epi32(temp);
+
+    /* lvl_shift1 * wgt0 */
+    lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b);
+    /* lvl_shift2 * wgt1 */
+    lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b);
+
+    if(0 == (wd & 7)) /* wd multiple of 8 case */
+    {
+        __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b;
+        /*  outer for loop starts from here */
+        for(row = 0; row < ht; row += 2)
+        {
+            for(col = 0; col < wd; col += 8)
+            {
+                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
+                src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
+                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
+                src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
+
+                /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
+                res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
+                res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
+                res_temp3_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
+                res_temp4_4x32b  = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b);
+                /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
+                src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
+                src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
+                src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
+                src_temp4_8x16b  = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b);
+
+                /* Get 32 bit Result */
+                res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
+                res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
+                res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
+                res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b);
+
+                res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
+                res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
+                res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
+                res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b);
+
+                /* (pi2_src[col] + lvl_shift) * wgt */
+                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b);
+                res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b);
+                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b);
+                res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b);
+                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
+                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
+                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
+                res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
+
+                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
+                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
+                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
+                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
+                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
+                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
+                /* (i4_tmp >> shift) */
+                res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
+                res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
+
+                /* Next 4 Pixels */
+                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b);
+                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b);
+                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b);
+                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b);
+                res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b,  shift);
+                res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b,  shift);
+
+                res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
+                res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
+
+                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
+                res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
+
+                /* store four 8-bit output values  */
+                _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/
+
+                pi2_src1 += 8;  /* Pointer update */
+                pi2_src2 += 8;  /* Pointer update */
+                pu1_dst  += 8;  /* Pointer update */
+
+            } /* inner loop ends here(4-output values in single iteration) */
+
+            pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
+            pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
+            pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */
+
+        } /* outer loop ends */
+    }
+    else /* wd multiple of 4 case */
+    {
+        WORD32 dst0, dst1;
+        /*  outer for loop starts from here */
+        for(row = 0; row < ht; row += 2)
+        {
+            for(col = 0; col < wd; col += 4)
+            {
+                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */
+                src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */
+                src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
+                src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
+
+                /* 2 rows together */
+                src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
+                src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
+
+                /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
+                res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
+                res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
+                /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
+                src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
+                src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
+
+                /* Get 32 bit Result */
+                res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
+                res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
+
+                res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
+                res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
+
+                /* (pi2_src[col] + lvl_shift) * wgt */
+                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
+                res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
+                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
+                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
+
+                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
+                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
+                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
+
+                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
+                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
+                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
+
+                /* (i4_tmp >> shift) */
+                res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
+                res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
+
+                res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
+
+                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
+
+                dst0 = _mm_cvtsi128_si32(res_temp1_4x32b);
+
+                /* dst row = 1 to 3 */
+                res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1);
+
+                /* store four 8-bit output values  */
+                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+                dst1 = _mm_cvtsi128_si32(res_temp2_4x32b);
+
+                /* row = 1 */
+                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+
+                pi2_src1 += 4;  /* Pointer update */
+                pi2_src2 += 4;  /* Pointer update */
+                pu1_dst  += 4;  /* Pointer update */
+
+            } /* inner loop ends here(4-output values in single iteration) */
+
+            pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
+            pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
+            pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */
+
+        } /* outer loop ends */
+    }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does chroma bi-weighted prediction on the arrays pointed by  pi2_src1 and
+* pi2_src2 and stores it at location pointed  by pi2_dst
+*
+* @par Description:
+*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
+* off1 + 1) << (shift - 1) ) >> shift
+*
+* @param[in] pi2_src1
+*  Pointer to source 1
+*
+* @param[in] pi2_src2
+*  Pointer to source 2
+*
+* @param[out] pu1_dst
+*  Pointer to destination
+*
+* @param[in] src_strd1
+*  Source stride 1
+*
+* @param[in] src_strd2
+*  Source stride 2
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] wgt0
+*  weight to be multiplied to source 1
+*
+* @param[in] off0
+*  offset 0
+*
+* @param[in] wgt1
+*  weight to be multiplied to source 2
+*
+* @param[in] off1
+*  offset 1
+*
+* @param[in] shift
+*  (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift1
+*  added before shift and offset
+*
+* @param[in] lvl_shift2
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source (each colour component)
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_weighted_pred_chroma_bi_ssse3(WORD16 *pi2_src1,
+                                         WORD16 *pi2_src2,
+                                         UWORD8 *pu1_dst,
+                                         WORD32 src_strd1,
+                                         WORD32 src_strd2,
+                                         WORD32 dst_strd,
+                                         WORD32 wgt0_cb,
+                                         WORD32 wgt0_cr,
+                                         WORD32 off0_cb,
+                                         WORD32 off0_cr,
+                                         WORD32 wgt1_cb,
+                                         WORD32 wgt1_cr,
+                                         WORD32 off1_cb,
+                                         WORD32 off1_cr,
+                                         WORD32 shift,
+                                         WORD32 lvl_shift1,
+                                         WORD32 lvl_shift2,
+                                         WORD32 ht,
+                                         WORD32 wd)
+{
+    WORD32 row, col, temp1, temp2;
+    WORD32 wdx2;
+
+    __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+    __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b;
+    __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b;
+
+    ASSERT(wd % 2 == 0); /* checking assumption*/
+    ASSERT(ht % 2 == 0); /* checking assumption*/
+
+    temp1 = (off0_cb + off1_cb + 1) << (shift - 1);
+    temp2 = (off0_cr + off1_cr + 1) << (shift - 1);
+
+    // seting values in register
+    lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1);
+    wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
+    lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2);
+    wgt1_8x16b = _mm_set_epi16(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb);
+
+    /* lvl_shift1 * wgt0 */
+    res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b);
+    res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b);
+    /* lvl_shift2 * wgt1 */
+    res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b);
+    res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b);
+
+    const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1);
+    wdx2 = wd * 2;
+
+    /* lvl_shift1 * wgt0 */
+    lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b);
+    /* lvl_shift2 * wgt1 */
+    lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b);
+
+    if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */
+    {
+        __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b;
+        /*  outer for loop starts from here */
+        for(row = 0; row < ht; row += 2)
+        {
+            for(col = 0; col < wdx2; col += 8)
+            {
+                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
+                src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
+                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
+                src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
+
+                /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
+                res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
+                res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
+                res_temp3_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
+                res_temp4_4x32b  = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b);
+                /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
+                src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
+                src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
+                src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
+                src_temp4_8x16b  = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b);
+
+                /* Get 32 bit Result */
+                res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
+                res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
+                res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
+                res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b);
+
+                res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
+                res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
+                res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
+                res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b);
+
+                /* (pi2_src[col] + lvl_shift) * wgt */
+                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b);
+                res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b);
+                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b);
+                res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b);
+                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
+                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
+                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
+                res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
+
+                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
+                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
+                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
+                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
+                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
+                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
+                /* (i4_tmp >> shift) */
+                res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
+                res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
+
+                /* Next 4 Pixels */
+                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b);
+                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b);
+                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b);
+                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b);
+                res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b,  shift);
+                res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b,  shift);
+
+                res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
+                res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
+
+                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
+                res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
+
+                /* store four 8-bit output values  */
+                _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/
+                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/
+
+                pi2_src1 += 8;  /* Pointer update */
+                pi2_src2 += 8;  /* Pointer update */
+                pu1_dst  += 8;  /* Pointer update */
+
+            } /* inner loop ends here(4-output values in single iteration) */
+
+            pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
+            pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
+            pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
+
+        } /* outer loop ends */
+    }
+    else /* wdx2 multiple of 4 case */
+    {
+        WORD32 dst0, dst1;
+        /*  outer for loop starts from here */
+        for(row = 0; row < ht; row += 2)
+        {
+            for(col = 0; col < wdx2; col += 4)
+            {
+                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */
+                src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */
+                src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
+                src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
+
+                /* 2 rows together */
+                src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
+                src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
+
+                /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
+                res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
+                res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
+                /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
+                src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
+                src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
+
+                /* Get 32 bit Result */
+                res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
+                res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
+
+                res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
+                res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
+
+                /* (pi2_src[col] + lvl_shift) * wgt */
+                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
+                res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
+                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
+                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
+
+                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
+                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
+                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
+
+                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
+                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
+                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
+
+                /* (i4_tmp >> shift) */
+                res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
+                res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
+
+                res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
+
+                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
+
+                dst0 = _mm_cvtsi128_si32(res_temp1_4x32b);
+
+                /* dst row = 1 to 3 */
+                res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1);
+
+                /* store four 8-bit output values  */
+                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+                dst1 = _mm_cvtsi128_si32(res_temp2_4x32b);
+
+                /* row = 1 */
+                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+
+                pi2_src1 += 4;  /* Pointer update */
+                pi2_src2 += 4;  /* Pointer update */
+                pu1_dst  += 4;  /* Pointer update */
+
+            } /* inner loop ends here(4-output values in single iteration) */
+
+            pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
+            pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
+            pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
+        }
+    }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location  pointed by pi2_dst
+*
+* @par Description:
+*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
+* >> shift  where shift = 15 - BitDepth
+*
+* @param[in] pi2_src1
+*  Pointer to source 1
+*
+* @param[in] pi2_src2
+*  Pointer to source 2
+*
+* @param[out] pu1_dst
+*  Pointer to destination
+*
+* @param[in] src_strd1
+*  Source stride 1
+*
+* @param[in] src_strd2
+*  Source stride 2
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] lvl_shift1
+*  added before shift and offset
+*
+* @param[in] lvl_shift2
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source
+*
+* @returns
+*
+* @remarks
+*  None
+*
+* Assumption : ht%4 == 0, wd%4 == 0
+* shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
+* final result will match even if intermediate precision is in 16 bit.
+*
+*******************************************************************************
+*/
+void ihevc_weighted_pred_bi_default_ssse3(WORD16 *pi2_src1,
+                                          WORD16 *pi2_src2,
+                                          UWORD8 *pu1_dst,
+                                          WORD32 src_strd1,
+                                          WORD32 src_strd2,
+                                          WORD32 dst_strd,
+                                          WORD32 lvl_shift1,
+                                          WORD32 lvl_shift2,
+                                          WORD32 ht,
+                                          WORD32 wd)
+{
+#if 1
+    {
+        WORD32 row, col, temp;
+        WORD32 shift;
+
+        __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+        __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b;
+        __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+        ASSERT(wd % 4 == 0); /* checking assumption*/
+        ASSERT(ht % 2 == 0); /* checking assumption*/
+
+        shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
+        temp = 1 << (shift - 1);
+
+        // seting values in register
+        lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1);
+        lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2);
+        const_temp_8x16b = _mm_set1_epi16(temp);
+
+        lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b);
+        lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b);
+
+        if(0 == (ht & 3)) /* ht multiple of 4*/
+        {
+            if(0 == (wd & 15)) /* wd multiple of 16 case */
+            {
+                __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
+                __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
+                /*  outer for loop starts from here */
+                for(row = 0; row < ht; row += 4)
+                {
+                    for(col = 0; col < wd; col += 16)
+                    {
+                        /*load 8 pixel values */ /* First 8 Values */
+                        src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
+                        src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
+                        /* row = 1 */
+                        src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
+                        src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
+                        /* row = 2 */
+                        src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
+                        src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
+                        /* row = 3 */
+                        src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
+                        src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
+
+                        /*load 8 pixel values */ /* Second 8 Values */
+                        src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
+                        src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
+                        /* row = 1 */
+                        src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
+                        src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
+                        /* row = 2 */
+                        src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
+                        src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
+
+                        /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
+                        src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+                        src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
+                        src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
+                        src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
+
+                        /*load 8 pixel values */ /* Second 8 Values */
+                        /* row = 3 */
+                        src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
+                        src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
+
+                        /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
+                        src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+                        src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
+                        src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
+                        src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
+
+                        /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
+                        src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
+                        src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
+                        src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
+                        src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+                        /* (i4_tmp >> shift) */ /* First 8 Values */
+                        src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
+                        src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
+                        src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
+                        src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
+
+                        /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
+                        src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
+                        src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
+                        src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
+                        src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
+
+                        /* (i4_tmp >> shift) */ /* Second 8 Values */
+                        src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
+                        src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
+                        src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  shift);
+                        src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b,  shift);
+
+                        /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */
+                        src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b);
+                        src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b);
+                        src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b);
+                        src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b);
+
+                        /* store four 8-bit output values  */ /* 16 8 Values */
+                        _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
+                        _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
+                        _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
+                        _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
+
+                        /* To update pointer */
+                        pi2_src1 += 16;
+                        pi2_src2 += 16;
+                        pu1_dst  += 16;
+
+                    } /* inner loop ends here(8-output values in single iteration) */
+
+                    pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
+                    pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
+                    pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */
+
+                }
+            }
+            else if(0 == (wd & 7)) /* multiple of 8 case */
+            {
+                /*  outer for loop starts from here */
+                for(row = 0; row < ht; row += 4)
+                {
+                    for(col = 0; col < wd; col += 8)
+                    {
+                        /*load 8 pixel values */
+                        src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
+                        src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
+                        /* row = 1 */
+                        src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
+                        src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
+                        /* row = 2 */
+                        src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
+                        src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
+                        /* row = 3 */
+                        src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
+                        src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
+
+                        /* (pi2_src1[col] + pi2_src2[col]) */
+                        src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+                        src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
+                        src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
+                        src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
+
+                        /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+                        src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+                        src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
+                        src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
+                        src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
+
+                        /* (i4_tmp >> shift) */
+                        src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
+                        src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
+                        src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
+                        src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
+
+                        /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                        src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+                        src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
+                        src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
+                        src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
+
+                        /* store four 8-bit output values  */
+                        _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
+                        _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
+                        _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
+                        _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
+
+                        /* To update pointer */
+                        pi2_src1 += 8;
+                        pi2_src2 += 8;
+                        pu1_dst  += 8;
+
+                    } /* inner loop ends here(8-output values in single iteration) */
+
+                    pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
+                    pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
+                    pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */
+
+                }
+            }
+            else /* wd multiple of 4 case*/
+            {
+                WORD32 dst0, dst1, dst2, dst3;
+
+                /*  outer for loop starts from here */
+                for(row = 0; row < ht; row += 4)
+                {
+                    for(col = 0; col < wd; col += 4)
+                    {
+                        /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
+                        src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
+                        /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                        src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
+
+                        /* row = 1 */
+                        src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
+                        src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
+                        /* row = 2 */
+                        src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
+                        src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
+                        /* row = 3 */
+                        src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
+                        src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
+
+                        /* Pack two rows together */
+                        src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
+                        src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
+                        src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
+                        src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
+
+                        /* (pi2_src1[col] + pi2_src2[col]) */
+                        src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+                        src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+                        /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+                        src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+                        src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
+
+                        /* (i4_tmp >> shift) */
+                        src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
+                        src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
+
+                        /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                        src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+                        src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
+
+                        dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
+                        /* dst row = 1 to 3 */
+                        src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
+                        src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
+
+                        /* store four 8-bit output values  */
+                        *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+                        dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
+                        dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
+                        dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
+
+                        /* row = 1 to row = 3 */
+                        *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+                        *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
+                        *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
+
+                        /* To update pointer */
+                        pi2_src1 += 4;
+                        pi2_src2 += 4;
+                        pu1_dst  += 4;
+
+                    } /* inner loop ends here(4-output values in single iteration) */
+
+                    pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
+                    pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
+                    pu1_dst  = pu1_dst  - wd + 4 * dst_strd;  /* Pointer update */
+
+                }
+            }
+        }
+        else /* ht multiple of 2 case and wd multiple of 4 case*/
+        {
+
+            WORD32 dst0, dst1;
+
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row += 2)
+            {
+                for(col = 0; col < wd; col += 4)
+                {
+                    /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
+                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
+
+                    /* row = 1 */
+                    src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
+                    src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
+
+                    /* Pack two rows together */
+                    src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
+                    src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
+
+                    /* (pi2_src1[col] + pi2_src2[col]) */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+
+                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+
+                    /* (i4_tmp >> shift) */
+                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
+
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+
+                    dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
+                    /* dst row = 1 to 3 */
+                    src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
+
+                    /* store four 8-bit output values  */
+                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+                    dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
+
+                    /* row = 1 to row = 3 */
+                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+
+                    /* To update pointer */
+                    pi2_src1 += 4;
+                    pi2_src2 += 4;
+                    pu1_dst  += 4;
+
+                } /* inner loop ends here(4-output values in single iteration) */
+
+                pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
+                pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
+                pu1_dst  = pu1_dst  - wd + 2 * dst_strd;  /* Pointer update */
+
+            }
+
+        }
+
+    }
+#else
+    {
+        WORD32 row, col,temp;
+        WORD32 shift;
+
+        __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+        __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b;
+        __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+        ASSERT (wd%4 == 0); /* checking assumption*/
+        ASSERT (ht%4 == 0); /* checking assumption*/
+
+        shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
+        temp = 1 << (shift - 1);
+
+        // seting values in register
+        lvl_shift1_8x16b = _mm_set1_epi32 (lvl_shift1);
+        lvl_shift2_8x16b = _mm_set1_epi32 (lvl_shift2);
+        const_temp_8x16b = _mm_set1_epi32 (temp);
+
+        lvl_shift1_8x16b = _mm_add_epi32 (lvl_shift1_8x16b, lvl_shift2_8x16b);
+        lvl_shift1_8x16b = _mm_add_epi32 (lvl_shift1_8x16b, const_temp_8x16b);
+
+        if( 0 == (wd & 7)) /* multiple of 8 case */
+        {
+            __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
+            __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
+
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row +=4)
+            {
+                for(col = 0; col < wd; col +=8)
+                {
+                    /*load 4 pixel values */
+                    src_temp1_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1));
+                    src_temp2_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2));
+                    /* row = 1 */
+                    src_temp3_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+src_strd1));
+                    src_temp4_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+src_strd2));
+                    /* row = 2 */
+                    src_temp5_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+2*src_strd1));
+                    src_temp6_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+2*src_strd2));
+                    /* row = 3 */
+                    src_temp7_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+3*src_strd1));
+                    src_temp8_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+3*src_strd2));
+
+                    /* considering pix. 3:0 by converting 16-into 32 bit */
+                    src_temp1_8x16b  = _mm_cvtepi16_epi32(src_temp1_8x16b);
+                    src_temp2_8x16b  = _mm_cvtepi16_epi32(src_temp2_8x16b);
+                    /* row = 1 */
+                    src_temp3_8x16b  = _mm_cvtepi16_epi32(src_temp3_8x16b);
+                    src_temp4_8x16b  = _mm_cvtepi16_epi32(src_temp4_8x16b);
+                    /* row = 2 */
+                    src_temp5_8x16b  = _mm_cvtepi16_epi32(src_temp5_8x16b);
+                    src_temp6_8x16b  = _mm_cvtepi16_epi32(src_temp6_8x16b);
+                    /* row = 3 */
+                    src_temp7_8x16b  = _mm_cvtepi16_epi32(src_temp7_8x16b);
+                    src_temp8_8x16b  = _mm_cvtepi16_epi32(src_temp8_8x16b);
+
+                    /* (pi2_src1[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+                    src_temp1_8x16b = _mm_add_epi32 (src_temp1_8x16b, lvl_shift1_8x16b);
+                    src_temp3_8x16b = _mm_add_epi32 (src_temp3_8x16b, lvl_shift1_8x16b);
+                    src_temp5_8x16b = _mm_add_epi32 (src_temp5_8x16b, lvl_shift1_8x16b);
+                    src_temp7_8x16b = _mm_add_epi32 (src_temp7_8x16b, lvl_shift1_8x16b);
+
+                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+                    src_temp1_8x16b = _mm_add_epi32 (src_temp1_8x16b, src_temp2_8x16b);
+                    src_temp3_8x16b = _mm_add_epi32 (src_temp3_8x16b, src_temp4_8x16b);
+                    src_temp5_8x16b = _mm_add_epi32 (src_temp5_8x16b, src_temp6_8x16b);
+                    src_temp7_8x16b = _mm_add_epi32 (src_temp7_8x16b, src_temp8_8x16b);
+
+                    /* (i4_tmp >> shift) */
+                    src_temp1_8x16b = _mm_srai_epi32(src_temp1_8x16b,  shift);
+                    src_temp3_8x16b = _mm_srai_epi32(src_temp3_8x16b,  shift);
+                    src_temp5_8x16b = _mm_srai_epi32(src_temp5_8x16b,  shift);
+                    src_temp7_8x16b = _mm_srai_epi32(src_temp7_8x16b,  shift);
+
+                    /*load next 4 pixel values */
+                    src_temp9_8x16b  = _mm_loadu_si128((__m128i*)(pi2_src1+4));
+                    src_temp10_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+4));
+                    /* row = 1 */
+                    src_temp11_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+1*src_strd1+4));
+                    src_temp12_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+1*src_strd2+4));
+                    /* row = 2 */
+                    src_temp13_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+2*src_strd1+4));
+                    src_temp14_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+2*src_strd2+4));
+                    /* row = 3 */
+                    src_temp15_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+3*src_strd1+4));
+                    src_temp16_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+3*src_strd2+4));
+
+                    /* considering pix. 7:4 by converting 16-into 32 bit */
+                    src_temp9_8x16b   = _mm_cvtepi16_epi32(src_temp9_8x16b);
+                    src_temp10_8x16b  = _mm_cvtepi16_epi32(src_temp10_8x16b);
+                    /* row = 1 */
+                    src_temp11_8x16b  = _mm_cvtepi16_epi32(src_temp11_8x16b);
+                    src_temp12_8x16b  = _mm_cvtepi16_epi32(src_temp12_8x16b);
+                    /* row = 2 */
+                    src_temp13_8x16b  = _mm_cvtepi16_epi32(src_temp13_8x16b);
+                    src_temp14_8x16b  = _mm_cvtepi16_epi32(src_temp14_8x16b);
+                    /* row = 3 */
+                    src_temp15_8x16b  = _mm_cvtepi16_epi32(src_temp15_8x16b);
+                    src_temp16_8x16b  = _mm_cvtepi16_epi32(src_temp16_8x16b);
+
+                    /* (pi2_src1[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+                    src_temp9_8x16b  = _mm_add_epi32 (src_temp9_8x16b , lvl_shift1_8x16b);
+                    src_temp11_8x16b = _mm_add_epi32 (src_temp11_8x16b, lvl_shift1_8x16b);
+                    src_temp13_8x16b = _mm_add_epi32 (src_temp13_8x16b, lvl_shift1_8x16b);
+                    src_temp15_8x16b = _mm_add_epi32 (src_temp15_8x16b, lvl_shift1_8x16b);
+
+                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+                    src_temp9_8x16b  = _mm_add_epi32 (src_temp9_8x16b , src_temp10_8x16b);
+                    src_temp11_8x16b = _mm_add_epi32 (src_temp11_8x16b, src_temp12_8x16b);
+                    src_temp13_8x16b = _mm_add_epi32 (src_temp13_8x16b, src_temp14_8x16b);
+                    src_temp15_8x16b = _mm_add_epi32 (src_temp15_8x16b, src_temp16_8x16b);
+
+                    /* (i4_tmp >> shift) */
+                    src_temp9_8x16b  = _mm_srai_epi32(src_temp9_8x16b ,  shift);
+                    src_temp11_8x16b = _mm_srai_epi32(src_temp11_8x16b,  shift);
+                    src_temp13_8x16b = _mm_srai_epi32(src_temp13_8x16b,  shift);
+                    src_temp15_8x16b = _mm_srai_epi32(src_temp15_8x16b,  shift);
+
+                    src_temp1_8x16b = _mm_packs_epi32 (src_temp1_8x16b, src_temp9_8x16b);
+                    src_temp3_8x16b = _mm_packs_epi32 (src_temp3_8x16b, src_temp11_8x16b);
+                    src_temp5_8x16b = _mm_packs_epi32 (src_temp5_8x16b, src_temp13_8x16b);
+                    src_temp7_8x16b = _mm_packs_epi32 (src_temp7_8x16b, src_temp15_8x16b);
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                    src_temp1_8x16b = _mm_packus_epi16 (src_temp1_8x16b, src_temp1_8x16b);
+                    src_temp3_8x16b = _mm_packus_epi16 (src_temp3_8x16b, src_temp3_8x16b);
+                    src_temp5_8x16b = _mm_packus_epi16 (src_temp5_8x16b, src_temp5_8x16b);
+                    src_temp7_8x16b = _mm_packus_epi16 (src_temp7_8x16b, src_temp7_8x16b);
+
+                    /* store four 8-bit output values  */
+                    _mm_storel_epi64((__m128i*)(pu1_dst+0*dst_strd), src_temp1_8x16b); /* row = 0*/
+                    _mm_storel_epi64((__m128i*)(pu1_dst+1*dst_strd), src_temp3_8x16b); /* row = 2*/
+                    _mm_storel_epi64((__m128i*)(pu1_dst+2*dst_strd), src_temp5_8x16b); /* row = 1*/
+                    _mm_storel_epi64((__m128i*)(pu1_dst+3*dst_strd), src_temp7_8x16b); /* row = 3*/
+
+                    /* To update pointer */
+                    pi2_src1 += 8;
+                    pi2_src2 += 8;
+                    pu1_dst  += 8;
+                } /* inner loop ends here(8-output values in single iteration) */
+
+                pi2_src1 = pi2_src1- wd + 4*src_strd1;  /* Pointer update */
+                pi2_src2 = pi2_src2- wd + 4*src_strd2;  /* Pointer update */
+                pu1_dst  = pu1_dst - wd + 4*dst_strd;   /* Pointer update */
+
+            }
+        }
+        else /* wd multiple of 4 case */
+        {
+            WORD32 dst0, dst1, dst2, dst3;
+
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row +=4)
+            {
+                for(col = 0; col < wd; col +=4)
+                {
+                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp1_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1));
+                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp2_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2));
+
+                    /* row = 1 */
+                    src_temp3_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+src_strd1));
+                    src_temp4_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+src_strd2));
+                    /* row = 2 */
+                    src_temp5_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+2*src_strd1));
+                    src_temp6_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+2*src_strd2));
+                    /* row = 3 */
+                    src_temp7_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+3*src_strd1));
+                    src_temp8_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+3*src_strd2));
+
+                    /* considering pix. 4:0 by converting 16-into 32 bit */
+                    src_temp1_8x16b  = _mm_cvtepi16_epi32(src_temp1_8x16b);
+                    src_temp2_8x16b  = _mm_cvtepi16_epi32(src_temp2_8x16b);
+                    /* row = 1 */
+                    src_temp3_8x16b  = _mm_cvtepi16_epi32(src_temp3_8x16b);
+                    src_temp4_8x16b  = _mm_cvtepi16_epi32(src_temp4_8x16b);
+                    /* row = 2 */
+                    src_temp5_8x16b  = _mm_cvtepi16_epi32(src_temp5_8x16b);
+                    src_temp6_8x16b  = _mm_cvtepi16_epi32(src_temp6_8x16b);
+                    /* row = 3 */
+                    src_temp7_8x16b  = _mm_cvtepi16_epi32(src_temp7_8x16b);
+                    src_temp8_8x16b  = _mm_cvtepi16_epi32(src_temp8_8x16b);
+
+                    /* (pi2_src1[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+                    src_temp1_8x16b = _mm_add_epi32 (src_temp1_8x16b, lvl_shift1_8x16b);
+                    src_temp3_8x16b = _mm_add_epi32 (src_temp3_8x16b, lvl_shift1_8x16b);
+                    src_temp5_8x16b = _mm_add_epi32 (src_temp5_8x16b, lvl_shift1_8x16b);
+                    src_temp7_8x16b = _mm_add_epi32 (src_temp7_8x16b, lvl_shift1_8x16b);
+
+                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+                    src_temp1_8x16b = _mm_add_epi32 (src_temp1_8x16b, src_temp2_8x16b);
+                    src_temp3_8x16b = _mm_add_epi32 (src_temp3_8x16b, src_temp4_8x16b);
+                    src_temp5_8x16b = _mm_add_epi32 (src_temp5_8x16b, src_temp6_8x16b);
+                    src_temp7_8x16b = _mm_add_epi32 (src_temp7_8x16b, src_temp8_8x16b);
+
+                    /* (i4_tmp >> shift) */
+                    src_temp1_8x16b = _mm_srai_epi32(src_temp1_8x16b,  shift);
+                    src_temp3_8x16b = _mm_srai_epi32(src_temp3_8x16b,  shift);
+                    src_temp5_8x16b = _mm_srai_epi32(src_temp5_8x16b,  shift);
+                    src_temp7_8x16b = _mm_srai_epi32(src_temp7_8x16b,  shift);
+
+                    src_temp1_8x16b = _mm_packs_epi32 (src_temp1_8x16b, src_temp3_8x16b);
+                    src_temp5_8x16b = _mm_packs_epi32 (src_temp5_8x16b, src_temp7_8x16b);
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                    src_temp1_8x16b = _mm_packus_epi16 (src_temp1_8x16b, src_temp5_8x16b);
+
+                    dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
+                    /* dst row = 1 to 3 */
+                    src_temp2_8x16b = _mm_shuffle_epi32 (src_temp1_8x16b, 1);
+                    src_temp3_8x16b = _mm_shuffle_epi32 (src_temp1_8x16b, 2);
+                    src_temp4_8x16b = _mm_shuffle_epi32 (src_temp1_8x16b, 3);
+
+                    /* store four 8-bit output values  */
+                    *(WORD32 *) (&pu1_dst[0*dst_strd]) = dst0;
+
+                    dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
+                    dst2 = _mm_cvtsi128_si32(src_temp3_8x16b);
+                    dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
+
+                    /* row = 1 to row = 3 */
+                    *(WORD32 *) (&pu1_dst[1*dst_strd]) = dst1;
+                    *(WORD32 *) (&pu1_dst[2*dst_strd]) = dst2;
+                    *(WORD32 *) (&pu1_dst[3*dst_strd]) = dst3;
+
+                    pi2_src1 += 4;
+                    pi2_src2 += 4;
+                    pu1_dst  += 4;
+
+                } /* inner loop ends here(4-output values in single iteration) */
+
+                pi2_src1 = pi2_src1 - wd + 4*src_strd1; /* Pointer update */
+                pi2_src2 = pi2_src2 - wd + 4*src_strd2; /* Pointer update */
+                pu1_dst  = pu1_dst  - wd + 4*dst_strd;  /* Pointer update */
+
+            }
+        }
+
+    }
+#endif
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Does chroma default bi-weighted prediction on arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location  pointed by pi2_dst
+*
+* @par Description:
+*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
+* >> shift  where shift = 15 - BitDepth
+*
+* @param[in] pi2_src1
+*  Pointer to source 1
+*
+* @param[in] pi2_src2
+*  Pointer to source 2
+*
+* @param[out] pu1_dst
+*  Pointer to destination
+*
+* @param[in] src_strd1
+*  Source stride 1
+*
+* @param[in] src_strd2
+*  Source stride 2
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] lvl_shift1
+*  added before shift and offset
+*
+* @param[in] lvl_shift2
+*  added before shift and offset
+*
+* @param[in] ht
+*  height of the source
+*
+* @param[in] wd
+*  width of the source (each colour component)
+*
+* @returns
+*
+* @remarks
+*  None
+*
+* Assumption : ht%2 == 0, wd%2 == 0, lvl_shift1==0, lvl_shift2==0.
+* shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
+* final result will match even if intermediate precision is in 16 bit.
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_chroma_bi_default_ssse3(WORD16 *pi2_src1,
+                                                 WORD16 *pi2_src2,
+                                                 UWORD8 *pu1_dst,
+                                                 WORD32 src_strd1,
+                                                 WORD32 src_strd2,
+                                                 WORD32 dst_strd,
+                                                 WORD32 lvl_shift1,
+                                                 WORD32 lvl_shift2,
+                                                 WORD32 ht,
+                                                 WORD32 wd)
+{
+    WORD32 row, col, temp;
+    WORD32 shift, wdx2;
+
+    __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+    __m128i lvl_shift1_8x16b;
+    __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+    ASSERT(wd % 2 == 0); /* checking assumption*/
+    ASSERT(ht % 2 == 0); /* checking assumption*/
+    UNUSED(lvl_shift1);
+    UNUSED(lvl_shift2);
+    shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
+    temp = 1 << (shift - 1);
+    wdx2 = wd * 2;
+
+    // seting values in register
+    lvl_shift1_8x16b = _mm_set1_epi16(temp);
+
+    if(0 == (ht & 3)) /* ht multiple of 4 case */
+    {
+        if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
+        {
+            __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
+            __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row += 4)
+            {
+                for(col = 0; col < wdx2; col += 16)
+                {
+                    /*load 8 pixel values */ /* First 8 Values */
+                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
+                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
+                    /* row = 1 */
+                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
+                    src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
+                    /* row = 2 */
+                    src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
+                    src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
+                    /* row = 3 */
+                    src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
+                    src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
+
+                    /*load 8 pixel values */ /* Second 8 Values */
+                    src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
+                    src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
+                    /* row = 1 */
+                    src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
+                    src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
+                    /* row = 2 */
+                    src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
+                    src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
+
+                    /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
+                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
+                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
+
+                    /*load 8 pixel values */ /* Second 8 Values */
+                    /* row = 3 */
+                    src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
+                    src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
+
+                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
+                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
+                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
+
+                    /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
+                    src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
+                    src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
+                    src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
+                    src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+                    /* (i4_tmp >> shift) */ /* First 8 Values */
+                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
+                    src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
+                    src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
+                    src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
+
+                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
+                    src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
+                    src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
+                    src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
+                    src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
+
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */
+                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+                    src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
+                    src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
+                    src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
+
+                    /* (i4_tmp >> shift) */ /* Second 8 Values */
+                    src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
+                    src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
+                    src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  shift);
+                    src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b,  shift);
+
+                    /* store four 8-bit output values  */ /* First 8 Values */
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
+
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */
+                    src_temp9_8x16b  = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b);
+                    src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b);
+                    src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, src_temp13_8x16b);
+                    src_temp15_8x16b = _mm_packus_epi16(src_temp15_8x16b, src_temp15_8x16b);
+
+                    /* store four 8-bit output values  */ /* Second 8 Values */
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd + 8), src_temp13_8x16b); /* row = 1*/
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd + 8), src_temp15_8x16b); /* row = 3*/
+
+                    /* To update pointer */
+                    pi2_src1 += 16;
+                    pi2_src2 += 16;
+                    pu1_dst  += 16;
+
+                } /* inner loop ends here(8-output values in single iteration) */
+
+                pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1;    /* Pointer update */
+                pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2;    /* Pointer update */
+                pu1_dst  = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */
+
+            }
+        }
+        else if(0 == (wdx2 & 7)) /* multiple of 8 case */
+        {
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row += 4)
+            {
+                for(col = 0; col < wdx2; col += 8)
+                {
+                    /*load 8 pixel values */
+                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
+                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
+                    /* row = 1 */
+                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
+                    src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
+                    /* row = 2 */
+                    src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
+                    src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
+                    /* row = 3 */
+                    src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
+                    src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
+
+                    /* (pi2_src1[col] + pi2_src2[col]) */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
+                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
+                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
+
+                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
+                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
+                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
+
+                    /* (i4_tmp >> shift) */
+                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
+                    src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
+                    src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
+                    src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
+
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+                    src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
+                    src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
+                    src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
+
+                    /* store four 8-bit output values  */
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
+
+                    /* To update pointer */
+                    pi2_src1 += 8;
+                    pi2_src2 += 8;
+                    pu1_dst  += 8;
+
+                } /* inner loop ends here(8-output values in single iteration) */
+
+                pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1;    /* Pointer update */
+                pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2;    /* Pointer update */
+                pu1_dst  = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */
+
+            }
+        }
+        else /* 2*wd multiple of 4 case */
+        {
+            WORD32 dst0, dst1, dst2, dst3;
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row += 4)
+            {
+                for(col = 0; col < wdx2; col += 4)
+                {
+                    /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
+                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
+
+                    /* row = 1 */
+                    src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
+                    src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
+                    /* row = 2 */
+                    src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
+                    src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
+                    /* row = 3 */
+                    src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
+                    src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
+
+                    /* Pack two rows together */
+                    src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
+                    src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
+                    src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
+                    src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
+
+                    /* (pi2_src1[col] + pi2_src2[col]) */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
+
+                    /* (i4_tmp >> shift) */
+                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
+                    src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
+
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+                    src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
+
+                    dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
+                    /* dst row = 1 to 3 */
+                    src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
+                    src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
+
+                    /* store four 8-bit output values  */
+                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+                    dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
+                    dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
+                    dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
+
+                    /* row = 1 to row = 3 */
+                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+                    *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
+                    *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
+
+                    /* To update pointer */
+                    pi2_src1 += 4;
+                    pi2_src2 += 4;
+                    pu1_dst  += 4;
+
+                } /* inner loop ends here(4-output values in single iteration) */
+
+                pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1;   /* Pointer update */
+                pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2;   /* Pointer update */
+                pu1_dst  = pu1_dst  - wdx2 + 4 * dst_strd;    /* Pointer update */
+
+            }
+        }
+    }
+    else /* ht multiple of 2 case */
+    {
+        if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
+        {
+            __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row += 2)
+            {
+                for(col = 0; col < wdx2; col += 16)
+                {
+                    /*load 8 pixel values */ /* First 8 Values */
+                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
+                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
+                    /* row = 1 */
+                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
+                    src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
+
+                    /*load 8 pixel values */ /* Second 8 Values */
+                    src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
+                    src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
+                    /* row = 1 */
+                    src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
+                    src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
+
+                    /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
+                    src_temp1_8x16b  = _mm_adds_epi16(src_temp1_8x16b,  src_temp2_8x16b);
+                    src_temp3_8x16b  = _mm_adds_epi16(src_temp3_8x16b,  src_temp4_8x16b);
+
+                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
+
+                    /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
+                    src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
+                    src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
+
+                    /* (i4_tmp >> shift) */ /* First 8 Values */
+                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
+                    src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
+
+                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
+                    src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
+                    src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
+
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */
+                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+                    src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
+
+                    /* (i4_tmp >> shift) */ /* Second 8 Values */
+                    src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
+                    src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
+
+                    /* store four 8-bit output values  */ /* First 8 Values */
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
+
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */
+                    src_temp9_8x16b  = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b);
+                    src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b);
+
+                    /* store four 8-bit output values  */ /* Second 8 Values */
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/
+
+                    /* To update pointer */
+                    pi2_src1 += 16;
+                    pi2_src2 += 16;
+                    pu1_dst  += 16;
+
+                } /* inner loop ends here(8-output values in single iteration) */
+
+                pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
+                pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
+                pu1_dst  = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+
+            }
+        }
+        else if(0 == (wdx2 & 7)) /* multiple of 8 case */
+        {
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row += 2)
+            {
+                for(col = 0; col < wdx2; col += 8)
+                {
+                    /*load 8 pixel values */
+                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
+                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
+                    /* row = 1 */
+                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
+                    src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
+
+                    /* (pi2_src1[col] + pi2_src2[col]) */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
+
+                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
+
+                    /* (i4_tmp >> shift) */
+                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
+                    src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
+
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+                    src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
+
+                    /* store four 8-bit output values  */
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
+                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 1*/
+
+                    /* To update pointer */
+                    pi2_src1 += 8;
+                    pi2_src2 += 8;
+                    pu1_dst  += 8;
+
+                } /* inner loop ends here(8-output values in single iteration) */
+
+                pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
+                pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
+                pu1_dst  = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+
+            }
+        }
+        else /* 2*wd multiple of 4 case */
+        {
+            WORD32 dst0, dst1;
+            /*  outer for loop starts from here */
+            for(row = 0; row < ht; row += 2)
+            {
+                for(col = 0; col < wdx2; col += 4)
+                {
+                    /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
+                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+                    src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
+                    /* row = 1 */
+                    src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
+                    src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
+
+                    /* Pack two rows together */
+                    src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
+                    src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
+
+                    /* (pi2_src1[col] + pi2_src2[col]) */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+
+                    /* (i4_tmp >> shift) */
+                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
+                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+
+                    dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
+                    /* dst row = 1 */
+                    src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
+
+                    /* store four 8-bit output values  */
+                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+                    dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
+                    /* row = 1 */
+                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+
+                    /* To update pointer */
+                    pi2_src1 += 4;
+                    pi2_src2 += 4;
+                    pu1_dst  += 4;
+                } /* inner loop ends here(4-output values in single iteration) */
+
+                pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;   /* Pointer update */
+                pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;   /* Pointer update */
+                pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;    /* Pointer update */
+
+            }
+        }
+    }
+}

diff --git a/decoder.arm.mk b/decoder.arm.mk
new file mode 100644
index 0000000..903822d
--- /dev/null
+++ b/decoder.arm.mk

@@ -0,0 +1,88 @@
+libhevcd_inc_dir_arm   +=  $(LOCAL_PATH)/decoder/arm
+libhevcd_inc_dir_arm   +=  $(LOCAL_PATH)/common/arm
+
+libhevcd_srcs_c_arm    +=  decoder/arm/ihevcd_function_selector.c
+libhevcd_srcs_c_arm    +=  decoder/arm/ihevcd_function_selector_noneon.c
+libhevcd_cflags_arm    += -DDISABLE_NEONINTR  -DARM -DARMGCC
+
+LOCAL_ARM_MODE         := arm
+
+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+libhevcd_srcs_c_arm    +=  decoder/arm/ihevcd_function_selector_a9q.c
+libhevcd_srcs_c_arm    +=  common/arm/ihevc_intra_ref_substitution_a9q.c
+libhevcd_srcs_c_arm    +=  common/arm/ihevc_intra_pred_filters_neon_intr.c
+libhevcd_srcs_c_arm    +=  common/arm/ihevc_weighted_pred_neon_intr.c
+
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_mem_fns.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_itrans_recon_32x32.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_weighted_pred_bi_default.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_weighted_pred_bi.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_weighted_pred_uni.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_deblk_luma_horz.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_deblk_luma_vert.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_deblk_chroma_vert.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_deblk_chroma_horz.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_sao_band_offset_luma.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_sao_band_offset_chroma.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_sao_edge_offset_class0.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_sao_edge_offset_class0_chroma.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_sao_edge_offset_class1.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_sao_edge_offset_class1_chroma.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_sao_edge_offset_class2.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_sao_edge_offset_class2_chroma.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_sao_edge_offset_class3.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_sao_edge_offset_class3_chroma.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_inter_pred_luma_horz_w16out.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_inter_pred_filters_luma_horz.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_inter_pred_filters_luma_vert.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_inter_pred_chroma_horz.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_inter_pred_chroma_horz_w16out.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_inter_pred_chroma_vert.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_inter_pred_chroma_vert_w16out.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_inter_pred_chroma_vert_w16inp.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_inter_pred_luma_copy_w16out.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_inter_pred_luma_copy.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_inter_pred_chroma_copy.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_inter_pred_chroma_copy_w16out.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_itrans_recon_4x4_ttype1.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_itrans_recon_4x4.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_itrans_recon_8x8.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_itrans_recon_16x16.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_chroma_planar.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_chroma_dc.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_chroma_horz.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_chroma_ver.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_chroma_mode2.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_chroma_mode_18_34.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_luma_planar.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_luma_horz.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_luma_mode2.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_luma_mode_27_to_33.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_luma_mode_18_34.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_luma_vert.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_luma_dc.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_intra_pred_luma_mode_3_to_9.s
+libhevcd_srcs_asm_arm   +=  common/arm/ihevc_padding.s
+
+libhevcd_srcs_asm_arm    +=  decoder/arm/ihevcd_itrans_recon_dc_luma.s
+libhevcd_srcs_asm_arm    +=  decoder/arm/ihevcd_itrans_recon_dc_chroma.s
+libhevcd_srcs_asm_arm    +=  decoder/arm/ihevcd_fmt_conv_420sp_to_420p.s
+libhevcd_srcs_asm_arm    +=  decoder/arm/ihevcd_fmt_conv_420sp_to_420sp.s
+libhevcd_srcs_asm_arm    +=  decoder/arm/ihevcd_fmt_conv_420sp_to_rgba8888.s
+libhevcd_cflags_arm += -DDEFAULT_ARCH=D_ARCH_ARM_A9Q
+else
+libhevcd_cflags_arm += -DDISABLE_NEON -DDEFAULT_ARCH=D_ARCH_ARM_NONEON
+endif
+
+LOCAL_SRC_FILES_arm += $(libhevcd_srcs_c_arm) $(libhevcd_srcs_asm_arm)
+LOCAL_C_INCLUDES_arm += $(libhevcd_inc_dir_arm)
+LOCAL_CFLAGS_arm += $(libhevcd_cflags_arm)

diff --git a/decoder.arm64.mk b/decoder.arm64.mk
new file mode 100644
index 0000000..8714aaf
--- /dev/null
+++ b/decoder.arm64.mk

@@ -0,0 +1,97 @@
+libhevcd_cflags_arm64 += -DARMV8
+libhevcd_cflags_arm64 += -DDISABLE_NEONINTR  -DARM -DARMGCC
+
+libhevcd_inc_dir_arm64   +=  $(LOCAL_PATH)/decoder/arm
+libhevcd_inc_dir_arm64   +=  $(LOCAL_PATH)/common/arm
+libhevcd_inc_dir_arm64   +=  $(LOCAL_PATH)/decoder/arm64
+libhevcd_inc_dir_arm64   +=  $(LOCAL_PATH)/common/arm64
+
+libhevcd_srcs_c_arm64    +=  decoder/arm/ihevcd_function_selector.c
+libhevcd_srcs_c_arm64    +=  decoder/arm/ihevcd_function_selector_noneon.c
+
+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+libhevcd_srcs_c_arm64    +=  decoder/arm64/ihevcd_function_selector_av8.c
+
+libhevcd_srcs_c_arm64    +=  common/arm/ihevc_intra_pred_filters_neon_intr.c
+libhevcd_srcs_c_arm64    +=  common/arm/ihevc_weighted_pred_neon_intr.c
+
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_mem_fns.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_itrans_recon_32x32.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_weighted_pred_bi_default.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_weighted_pred_bi.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_weighted_pred_uni.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_deblk_luma_horz.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_deblk_luma_vert.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_deblk_chroma_vert.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_deblk_chroma_horz.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_sao_band_offset_luma.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_sao_band_offset_chroma.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_sao_edge_offset_class0.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_sao_edge_offset_class0_chroma.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_sao_edge_offset_class1.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_sao_edge_offset_class1_chroma.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_sao_edge_offset_class2.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_sao_edge_offset_class2_chroma.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_sao_edge_offset_class3.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_sao_edge_offset_class3_chroma.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_inter_pred_luma_horz_w16out.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_inter_pred_filters_luma_horz.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_inter_pred_filters_luma_vert.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_inter_pred_chroma_horz.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_inter_pred_chroma_horz_w16out.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_inter_pred_chroma_vert.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_inter_pred_luma_copy_w16out.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_inter_pred_luma_copy.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_inter_pred_chroma_copy.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_inter_pred_chroma_copy_w16out.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_itrans_recon_4x4_ttype1.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_itrans_recon_4x4.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_itrans_recon_8x8.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_itrans_recon_16x16.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_chroma_planar.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_chroma_dc.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_chroma_horz.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_chroma_ver.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_chroma_mode2.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_chroma_mode_18_34.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_luma_planar.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_luma_horz.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_luma_mode2.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_luma_mode_18_34.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_luma_vert.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_luma_dc.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s
+libhevcd_srcs_asm_arm64   +=  common/arm64/ihevc_padding.s
+
+
+
+libhevcd_srcs_asm_arm64    +=  decoder/arm64/ihevcd_itrans_recon_dc_luma.s
+libhevcd_srcs_asm_arm64    +=  decoder/arm64/ihevcd_itrans_recon_dc_chroma.s
+libhevcd_srcs_asm_arm64    +=  decoder/arm64/ihevcd_fmt_conv_420sp_to_420p.s
+libhevcd_srcs_asm_arm64    +=  decoder/arm64/ihevcd_fmt_conv_420sp_to_420sp.s
+libhevcd_srcs_asm_arm64    +=  decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
+
+libhevcd_cflags_arm += -DDEFAULT_ARCH=D_ARCH_ARMV8_GENERIC
+else
+libhevcd_cflags_arm64 += -DDISABLE_NEON -DDEFAULT_ARCH=D_ARCH_ARM_NONEON
+endif
+
+
+
+
+LOCAL_SRC_FILES_arm64 += $(libhevcd_srcs_c_arm64) $(libhevcd_srcs_asm_arm64)
+LOCAL_C_INCLUDES_arm64 += $(libhevcd_inc_dir_arm64)
+LOCAL_CFLAGS_arm64 += $(libhevcd_cflags_arm64)

diff --git a/decoder.mips.mk b/decoder.mips.mk
new file mode 100644
index 0000000..2aecc09
--- /dev/null
+++ b/decoder.mips.mk

@@ -0,0 +1,13 @@
+libhevcd_inc_dir_mips   +=  $(LOCAL_PATH)/decoder/mips
+libhevcd_inc_dir_mips   +=  $(LOCAL_PATH)/common/mips
+
+libhevcd_srcs_c_mips    +=  decoder/mips/ihevcd_function_selector.c
+libhevcd_srcs_c_mips    +=  decoder/mips/ihevcd_function_selector_mips_generic.c
+
+
+LOCAL_SRC_FILES_mips += $(libhevcd_srcs_c_mips) $(libhevcd_srcs_asm_mips)
+LOCAL_C_INCLUDES_mips += $(libhevcd_inc_dir_mips)
+LOCAL_CFLAGS_mips += $(libhevcd_cflags_mips)
+
+
+

diff --git a/decoder.mips64.mk b/decoder.mips64.mk
new file mode 100644
index 0000000..5ac515e
--- /dev/null
+++ b/decoder.mips64.mk

@@ -0,0 +1,10 @@
+libhevcd_inc_dir_mips   +=  $(LOCAL_PATH)/decoder/mips
+libhevcd_inc_dir_mips   +=  $(LOCAL_PATH)/common/mips
+
+libhevcd_srcs_c_mips    +=  decoder/mips/ihevcd_function_selector.c
+libhevcd_srcs_c_mips    +=  decoder/mips/ihevcd_function_selector_mips_generic.c
+
+LOCAL_SRC_FILES_mips64 += $(libhevcd_srcs_c_mips64) $(libhevcd_srcs_asm_mips64)
+LOCAL_C_INCLUDES_mips64 += $(libhevcd_inc_dir_mips64)
+LOCAL_CFLAGS_mips64 += $(libhevcd_cflags_mips64)
+

diff --git a/decoder.mk b/decoder.mk
new file mode 100644
index 0000000..38e3654
--- /dev/null
+++ b/decoder.mk

@@ -0,0 +1,96 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+libhevc_source_dir := $(LOCAL_PATH)
+
+## Arch-common settings
+LOCAL_MODULE := libhevcdec
+#LOCAL_32_BIT_ONLY := true
+
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+
+
+LOCAL_CFLAGS += -D_LIB -DMULTICORE -fPIC
+#TODO -O3 is throwing up an error in aarch64 while linking
+LOCAL_CFLAGS += -O2 -DHM_10DOT0 -DANDROID
+
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/decoder $(LOCAL_PATH)/common
+
+libhevcd_srcs_c   +=  common/ihevc_quant_tables.c
+libhevcd_srcs_c   +=  common/ihevc_inter_pred_filters.c
+libhevcd_srcs_c   +=  common/ihevc_weighted_pred.c
+libhevcd_srcs_c   +=  common/ihevc_padding.c
+libhevcd_srcs_c   +=  common/ihevc_deblk_edge_filter.c
+libhevcd_srcs_c   +=  common/ihevc_deblk_tables.c
+libhevcd_srcs_c   +=  common/ihevc_cabac_tables.c
+libhevcd_srcs_c   +=  common/ihevc_common_tables.c
+libhevcd_srcs_c   +=  common/ihevc_intra_pred_filters.c
+libhevcd_srcs_c   +=  common/ihevc_chroma_intra_pred_filters.c
+libhevcd_srcs_c   +=  common/ihevc_mem_fns.c
+libhevcd_srcs_c   +=  common/ihevc_sao.c
+libhevcd_srcs_c   +=  common/ihevc_trans_tables.c
+libhevcd_srcs_c   +=  common/ihevc_recon.c
+libhevcd_srcs_c   +=  common/ihevc_itrans.c
+libhevcd_srcs_c   +=  common/ihevc_itrans_recon.c
+libhevcd_srcs_c   +=  common/ihevc_iquant_recon.c
+libhevcd_srcs_c   +=  common/ihevc_iquant_itrans_recon.c
+libhevcd_srcs_c   +=  common/ihevc_itrans_recon_32x32.c
+libhevcd_srcs_c   +=  common/ihevc_itrans_recon_16x16.c
+libhevcd_srcs_c   +=  common/ihevc_itrans_recon_8x8.c
+libhevcd_srcs_c   +=  common/ihevc_chroma_itrans_recon.c
+libhevcd_srcs_c   +=  common/ihevc_chroma_iquant_recon.c
+libhevcd_srcs_c   +=  common/ihevc_chroma_iquant_itrans_recon.c
+libhevcd_srcs_c   +=  common/ihevc_chroma_recon.c
+libhevcd_srcs_c   +=  common/ihevc_chroma_itrans_recon_16x16.c
+libhevcd_srcs_c   +=  common/ihevc_chroma_itrans_recon_8x8.c
+libhevcd_srcs_c   +=  common/ihevc_buf_mgr.c
+libhevcd_srcs_c   +=  common/ihevc_disp_mgr.c
+libhevcd_srcs_c   +=  common/ihevc_dpb_mgr.c
+libhevcd_srcs_c   +=  common/ithread.c
+
+
+
+libhevcd_srcs_c   +=  decoder/ihevcd_version.c
+libhevcd_srcs_c   +=  decoder/ihevcd_trace.c
+libhevcd_srcs_c   +=  decoder/ihevcd_api.c
+libhevcd_srcs_c   +=  decoder/ihevcd_decode.c
+libhevcd_srcs_c   +=  decoder/ihevcd_nal.c
+libhevcd_srcs_c   +=  decoder/ihevcd_bitstream.c
+libhevcd_srcs_c   +=  decoder/ihevcd_parse_headers.c
+libhevcd_srcs_c   +=  decoder/ihevcd_parse_slice_header.c
+libhevcd_srcs_c   +=  decoder/ihevcd_parse_slice.c
+libhevcd_srcs_c   +=  decoder/ihevcd_parse_residual.c
+libhevcd_srcs_c   +=  decoder/ihevcd_cabac.c
+libhevcd_srcs_c   +=  decoder/ihevcd_intra_pred_mode_prediction.c
+libhevcd_srcs_c   +=  decoder/ihevcd_process_slice.c
+libhevcd_srcs_c   +=  decoder/ihevcd_utils.c
+libhevcd_srcs_c   +=  decoder/ihevcd_job_queue.c
+libhevcd_srcs_c   +=  decoder/ihevcd_ref_list.c
+libhevcd_srcs_c   +=  decoder/ihevcd_get_mv.c
+libhevcd_srcs_c   +=  decoder/ihevcd_mv_pred.c
+libhevcd_srcs_c   +=  decoder/ihevcd_mv_merge.c
+libhevcd_srcs_c   +=  decoder/ihevcd_iquant_itrans_recon_ctb.c
+libhevcd_srcs_c   +=  decoder/ihevcd_itrans_recon_dc.c
+libhevcd_srcs_c   +=  decoder/ihevcd_common_tables.c
+libhevcd_srcs_c   +=  decoder/ihevcd_boundary_strength.c
+libhevcd_srcs_c   +=  decoder/ihevcd_deblk.c
+libhevcd_srcs_c   +=  decoder/ihevcd_inter_pred.c
+libhevcd_srcs_c   +=  decoder/ihevcd_sao.c
+libhevcd_srcs_c   +=  decoder/ihevcd_ilf_padding.c
+libhevcd_srcs_c   +=  decoder/ihevcd_debug.c
+libhevcd_srcs_c   +=  decoder/ihevcd_ittiam_logo.c
+libhevcd_srcs_c   +=  decoder/ihevcd_statistics.c
+libhevcd_srcs_c   +=  decoder/ihevcd_fmt_conv.c
+
+LOCAL_SRC_FILES := $(libhevcd_srcs_c) $(libhevcd_srcs_asm)
+
+
+# Load the arch-specific settings
+include $(LOCAL_PATH)/decoder.arm.mk
+include $(LOCAL_PATH)/decoder.arm64.mk
+include $(LOCAL_PATH)/decoder.x86.mk
+include $(LOCAL_PATH)/decoder.x86_64.mk
+include $(LOCAL_PATH)/decoder.mips.mk
+include $(LOCAL_PATH)/decoder.mips64.mk
+
+include $(BUILD_STATIC_LIBRARY)

diff --git a/decoder.x86.mk b/decoder.x86.mk
new file mode 100644
index 0000000..287ef3a
--- /dev/null
+++ b/decoder.x86.mk

@@ -0,0 +1,44 @@
+libhevcd_cflags_x86 += -DX86 -DDISABLE_AVX2 -m32 -msse4.2 -mno-avx -DDEFAULT_ARCH=D_ARCH_X86_SSE42
+
+libhevcd_inc_dir_x86   +=  $(LOCAL_PATH)/decoder/x86
+libhevcd_inc_dir_x86   +=  $(LOCAL_PATH)/common/x86
+
+libhevcd_srcs_c_x86    +=  decoder/x86/ihevcd_function_selector.c
+libhevcd_srcs_c_x86    +=  decoder/x86/ihevcd_function_selector_generic.c
+libhevcd_srcs_c_x86    +=  decoder/x86/ihevcd_function_selector_ssse3.c
+libhevcd_srcs_c_x86    +=  decoder/x86/ihevcd_function_selector_sse42.c
+
+
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_inter_pred_filters_ssse3_intr.c
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_weighted_pred_ssse3_intr.c
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_intra_pred_filters_ssse3_intr.c
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_chroma_intra_pred_filters_ssse3_intr.c
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_itrans_recon_ssse3_intr.c
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_itrans_recon_16x16_ssse3_intr.c
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_itrans_recon_32x32_ssse3_intr.c
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_sao_ssse3_intr.c
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_deblk_ssse3_intr.c
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_padding_ssse3_intr.c
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_mem_fns_ssse3_intr.c
+libhevcd_srcs_c_x86    +=  decoder/x86/ihevcd_fmt_conv_ssse3_intr.c
+libhevcd_srcs_c_x86    +=  decoder/x86/ihevcd_it_rec_dc_ssse3_intr.c
+
+
+
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_inter_pred_filters_sse42_intr.c
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_weighted_pred_sse42_intr.c
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_intra_pred_filters_sse42_intr.c
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_itrans_recon_sse42_intr.c
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_16x16_itrans_recon_sse42_intr.c
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_32x32_itrans_recon_sse42_intr.c
+libhevcd_srcs_c_x86    +=  decoder/x86/ihevcd_it_rec_dc_sse42_intr.c
+
+libhevcd_srcs_c_x86    +=  common/x86/ihevc_tables_x86_intr.c
+
+LOCAL_SRC_FILES_x86 += $(libhevcd_srcs_c_x86) $(libhevcd_srcs_asm_x86)
+LOCAL_C_INCLUDES_x86 += $(libhevcd_inc_dir_x86)
+LOCAL_CFLAGS_x86 += $(libhevcd_cflags_x86)
+
+
+

diff --git a/decoder.x86_64.mk b/decoder.x86_64.mk
new file mode 100644
index 0000000..7c53b87
--- /dev/null
+++ b/decoder.x86_64.mk

@@ -0,0 +1,44 @@
+libhevcd_cflags_x86_64 += -DX86 -DDISABLE_AVX2 -m64 -msse4.2 -mno-avx  -DDEFAULT_ARCH=D_ARCH_X86_SSE42
+
+libhevcd_inc_dir_x86_64   +=  $(LOCAL_PATH)/decoder/x86
+libhevcd_inc_dir_x86_64   +=  $(LOCAL_PATH)/common/x86
+
+libhevcd_srcs_c_x86_64    +=  decoder/x86/ihevcd_function_selector.c
+libhevcd_srcs_c_x86_64    +=  decoder/x86/ihevcd_function_selector_generic.c
+libhevcd_srcs_c_x86_64    +=  decoder/x86/ihevcd_function_selector_ssse3.c
+libhevcd_srcs_c_x86_64    +=  decoder/x86/ihevcd_function_selector_sse42.c
+
+
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_inter_pred_filters_ssse3_intr.c
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_weighted_pred_ssse3_intr.c
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_intra_pred_filters_ssse3_intr.c
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_chroma_intra_pred_filters_ssse3_intr.c
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_itrans_recon_ssse3_intr.c
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_itrans_recon_16x16_ssse3_intr.c
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_itrans_recon_32x32_ssse3_intr.c
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_sao_ssse3_intr.c
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_deblk_ssse3_intr.c
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_padding_ssse3_intr.c
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_mem_fns_ssse3_intr.c
+libhevcd_srcs_c_x86_64    +=  decoder/x86/ihevcd_fmt_conv_ssse3_intr.c
+libhevcd_srcs_c_x86_64    +=  decoder/x86/ihevcd_it_rec_dc_ssse3_intr.c
+
+
+
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_inter_pred_filters_sse42_intr.c
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_weighted_pred_sse42_intr.c
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_intra_pred_filters_sse42_intr.c
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_itrans_recon_sse42_intr.c
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_16x16_itrans_recon_sse42_intr.c
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_32x32_itrans_recon_sse42_intr.c
+libhevcd_srcs_c_x86_64    +=  decoder/x86/ihevcd_it_rec_dc_sse42_intr.c
+
+libhevcd_srcs_c_x86_64    +=  common/x86/ihevc_tables_x86_intr.c
+
+LOCAL_SRC_FILES_x86_64 += $(libhevcd_srcs_c_x86_64) $(libhevcd_srcs_asm_x86_64)
+LOCAL_C_INCLUDES_x86_64 += $(libhevcd_inc_dir_x86_64)
+LOCAL_CFLAGS_x86_64 += $(libhevcd_cflags_x86_64)
+
+
+

diff --git a/decoder/arm/ihevcd_fmt_conv_420sp_to_420p.s b/decoder/arm/ihevcd_fmt_conv_420sp_to_420p.s
new file mode 100644
index 0000000..c1d09ed
--- /dev/null
+++ b/decoder/arm/ihevcd_fmt_conv_420sp_to_420p.s

@@ -0,0 +1,203 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/*******************************************************************************
+@* @file
+@*  ihevcd_fmt_conv_420sp_to_420p.s
+@*
+@* @brief
+@*  contains function definitions for format conversions
+@*
+@* @author
+@*  ittiam
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************/
+
+
+
+
+
+
+
+
+.text
+
+
+
+
+
+@/*****************************************************************************
+@*                                                                            *
+@*  Function Name    : neon_copy_yuv420sp_to_yuv420p()                       *
+@*                                                                            *
+@*  Description      : This function conversts the image from YUV420sP color  *
+@*                     space to 420SP color space(UV interleaved).            *
+@*                                                                            *
+@*  Arguments        : R0           pu1_src_y                                 *
+@*                     R1           pu1_src_uv                                *
+@*                     R2           pu1_dest_y                                *
+@*                     R3           pu1_dest_u                               *
+@*                     [R13 #40]    pu1_dest_v                               *
+@*                     [R13 #44]    u2_width                                 *
+@*                     [R13 #48]    u2_height                                   *
+@*                     [R13 #52]    u2_stridey                                *
+@*                     [R13 #56]    u2_strideuv                               *
+@*                     [R13 #60]    u2_dest_stridey                           *
+@*                     [R13 #64]    u2_dest_strideuv                          *
+@*                     [R13 #68]    is_u_first                                *
+@*                     [R13 #72]    disable_luma_copy                         *
+@*                                                                            *
+@*  Values Returned  : None                                                   *
+@*                                                                            *
+@*  Register Usage   : R0 - R14                                               *
+@*                                                                            *
+@*  Stack Usage      : 40 Bytes                                               *
+@*                                                                            *
+@*  Interruptibility : Interruptible                                          *
+@*                                                                            *
+@*  Known Limitations                                                         *
+@*       Assumptions: Image Width:     Assumed to be multiple of 2 and       *
+@*                     Image Height:    Assumed to be even.                   *
+@*                                                                            *
+@*  Revision History :                                                        *
+@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
+@*         16 05 2012   Naveen SR     draft                                   *
+@*                                                                            *
+@*****************************************************************************/
+
+.globl ihevcd_fmt_conv_420sp_to_420p_a9q
+
+.type ihevcd_fmt_conv_420sp_to_420p_a9q, %function
+
+ihevcd_fmt_conv_420sp_to_420p_a9q:
+    STMFD       sp!,{r4-r12, lr}
+
+    LDR         r5,[sp,#60]                 @//Load u2_dest_stridey
+@   LDR     r6,[sp,#56]             @//Load u2_strideuv
+    LDR         r7,[sp,#52]                 @//Load u2_stridey
+    LDR         r8,[sp,#44]                 @//Load u2_width
+    LDR         r9,[sp,#48]                 @//Load u2_height
+
+    SUB         r10,r7,r8                   @// Src Y increment
+    SUB         r11,r5,r8                   @// Dst Y increment
+
+    LDR         r5,[sp,#72]                 @//Load disable_luma_copy flag
+    CMP         r5,#0                       @//skip luma if disable_luma_copy is non-zero
+    BNE         uv_copy_start
+
+    @/* Copy Y */
+
+    MOV         r4,r9                       @// Copying height
+y_row_loop:
+    MOV         r6,r8                       @// Copying width
+
+y_col_loop:
+
+    SUB         r6,r6,#16
+    vld1.8      {d0,d1},[r0]!
+    vst1.8      {d0,d1},[r2]!
+    CMP         r6,#16
+    BGE         y_col_loop
+    CMP         r6,#0
+    BEQ         y_col_loop_end
+    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    @//Ex if width is 162, above loop will process 160 pixels. And
+    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+    @// and written using VLD1 and VST1
+    RSB         r6,r6,#16
+    SUB         r0,r0,r6
+    SUB         r2,r2,r6
+    vld1.8      {d0,d1}, [r0]!
+    vst1.8      {d0,d1}, [r2]!
+
+y_col_loop_end:
+    ADD         r0, r0, r10
+    ADD         r2, r2, r11
+    SUBS        r4, r4, #1
+    BGT         y_row_loop
+
+
+    @/* Copy UV */
+uv_copy_start:
+
+    LDR         r5,[sp,#64]                 @//Load u2_dest_strideuv
+    LDR         r7,[sp,#56]                 @//Load u2_strideuv
+
+    MOV         r9,r9,LSR #1                @// height/2
+@   MOV     r8,r8,LSR #1            @// Width/2
+
+    SUB         r10,r7,r8                   @// Src UV increment
+    MOV         r11,r8,LSR #1
+    SUB         r11,r5,r11                  @// Dst U and V increment
+
+    LDR         r5,[sp,#40]                 @//Load pu1_dest_v
+
+    LDR         r4,[sp,#68]                 @//Load is_u_first_flag
+    CMP         r4,#0                       @//Swap U and V dest if is_u_first_flag is zero
+    MOVEQ       r4,r5
+    MOVEQ       r5,r3
+    MOVEQ       r3,r4
+
+    MOV         r4,r9                       @// Copying height
+uv_row_loop:
+    MOV         r6,r8                       @// Copying width
+
+uv_col_loop:
+
+    SUB         r6,r6,#16
+
+    PLD         [r1,#128]
+    vld2.8      {d0,d1},[r1]!
+    VST1.8      D0,[r3]!
+    VST1.8      D1,[r5]!
+    CMP         r6,#16
+    BGE         uv_col_loop
+    CMP         r6,#0
+    BEQ         uv_col_loop_end
+    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    @//Ex if width is 162, above loop will process 160 pixels. And
+    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+    @// and written using VLD1 and VST1
+    RSB         r6,r6,#16
+    SUB         r1,r1,r6
+    SUB         r3,r3,r6,LSR #1
+    SUB         r5,r5,r6,LSR #1
+    vld2.8      {d0,d1}, [r1]!
+    VST1.8      D0, [r3]!
+    VST1.8      D1, [r5]!
+uv_col_loop_end:
+    ADD         r1, r1, r10
+    ADD         r3, r3, r11
+    ADD         r5, r5, r11
+    SUBS        r4, r4, #1
+    BGT         uv_row_loop
+
+exit:
+    LDMFD       sp!,{r4-r12, pc}
+
+
+
+
+
+

diff --git a/decoder/arm/ihevcd_fmt_conv_420sp_to_420sp.s b/decoder/arm/ihevcd_fmt_conv_420sp_to_420sp.s
new file mode 100644
index 0000000..38886ba
--- /dev/null
+++ b/decoder/arm/ihevcd_fmt_conv_420sp_to_420sp.s

@@ -0,0 +1,198 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/*******************************************************************************
+@* @file
+@*  ihevcd_fmt_conv_420sp_to_420sp.s
+@*
+@* @brief
+@*  contains function definitions for format conversions
+@*
+@* @author
+@*  ittiam
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************/
+    .equ DO1STROUNDING, 0
+
+    @ ARM
+    @
+    @ PRESERVE8
+
+.text
+.p2align 2
+
+
+
+
+
+@/*****************************************************************************
+@*                                                                            *
+@*  Function Name    : ihevcd_fmt_conv_420sp_to_420sp()                       *
+@*                                                                            *
+@*  Description      : This function conversts the image from YUV420SP color  *
+@*                     space to 420SP color space(UV interleaved).            *
+@*                                                                            *
+@*  Arguments        : R0           pu1_y                                     *
+@*                     R1           pu1_uv                                    *
+@*                     R2           pu1_dest_y                                *
+@*                     R3           pu1_dest_uv                               *
+@*                     [R13 #40]    u2_width                                  *
+@*                     [R13 #44]    u2_height                                 *
+@*                     [R13 #48]    u2_stridey                                *
+@*                     [R13 #52]    u2_stridechroma                           *
+@*                     [R13 #56]    u2_dest_stridey                           *
+@*                     [R13 #60]    u2_dest_stridechroma                      *
+@*                                                                            *
+@*  Values Returned  : None                                                   *
+@*                                                                            *
+@*  Register Usage   : R0 - R14                                               *
+@*                                                                            *
+@*  Stack Usage      : 40 Bytes                                               *
+@*                                                                            *
+@*  Interruptibility : Interruptible                                          *
+@*                                                                            *
+@*  Known Limitations                                                         *
+@*       Assumptions: Image Width:     Assumed to be multiple of 2 and       *
+@*                     Image Height:    Assumed to be even.                   *
+@*                                                                            *
+@*  Revision History :                                                        *
+@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
+@*         16 05 2012   Naveen SR     draft                                   *
+@*                                                                            *
+@*****************************************************************************/
+
+    .global ihevcd_fmt_conv_420sp_to_420sp_a9q
+.type ihevcd_fmt_conv_420sp_to_420sp_a9q, %function
+ihevcd_fmt_conv_420sp_to_420sp_a9q:
+
+    STMFD       sp!,{r4-r12, lr}
+
+
+    LDR         r5,[sp,#56]                 @//Load u2_dest_stridey
+
+    LDR         r7,[sp,#48]                 @//Load u2_stridey
+    LDR         r8,[sp,#40]                 @//Load u2_width
+    LDR         r9,[sp,#44]                 @//Load u2_height
+
+    SUB         r10,r7,r8                   @// Src Y increment
+    SUB         r11,r5,r8                   @// Dst Y increment
+
+    @/* Copy Y */
+
+    MOV         r4,r9                       @// Copying height
+y_row_loop:
+    MOV         r6,r8                       @// Copying width
+
+y_col_loop:
+    PLD         [r0, #128]
+    SUB         r6,r6,#32
+    VLD1.8      D0,[r0]!
+    VLD1.8      D1,[r0]!
+    VLD1.8      D2,[r0]!
+    VLD1.8      D3,[r0]!
+    VST1.8      D0,[R2]!
+    VST1.8      D1,[R2]!
+    VST1.8      D2,[R2]!
+    VST1.8      D3,[R2]!
+    CMP         r6,#32
+    BGE         y_col_loop
+    CMP         r6,#0
+    BEQ         y_col_loop_end
+    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    @//Ex if width is 162, above loop will process 160 pixels. And
+    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+    @// and written using VLD1 and VST1
+    RSB         r6,r6,#32
+    SUB         r0,r0,r6
+    SUB         R2,R2,r6
+    VLD1.8      D0,[r0]!
+    VLD1.8      D1,[r0]!
+    VLD1.8      D2,[r0]!
+    VLD1.8      D3,[r0]!
+    VST1.8      D0,[R2]!
+    VST1.8      D1,[R2]!
+    VST1.8      D2,[R2]!
+    VST1.8      D3,[R2]!
+
+y_col_loop_end:
+    ADD         r0, r0, r10
+    ADD         R2, R2, r11
+    SUBS        r4, r4, #1
+    BGT         y_row_loop
+
+
+
+    @/* Copy UV */
+
+    LDR         r5,[sp,#60]                 @//Load u2_dest_stridechroma
+    LDR         r7,[sp,#52]                 @//Load u2_stridechroma
+
+    MOV         r9,r9,LSR #1                @// height/2
+@   MOV     r8,r8,LSR #1            @// Width/2
+
+    MOV         R2,R3                       @pu1_dest_uv
+
+    SUB         r10,r7,r8                   @// Src UV increment
+    SUB         r11,r5,r8                   @// Dst UV increment
+
+    MOV         r4,r9                       @// Copying height
+uv_row_loop:
+    MOV         r6,r8                       @// Copying width
+
+uv_col_loop:
+
+    PLD         [r1, #128]
+    SUB         r6,r6,#16
+    VLD1.8      D0,[r1]!
+    VLD1.8      D1,[r1]!
+    VST1.8      D0,[R2]!
+    VST1.8      D1,[R2]!
+    CMP         r6,#16
+    BGE         uv_col_loop
+    CMP         r6,#0
+    BEQ         u_col_loop_end
+    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    @//Ex if width is 162, above loop will process 160 pixels. And
+    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+    @// and written using VLD1 and VST1
+    RSB         r6,r6,#16
+    SUB         r1,r1,r6
+    SUB         R2,R2,r6
+    VLD1.8      D0, [r1]!
+    VLD1.8      D1, [r1]!
+    VST1.8      D0, [R2]!
+    VST1.8      D1, [R2]!
+
+u_col_loop_end:
+    ADD         r1, r1, r10
+    ADD         R2, R2, r11
+    SUBS        r4, r4, #1
+    BGT         uv_row_loop
+
+exit:
+    LDMFD       sp!,{r4-r12, pc}
+
+
+    .section .note.GNU-stack,"",%progbits
+

diff --git a/decoder/arm/ihevcd_fmt_conv_420sp_to_rgba8888.s b/decoder/arm/ihevcd_fmt_conv_420sp_to_rgba8888.s
new file mode 100644
index 0000000..a9a75cb
--- /dev/null
+++ b/decoder/arm/ihevcd_fmt_conv_420sp_to_rgba8888.s

@@ -0,0 +1,454 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/*******************************************************************************
+@* @file
+@*  ihevcd_fmt_conv_420sp_to_rgba8888.s
+@*
+@* @brief
+@*  contains function definitions for format conversions
+@*
+@* @author
+@*  ittiam
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************/
+    .equ DO1STROUNDING, 0
+
+    @ ARM
+    @
+    @ PRESERVE8
+
+.text
+.p2align 2
+
+
+
+
+@/*****************************************************************************
+@*                                                                            *
+@*  Function Name    : ihevcd_fmt_conv_420sp_to_rgba8888()                    *
+@*                                                                            *
+@*  Description      : This function conversts the image from YUV422 color    *
+@*                     space to RGB888 color space. The function can be       *
+@*                     invoked at the MB level.                               *
+@*                                                                            *
+@*  Arguments        : R0           pubY                                      *
+@*                     R1           pubUV                                     *
+@*                     R2           pusRGB                                    *
+@*                     R3           pusRGB                                    *
+@*                     [R13 #40]    usHeight                                  *
+@*                     [R13 #44]    usWidth                                   *
+@*                     [R13 #48]    usStrideY                                 *
+@*                     [R13 #52]    usStrideU                                 *
+@*                     [R13 #56]    usStrideV                                 *
+@*                     [R13 #60]    usStrideRGB                               *
+@*                                                                            *
+@*  Values Returned  : None                                                   *
+@*                                                                            *
+@*  Register Usage   : R0 - R14                                               *
+@*                                                                            *
+@*  Stack Usage      : 40 Bytes                                               *
+@*                                                                            *
+@*  Interruptibility : Interruptible                                          *
+@*                                                                            *
+@*  Known Limitations                                                         *
+@*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
+@*                     greater than or equal to 16                *
+@*                     Image Height:    Assumed to be even.                   *
+@*                                                                            *
+@*  Revision History :                                                        *
+@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
+@*         07 06 2010   Varshita        Draft                                 *
+@*         07 06 2010   Naveen Kr T     Completed                             *
+@*         05 08 2013   Naveen K P      Modified for HEVC                     *
+@*****************************************************************************/
+    .global ihevcd_fmt_conv_420sp_to_rgba8888_a9q
+.type ihevcd_fmt_conv_420sp_to_rgba8888_a9q, function
+ihevcd_fmt_conv_420sp_to_rgba8888_a9q:
+
+    @// push the registers on the stack
+    STMFD       SP!,{R4-R12,LR}
+
+
+    @//R0 - Y PTR
+    @//R1 - UV PTR
+    @//R2 - RGB PTR
+    @//R3 - RGB PTR
+    @//R4 - PIC WIDTH
+    @//R5 - PIC HT
+    @//R6 - STRIDE Y
+    @//R7 - STRIDE U
+    @//R8 - STRIDE V
+    @//R9 - STRIDE RGB
+
+    @//ONE ROW PROCESSING AT A TIME
+
+    @//THE FOUR CONSTANTS ARE:
+    @//C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092
+
+    @PLD        [R0]
+    @PLD        [R1]
+    @PLD        [R2]
+
+
+    @/* can be loaded from a defined const type */
+    MOVW        R10,#0x3311
+    VMOV.16     D0[0],R10                   @//C1
+
+    MOVW        R10,#0xF379
+    VMOV.16     D0[1],R10                   @//C2
+
+    MOVW        R10,#0xE5F8
+    VMOV.16     D0[2],R10                   @//C3
+
+    MOVW        R10,#0x4092
+    VMOV.16     D0[3],R10                   @//C4
+
+    @//LOAD CONSTANT 128 INTO A CORTEX REGISTER
+    MOV         R10,#128
+    VDUP.8      D1,R10
+
+    @//D0 HAS C1-C2-C3-C4
+    @// load other parameters from stack
+    LDR         R5,[sp,#40]
+    @LDR  R4,[sp,#44]
+    LDR         R6,[sp,#44]
+    LDR         R7,[sp,#48]
+    @LDR  R8,[sp,#52]
+    LDR         R9,[sp,#52]
+
+    @// calculate offsets, offset = stride - width
+    SUB         R10,R6,R3                   @// luma offset
+    SUB         R11,R7,R3
+    @, LSR #1   @// u offset
+    @SUB     R12,R8,R3, LSR #1  @// v offset
+    SUB         R14,R9,R3                   @// rgb offset in pixels
+
+    @// calculate height loop count
+    MOV         R5,R5, LSR #1               @// height_cnt = height / 16
+
+    @// create next row pointers for rgb and luma data
+    ADD         R7,R0,R6                    @// luma_next_row = luma + luma_stride
+    ADD         R8,R2,R9,LSL #2             @// rgb_next_row = rgb + rgb_stride
+
+LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP:
+
+    @//LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES.
+    VLD1.8      {D2,D3},[R1]!               @//LOAD 8 VALUES OF UV
+    @//VLD1.8 {D3},[R2]!            @//LOAD 8 VALUES OF V
+
+    @// calculate width loop count
+    MOV         R6,R3, LSR #4               @// width_cnt = width / 16
+
+    @//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
+    @//LOAD VALUES OF Y 8-BIT VALUES
+    VLD2.8      {D30,D31},[R0]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
+                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+    VLD2.8      {D28,D29},[R7]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
+                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+
+    SUBS        R6,R6,#1
+    BEQ         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP
+
+LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
+    @VMOV.I8 Q1,#128
+    VUZP.8      D2,D3
+
+
+    @//NEED TO SUBTRACT (U-128) AND (V-128)
+    @//(D2-D1),(D3-D1)
+    VSUBL.U8    Q2,D2,D1                    @//(U-128)
+    VSUBL.U8    Q3,D3,D1                    @//(V-128)
+
+    @//LOAD VALUES OF U&V for next row
+    VLD1.8      {D2,D3},[R1]!               @//LOAD 8 VALUES OF U
+    @//VLD1.8 {D3},[R2]!            @//LOAD 8 VALUES OF V
+
+    @PLD        [R0]
+    PLD         [R1]
+
+    @//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
+    VMULL.S16   Q4,D4,D0[3]                 @//(U-128)*C4 FOR B
+    VMULL.S16   Q5,D5,D0[3]                 @//(U-128)*C4 FOR B
+
+    VMULL.S16   Q10,D6,D0[0]                @//(V-128)*C1 FOR R
+    VMULL.S16   Q11,D7,D0[0]                @//(V-128)*C1 FOR R
+
+    VMULL.S16   Q6,D4,D0[1]                 @//(U-128)*C2 FOR G
+    VMLAL.S16   Q6,D6,D0[2]                 @//Q6 = (U-128)*C2 + (V-128)*C3
+    VMULL.S16   Q7,D5,D0[1]                 @//(U-128)*C2 FOR G
+    VMLAL.S16   Q7,D7,D0[2]                 @//Q7 = (U-128)*C2 + (V-128)*C3
+
+    @//NARROW RIGHT SHIFT BY 13 FOR R&B
+    VQSHRN.S32  D8,Q4,#13                   @//D8 = (U-128)*C4>>13 4 16-BIT VALUES
+    VQSHRN.S32  D9,Q5,#13                   @//D9 = (U-128)*C4>>13 4 16-BIT VALUES
+    @//Q4 - WEIGHT FOR B
+
+    @//NARROW RIGHT SHIFT BY 13 FOR R&B
+    VQSHRN.S32  D10,Q10,#13                 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES
+    VQSHRN.S32  D11,Q11,#13                 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES
+    @//Q5 - WEIGHT FOR R
+
+    @//NARROW RIGHT SHIFT BY 13 FOR G
+    VQSHRN.S32  D12,Q6,#13                  @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+    VQSHRN.S32  D13,Q7,#13                  @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+    @//Q6 - WEIGHT FOR G
+
+    VADDW.U8    Q7,Q4,D30                   @//Q7 - HAS Y + B
+    VADDW.U8    Q8,Q5,D30                   @//Q8 - HAS Y + R
+    VADDW.U8    Q9,Q6,D30                   @//Q9 - HAS Y + G
+
+    VADDW.U8    Q10,Q4,D31                  @//Q10 - HAS Y + B
+    VADDW.U8    Q11,Q5,D31                  @//Q11 - HAS Y + R
+    VADDW.U8    Q12,Q6,D31                  @//Q12 - HAS Y + G
+
+    VQMOVUN.S16 D14,Q7
+    VQMOVUN.S16 D15,Q9
+    VQMOVUN.S16 D16,Q8
+    VMOV.I8     D17,#0
+
+    VZIP.8      D14,D15
+    VZIP.8      D16,D17
+    VZIP.16     Q7,Q8
+
+
+    VQMOVUN.S16 D20,Q10
+    VQMOVUN.S16 D21,Q12
+    VQMOVUN.S16 D22,Q11
+    VMOV.I8     D23,#0
+
+    VZIP.8      D20,D21
+    VZIP.8      D22,D23
+    VZIP.16     Q10,Q11
+
+    VZIP.32     Q7,Q10
+    VZIP.32     Q8,Q11
+
+    VST1.32     D14,[R2]!
+    VST1.32     D15,[R2]!
+    VST1.32     D20,[R2]!
+    VST1.32     D21,[R2]!
+    VST1.32     D16,[R2]!
+    VST1.32     D17,[R2]!
+    VST1.32     D22,[R2]!
+    VST1.32     D23,[R2]!
+
+    @//D14-D20 - TOALLY HAVE 16 VALUES
+    @//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
+    VADDW.U8    Q7,Q4,D28                   @//Q7 - HAS Y + B
+    VADDW.U8    Q8,Q5,D28                   @//Q2 - HAS Y + R
+    VADDW.U8    Q9,Q6,D28                   @//Q3 - HAS Y + G
+
+    VADDW.U8    Q10,Q4,D29                  @//Q10 - HAS Y + B
+    VADDW.U8    Q11,Q5,D29                  @//Q11 - HAS Y + R
+    VADDW.U8    Q12,Q6,D29                  @//Q12 - HAS Y + G
+
+    @//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
+    @//LOAD VALUES OF Y 8-BIT VALUES
+    VLD2.8      {D30,D31},[R0]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
+                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+    VLD2.8      {D28,D29},[R7]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
+                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+
+    PLD         [R0]
+    PLD         [R7]
+
+    VQMOVUN.S16 D14,Q7
+    VQMOVUN.S16 D15,Q9
+    VQMOVUN.S16 D16,Q8
+    VMOV.I8     D17,#0
+
+    VZIP.8      D14,D15
+    VZIP.8      D16,D17
+    VZIP.16     Q7,Q8
+
+
+    VQMOVUN.S16 D20,Q10
+    VQMOVUN.S16 D21,Q12
+    VQMOVUN.S16 D22,Q11
+    VMOV.I8     D23,#0
+
+    VZIP.8      D20,D21
+    VZIP.8      D22,D23
+    VZIP.16     Q10,Q11
+
+    VZIP.32     Q7,Q10
+    VZIP.32     Q8,Q11
+
+    VST1.32     D14,[R8]!
+    VST1.32     D15,[R8]!
+    VST1.32     D20,[R8]!
+    VST1.32     D21,[R8]!
+    VST1.32     D16,[R8]!
+    VST1.32     D17,[R8]!
+    VST1.32     D22,[R8]!
+    VST1.32     D23,[R8]!
+
+    SUBS        R6,R6,#1                    @// width_cnt -= 1
+    BNE         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP
+
+LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
+    @VMOV.I8 Q1,#128
+    VUZP.8      D2,D3
+
+
+    @//NEED TO SUBTRACT (U-128) AND (V-128)
+    @//(D2-D1),(D3-D1)
+    VSUBL.U8    Q2,D2,D1                    @//(U-128)
+    VSUBL.U8    Q3,D3,D1                    @//(V-128)
+
+
+    @//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
+    VMULL.S16   Q4,D4,D0[3]                 @//(U-128)*C4 FOR B
+    VMULL.S16   Q5,D5,D0[3]                 @//(U-128)*C4 FOR B
+
+    VMULL.S16   Q10,D6,D0[0]                @//(V-128)*C1 FOR R
+    VMULL.S16   Q11,D7,D0[0]                @//(V-128)*C1 FOR R
+
+    VMULL.S16   Q6,D4,D0[1]                 @//(U-128)*C2 FOR G
+    VMLAL.S16   Q6,D6,D0[2]                 @//Q6 = (U-128)*C2 + (V-128)*C3
+    VMULL.S16   Q7,D5,D0[1]                 @//(U-128)*C2 FOR G
+    VMLAL.S16   Q7,D7,D0[2]                 @//Q7 = (U-128)*C2 + (V-128)*C3
+
+    @//NARROW RIGHT SHIFT BY 13 FOR R&B
+    VQSHRN.S32  D8,Q4,#13                   @//D8 = (U-128)*C4>>13 4 16-BIT VALUES
+    VQSHRN.S32  D9,Q5,#13                   @//D9 = (U-128)*C4>>13 4 16-BIT VALUES
+    @//Q4 - WEIGHT FOR B
+
+    @//NARROW RIGHT SHIFT BY 13 FOR R&B
+    VQSHRN.S32  D10,Q10,#13                 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES
+    VQSHRN.S32  D11,Q11,#13                 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES
+    @//Q5 - WEIGHT FOR R
+
+    @//NARROW RIGHT SHIFT BY 13 FOR G
+    VQSHRN.S32  D12,Q6,#13                  @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+    VQSHRN.S32  D13,Q7,#13                  @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+    @//Q6 - WEIGHT FOR G
+
+    VADDW.U8    Q7,Q4,D30                   @//Q7 - HAS Y + B
+    VADDW.U8    Q8,Q5,D30                   @//Q8 - HAS Y + R
+    VADDW.U8    Q9,Q6,D30                   @//Q9 - HAS Y + G
+
+    VADDW.U8    Q10,Q4,D31                  @//Q10 - HAS Y + B
+    VADDW.U8    Q11,Q5,D31                  @//Q11 - HAS Y + R
+    VADDW.U8    Q12,Q6,D31                  @//Q12 - HAS Y + G
+
+    VQMOVUN.S16 D14,Q7
+    VQMOVUN.S16 D15,Q9
+    VQMOVUN.S16 D16,Q8
+    VMOV.I8     D17,#0
+
+    VZIP.8      D14,D15
+    VZIP.8      D16,D17
+    VZIP.16     Q7,Q8
+
+
+    VQMOVUN.S16 D20,Q10
+    VQMOVUN.S16 D21,Q12
+    VQMOVUN.S16 D22,Q11
+    VMOV.I8     D23,#0
+
+    VZIP.8      D20,D21
+    VZIP.8      D22,D23
+    VZIP.16     Q10,Q11
+
+    VZIP.32     Q7,Q10
+    VZIP.32     Q8,Q11
+
+    VST1.32     D14,[R2]!
+    VST1.32     D15,[R2]!
+    VST1.32     D20,[R2]!
+    VST1.32     D21,[R2]!
+    VST1.32     D16,[R2]!
+    VST1.32     D17,[R2]!
+    VST1.32     D22,[R2]!
+    VST1.32     D23,[R2]!
+
+    @//D14-D20 - TOALLY HAVE 16 VALUES
+    @//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
+    VADDW.U8    Q7,Q4,D28                   @//Q7 - HAS Y + B
+    VADDW.U8    Q8,Q5,D28                   @//Q2 - HAS Y + R
+    VADDW.U8    Q9,Q6,D28                   @//Q3 - HAS Y + G
+
+    VADDW.U8    Q10,Q4,D29                  @//Q10 - HAS Y + B
+    VADDW.U8    Q11,Q5,D29                  @//Q11 - HAS Y + R
+    VADDW.U8    Q12,Q6,D29                  @//Q12 - HAS Y + G
+
+
+    VQMOVUN.S16 D14,Q7
+    VQMOVUN.S16 D15,Q9
+    VQMOVUN.S16 D16,Q8
+    VMOV.I8     D17,#0
+
+    VZIP.8      D14,D15
+    VZIP.8      D16,D17
+    VZIP.16     Q7,Q8
+
+
+    VQMOVUN.S16 D20,Q10
+    VQMOVUN.S16 D21,Q12
+    VQMOVUN.S16 D22,Q11
+    VMOV.I8     D23,#0
+
+    VZIP.8      D20,D21
+    VZIP.8      D22,D23
+    VZIP.16     Q10,Q11
+
+    VZIP.32     Q7,Q10
+    VZIP.32     Q8,Q11
+
+    VST1.32     D14,[R8]!
+    VST1.32     D15,[R8]!
+    VST1.32     D20,[R8]!
+    VST1.32     D21,[R8]!
+    VST1.32     D16,[R8]!
+    VST1.32     D17,[R8]!
+    VST1.32     D22,[R8]!
+    VST1.32     D23,[R8]!
+
+    @// Adjust the address pointers
+    ADD         R0,R7,R10                   @// luma = luma_next + offset
+    ADD         R2,R8,R14,LSL #2            @// rgb = rgb_next + offset
+
+    ADD         R7,R0,R3                    @// luma_next = luma + width
+    ADD         R8,R2,R3,LSL #2             @// rgb_next_row = rgb + width
+
+    ADD         R1,R1,R11                   @// adjust u pointer
+    @ADD        R2,R2,R12           @// adjust v pointer
+
+    ADD         R7,R7,R10                   @// luma_next = luma + width + offset (because of register crunch)
+    ADD         R8,R8,R14,LSL #2            @// rgb_next_row = rgb + width + offset
+
+    SUBS        R5,R5,#1                    @// height_cnt -= 1
+
+    BNE         LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP
+
+    @//POP THE REGISTERS
+    LDMFD       SP!,{R4-R12,PC}
+
+
+
+
+    .section .note.GNU-stack,"",%progbits
+

diff --git a/decoder/arm/ihevcd_function_selector.c b/decoder/arm/ihevcd_function_selector.c
new file mode 100644
index 0000000..66c7d4d
--- /dev/null
+++ b/decoder/arm/ihevcd_function_selector.c

@@ -0,0 +1,135 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_function_selector.c
+*
+* @brief
+*  Contains functions to initialize function pointers used in hevc
+*
+* @author
+*  Naveen
+*
+* @par List of Functions:
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr_neonintr(codec_t *ps_codec);
+void ihevcd_init_function_ptr_noneon(codec_t *ps_codec);
+void ihevcd_init_function_ptr_a9q(codec_t *ps_codec);
+void ihevcd_init_function_ptr_av8(codec_t *ps_codec);
+void ihevcd_init_function_ptr(void *pv_codec)
+{
+    codec_t *ps_codec = (codec_t *)pv_codec;
+
+#ifndef ARMV8
+    switch(ps_codec->e_processor_arch)
+    {
+#ifndef DISABLE_NEONINTR
+        case ARCH_ARM_NEONINTR:
+            ihevcd_init_function_ptr_neonintr(ps_codec);
+            break;
+#endif
+        case ARCH_ARM_NONEON:
+            ihevcd_init_function_ptr_noneon(ps_codec);
+            break;
+        default:
+        case ARCH_ARM_A5:
+        case ARCH_ARM_A7:
+        case ARCH_ARM_A9:
+        case ARCH_ARM_A15:
+        case ARCH_ARM_A9Q:
+#ifndef DISABLE_NEON
+            ihevcd_init_function_ptr_a9q(ps_codec);
+#else
+            ihevcd_init_function_ptr_noneon(ps_codec);
+#endif
+            break;
+    }
+    switch(ps_codec->e_processor_soc)
+    {
+
+        case SOC_HISI_37X:
+#ifndef DISABLE_NEON
+            ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420sp_fptr               =  &ihevcd_fmt_conv_420sp_to_420sp_a9q;
+#endif
+            break;
+        case SOC_GENERIC:
+        default:
+            break;
+    }
+#else
+    switch(ps_codec->e_processor_arch)
+    {
+        case ARCH_ARM_NONEON:
+            ihevcd_init_function_ptr_noneon(ps_codec);
+            break;
+        case ARCH_ARMV8_GENERIC:
+        default:
+            ihevcd_init_function_ptr_av8(ps_codec);
+            break;
+    }
+#endif
+}
+
+void ihevcd_init_arch(void *pv_codec)
+{
+    codec_t *ps_codec = (codec_t *)pv_codec;
+#ifdef DEFAULT_ARCH
+#if DEFAULT_ARCH == D_ARCH_ARM_NONEON
+    ps_codec->e_processor_arch = ARCH_ARM_NONEON;
+#elif DEFAULT_ARCH == D_ARCH_ARMV8_GENERIC
+    ps_codec->e_processor_arch = ARCH_ARMV8_GENERIC;
+#elif DEFAULT_ARCH == D_ARCH_ARM_NEONINTR
+    ps_codec->e_processor_arch = ARCH_ARM_NEONINTR;
+#else
+    ps_codec->e_processor_arch = ARCH_ARM_A9Q;
+#endif
+#else
+    ps_codec->e_processor_arch = ARCH_ARM_A9Q;
+#endif
+}

diff --git a/decoder/arm/ihevcd_function_selector_a9q.c b/decoder/arm/ihevcd_function_selector_a9q.c
new file mode 100644
index 0000000..ea5b8c0
--- /dev/null
+++ b/decoder/arm/ihevcd_function_selector_a9q.c

@@ -0,0 +1,160 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_function_selector_a9q.c
+*
+* @brief
+*  Contains functions to initialize a9q function pointers used in hevc
+*
+* @author
+*  Naveen
+*
+* @par List of Functions:
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr_a9q(codec_t *ps_codec)
+{
+    ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr                      =  &ihevc_deblk_chroma_horz_a9q;
+    ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr                      =  &ihevc_deblk_chroma_vert_a9q;
+    ps_codec->s_func_selector.ihevc_deblk_luma_vert_fptr                        =  &ihevc_deblk_luma_vert_a9q;
+    ps_codec->s_func_selector.ihevc_deblk_luma_horz_fptr                        =  &ihevc_deblk_luma_horz_a9q;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_fptr                 =  &ihevc_inter_pred_chroma_copy_a9q;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_w16out_fptr          =  &ihevc_inter_pred_chroma_copy_w16out_a9q;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_fptr                 =  &ihevc_inter_pred_chroma_horz_a9q;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr          =  &ihevc_inter_pred_chroma_horz_w16out_a9q;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_fptr                 =  &ihevc_inter_pred_chroma_vert_a9q;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_fptr          =  &ihevc_inter_pred_chroma_vert_w16inp_a9q;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_w16out_fptr   =  &ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16out_fptr          =  &ihevc_inter_pred_chroma_vert_w16out_a9q;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_fptr                   =  &ihevc_inter_pred_luma_horz_a9q;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_fptr                   =  &ihevc_inter_pred_luma_vert_a9q;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16out_fptr            =  &ihevc_inter_pred_luma_vert_w16out_a9q;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_fptr            =  &ihevc_inter_pred_luma_vert_w16inp_a9q;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_fptr                   =  &ihevc_inter_pred_luma_copy_a9q;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_w16out_fptr            =  &ihevc_inter_pred_luma_copy_w16out_a9q;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr            =  &ihevc_inter_pred_luma_horz_w16out_a9q;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_w16out_fptr     =  &ihevc_inter_pred_luma_vert_w16inp_w16out_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr     =  &ihevc_intra_pred_chroma_ref_substitution_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr       =  &ihevc_intra_pred_luma_ref_substitution_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr   =  &ihevc_intra_pred_luma_ref_subst_all_avlble;
+    ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr               =  &ihevc_intra_pred_ref_filtering_neonintr;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_dc_fptr                   =  &ihevc_intra_pred_chroma_dc_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_horz_fptr                 =  &ihevc_intra_pred_chroma_horz_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode2_fptr                =  &ihevc_intra_pred_chroma_mode2_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_18_34_fptr           =  &ihevc_intra_pred_chroma_mode_18_34_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_27_to_33_fptr        =  &ihevc_intra_pred_chroma_mode_27_to_33_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_3_to_9_fptr          =  &ihevc_intra_pred_chroma_mode_3_to_9_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_planar_fptr               =  &ihevc_intra_pred_chroma_planar_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_ver_fptr                  =  &ihevc_intra_pred_chroma_ver_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_11_to_17_fptr        =  &ihevc_intra_pred_chroma_mode_11_to_17_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_19_to_25_fptr        =  &ihevc_intra_pred_chroma_mode_19_to_25_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_11_to_17_fptr          =  &ihevc_intra_pred_luma_mode_11_to_17_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_19_to_25_fptr          =  &ihevc_intra_pred_luma_mode_19_to_25_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_dc_fptr                     =  &ihevc_intra_pred_luma_dc_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_horz_fptr                   =  &ihevc_intra_pred_luma_horz_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode2_fptr                  =  &ihevc_intra_pred_luma_mode2_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_18_34_fptr             =  &ihevc_intra_pred_luma_mode_18_34_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_27_to_33_fptr          =  &ihevc_intra_pred_luma_mode_27_to_33_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_3_to_9_fptr            =  &ihevc_intra_pred_luma_mode_3_to_9_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_planar_fptr                 =  &ihevc_intra_pred_luma_planar_a9q;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ver_fptr                    =  &ihevc_intra_pred_luma_ver_a9q;
+    ps_codec->s_func_selector.ihevc_itrans_4x4_ttype1_fptr                      =  &ihevc_itrans_4x4_ttype1;
+    ps_codec->s_func_selector.ihevc_itrans_4x4_fptr                             =  &ihevc_itrans_4x4;
+    ps_codec->s_func_selector.ihevc_itrans_8x8_fptr                             =  &ihevc_itrans_8x8;
+    ps_codec->s_func_selector.ihevc_itrans_16x16_fptr                           =  &ihevc_itrans_16x16;
+    ps_codec->s_func_selector.ihevc_itrans_32x32_fptr                           =  &ihevc_itrans_32x32;
+    ps_codec->s_func_selector.ihevc_itrans_recon_4x4_ttype1_fptr                =  &ihevc_itrans_recon_4x4_ttype1_a9q;
+    ps_codec->s_func_selector.ihevc_itrans_recon_4x4_fptr                       =  &ihevc_itrans_recon_4x4_a9q;
+    ps_codec->s_func_selector.ihevc_itrans_recon_8x8_fptr                       =  &ihevc_itrans_recon_8x8_a9q;
+    ps_codec->s_func_selector.ihevc_itrans_recon_16x16_fptr                     =  &ihevc_itrans_recon_16x16_a9q;
+    ps_codec->s_func_selector.ihevc_itrans_recon_32x32_fptr                     =  &ihevc_itrans_recon_32x32_a9q;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_4x4_fptr                =  &ihevc_chroma_itrans_recon_4x4;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_8x8_fptr                =  &ihevc_chroma_itrans_recon_8x8;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_16x16_fptr              =  &ihevc_chroma_itrans_recon_16x16;
+    ps_codec->s_func_selector.ihevc_recon_4x4_ttype1_fptr                       =  &ihevc_recon_4x4_ttype1;
+    ps_codec->s_func_selector.ihevc_recon_4x4_fptr                              =  &ihevc_recon_4x4;
+    ps_codec->s_func_selector.ihevc_recon_8x8_fptr                              =  &ihevc_recon_8x8;
+    ps_codec->s_func_selector.ihevc_recon_16x16_fptr                            =  &ihevc_recon_16x16;
+    ps_codec->s_func_selector.ihevc_recon_32x32_fptr                            =  &ihevc_recon_32x32;
+    ps_codec->s_func_selector.ihevc_chroma_recon_4x4_fptr                       =  &ihevc_chroma_recon_4x4;
+    ps_codec->s_func_selector.ihevc_chroma_recon_8x8_fptr                       =  &ihevc_chroma_recon_8x8;
+    ps_codec->s_func_selector.ihevc_chroma_recon_16x16_fptr                     =  &ihevc_chroma_recon_16x16;
+    ps_codec->s_func_selector.ihevc_memcpy_mul_8_fptr                           =  &ihevc_memcpy_mul_8_a9q;
+    ps_codec->s_func_selector.ihevc_memcpy_fptr                                 =  &ihevc_memcpy_a9q;
+    ps_codec->s_func_selector.ihevc_memset_mul_8_fptr                           =  &ihevc_memset_mul_8_a9q;
+    ps_codec->s_func_selector.ihevc_memset_fptr                                 =  &ihevc_memset_a9q;
+    ps_codec->s_func_selector.ihevc_memset_16bit_mul_8_fptr                     =  &ihevc_memset_16bit_mul_8_a9q;
+    ps_codec->s_func_selector.ihevc_memset_16bit_fptr                           =  &ihevc_memset_16bit_a9q;
+    ps_codec->s_func_selector.ihevc_pad_left_luma_fptr                          =  &ihevc_pad_left_luma_a9q;
+    ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr                        =  &ihevc_pad_left_chroma_a9q;
+    ps_codec->s_func_selector.ihevc_pad_right_luma_fptr                         =  &ihevc_pad_right_luma_a9q;
+    ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr                       =  &ihevc_pad_right_chroma_a9q;
+    ps_codec->s_func_selector.ihevc_weighted_pred_bi_fptr                       =  &ihevc_weighted_pred_bi_a9q;
+    ps_codec->s_func_selector.ihevc_weighted_pred_bi_default_fptr               =  &ihevc_weighted_pred_bi_default_a9q;
+    ps_codec->s_func_selector.ihevc_weighted_pred_uni_fptr                      =  &ihevc_weighted_pred_uni_a9q;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_fptr                =  &ihevc_weighted_pred_chroma_bi_neonintr;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_default_fptr        =  &ihevc_weighted_pred_chroma_bi_default_neonintr;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_uni_fptr               =  &ihevc_weighted_pred_chroma_uni_neonintr;
+    ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr                   =  &ihevc_sao_band_offset_luma_a9q;
+    ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr                 =  &ihevc_sao_band_offset_chroma_a9q;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_fptr                 =  &ihevc_sao_edge_offset_class0_a9q;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_chroma_fptr          =  &ihevc_sao_edge_offset_class0_chroma_a9q;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_fptr                 =  &ihevc_sao_edge_offset_class1_a9q;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_chroma_fptr          =  &ihevc_sao_edge_offset_class1_chroma_a9q;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_fptr                 =  &ihevc_sao_edge_offset_class2_a9q;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_chroma_fptr          =  &ihevc_sao_edge_offset_class2_chroma_a9q;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_fptr                 =  &ihevc_sao_edge_offset_class3_a9q;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_chroma_fptr          =  &ihevc_sao_edge_offset_class3_chroma_a9q;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgba8888_fptr            =  &ihevcd_fmt_conv_420sp_to_rgba8888_a9q;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgb565_fptr              =  &ihevcd_fmt_conv_420sp_to_rgb565;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420sp_fptr               =  &ihevcd_fmt_conv_420sp_to_420sp;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420p_fptr                =  &ihevcd_fmt_conv_420sp_to_420p_a9q;
+    ps_codec->s_func_selector.ihevcd_itrans_recon_dc_luma_fptr                  =  &ihevcd_itrans_recon_dc_luma_a9q;
+    ps_codec->s_func_selector.ihevcd_itrans_recon_dc_chroma_fptr                =  &ihevcd_itrans_recon_dc_chroma_a9q;
+}

diff --git a/decoder/arm/ihevcd_function_selector_noneon.c b/decoder/arm/ihevcd_function_selector_noneon.c
new file mode 100644
index 0000000..b5c9f6a
--- /dev/null
+++ b/decoder/arm/ihevcd_function_selector_noneon.c

@@ -0,0 +1,160 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_function_selector_noneon.c
+*
+* @brief
+*  Contains functions to initialize noneon function pointers used in hevc
+*
+* @author
+*  Naveen
+*
+* @par List of Functions:
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr_noneon(codec_t *ps_codec)
+{
+    ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr                      =  &ihevc_deblk_chroma_horz;
+    ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr                      =  &ihevc_deblk_chroma_vert;
+    ps_codec->s_func_selector.ihevc_deblk_luma_vert_fptr                        =  &ihevc_deblk_luma_vert;
+    ps_codec->s_func_selector.ihevc_deblk_luma_horz_fptr                        =  &ihevc_deblk_luma_horz;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_fptr                 =  &ihevc_inter_pred_chroma_copy;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_w16out_fptr          =  &ihevc_inter_pred_chroma_copy_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_fptr                 =  &ihevc_inter_pred_chroma_horz;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr          =  &ihevc_inter_pred_chroma_horz_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_fptr                 =  &ihevc_inter_pred_chroma_vert;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_fptr          =  &ihevc_inter_pred_chroma_vert_w16inp;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_w16out_fptr   =  &ihevc_inter_pred_chroma_vert_w16inp_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16out_fptr          =  &ihevc_inter_pred_chroma_vert_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_fptr                   =  &ihevc_inter_pred_luma_horz;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_fptr                   =  &ihevc_inter_pred_luma_vert;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16out_fptr            =  &ihevc_inter_pred_luma_vert_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_fptr            =  &ihevc_inter_pred_luma_vert_w16inp;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_fptr                   =  &ihevc_inter_pred_luma_copy;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_w16out_fptr            =  &ihevc_inter_pred_luma_copy_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr            =  &ihevc_inter_pred_luma_horz_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_w16out_fptr     =  &ihevc_inter_pred_luma_vert_w16inp_w16out;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr     =  &ihevc_intra_pred_chroma_ref_substitution;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr       =  &ihevc_intra_pred_luma_ref_substitution;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr   =  &ihevc_intra_pred_luma_ref_subst_all_avlble;
+    ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr               =  &ihevc_intra_pred_ref_filtering;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_dc_fptr                   =  &ihevc_intra_pred_chroma_dc;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_horz_fptr                 =  &ihevc_intra_pred_chroma_horz;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode2_fptr                =  &ihevc_intra_pred_chroma_mode2;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_18_34_fptr           =  &ihevc_intra_pred_chroma_mode_18_34;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_27_to_33_fptr        =  &ihevc_intra_pred_chroma_mode_27_to_33;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_3_to_9_fptr          =  &ihevc_intra_pred_chroma_mode_3_to_9;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_planar_fptr               =  &ihevc_intra_pred_chroma_planar;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_ver_fptr                  =  &ihevc_intra_pred_chroma_ver;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_11_to_17_fptr        =  &ihevc_intra_pred_chroma_mode_11_to_17;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_19_to_25_fptr        =  &ihevc_intra_pred_chroma_mode_19_to_25;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_11_to_17_fptr          =  &ihevc_intra_pred_luma_mode_11_to_17;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_19_to_25_fptr          =  &ihevc_intra_pred_luma_mode_19_to_25;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_dc_fptr                     =  &ihevc_intra_pred_luma_dc;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_horz_fptr                   =  &ihevc_intra_pred_luma_horz;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode2_fptr                  =  &ihevc_intra_pred_luma_mode2;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_18_34_fptr             =  &ihevc_intra_pred_luma_mode_18_34;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_27_to_33_fptr          =  &ihevc_intra_pred_luma_mode_27_to_33;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_3_to_9_fptr            =  &ihevc_intra_pred_luma_mode_3_to_9;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_planar_fptr                 =  &ihevc_intra_pred_luma_planar;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ver_fptr                    =  &ihevc_intra_pred_luma_ver;
+    ps_codec->s_func_selector.ihevc_itrans_4x4_ttype1_fptr                      =  &ihevc_itrans_4x4_ttype1;
+    ps_codec->s_func_selector.ihevc_itrans_4x4_fptr                             =  &ihevc_itrans_4x4;
+    ps_codec->s_func_selector.ihevc_itrans_8x8_fptr                             =  &ihevc_itrans_8x8;
+    ps_codec->s_func_selector.ihevc_itrans_16x16_fptr                           =  &ihevc_itrans_16x16;
+    ps_codec->s_func_selector.ihevc_itrans_32x32_fptr                           =  &ihevc_itrans_32x32;
+    ps_codec->s_func_selector.ihevc_itrans_recon_4x4_ttype1_fptr                =  &ihevc_itrans_recon_4x4_ttype1;
+    ps_codec->s_func_selector.ihevc_itrans_recon_4x4_fptr                       =  &ihevc_itrans_recon_4x4;
+    ps_codec->s_func_selector.ihevc_itrans_recon_8x8_fptr                       =  &ihevc_itrans_recon_8x8;
+    ps_codec->s_func_selector.ihevc_itrans_recon_16x16_fptr                     =  &ihevc_itrans_recon_16x16;
+    ps_codec->s_func_selector.ihevc_itrans_recon_32x32_fptr                     =  &ihevc_itrans_recon_32x32;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_4x4_fptr                =  &ihevc_chroma_itrans_recon_4x4;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_8x8_fptr                =  &ihevc_chroma_itrans_recon_8x8;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_16x16_fptr              =  &ihevc_chroma_itrans_recon_16x16;
+    ps_codec->s_func_selector.ihevc_recon_4x4_ttype1_fptr                       =  &ihevc_recon_4x4_ttype1;
+    ps_codec->s_func_selector.ihevc_recon_4x4_fptr                              =  &ihevc_recon_4x4;
+    ps_codec->s_func_selector.ihevc_recon_8x8_fptr                              =  &ihevc_recon_8x8;
+    ps_codec->s_func_selector.ihevc_recon_16x16_fptr                            =  &ihevc_recon_16x16;
+    ps_codec->s_func_selector.ihevc_recon_32x32_fptr                            =  &ihevc_recon_32x32;
+    ps_codec->s_func_selector.ihevc_chroma_recon_4x4_fptr                       =  &ihevc_chroma_recon_4x4;
+    ps_codec->s_func_selector.ihevc_chroma_recon_8x8_fptr                       =  &ihevc_chroma_recon_8x8;
+    ps_codec->s_func_selector.ihevc_chroma_recon_16x16_fptr                     =  &ihevc_chroma_recon_16x16;
+    ps_codec->s_func_selector.ihevc_memcpy_mul_8_fptr                           =  &ihevc_memcpy_mul_8;
+    ps_codec->s_func_selector.ihevc_memcpy_fptr                                 =  &ihevc_memcpy;
+    ps_codec->s_func_selector.ihevc_memset_mul_8_fptr                           =  &ihevc_memset_mul_8;
+    ps_codec->s_func_selector.ihevc_memset_fptr                                 =  &ihevc_memset;
+    ps_codec->s_func_selector.ihevc_memset_16bit_mul_8_fptr                     =  &ihevc_memset_16bit_mul_8;
+    ps_codec->s_func_selector.ihevc_memset_16bit_fptr                           =  &ihevc_memset_16bit;
+    ps_codec->s_func_selector.ihevc_pad_left_luma_fptr                          =  &ihevc_pad_left_luma;
+    ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr                        =  &ihevc_pad_left_chroma;
+    ps_codec->s_func_selector.ihevc_pad_right_luma_fptr                         =  &ihevc_pad_right_luma;
+    ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr                       =  &ihevc_pad_right_chroma;
+    ps_codec->s_func_selector.ihevc_weighted_pred_bi_fptr                       =  &ihevc_weighted_pred_bi;
+    ps_codec->s_func_selector.ihevc_weighted_pred_bi_default_fptr               =  &ihevc_weighted_pred_bi_default;
+    ps_codec->s_func_selector.ihevc_weighted_pred_uni_fptr                      =  &ihevc_weighted_pred_uni;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_fptr                =  &ihevc_weighted_pred_chroma_bi;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_default_fptr        =  &ihevc_weighted_pred_chroma_bi_default;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_uni_fptr               =  &ihevc_weighted_pred_chroma_uni;
+    ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr                   =  &ihevc_sao_band_offset_luma;
+    ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr                 =  &ihevc_sao_band_offset_chroma;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_fptr                 =  &ihevc_sao_edge_offset_class0;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_chroma_fptr          =  &ihevc_sao_edge_offset_class0_chroma;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_fptr                 =  &ihevc_sao_edge_offset_class1;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_chroma_fptr          =  &ihevc_sao_edge_offset_class1_chroma;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_fptr                 =  &ihevc_sao_edge_offset_class2;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_chroma_fptr          =  &ihevc_sao_edge_offset_class2_chroma;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_fptr                 =  &ihevc_sao_edge_offset_class3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_chroma_fptr          =  &ihevc_sao_edge_offset_class3_chroma;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgba8888_fptr            =  &ihevcd_fmt_conv_420sp_to_rgba8888;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgb565_fptr              =  &ihevcd_fmt_conv_420sp_to_rgb565;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420sp_fptr               =  &ihevcd_fmt_conv_420sp_to_420sp;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420p_fptr                =  &ihevcd_fmt_conv_420sp_to_420p;
+    ps_codec->s_func_selector.ihevcd_itrans_recon_dc_luma_fptr                  =  &ihevcd_itrans_recon_dc_luma;
+    ps_codec->s_func_selector.ihevcd_itrans_recon_dc_chroma_fptr                =  &ihevcd_itrans_recon_dc_chroma;
+}

diff --git a/decoder/arm/ihevcd_itrans_recon_dc_chroma.s b/decoder/arm/ihevcd_itrans_recon_dc_chroma.s
new file mode 100644
index 0000000..6732ce0
--- /dev/null
+++ b/decoder/arm/ihevcd_itrans_recon_dc_chroma.s

@@ -0,0 +1,193 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/*******************************************************************************
+@* @file
+@*  ihevcd_itrans_recon_dc_chroma.s
+@*
+@* @brief
+@*  contains function definitions itrans and recon for dc only case
+@*
+@* @author
+@*  ittiam
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************/
+
+.text
+
+
+.globl ihevcd_itrans_recon_dc_chroma_a9q
+
+.type ihevcd_itrans_recon_dc_chroma_a9q, %function
+
+ihevcd_itrans_recon_dc_chroma_a9q:
+
+@void ihevcd_itrans_recon_dc_chroma(uword8 *pu1_pred,
+@                            uword8 *pu1_dst,
+@                            word32 pred_strd,
+@                            word32 dst_strd,
+@                            word32 log2_trans_size,
+@                            word16 i2_coeff_value)
+
+@r0:pu1_pred
+@r1:pu1_dest
+@r2:pred_strd
+@r3:dst_strd
+
+
+
+    push        {r0-r11,lr}
+    ldr         r4,[sp,#0x34]               @loads log2_trans_size
+    ldr         r5,[sp,#0x38]               @ loads i2_coeff_value
+
+    mov         r10,#1
+    lsl         r4,r10,r4                   @    trans_size = (1 << log2_trans_size)@
+    mov         r6,#64 @ 1 << (shift1 - 1)@
+    mov         r7,#2048                    @ 1<<(shift2-1)
+
+    add         r8,r6,r5,lsl #6
+    ssat        r8,#16,r8,asr #7
+    add         r5,r7,r8,lsl #6
+    ssat        r6,#16,r5,asr #12
+    mov         r9,r4
+    mov         r8,r4
+
+    @ r6 has the dc_value
+    @ r4 has the trans_size value
+    @ r8 has the row value
+    @ r9 has the col value
+    vdup.s16    q0,r6
+    cmp         r4,#4
+    beq         row_loop_4chroma
+
+
+row_loop_chroma:
+    mov         r9,r4
+
+
+col_loop_chroma:
+
+    mov         r7,r0
+    vld2.8      {d2,d3},[r7],r2
+    vld2.8      {d4,d5},[r7],r2
+    vld2.8      {d6,d7},[r7],r2
+    vld2.8      {d8,d9},[r7],r2
+
+    vld2.8      {d10,d11},[r7],r2
+    vld2.8      {d12,d13},[r7],r2
+    vld2.8      {d14,d15},[r7],r2
+    vld2.8      {d16,d17},[r7]
+
+    add         r0,r0,#16
+
+
+    vaddw.u8    q15,q0,d2
+    vaddw.u8    q14,q0,d4
+    vaddw.u8    q13,q0,d6
+    vaddw.u8    q12,q0,d8
+    vaddw.u8    q11,q0,d10
+    vaddw.u8    q10,q0,d12
+    vaddw.u8    q9,q0,d14
+
+
+    mov         r11,r1
+    vqmovun.s16 d2,q15
+    vqmovun.s16 d4,q14
+    vqmovun.s16 d6,q13
+    vqmovun.s16 d8,q12
+
+    vaddw.u8    q15,q0,d16
+
+    vqmovun.s16 d10,q11
+    vqmovun.s16 d12,q10
+    vqmovun.s16 d14,q9
+    vqmovun.s16 d16,q15
+
+    vst2.8      {d2,d3},[r11],r3
+    vst2.8      {d4,d5},[r11],r3
+    vst2.8      {d6,d7},[r11],r3
+    vst2.8      {d8,d9},[r11],r3
+
+    vst2.8      {d10,d11},[r11],r3
+    vst2.8      {d12,d13},[r11],r3
+    vst2.8      {d14,d15},[r11],r3
+    vst2.8      {d16,d17},[r11]
+
+    add         r1,r1,#16
+
+    subs        r9,r9,#8
+    bgt         col_loop_chroma
+
+    subs        r8,r8,#8
+
+    add         r0,r0,r2,lsl #3
+    add         r1,r1,r3,lsl #3
+    sub         r0,r0,r4,lsl #1
+    sub         r1,r1,r4,lsl #1
+    bgt         row_loop_chroma
+    b           end_loops_chroma
+
+
+row_loop_4chroma:
+    mov         r9,r10
+
+
+col_loop_4chroma:
+
+
+    vld2.8      {d2,d3},[r0],r2
+    vld2.8      {d4,d5},[r0],r2
+    vld2.8      {d6,d7},[r0],r2
+    vld2.8      {d8,d9},[r0]
+
+
+
+
+    vaddw.u8    q15,q0,d2
+    vaddw.u8    q14,q0,d4
+    vaddw.u8    q13,q0,d6
+    vaddw.u8    q12,q0,d8
+
+
+
+    vqmovun.s16 d2,q15
+    vqmovun.s16 d4,q14
+    vqmovun.s16 d6,q13
+    vqmovun.s16 d8,q12
+
+
+    vzip.8      d2,d3
+    vzip.8      d4,d5
+    vzip.8      d6,d7
+    vzip.8      d8,d9
+
+    vst1.u32    {d2},[r1],r3
+    vst1.u32    {d4},[r1],r3
+    vst1.u32    {d6},[r1],r3
+    vst1.u32    {d8},[r1]
+
+end_loops_chroma:
+    pop         {r0-r11,pc}
+
+

diff --git a/decoder/arm/ihevcd_itrans_recon_dc_luma.s b/decoder/arm/ihevcd_itrans_recon_dc_luma.s
new file mode 100644
index 0000000..8aee84c
--- /dev/null
+++ b/decoder/arm/ihevcd_itrans_recon_dc_luma.s

@@ -0,0 +1,193 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/*******************************************************************************
+@* @file
+@*  ihevcd_itrans_recon_dc_luma.s
+@*
+@* @brief
+@*  contains function definitions itrans and recon for dc only case
+@*
+@* @author
+@*  ittiam
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************/
+
+.text
+
+
+
+.globl ihevcd_itrans_recon_dc_luma_a9q
+
+.type ihevcd_itrans_recon_dc_luma_a9q, %function
+
+ihevcd_itrans_recon_dc_luma_a9q:
+
+@void ihevcd_itrans_recon_dc_luma(uword8 *pu1_pred,
+@                            uword8 *pu1_dst,
+@                            word32 pred_strd,
+@                            word32 dst_strd,
+@                            word32 log2_trans_size,
+@                            word16 i2_coeff_value)
+
+@r0:pu1_pred
+@r1:pu1_dest
+@r2:pred_strd
+@r3:dst_strd
+
+
+
+    push        {r0-r11,lr}
+    ldr         r4,[sp,#0x34]               @loads log2_trans_size
+    ldr         r5,[sp,#0x38]               @ loads i2_coeff_value
+
+    mov         r10,#1
+    lsl         r4,r10,r4                   @    trans_size = (1 << log2_trans_size)@
+    mov         r6,#64 @ 1 << (shift1 - 1)@
+    mov         r7,#2048                    @ 1<<(shift2-1)
+
+    add         r8,r6,r5,lsl #6
+    ssat        r8,#16,r8,asr #7
+    add         r5,r7,r8,lsl #6
+    ssat        r6,#16,r5,asr #12
+    mov         r9,r4
+    mov         r8,r4
+
+    @ r6 has the dc_value
+    @ r4 has the trans_size value
+    @ r8 has the row value
+    @ r9 has the col value
+    vdup.s16    q0,r6
+    cmp         r4,#4
+    beq         row_loop_4
+
+
+row_loop:
+    mov         r9,r4
+
+
+col_loop:
+
+    mov         r7,r0
+    vld1.8      d2,[r7],r2
+    vld1.8      d3,[r7],r2
+    vld1.8      d4,[r7],r2
+    vld1.8      d5,[r7],r2
+
+    vld1.8      d6,[r7],r2
+    vld1.8      d7,[r7],r2
+    vld1.8      d8,[r7],r2
+    vld1.8      d9,[r7]
+
+    add         r0,r0,#8
+
+
+    vaddw.u8    q15,q0,d2
+    vaddw.u8    q14,q0,d3
+    vaddw.u8    q13,q0,d4
+    vaddw.u8    q12,q0,d5
+    vaddw.u8    q11,q0,d6
+    vaddw.u8    q10,q0,d7
+    vaddw.u8    q9,q0,d8
+    vaddw.u8    q8,q0,d9
+
+    mov         r11,r1
+    vqmovun.s16 d2,q15
+    vqmovun.s16 d3,q14
+    vqmovun.s16 d4,q13
+    vqmovun.s16 d5,q12
+    vqmovun.s16 d6,q11
+    vqmovun.s16 d7,q10
+    vqmovun.s16 d8,q9
+    vqmovun.s16 d9,q8
+
+
+    vst1.u32    {d2},[r11],r3
+    vst1.u32    {d3},[r11],r3
+    vst1.u32    {d4},[r11],r3
+    vst1.u32    {d5},[r11],r3
+    vst1.u32    {d6},[r11],r3
+    vst1.u32    {d7},[r11],r3
+    vst1.u32    {d8},[r11],r3
+    vst1.u32    {d9},[r11]
+
+    add         r1,r1,#8
+
+    subs        r9,r9,#8
+    bgt         col_loop
+
+    subs        r8,r8,#8
+
+    add         r0,r0,r2,lsl #3
+    add         r1,r1,r3,lsl #3
+    sub         r0,r0,r4
+    sub         r1,r1,r4
+    bgt         row_loop
+    b           end_loops
+
+
+row_loop_4:
+    mov         r9,r10
+
+
+col_loop_4:
+
+
+    vld1.8      d2,[r0],r2
+    vld1.8      d3,[r0],r2
+    vld1.8      d4,[r0],r2
+    vld1.8      d5,[r0]
+
+
+
+
+    vaddw.u8    q15,q0,d2
+    vaddw.u8    q14,q0,d3
+    vaddw.u8    q13,q0,d4
+    vaddw.u8    q12,q0,d5
+
+
+
+    vqmovun.s16 d2,q15
+    vqmovun.s16 d3,q14
+    vqmovun.s16 d4,q13
+    vqmovun.s16 d5,q12
+
+
+
+    vst1.u32    {d2[0]},[r1],r3
+    vst1.u32    {d3[0]},[r1],r3
+    vst1.u32    {d4[0]},[r1],r3
+    vst1.u32    {d5[0]},[r1]
+
+end_loops:
+    pop         {r0-r11,pc}
+
+
+
+
+
+
+
+

diff --git a/decoder/arm64/ihevcd_fmt_conv_420sp_to_420p.s b/decoder/arm64/ihevcd_fmt_conv_420sp_to_420p.s
new file mode 100644
index 0000000..4cc6085
--- /dev/null
+++ b/decoder/arm64/ihevcd_fmt_conv_420sp_to_420p.s

@@ -0,0 +1,209 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///*******************************************************************************
+//* //file
+//*  ihevcd_fmt_conv_420sp_to_420p.s
+//*
+//* //brief
+//*  contains function definitions for format conversions
+//*
+//* //author
+//*  ittiam
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************/
+
+.text
+
+.include "ihevc_neon_macros.s"
+
+
+
+
+///*****************************************************************************
+//*                                                                            *
+//*  Function Name    : neon_copy_yuv420sp_to_yuv420p()                       *
+//*                                                                            *
+//*  Description      : This function conversts the image from YUV420sP color  *
+//*                     space to 420SP color space(UV interleaved).                 *
+//*                                                                            *
+//*  Arguments        : x0           pu1_src_y                                 *
+//*                     x1           pu1_src_uv                                *
+//*                     x2           pu1_dest_y                                *
+//*                     x3           pu1_dest_u                               *
+//*                     [x13 #40]    pu1_dest_v                               *
+//*                     [x13 #44]    u2_width                                 *
+//*                     [x13 #48]    u2_height                                   *
+//*                     [x13 #52]    u2_stridey                                *
+//*                     [x13 #56]    u2_strideuv                               *
+//*                     [x13 #60]    u2_dest_stridey                           *
+//*                     [x13 #64]    u2_dest_strideuv                          *
+//*                     [x13 #68]    is_u_first                                *
+//*                     [x13 #72]    disable_luma_copy                         *
+//*                                                                            *
+//*  Values Returned  : None                                                   *
+//*                                                                            *
+//*  Register Usage   : x0 - x14                                               *
+//*                                                                            *
+//*  Stack Usage      : 40 Bytes                                               *
+//*                                                                            *
+//*  Interruptibility : Interruptible                                          *
+//*                                                                            *
+//*  Known Limitations                                                         *
+//*       Assumptions: Image Width:     Assumed to be multiple of 2 and       *
+//*                     Image Height:    Assumed to be even.                   *
+//*                                                                            *
+//*  Revision History :                                                        *
+//*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
+//*         16 05 2012   Naveen SR     draft                                     *
+//*                                                                            *
+//*****************************************************************************/
+
+.globl ihevcd_fmt_conv_420sp_to_420p_av8
+
+.type ihevcd_fmt_conv_420sp_to_420p_av8, %function
+
+ihevcd_fmt_conv_420sp_to_420p_av8:
+    // STMFD sp!,{x4-x12, x14}
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+    mov         x15, x4
+    mov         x8, x5                      ////Load u2_width
+    mov         x9, x6                      ////Load u2_height
+
+    LDR         w5, [sp,#88]                ////Load u2_dest_stridey
+    sxtw        x5,w5
+//    LDR        x6,[sp,#80]                @//Load u2_strideuv
+
+    SUB         x10,x7,x8                   //// Src Y increment
+    SUB         x11,x5,x8                   //// Dst Y increment
+
+    LDR         w5, [sp,#112]               ////Load disable_luma_copy flag
+    sxtw        x5,w5
+    CMP         x5,#0                       ////skip luma if disable_luma_copy is non-zero
+    BNE         uv_copy_start
+
+    ///* Copy Y */
+
+    MOV         x4,x9                       //// Copying height
+y_row_loop:
+    MOV         x6,x8                       //// Copying width
+
+y_col_loop:
+
+    SUB         x6,x6,#16
+    ld1         {v0.8b, v1.8b},[x0],#16
+    st1         {v0.8b, v1.8b},[x2],#16
+    CMP         x6,#16
+    BGE         y_col_loop
+    CMP         x6,#0
+    BEQ         y_col_loop_end
+    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    ////Ex if width is 162, above loop will process 160 pixels. And
+    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
+    //// and written using VLD1 and VST1
+    sub         x20,x6,#16
+    neg         x6, x20
+    SUB         x0,x0,x6
+    SUB         x2,x2,x6
+    ld1         {v0.8b, v1.8b}, [x0],#16
+    st1         {v0.8b, v1.8b}, [x2],#16
+
+y_col_loop_end:
+    ADD         x0, x0, x10
+    ADD         x2, x2, x11
+    SUBS        x4, x4, #1
+    BGT         y_row_loop
+
+
+    ///* Copy UV */
+uv_copy_start:
+
+    LDR         w5, [sp,#96]                ////Load u2_dest_strideuv
+    sxtw        x5,w5
+    LDR         w7, [sp,#80]                ////Load u2_strideuv
+    sxtw        x7,w7
+
+    LSR         x9, x9, #1                  //// height/2
+//    MOV     x8,x8,LSR #1            @// Width/2
+
+    SUB         x10,x7,x8                   //// Src UV increment
+    LSR         x11, x8, #1
+    SUB         x11,x5,x11                  //// Dst U and V increment
+
+    mov         x5, x15                     ////Load pu1_dest_v
+
+    LDR         w4, [sp,#104]               ////Load is_u_first_flag
+    sxtw        x4,w4
+    CMP         x4,#0                       ////Swap U and V dest if is_u_first_flag is zero
+    csel        x4, x5, x4,EQ
+    csel        x5, x3, x5,EQ
+    csel        x3, x4, x3,EQ
+
+    MOV         x4,x9                       //// Copying height
+uv_row_loop:
+    MOV         x6,x8                       //// Copying width
+
+uv_col_loop:
+
+    SUB         x6,x6,#16
+
+    prfm        PLDL1KEEP,[x1,#128]
+    ld2         {v0.8b, v1.8b},[x1],#16
+    ST1         {v0.8b},[x3],#8
+    ST1         {v1.8b},[x5],#8
+    CMP         x6,#16
+    BGE         uv_col_loop
+    CMP         x6,#0
+    BEQ         uv_col_loop_end
+    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    ////Ex if width is 162, above loop will process 160 pixels. And
+    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
+    //// and written using VLD1 and VST1
+    sub         x20,x6,#16
+    neg         x6, x20
+    SUB         x1,x1,x6
+    SUB         x3,x3,x6,LSR #1
+    SUB         x5,x5,x6,LSR #1
+    ld2         {v0.8b, v1.8b}, [x1],#16
+    ST1         {v0.8b},[x3],#8
+    ST1         {v1.8b},[x5],#8
+uv_col_loop_end:
+    ADD         x1, x1, x10
+    ADD         x3, x3, x11
+    ADD         x5, x5, x11
+    SUBS        x4, x4, #1
+    BGT         uv_row_loop
+
+exit:
+    // LDMFD sp!,{x4-x12, pc}
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+

diff --git a/decoder/arm64/ihevcd_fmt_conv_420sp_to_420sp.s b/decoder/arm64/ihevcd_fmt_conv_420sp_to_420sp.s
new file mode 100644
index 0000000..ccf47a5
--- /dev/null
+++ b/decoder/arm64/ihevcd_fmt_conv_420sp_to_420sp.s

@@ -0,0 +1,207 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///*******************************************************************************
+//* //file
+//*  ihevcd_fmt_conv_420sp_to_420sp.s
+//*
+//* //brief
+//*  contains function definitions for format conversions
+//*
+//* //author
+//*  ittiam
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************/
+    .equ DO1STROUNDING, 0
+
+    // ARM
+    //
+    // PRESERVE8
+
+.text
+.p2align 2
+
+.include "ihevc_neon_macros.s"
+
+
+
+
+///*****************************************************************************
+//*                                                                            *
+//*  Function Name    : ihevcd_fmt_conv_420sp_to_420sp()                       *
+//*                                                                            *
+//*  Description      : This function conversts the image from YUV420SP color  *
+//*                     space to 420SP color space(UV interleaved).                 *
+//*                                                                            *
+//*  Arguments        : x0           pu1_y                                     *
+//*                     x1           pu1_uv                                    *
+//*                     x2           pu1_dest_y                                *
+//*                     x3           pu1_dest_uv                               *
+//*                     [x13 #40]    u2_width                                  *
+//*                     [x13 #44]    u2_height                                 *
+//*                     [x13 #48]    u2_stridey                                *
+//*                     [x13 #52]    u2_stridechroma                           *
+//*                     [x13 #56]    u2_dest_stridey                           *
+//*                     [x13 #60]    u2_dest_stridechroma                      *
+//*                                                                            *
+//*  Values Returned  : None                                                   *
+//*                                                                            *
+//*  Register Usage   : x0 - x14                                               *
+//*                                                                            *
+//*  Stack Usage      : 40 Bytes                                               *
+//*                                                                            *
+//*  Interruptibility : Interruptible                                          *
+//*                                                                            *
+//*  Known Limitations                                                         *
+//*       Assumptions: Image Width:     Assumed to be multiple of 2 and       *
+//*                     Image Height:    Assumed to be even.                   *
+//*                                                                            *
+//*  Revision History :                                                        *
+//*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
+//*         16 05 2012   Naveen SR     draft                                     *
+//*                                                                            *
+//*****************************************************************************/
+
+    .global ihevcd_fmt_conv_420sp_to_420sp_av8
+.type ihevcd_fmt_conv_420sp_to_420sp_a9q, %function
+ihevcd_fmt_conv_420sp_to_420sp_av8:
+
+    // STMFD sp!,{x4-x12, x14}
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    mov         x8, x4                      ////Load u2_width
+    mov         x9, x5                      ////Load u2_height
+
+    LDR         w5, [sp,#80]                ////Load u2_dest_stridey
+    sxtw        x5,w5
+
+    mov         x7, x6                      ////Load u2_stridey
+
+    SUB         x10,x7,x8                   //// Src Y increment
+    SUB         x11,x5,x8                   //// Dst Y increment
+
+    ///* Copy Y */
+
+    MOV         x4,x9                       //// Copying height
+y_row_loop:
+    MOV         x6,x8                       //// Copying width
+
+y_col_loop:
+    prfm        PLDL1KEEP,[x0, #128]
+    SUB         x6,x6,#32
+    LD1         {v0.8b},[x0],#8
+    LD1         {v1.8b},[x0],#8
+    LD1         {v2.8b},[x0],#8
+    LD1         {v3.8b},[x0],#8
+    ST1         {v0.8b},[x2],#8
+    ST1         {v1.8b},[x2],#8
+    ST1         {v2.8b},[x2],#8
+    ST1         {v3.8b},[x2],#8
+    CMP         x6,#32
+    BGE         y_col_loop
+    CMP         x6,#0
+    BEQ         y_col_loop_end
+    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    ////Ex if width is 162, above loop will process 160 pixels. And
+    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
+    //// and written using VLD1 and VST1
+    sub         x20,x6,#32
+    neg         x6, x20
+    SUB         x0,x0,x6
+    SUB         x2,x2,x6
+    LD1         {v0.8b},[x0],#8
+    LD1         {v1.8b},[x0],#8
+    LD1         {v2.8b},[x0],#8
+    LD1         {v3.8b},[x0],#8
+    ST1         {v0.8b},[x2],#8
+    ST1         {v1.8b},[x2],#8
+    ST1         {v2.8b},[x2],#8
+    ST1         {v3.8b},[x2],#8
+
+y_col_loop_end:
+    ADD         x0, x0, x10
+    ADD         x2, x2, x11
+    SUBS        x4, x4, #1
+    BGT         y_row_loop
+
+
+
+    ///* Copy UV */
+
+    LDR         w5, [sp,#88]                ////Load u2_dest_stridechroma
+    sxtw        x5,w5
+
+    LSR         x9, x9, #1                  //// height/2
+//    MOV     x8,x8,LSR #1            @// Width/2
+
+    MOV         x2,x3                       //pu1_dest_uv
+
+    SUB         x10,x7,x8                   //// Src UV increment
+    SUB         x11,x5,x8                   //// Dst UV increment
+
+    MOV         x4,x9                       //// Copying height
+uv_row_loop:
+    MOV         x6,x8                       //// Copying width
+
+uv_col_loop:
+
+    prfm        PLDL1KEEP,[x1, #128]
+    SUB         x6,x6,#16
+    LD1         {v0.8b},[x1],#8
+    LD1         {v1.8b},[x1],#8
+    ST1         {v0.8b},[x2],#8
+    ST1         {v1.8b},[x2],#8
+    CMP         x6,#16
+    BGE         uv_col_loop
+    CMP         x6,#0
+    BEQ         u_col_loop_end
+    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    ////Ex if width is 162, above loop will process 160 pixels. And
+    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
+    //// and written using VLD1 and VST1
+    sub         x20,x6,#16
+    neg         x6, x20
+    SUB         x1,x1,x6
+    SUB         x2,x2,x6
+    LD1         {v0.8b},[x1],#8
+    LD1         {v1.8b},[x1],#8
+    ST1         {v0.8b},[x2],#8
+    ST1         {v1.8b},[x2],#8
+
+u_col_loop_end:
+    ADD         x1, x1, x10
+    ADD         x2, x2, x11
+    SUBS        x4, x4, #1
+    BGT         uv_row_loop
+
+exit:
+    // LDMFD sp!,{x4-x12, pc}
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+    .section .note.GNU-stack,"",%progbits
+

diff --git a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
new file mode 100644
index 0000000..485ee66
--- /dev/null
+++ b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s

@@ -0,0 +1,523 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///*******************************************************************************
+//* //file
+//*  ihevcd_fmt_conv_420sp_to_rgba8888.s
+//*
+//* //brief
+//*  contains function definitions for format conversions
+//*
+//* //author
+//*  ittiam
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************/
+
+    .equ DO1STROUNDING, 0
+
+    // ARM
+    //
+    // PRESERVE8
+
+.text
+.p2align 2
+
+.include "ihevc_neon_macros.s"
+
+
+
+///*****************************************************************************
+//*                                                                            *
+//*  Function Name    : ihevcd_fmt_conv_420sp_to_rgba8888()                    *
+//*                                                                            *
+//*  Description      : This function conversts the image from YUV422 color    *
+//*                     space to RGB888 color space. The function can be       *
+//*                     invoked at the MB level.                               *
+//*                                                                            *
+//*  Arguments        : x0           pubY                                      *
+//*                     x1           pubUV                                     *
+//*                     x2           pusRGB                                    *
+//*                     x3           pusRGB                                    *
+//*                     [x13 #40]    usHeight                                  *
+//*                     [x13 #44]    usWidth                                   *
+//*                     [x13 #48]    usStrideY                                 *
+//*                     [x13 #52]    usStrideU                                 *
+//*                     [x13 #56]    usStrideV                                 *
+//*                     [x13 #60]    usStrideRGB                               *
+//*                                                                            *
+//*  Values Returned  : None                                                   *
+//*                                                                            *
+//*  Register Usage   : x0 - x14                                               *
+//*                                                                            *
+//*  Stack Usage      : 40 Bytes                                               *
+//*                                                                            *
+//*  Interruptibility : Interruptible                                          *
+//*                                                                            *
+//*  Known Limitations                                                         *
+//*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
+//*                     greater than or equal to 16                  *
+//*                     Image Height:    Assumed to be even.                   *
+//*                                                                            *
+//*  Revision History :                                                        *
+//*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
+//*         07 06 2010   Varshita        Draft                                 *
+//*         07 06 2010   Naveen Kr T     Completed                             *
+//*         05 08 2013   Naveen K P      Modified for HEVC                     *
+//*****************************************************************************/
+    .global ihevcd_fmt_conv_420sp_to_rgba8888_av8
+.type ihevcd_fmt_conv_420sp_to_rgba8888_av8, function
+ihevcd_fmt_conv_420sp_to_rgba8888_av8:
+
+    //// push the registers on the stack
+    // STMFD sp!,{x4-x12,x14}
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+
+    ////x0 - Y PTR
+    ////x1 - UV PTR
+    ////x2 - RGB PTR
+    ////x3 - RGB PTR
+    ////x4 - PIC WIDTH
+    ////x5 - PIC HT
+    ////x6 - STRIDE Y
+    ////x7 - STRIDE U
+    ////x8 - STRIDE V
+    ////x9 - STRIDE RGB
+
+    ////ONE ROW PROCESSING AT A TIME
+
+    ////THE FOUR CONSTANTS ARE:
+    ////C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092
+
+    //PLD        [x0]
+    //PLD        [x1]
+    //PLD        [x2]
+
+
+    ///* can be loaded from a defined const type */
+    mov         x10,#0x3311
+    mov         v0.4h[0], w10               ////C1
+
+    mov         x10,#0xF379
+    mov         v0.4h[1], w10               ////C2
+
+    mov         x10,#0xE5F8
+    mov         v0.4h[2], w10               ////C3
+
+    mov         x10,#0x4092
+    mov         v0.4h[3], w10               ////C4
+
+    ////LOAD CONSTANT 128 INTO A CORTEX REGISTER
+    MOV         x10,#128
+    dup         v1.8b,w10
+
+    ////D0 HAS C1-C2-C3-C4
+    //// load other parameters from stack
+    mov         x9, x7
+    mov         x7, x6
+    mov         x6, x5
+    mov         x5, x4
+    //LDR  x4,[sp,#44]
+    //LDR  x8,[sp,#52]
+
+    //// calculate offsets, offset = stride - width
+    SUB         x10,x6,x3                   //// luma offset
+    SUB         x11,x7,x3
+    //, LSR #1    @// u offset
+    //SUB     x12,x8,x3, LSR #1    @// v offset
+    SUB         x14,x9,x3                   //// rgb offset in pixels
+
+    //// calculate height loop count
+    LSR         x5, x5, #1                  //// height_cnt = height / 16
+
+    //// create next row pointers for rgb and luma data
+    ADD         x7,x0,x6                    //// luma_next_row = luma + luma_stride
+    ADD         x8,x2,x9,LSL #2             //// rgb_next_row = rgb + rgb_stride
+
+LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP:
+
+    ////LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES.
+    LD1         {v2.8b, v3.8b},[x1],#16     ////LOAD 8 VALUES OF UV
+    ////VLD1.8 {D3},[x2]!             @//LOAD 8 VALUES OF V
+
+    //// calculate width loop count
+    LSR         x6, x3, #4                  //// width_cnt = width / 16
+
+    ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
+    ////LOAD VALUES OF Y 8-BIT VALUES
+    LD2         {v30.8b, v31.8b},[x0],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
+                                            ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+    LD2         {v28.8b, v29.8b},[x7],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
+                                            ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+
+    SUBS        x6,x6,#1
+    BEQ         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP
+
+LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
+    //VMOV.I8 Q1,#128
+    UZP1        v27.8b, v2.8b, v3.8b
+    UZP2        v3.8b, v2.8b, v3.8b
+    mov         v2.d[0], v27.d[0]
+
+    ////NEED TO SUBTRACT (U-128) AND (V-128)
+    ////(D2-D1),(D3-D1)
+    uSUBL       v4.8h, v2.8b, v1.8b         ////(U-128)
+    uSUBL       v6.8h, v3.8b, v1.8b         ////(V-128)
+
+    ////LOAD VALUES OF U&V for next row
+    LD1         {v2.8b, v3.8b},[x1],#16     ////LOAD 8 VALUES OF U
+    ////VLD1.8 {D3},[x2]!             @//LOAD 8 VALUES OF V
+
+    //PLD        [x0]
+    prfm        PLDL1KEEP,[x1]
+
+    ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
+    sMULL       v8.4s, v4.4h, v0.4h[3]      ////(U-128)*C4 FOR B
+    sMULL2      v10.4s, v4.8h, v0.4h[3]     ////(U-128)*C4 FOR B
+
+    sMULL       v20.4s, v6.4h, v0.4h[0]     ////(V-128)*C1 FOR R
+    sMULL2      v22.4s, v6.8h, v0.4h[0]     ////(V-128)*C1 FOR R
+
+    sMULL       v12.4s, v4.4h, v0.4h[1]     ////(U-128)*C2 FOR G
+    sMLAL       v12.4s, v6.4h, v0.4h[2]     ////Q6 = (U-128)*C2 + (V-128)*C3
+    sMULL2      v14.4s, v4.8h, v0.4h[1]     ////(U-128)*C2 FOR G
+    sMLAL2      v14.4s, v6.8h, v0.4h[2]     ////Q7 = (U-128)*C2 + (V-128)*C3
+
+    ////NARROW RIGHT SHIFT BY 13 FOR R&B
+    sqshrn      v8.4h, v8.4s,#13            ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
+    sqshrn2     v8.8h, v10.4s,#13           ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
+    ////Q4 - WEIGHT FOR B
+
+    ////NARROW RIGHT SHIFT BY 13 FOR R&B
+    sqshrn      v10.4h, v20.4s,#13          ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
+    sqshrn2     v10.8h, v22.4s,#13          ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
+    ////Q5 - WEIGHT FOR R
+
+    ////NARROW RIGHT SHIFT BY 13 FOR G
+    sqshrn      v12.4h, v12.4s,#13          ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+    sqshrn2     v12.8h, v14.4s,#13          ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+    ////Q6 - WEIGHT FOR G
+
+    UADDW       v14.8h,  v8.8h ,  v30.8b    ////Q7 - HAS Y + B
+    UADDW       v16.8h,  v10.8h ,  v30.8b   ////Q8 - HAS Y + R
+    UADDW       v18.8h,  v12.8h ,  v30.8b   ////Q9 - HAS Y + G
+
+    UADDW       v20.8h,  v8.8h ,  v31.8b    ////Q10 - HAS Y + B
+    UADDW       v22.8h,  v10.8h ,  v31.8b   ////Q11 - HAS Y + R
+    UADDW       v24.8h,  v12.8h ,  v31.8b   ////Q12 - HAS Y + G
+
+    sqxtun      v14.8b, v14.8h
+    sqxtun      v15.8b, v18.8h
+    sqxtun      v16.8b, v16.8h
+    movi        v17.8b, #0
+
+    sqxtun      v20.8b, v20.8h
+    sqxtun      v21.8b, v24.8h
+    sqxtun      v22.8b, v22.8h
+    movi        v23.8b, #0
+
+    ZIP1        v27.8b, v14.8b, v15.8b
+    ZIP2        v15.8b, v14.8b, v15.8b
+    mov         v14.d[0], v27.d[0]
+    ZIP1        v27.8b, v16.8b, v17.8b
+    ZIP2        v17.8b, v16.8b, v17.8b
+    mov         v16.d[0], v27.d[0]
+
+    ZIP1        v27.8b, v20.8b, v21.8b
+    ZIP2        v21.8b, v20.8b, v21.8b
+    mov         v20.d[0], v27.d[0]
+    ZIP1        v27.8b, v22.8b, v23.8b
+    ZIP2        v23.8b, v22.8b, v23.8b
+    mov         v22.d[0], v27.d[0]
+
+    mov         v14.d[1], v15.d[0]
+    mov         v20.d[1], v21.d[0]
+    mov         v16.d[1], v17.d[0]
+    mov         v22.d[1], v23.d[0]
+
+    ZIP1        v27.8h, v14.8h, v16.8h
+    ZIP2        v26.8h, v14.8h, v16.8h
+
+    ZIP1        v25.8h, v20.8h, v22.8h
+    ZIP2        v19.8h, v20.8h, v22.8h
+
+    ZIP1        v14.4s, v27.4s, v25.4s
+    ZIP2        v20.4s, v27.4s, v25.4s
+
+    ZIP1        v16.4s, v26.4s, v19.4s
+    ZIP2        v22.4s, v26.4s, v19.4s
+
+    ST1         {v14.4s},[x2],#16
+    ST1         {v20.4s},[x2],#16
+    ST1         {v16.4s},[x2],#16
+    ST1         {v22.4s},[x2],#16
+
+    ////D14-D20 - TOALLY HAVE 16 VALUES
+    ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
+    UADDW       v14.8h,  v8.8h ,  v28.8b    ////Q7 - HAS Y + B
+    UADDW       v16.8h,  v10.8h ,  v28.8b   ////Q2 - HAS Y + R
+    UADDW       v18.8h,  v12.8h ,  v28.8b   ////Q3 - HAS Y + G
+
+    UADDW       v20.8h,  v8.8h ,  v29.8b    ////Q10 - HAS Y + B
+    UADDW       v22.8h,  v10.8h ,  v29.8b   ////Q11 - HAS Y + R
+    UADDW       v24.8h,  v12.8h ,  v29.8b   ////Q12 - HAS Y + G
+
+    ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
+    ////LOAD VALUES OF Y 8-BIT VALUES
+    LD2         {v30.8b, v31.8b},[x0],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
+                                            ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+    LD2         {v28.8b, v29.8b},[x7],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
+                                            ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+
+    prfm        PLDL1KEEP,[x0]
+    prfm        PLDL1KEEP,[x7]
+
+    sqxtun      v14.8b, v14.8h
+    sqxtun      v15.8b, v18.8h
+    sqxtun      v16.8b, v16.8h
+    movi        v17.8b, #0
+
+    sqxtun      v20.8b, v20.8h
+    sqxtun      v21.8b, v24.8h
+    sqxtun      v22.8b, v22.8h
+    movi        v23.8b, #0
+
+    ZIP1        v27.8b, v14.8b, v15.8b
+    ZIP2        v15.8b, v14.8b, v15.8b
+    mov         v14.d[0], v27.d[0]
+    ZIP1        v27.8b, v16.8b, v17.8b
+    ZIP2        v17.8b, v16.8b, v17.8b
+    mov         v16.d[0], v27.d[0]
+
+    ZIP1        v27.8b, v20.8b, v21.8b
+    ZIP2        v21.8b, v20.8b, v21.8b
+    mov         v20.d[0], v27.d[0]
+    ZIP1        v27.8b, v22.8b, v23.8b
+    ZIP2        v23.8b, v22.8b, v23.8b
+    mov         v22.d[0], v27.d[0]
+
+    mov         v14.d[1], v15.d[0]
+    mov         v20.d[1], v21.d[0]
+    mov         v16.d[1], v17.d[0]
+    mov         v22.d[1], v23.d[0]
+
+    ZIP1        v27.8h, v14.8h, v16.8h
+    ZIP2        v26.8h, v14.8h, v16.8h
+
+    ZIP1        v25.8h, v20.8h, v22.8h
+    ZIP2        v19.8h, v20.8h, v22.8h
+
+    ZIP1        v14.4s, v27.4s, v25.4s
+    ZIP2        v20.4s, v27.4s, v25.4s
+
+    ZIP1        v16.4s, v26.4s, v19.4s
+    ZIP2        v22.4s, v26.4s, v19.4s
+
+    ST1         {v14.4s},[x8],#16
+    ST1         {v20.4s},[x8],#16
+    ST1         {v16.4s},[x8],#16
+    ST1         {v22.4s},[x8],#16
+
+    SUBS        x6,x6,#1                    //// width_cnt -= 1
+    BNE         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP
+
+LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
+    //VMOV.I8 Q1,#128
+    UZP1        v27.8b, v2.8b, v3.8b
+    UZP2        v3.8b, v2.8b, v3.8b
+    mov         v2.d[0], v27.d[0]
+
+
+    ////NEED TO SUBTRACT (U-128) AND (V-128)
+    ////(D2-D1),(D3-D1)
+    uSUBL       v4.8h, v2.8b, v1.8b         ////(U-128)
+    uSUBL       v6.8h, v3.8b, v1.8b         ////(V-128)
+
+
+    ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
+    sMULL       v8.4s, v4.4h, v0.4h[3]      ////(U-128)*C4 FOR B
+    sMULL2      v10.4s, v4.8h, v0.4h[3]     ////(U-128)*C4 FOR B
+
+    sMULL       v20.4s, v6.4h, v0.4h[0]     ////(V-128)*C1 FOR R
+    sMULL2      v22.4s, v6.8h, v0.4h[0]     ////(V-128)*C1 FOR R
+
+    sMULL       v12.4s, v4.4h, v0.4h[1]     ////(U-128)*C2 FOR G
+    sMLAL       v12.4s, v6.4h, v0.4h[2]     ////Q6 = (U-128)*C2 + (V-128)*C3
+    sMULL2      v14.4s, v4.8h, v0.4h[1]     ////(U-128)*C2 FOR G
+    sMLAL2      v14.4s, v6.8h, v0.4h[2]     ////Q7 = (U-128)*C2 + (V-128)*C3
+
+    ////NARROW RIGHT SHIFT BY 13 FOR R&B
+    sqshrn      v8.4h, v8.4s,#13            ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
+    sqshrn2     v8.8h, v10.4s,#13           ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
+    ////Q4 - WEIGHT FOR B
+
+    ////NARROW RIGHT SHIFT BY 13 FOR R&B
+    sqshrn      v10.4h, v20.4s,#13          ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
+    sqshrn2     v10.8h, v22.4s,#13          ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
+    ////Q5 - WEIGHT FOR R
+
+    ////NARROW RIGHT SHIFT BY 13 FOR G
+    sqshrn      v12.4h, v12.4s,#13          ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+    sqshrn2     v12.8h, v14.4s,#13          ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+    ////Q6 - WEIGHT FOR G
+
+    UADDW       v14.8h,  v8.8h ,  v30.8b    ////Q7 - HAS Y + B
+    UADDW       v16.8h,  v10.8h ,  v30.8b   ////Q8 - HAS Y + R
+    UADDW       v18.8h,  v12.8h ,  v30.8b   ////Q9 - HAS Y + G
+
+    UADDW       v20.8h,  v8.8h ,  v31.8b    ////Q10 - HAS Y + B
+    UADDW       v22.8h,  v10.8h ,  v31.8b   ////Q11 - HAS Y + R
+    UADDW       v24.8h,  v12.8h ,  v31.8b   ////Q12 - HAS Y + G
+
+    sqxtun      v14.8b, v14.8h
+    sqxtun      v15.8b, v18.8h
+    sqxtun      v16.8b, v16.8h
+    movi        v17.8b, #0
+
+    sqxtun      v20.8b, v20.8h
+    sqxtun      v21.8b, v24.8h
+    sqxtun      v22.8b, v22.8h
+    movi        v23.8b, #0
+
+    ZIP1        v27.8b, v14.8b, v15.8b
+    ZIP2        v15.8b, v14.8b, v15.8b
+    mov         v14.d[0], v27.d[0]
+    ZIP1        v27.8b, v16.8b, v17.8b
+    ZIP2        v17.8b, v16.8b, v17.8b
+    mov         v16.d[0], v27.d[0]
+
+    ZIP1        v27.8b, v20.8b, v21.8b
+    ZIP2        v21.8b, v20.8b, v21.8b
+    mov         v20.d[0], v27.d[0]
+    ZIP1        v27.8b, v22.8b, v23.8b
+    ZIP2        v23.8b, v22.8b, v23.8b
+    mov         v22.d[0], v27.d[0]
+
+    mov         v14.d[1], v15.d[0]
+    mov         v20.d[1], v21.d[0]
+    mov         v16.d[1], v17.d[0]
+    mov         v22.d[1], v23.d[0]
+
+    ZIP1        v27.8h, v14.8h, v16.8h
+    ZIP2        v26.8h, v14.8h, v16.8h
+
+    ZIP1        v25.8h, v20.8h, v22.8h
+    ZIP2        v19.8h, v20.8h, v22.8h
+
+    ZIP1        v14.4s, v27.4s, v25.4s
+    ZIP2        v20.4s, v27.4s, v25.4s
+
+    ZIP1        v16.4s, v26.4s, v19.4s
+    ZIP2        v22.4s, v26.4s, v19.4s
+
+    ST1         {v14.4s},[x2],#16
+    ST1         {v20.4s},[x2],#16
+    ST1         {v16.4s},[x2],#16
+    ST1         {v22.4s},[x2],#16
+
+    ////D14-D20 - TOALLY HAVE 16 VALUES
+    ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
+    UADDW       v14.8h,  v8.8h ,  v28.8b    ////Q7 - HAS Y + B
+    UADDW       v16.8h,  v10.8h ,  v28.8b   ////Q2 - HAS Y + R
+    UADDW       v18.8h,  v12.8h ,  v28.8b   ////Q3 - HAS Y + G
+
+    UADDW       v20.8h,  v8.8h ,  v29.8b    ////Q10 - HAS Y + B
+    UADDW       v22.8h,  v10.8h ,  v29.8b   ////Q11 - HAS Y + R
+    UADDW       v24.8h,  v12.8h ,  v29.8b   ////Q12 - HAS Y + G
+
+    sqxtun      v14.8b, v14.8h
+    sqxtun      v15.8b, v18.8h
+    sqxtun      v16.8b, v16.8h
+    movi        v17.8b, #0
+
+    sqxtun      v20.8b, v20.8h
+    sqxtun      v21.8b, v24.8h
+    sqxtun      v22.8b, v22.8h
+    movi        v23.8b, #0
+
+    ZIP1        v27.8b, v14.8b, v15.8b
+    ZIP2        v15.8b, v14.8b, v15.8b
+    mov         v14.d[0], v27.d[0]
+    ZIP1        v27.8b, v16.8b, v17.8b
+    ZIP2        v17.8b, v16.8b, v17.8b
+    mov         v16.d[0], v27.d[0]
+
+    ZIP1        v27.8b, v20.8b, v21.8b
+    ZIP2        v21.8b, v20.8b, v21.8b
+    mov         v20.d[0], v27.d[0]
+    ZIP1        v27.8b, v22.8b, v23.8b
+    ZIP2        v23.8b, v22.8b, v23.8b
+    mov         v22.d[0], v27.d[0]
+
+    mov         v14.d[1], v15.d[0]
+    mov         v20.d[1], v21.d[0]
+    mov         v16.d[1], v17.d[0]
+    mov         v22.d[1], v23.d[0]
+
+    ZIP1        v27.8h, v14.8h, v16.8h
+    ZIP2        v26.8h, v14.8h, v16.8h
+
+    ZIP1        v25.8h, v20.8h, v22.8h
+    ZIP2        v19.8h, v20.8h, v22.8h
+
+    ZIP1        v14.4s, v27.4s, v25.4s
+    ZIP2        v20.4s, v27.4s, v25.4s
+
+    ZIP1        v16.4s, v26.4s, v19.4s
+    ZIP2        v22.4s, v26.4s, v19.4s
+
+    ST1         {v14.4s},[x8],#16
+    ST1         {v20.4s},[x8],#16
+    ST1         {v16.4s},[x8],#16
+    ST1         {v22.4s},[x8],#16
+
+    //// Adjust the address pointers
+    ADD         x0,x7,x10                   //// luma = luma_next + offset
+    ADD         x2,x8,x14,LSL #2            //// rgb = rgb_next + offset
+
+    ADD         x7,x0,x3                    //// luma_next = luma + width
+    ADD         x8,x2,x3,LSL #2             //// rgb_next_row = rgb + width
+
+    ADD         x1,x1,x11                   //// adjust u pointer
+    //ADD        x2,x2,x12            @// adjust v pointer
+
+    ADD         x7,x7,x10                   //// luma_next = luma + width + offset (because of register crunch)
+    ADD         x8,x8,x14,LSL #2            //// rgb_next_row = rgb + width + offset
+
+    SUBS        x5,x5,#1                    //// height_cnt -= 1
+
+    BNE         LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP
+
+    ////POP THE REGISTERS
+    // LDMFD sp!,{x4-x12,PC}
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+    .section .note.GNU-stack,"",%progbits
+

diff --git a/decoder/arm64/ihevcd_function_selector_av8.c b/decoder/arm64/ihevcd_function_selector_av8.c
new file mode 100644
index 0000000..210c730
--- /dev/null
+++ b/decoder/arm64/ihevcd_function_selector_av8.c

@@ -0,0 +1,160 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_function_selector.c
+*
+* @brief
+*  Contains functions to initialize a9q function pointers used in hevc
+*
+* @author
+*  Naveen
+*
+* @par List of Functions:
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr_av8(codec_t *ps_codec)
+{
+    ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr                      =  &ihevc_deblk_chroma_horz_av8;
+    ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr                      =  &ihevc_deblk_chroma_vert_av8;
+    ps_codec->s_func_selector.ihevc_deblk_luma_vert_fptr                        =  &ihevc_deblk_luma_vert_av8;
+    ps_codec->s_func_selector.ihevc_deblk_luma_horz_fptr                        =  &ihevc_deblk_luma_horz_av8;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_fptr                 =  &ihevc_inter_pred_chroma_copy_av8;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_w16out_fptr          =  &ihevc_inter_pred_chroma_copy_w16out_av8;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_fptr                 =  &ihevc_inter_pred_chroma_horz_av8;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr          =  &ihevc_inter_pred_chroma_horz_w16out_av8;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_fptr                 =  &ihevc_inter_pred_chroma_vert_av8;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_fptr          =  &ihevc_inter_pred_chroma_vert_w16inp_av8;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_w16out_fptr   =  &ihevc_inter_pred_chroma_vert_w16inp_w16out_av8;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16out_fptr          =  &ihevc_inter_pred_chroma_vert_w16out_av8;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_fptr                   =  &ihevc_inter_pred_luma_horz_av8;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_fptr                   =  &ihevc_inter_pred_luma_vert_av8;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16out_fptr            =  &ihevc_inter_pred_luma_vert_w16out_av8;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_fptr            =  &ihevc_inter_pred_luma_vert_w16inp_av8;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_fptr                   =  &ihevc_inter_pred_luma_copy_av8;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_w16out_fptr            =  &ihevc_inter_pred_luma_copy_w16out_av8;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr            =  &ihevc_inter_pred_luma_horz_w16out_av8;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_w16out_fptr     =  &ihevc_inter_pred_luma_vert_w16inp_w16out_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr     =  &ihevc_intra_pred_chroma_ref_substitution;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr       =  &ihevc_intra_pred_luma_ref_substitution;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr   =  &ihevc_intra_pred_luma_ref_subst_all_avlble;
+    ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr               =  &ihevc_intra_pred_ref_filtering_neonintr;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_dc_fptr                   =  &ihevc_intra_pred_chroma_dc_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_horz_fptr                 =  &ihevc_intra_pred_chroma_horz_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode2_fptr                =  &ihevc_intra_pred_chroma_mode2_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_18_34_fptr           =  &ihevc_intra_pred_chroma_mode_18_34_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_27_to_33_fptr        =  &ihevc_intra_pred_chroma_mode_27_to_33_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_3_to_9_fptr          =  &ihevc_intra_pred_chroma_mode_3_to_9_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_planar_fptr               =  &ihevc_intra_pred_chroma_planar_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_ver_fptr                  =  &ihevc_intra_pred_chroma_ver_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_11_to_17_fptr        =  &ihevc_intra_pred_chroma_mode_11_to_17_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_19_to_25_fptr        =  &ihevc_intra_pred_chroma_mode_19_to_25_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_11_to_17_fptr          =  &ihevc_intra_pred_luma_mode_11_to_17_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_19_to_25_fptr          =  &ihevc_intra_pred_luma_mode_19_to_25_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_dc_fptr                     =  &ihevc_intra_pred_luma_dc_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_horz_fptr                   =  &ihevc_intra_pred_luma_horz_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode2_fptr                  =  &ihevc_intra_pred_luma_mode2_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_18_34_fptr             =  &ihevc_intra_pred_luma_mode_18_34_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_27_to_33_fptr          =  &ihevc_intra_pred_luma_mode_27_to_33_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_3_to_9_fptr            =  &ihevc_intra_pred_luma_mode_3_to_9_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_planar_fptr                 =  &ihevc_intra_pred_luma_planar_av8;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ver_fptr                    =  &ihevc_intra_pred_luma_ver_av8;
+    ps_codec->s_func_selector.ihevc_itrans_4x4_ttype1_fptr                      =  &ihevc_itrans_4x4_ttype1;
+    ps_codec->s_func_selector.ihevc_itrans_4x4_fptr                             =  &ihevc_itrans_4x4;
+    ps_codec->s_func_selector.ihevc_itrans_8x8_fptr                             =  &ihevc_itrans_8x8;
+    ps_codec->s_func_selector.ihevc_itrans_16x16_fptr                           =  &ihevc_itrans_16x16;
+    ps_codec->s_func_selector.ihevc_itrans_32x32_fptr                           =  &ihevc_itrans_32x32;
+    ps_codec->s_func_selector.ihevc_itrans_recon_4x4_ttype1_fptr                =  &ihevc_itrans_recon_4x4_ttype1_av8;
+    ps_codec->s_func_selector.ihevc_itrans_recon_4x4_fptr                       =  &ihevc_itrans_recon_4x4_av8;
+    ps_codec->s_func_selector.ihevc_itrans_recon_8x8_fptr                       =  &ihevc_itrans_recon_8x8_av8;
+    ps_codec->s_func_selector.ihevc_itrans_recon_16x16_fptr                     =  &ihevc_itrans_recon_16x16_av8;
+    ps_codec->s_func_selector.ihevc_itrans_recon_32x32_fptr                     =  &ihevc_itrans_recon_32x32_av8;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_4x4_fptr                =  &ihevc_chroma_itrans_recon_4x4;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_8x8_fptr                =  &ihevc_chroma_itrans_recon_8x8;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_16x16_fptr              =  &ihevc_chroma_itrans_recon_16x16;
+    ps_codec->s_func_selector.ihevc_recon_4x4_ttype1_fptr                       =  &ihevc_recon_4x4_ttype1;
+    ps_codec->s_func_selector.ihevc_recon_4x4_fptr                              =  &ihevc_recon_4x4;
+    ps_codec->s_func_selector.ihevc_recon_8x8_fptr                              =  &ihevc_recon_8x8;
+    ps_codec->s_func_selector.ihevc_recon_16x16_fptr                            =  &ihevc_recon_16x16;
+    ps_codec->s_func_selector.ihevc_recon_32x32_fptr                            =  &ihevc_recon_32x32;
+    ps_codec->s_func_selector.ihevc_chroma_recon_4x4_fptr                       =  &ihevc_chroma_recon_4x4;
+    ps_codec->s_func_selector.ihevc_chroma_recon_8x8_fptr                       =  &ihevc_chroma_recon_8x8;
+    ps_codec->s_func_selector.ihevc_chroma_recon_16x16_fptr                     =  &ihevc_chroma_recon_16x16;
+    ps_codec->s_func_selector.ihevc_memcpy_mul_8_fptr                           =  &ihevc_memcpy_mul_8_av8;
+    ps_codec->s_func_selector.ihevc_memcpy_fptr                                 =  &ihevc_memcpy_av8;
+    ps_codec->s_func_selector.ihevc_memset_mul_8_fptr                           =  &ihevc_memset_mul_8_av8;
+    ps_codec->s_func_selector.ihevc_memset_fptr                                 =  &ihevc_memset_av8;
+    ps_codec->s_func_selector.ihevc_memset_16bit_mul_8_fptr                     =  &ihevc_memset_16bit_mul_8_av8;
+    ps_codec->s_func_selector.ihevc_memset_16bit_fptr                           =  &ihevc_memset_16bit_av8;
+    ps_codec->s_func_selector.ihevc_pad_left_luma_fptr                          =  &ihevc_pad_left_luma_av8;
+    ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr                        =  &ihevc_pad_left_chroma_av8;
+    ps_codec->s_func_selector.ihevc_pad_right_luma_fptr                         =  &ihevc_pad_right_luma_av8;
+    ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr                       =  &ihevc_pad_right_chroma_av8;
+    ps_codec->s_func_selector.ihevc_weighted_pred_bi_fptr                       =  &ihevc_weighted_pred_bi_av8;
+    ps_codec->s_func_selector.ihevc_weighted_pred_bi_default_fptr               =  &ihevc_weighted_pred_bi_default_av8;
+    ps_codec->s_func_selector.ihevc_weighted_pred_uni_fptr                      =  &ihevc_weighted_pred_uni_av8;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_fptr                =  &ihevc_weighted_pred_chroma_bi_neonintr;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_default_fptr        =  &ihevc_weighted_pred_chroma_bi_default_neonintr;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_uni_fptr               =  &ihevc_weighted_pred_chroma_uni_neonintr;
+    ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr                   =  &ihevc_sao_band_offset_luma_av8;
+    ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr                 =  &ihevc_sao_band_offset_chroma_av8;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_fptr                 =  &ihevc_sao_edge_offset_class0_av8;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_chroma_fptr          =  &ihevc_sao_edge_offset_class0_chroma_av8;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_fptr                 =  &ihevc_sao_edge_offset_class1_av8;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_chroma_fptr          =  &ihevc_sao_edge_offset_class1_chroma_av8;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_fptr                 =  &ihevc_sao_edge_offset_class2_av8;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_chroma_fptr          =  &ihevc_sao_edge_offset_class2_chroma_av8;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_fptr                 =  &ihevc_sao_edge_offset_class3_av8;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_chroma_fptr          =  &ihevc_sao_edge_offset_class3_chroma_av8;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgba8888_fptr            =  &ihevcd_fmt_conv_420sp_to_rgba8888_av8;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgb565_fptr              =  &ihevcd_fmt_conv_420sp_to_rgb565;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420sp_fptr               =  &ihevcd_fmt_conv_420sp_to_420sp_av8;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420p_fptr                =  &ihevcd_fmt_conv_420sp_to_420p_av8;
+    ps_codec->s_func_selector.ihevcd_itrans_recon_dc_luma_fptr                  =  &ihevcd_itrans_recon_dc_luma_av8;
+    ps_codec->s_func_selector.ihevcd_itrans_recon_dc_chroma_fptr                =  &ihevcd_itrans_recon_dc_chroma_av8;
+}

diff --git a/decoder/arm64/ihevcd_itrans_recon_dc_chroma.s b/decoder/arm64/ihevcd_itrans_recon_dc_chroma.s
new file mode 100644
index 0000000..9d1e8a4
--- /dev/null
+++ b/decoder/arm64/ihevcd_itrans_recon_dc_chroma.s

@@ -0,0 +1,220 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///*******************************************************************************
+//* //file
+//*  ihevcd_itrans_recon_dc_chroma.s
+//*
+//* //brief
+//*  contains function definitions itrans and recon for dc only case
+//*
+//* //author
+//*  ittiam
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************/
+
+
+.text
+.include "ihevc_neon_macros.s"
+
+
+.globl ihevcd_itrans_recon_dc_chroma_av8
+
+.type ihevcd_itrans_recon_dc_chroma_av8, %function
+
+ihevcd_itrans_recon_dc_chroma_av8:
+
+//void ihevcd_itrans_recon_dc_chroma(uword8 *pu1_pred,
+//                            uword8 *pu1_dst,
+//                            word32 pred_strd,
+//                            word32 dst_strd,
+//                            word32 log2_trans_size,
+//                            word16 i2_coeff_value)
+
+//x0:pu1_pred
+//x1:pu1_dest
+//x2:pred_strd
+//x3:dst_strd
+
+
+
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+
+    sxth        x5, w5 // since the argument is of word16, sign extend to x register
+
+    mov         x10,#1
+    lsl         x4,x10,x4                   //    trans_size = (1 << log2_trans_size)//
+    mov         x6,#64                      // 1 << (shift1 - 1)//
+    mov         x7,#2048                    // 1<<(shift2-1)
+
+    add         x8,x6,x5,lsl #6
+    asr         x20, x8, #7
+    mov         x19,#32767
+    cmp         x20,x19
+    blt         lbl36
+    mov         x8,#32767
+    b           lbl36_1
+lbl36:
+    mov         x19,#-32768
+    cmp         x20,x19
+    csel        x8, x19, x20, lt
+lbl36_1:
+
+    add         x5,x7,x8,lsl #6
+    asr         x20, x5, #12
+    mov         x19,#32767
+    cmp         x20,x19
+    blt         lbl38
+    mov         x6,#32767
+    b           lbl38_1
+lbl38:
+    mov         x19,#-32768
+    cmp         x20,x19
+    csel        x6, x19, x20, lt
+lbl38_1:
+
+    mov         x9,x4
+    mov         x8,x4
+
+    // x6 has the dc_value
+    // x4 has the trans_size value
+    // x8 has the row value
+    // x9 has the col value
+    dup         v0.8h,w6
+    cmp         x4,#4
+    beq         row_loop_4chroma
+
+
+row_loop_chroma:
+    mov         x9,x4
+
+
+col_loop_chroma:
+
+    mov         x7,x0
+    ld2         {v2.8b, v3.8b},[x7],x2
+    ld2         {v4.8b, v5.8b},[x7],x2
+    ld2         {v6.8b, v7.8b},[x7],x2
+    ld2         {v8.8b, v9.8b},[x7],x2
+
+    ld2         {v10.8b, v11.8b},[x7],x2
+    ld2         {v12.8b, v13.8b},[x7],x2
+    ld2         {v14.8b, v15.8b},[x7],x2
+    ld2         {v16.8b, v17.8b},[x7]
+
+    add         x0,x0,#16
+
+
+    uaddw       v30.8h,  v0.8h ,  v2.8b
+    uaddw       v28.8h,  v0.8h ,  v4.8b
+    uaddw       v26.8h,  v0.8h ,  v6.8b
+    uaddw       v24.8h,  v0.8h ,  v8.8b
+    uaddw       v22.8h,  v0.8h ,  v10.8b
+    uaddw       v20.8h,  v0.8h ,  v12.8b
+    uaddw       v18.8h,  v0.8h ,  v14.8b
+
+
+    mov         x11,x1
+    sqxtun      v2.8b, v30.8h
+    sqxtun      v4.8b, v28.8h
+    sqxtun      v6.8b, v26.8h
+    sqxtun      v8.8b, v24.8h
+
+    uaddw       v30.8h,  v0.8h ,  v16.8b
+
+    sqxtun      v10.8b, v22.8h
+    sqxtun      v12.8b, v20.8h
+    sqxtun      v14.8b, v18.8h
+    sqxtun      v16.8b, v30.8h
+
+    st2         {v2.8b, v3.8b},[x11],x3
+    st2         {v4.8b, v5.8b},[x11],x3
+    st2         {v6.8b, v7.8b},[x11],x3
+    st2         {v8.8b, v9.8b},[x11],x3
+
+    st2         {v10.8b, v11.8b},[x11],x3
+    st2         {v12.8b, v13.8b},[x11],x3
+    st2         {v14.8b, v15.8b},[x11],x3
+    st2         {v16.8b, v17.8b},[x11]
+
+    add         x1,x1,#16
+
+    subs        x9,x9,#8
+    bgt         col_loop_chroma
+
+    subs        x8,x8,#8
+
+    add         x0,x0,x2,lsl #3
+    add         x1,x1,x3,lsl #3
+    sub         x0,x0,x4,lsl #1
+    sub         x1,x1,x4,lsl #1
+    bgt         row_loop_chroma
+    b           end_loops_chroma
+
+
+row_loop_4chroma:
+    mov         x9,x10
+
+
+col_loop_4chroma:
+
+
+    ld2         {v2.8b, v3.8b},[x0],x2
+    ld2         {v4.8b, v5.8b},[x0],x2
+    ld2         {v6.8b, v7.8b},[x0],x2
+    ld2         {v8.8b, v9.8b},[x0]
+
+
+
+
+    uaddw       v30.8h,  v0.8h ,  v2.8b
+    uaddw       v28.8h,  v0.8h ,  v4.8b
+    uaddw       v26.8h,  v0.8h ,  v6.8b
+    uaddw       v24.8h,  v0.8h ,  v8.8b
+
+
+
+    sqxtun      v31.8b, v30.8h
+    sqxtun      v29.8b, v28.8h
+    sqxtun      v27.8b, v26.8h
+    sqxtun      v25.8b, v24.8h
+
+
+    zip1        v2.8b, v31.8b, v3.8b
+    zip1        v4.8b, v29.8b, v5.8b
+    zip1        v6.8b, v27.8b, v7.8b
+    zip1        v8.8b, v25.8b, v9.8b
+
+    st1         {v2.2s},[x1],x3
+    st1         {v4.2s},[x1],x3
+    st1         {v6.2s},[x1],x3
+    st1         {v8.2s},[x1]
+
+end_loops_chroma:
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+

diff --git a/decoder/arm64/ihevcd_itrans_recon_dc_luma.s b/decoder/arm64/ihevcd_itrans_recon_dc_luma.s
new file mode 100644
index 0000000..279888b
--- /dev/null
+++ b/decoder/arm64/ihevcd_itrans_recon_dc_luma.s

@@ -0,0 +1,218 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///*******************************************************************************
+//* //file
+//*  ihevcd_itrans_recon_dc_luma.s
+//*
+//* //brief
+//*  contains function definitions itrans and recon for dc only case
+//*
+//* //author
+//*  ittiam
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//*  none
+//*
+//*******************************************************************************/
+
+.text
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevcd_itrans_recon_dc_luma_av8
+
+.type ihevcd_itrans_recon_dc_luma_av8, %function
+
+ihevcd_itrans_recon_dc_luma_av8:
+
+//void ihevcd_itrans_recon_dc_luma(uword8 *pu1_pred,
+//                            uword8 *pu1_dst,
+//                            word32 pred_strd,
+//                            word32 dst_strd,
+//                            word32 log2_trans_size,
+//                            word16 i2_coeff_value)
+
+//x0:pu1_pred
+//x1:pu1_dest
+//x2:pred_strd
+//x3:dst_strd
+
+
+
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+    sxth        x5,w5
+
+    mov         x10,#1
+    lsl         x4,x10,x4                   //    trans_size = (1 << log2_trans_size)//
+    mov         x6,#64                      // 1 << (shift1 - 1)//
+    mov         x7,#2048                    // 1<<(shift2-1)
+
+    add         x8,x6,x5,lsl #6
+    asr         x20, x8, #7
+    mov         x19, #32767
+    cmp         x20,x19
+    blt         lbl37
+    mov         x8,#32767
+    b           lbl37_1
+lbl37:
+    mov         x19,#-32768
+    cmp         x20,x19
+    csel        x8, x19, x20, lt
+lbl37_1:
+
+    add         x5,x7,x8,lsl #6
+    asr         x20, x5, #12
+    mov         x19,#32767
+    cmp         x20,x19
+    blt         lbl39
+    mov         x6,#32767
+    b           lbl39_1
+lbl39:
+    mov         x19,#-32768
+    cmp         x20,x19
+    csel        x6, x19, x20, lt
+lbl39_1:
+
+    mov         x9,x4
+    mov         x8,x4
+
+    // x6 has the dc_value
+    // x4 has the trans_size value
+    // x8 has the row value
+    // x9 has the col value
+    dup         v0.8h,w6
+    cmp         x4,#4
+    beq         row_loop_4
+
+
+row_loop:
+    mov         x9,x4
+
+
+col_loop:
+
+    mov         x7,x0
+    ld1         {v2.8b},[x7],x2
+    ld1         {v3.8b},[x7],x2
+    ld1         {v4.8b},[x7],x2
+    ld1         {v5.8b},[x7],x2
+
+    ld1         {v6.8b},[x7],x2
+    ld1         {v7.8b},[x7],x2
+    ld1         {v8.8b},[x7],x2
+    ld1         {v9.8b},[x7]
+
+    add         x0,x0,#8
+
+
+    uaddw       v30.8h,  v0.8h ,  v2.8b
+    uaddw       v28.8h,  v0.8h ,  v3.8b
+    uaddw       v26.8h,  v0.8h ,  v4.8b
+    uaddw       v24.8h,  v0.8h ,  v5.8b
+    uaddw       v22.8h,  v0.8h ,  v6.8b
+    uaddw       v20.8h,  v0.8h ,  v7.8b
+    uaddw       v18.8h,  v0.8h ,  v8.8b
+    uaddw       v16.8h,  v0.8h ,  v9.8b
+
+    mov         x11,x1
+    sqxtun      v2.8b, v30.8h
+    sqxtun      v3.8b, v28.8h
+    sqxtun      v4.8b, v26.8h
+    sqxtun      v5.8b, v24.8h
+    sqxtun      v6.8b, v22.8h
+    sqxtun      v7.8b, v20.8h
+    sqxtun      v8.8b, v18.8h
+    sqxtun      v9.8b, v16.8h
+
+
+    st1         {v2.2s},[x11],x3
+    st1         {v3.2s},[x11],x3
+    st1         {v4.2s},[x11],x3
+    st1         {v5.2s},[x11],x3
+    st1         {v6.2s},[x11],x3
+    st1         {v7.2s},[x11],x3
+    st1         {v8.2s},[x11],x3
+    st1         {v9.2s},[x11]
+
+    add         x1,x1,#8
+
+    subs        x9,x9,#8
+    bgt         col_loop
+
+    subs        x8,x8,#8
+
+    add         x0,x0,x2,lsl #3
+    add         x1,x1,x3,lsl #3
+    sub         x0,x0,x4
+    sub         x1,x1,x4
+    bgt         row_loop
+    b           end_loops
+
+
+row_loop_4:
+    mov         x9,x10
+
+
+col_loop_4:
+
+
+    ld1         {v2.8b},[x0],x2
+    ld1         {v3.8b},[x0],x2
+    ld1         {v4.8b},[x0],x2
+    ld1         {v5.8b},[x0]
+
+
+
+
+    uaddw       v30.8h,  v0.8h ,  v2.8b
+    uaddw       v28.8h,  v0.8h ,  v3.8b
+    uaddw       v26.8h,  v0.8h ,  v4.8b
+    uaddw       v24.8h,  v0.8h ,  v5.8b
+
+
+
+    sqxtun      v2.8b, v30.8h
+    sqxtun      v3.8b, v28.8h
+    sqxtun      v4.8b, v26.8h
+    sqxtun      v5.8b, v24.8h
+
+
+
+    st1         {v2.s}[0],[x1],x3
+    st1         {v3.s}[0],[x1],x3
+    st1         {v4.s}[0],[x1],x3
+    st1         {v5.s}[0],[x1]
+
+end_loops:
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+
+
+

diff --git a/decoder/ihevcd_api.c b/decoder/ihevcd_api.c
new file mode 100644
index 0000000..c55c558
--- /dev/null
+++ b/decoder/ihevcd_api.c

@@ -0,0 +1,4753 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_api.c
+*
+* @brief
+*  Contains api functions definitions for HEVC decoder
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+* - api_check_struct_sanity()
+* - ihevcd_get_version()
+* - ihevcd_set_default_params()
+* - ihevcd_init()
+* - ihevcd_get_num_rec()
+* - ihevcd_fill_num_mem_rec()
+* - ihevcd_init_mem_rec()
+* - ihevcd_retrieve_memrec()
+* - ihevcd_set_display_frame()
+* - ihevcd_set_flush_mode()
+* - ihevcd_get_status()
+* - ihevcd_get_buf_info()
+* - ihevcd_set_params()
+* - ihevcd_reset()
+* - ihevcd_rel_display_frame()
+* - ihevcd_disable_deblk()
+* - ihevcd_get_frame_dimensions()
+* - ihevcd_set_num_cores()
+* - ihevcd_ctl()
+* - ihevcd_cxa_api_function()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_trace.h"
+
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_utils.h"
+#include "ihevcd_decode.h"
+#include "ihevcd_job_queue.h"
+#ifdef GPU_BUILD
+#include "ihevcd_opencl_mc_interface.h"
+#endif
+#include "ihevcd_statistics.h"
+
+/*****************************************************************************/
+/* Function Prototypes                                                       */
+/*****************************************************************************/
+IV_API_CALL_STATUS_T ihevcd_get_version(CHAR *pc_version_string,
+                                        UWORD32 u4_version_buffer_size);
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Used to test arguments for corresponding API call
+*
+* @par Description:
+*  For each command the arguments are validated
+*
+* @param[in] ps_handle
+*  Codec handle at API level
+*
+* @param[in] pv_api_ip
+*  Pointer to input structure
+*
+* @param[out] pv_api_op
+*  Pointer to output structure
+*
+* @returns  Status of error checking
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+static IV_API_CALL_STATUS_T api_check_struct_sanity(iv_obj_t *ps_handle,
+                                                    void *pv_api_ip,
+                                                    void *pv_api_op)
+{
+    IVD_API_COMMAND_TYPE_T e_cmd;
+    UWORD32 *pu4_api_ip;
+    UWORD32 *pu4_api_op;
+    WORD32 i, j;
+
+    if(NULL == pv_api_op)
+        return (IV_FAIL);
+
+    if(NULL == pv_api_ip)
+        return (IV_FAIL);
+
+    pu4_api_ip = (UWORD32 *)pv_api_ip;
+    pu4_api_op = (UWORD32 *)pv_api_op;
+    e_cmd = (IVD_API_COMMAND_TYPE_T)*(pu4_api_ip + 1);
+
+    *(pu4_api_op + 1) = 0;
+    /* error checks on handle */
+    switch((WORD32)e_cmd)
+    {
+        case IV_CMD_GET_NUM_MEM_REC:
+        case IV_CMD_FILL_NUM_MEM_REC:
+            break;
+        case IV_CMD_INIT:
+            if(ps_handle == NULL)
+            {
+                *(pu4_api_op + 1) |= 1 << IVD_UNSUPPORTEDPARAM;
+                *(pu4_api_op + 1) |= IVD_HANDLE_NULL;
+                return IV_FAIL;
+            }
+
+            if(ps_handle->u4_size != sizeof(iv_obj_t))
+            {
+                *(pu4_api_op + 1) |= 1 << IVD_UNSUPPORTEDPARAM;
+                *(pu4_api_op + 1) |= IVD_HANDLE_STRUCT_SIZE_INCORRECT;
+                DEBUG("Sizes do not match. Expected: %d, Got: %d",
+                                sizeof(iv_obj_t), ps_handle->u4_size);
+                return IV_FAIL;
+            }
+            break;
+        case IVD_CMD_REL_DISPLAY_FRAME:
+        case IVD_CMD_SET_DISPLAY_FRAME:
+        case IVD_CMD_GET_DISPLAY_FRAME:
+        case IVD_CMD_VIDEO_DECODE:
+        case IV_CMD_RETRIEVE_MEMREC:
+        case IVD_CMD_VIDEO_CTL:
+            if(ps_handle == NULL)
+            {
+                *(pu4_api_op + 1) |= 1 << IVD_UNSUPPORTEDPARAM;
+                *(pu4_api_op + 1) |= IVD_HANDLE_NULL;
+                return IV_FAIL;
+            }
+
+            if(ps_handle->u4_size != sizeof(iv_obj_t))
+            {
+                *(pu4_api_op + 1) |= 1 << IVD_UNSUPPORTEDPARAM;
+                *(pu4_api_op + 1) |= IVD_HANDLE_STRUCT_SIZE_INCORRECT;
+                return IV_FAIL;
+            }
+
+#if 0
+            if(ps_handle->pv_fxns != ihevcd_cxa_api_function)
+            {
+                *(pu4_api_op + 1) |= 1 << IVD_UNSUPPORTEDPARAM;
+                *(pu4_api_op + 1) |= IVD_INVALID_HANDLE_NULL;
+                return IV_FAIL;
+            }
+#endif
+
+            if(ps_handle->pv_codec_handle == NULL)
+            {
+                *(pu4_api_op + 1) |= 1 << IVD_UNSUPPORTEDPARAM;
+                *(pu4_api_op + 1) |= IVD_INVALID_HANDLE_NULL;
+                return IV_FAIL;
+            }
+            break;
+        default:
+            *(pu4_api_op + 1) |= 1 << IVD_UNSUPPORTEDPARAM;
+            *(pu4_api_op + 1) |= IVD_INVALID_API_CMD;
+            return IV_FAIL;
+    }
+
+    switch((WORD32)e_cmd)
+    {
+        case IV_CMD_GET_NUM_MEM_REC:
+        {
+            ihevcd_cxa_num_mem_rec_ip_t *ps_ip =
+                            (ihevcd_cxa_num_mem_rec_ip_t *)pv_api_ip;
+            ihevcd_cxa_num_mem_rec_op_t *ps_op =
+                            (ihevcd_cxa_num_mem_rec_op_t *)pv_api_op;
+            ps_op->s_ivd_num_mem_rec_op_t.u4_error_code = 0;
+
+            if(ps_ip->s_ivd_num_mem_rec_ip_t.u4_size
+                            != sizeof(ihevcd_cxa_num_mem_rec_ip_t))
+            {
+                ps_op->s_ivd_num_mem_rec_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_num_mem_rec_op_t.u4_error_code |=
+                                IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                return (IV_FAIL);
+            }
+
+            if(ps_op->s_ivd_num_mem_rec_op_t.u4_size
+                            != sizeof(ihevcd_cxa_num_mem_rec_op_t))
+            {
+                ps_op->s_ivd_num_mem_rec_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_num_mem_rec_op_t.u4_error_code |=
+                                IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                return (IV_FAIL);
+            }
+        }
+            break;
+        case IV_CMD_FILL_NUM_MEM_REC:
+        {
+            ihevcd_cxa_fill_mem_rec_ip_t *ps_ip =
+                            (ihevcd_cxa_fill_mem_rec_ip_t *)pv_api_ip;
+            ihevcd_cxa_fill_mem_rec_op_t *ps_op =
+                            (ihevcd_cxa_fill_mem_rec_op_t *)pv_api_op;
+            iv_mem_rec_t *ps_mem_rec;
+            WORD32 max_wd = ps_ip->s_ivd_fill_mem_rec_ip_t.u4_max_frm_wd;
+            WORD32 max_ht = ps_ip->s_ivd_fill_mem_rec_ip_t.u4_max_frm_ht;
+
+            max_wd = ALIGN64(max_wd);
+            max_ht = ALIGN64(max_ht);
+
+            ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code = 0;
+
+            if((ps_ip->s_ivd_fill_mem_rec_ip_t.u4_size
+                            > sizeof(ihevcd_cxa_fill_mem_rec_ip_t))
+                            || (ps_ip->s_ivd_fill_mem_rec_ip_t.u4_size
+                                            < sizeof(iv_fill_mem_rec_ip_t)))
+            {
+                ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+                                IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                return (IV_FAIL);
+            }
+
+            if((ps_op->s_ivd_fill_mem_rec_op_t.u4_size
+                            != sizeof(ihevcd_cxa_fill_mem_rec_op_t))
+                            && (ps_op->s_ivd_fill_mem_rec_op_t.u4_size
+                                            != sizeof(iv_fill_mem_rec_op_t)))
+            {
+                ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+                                IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                return (IV_FAIL);
+            }
+
+            if(max_wd < MIN_WD)
+            {
+                ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+                                IVD_REQUESTED_WIDTH_NOT_SUPPPORTED;
+                return (IV_FAIL);
+            }
+
+            if(max_wd > MAX_WD)
+            {
+                ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+                                IVD_REQUESTED_WIDTH_NOT_SUPPPORTED;
+                return (IV_FAIL);
+            }
+
+            if(max_ht < MIN_HT)
+            {
+                ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+                                IVD_REQUESTED_HEIGHT_NOT_SUPPPORTED;
+                return (IV_FAIL);
+            }
+
+            if((max_ht * max_wd) > (MAX_HT * MAX_WD))
+
+            {
+                ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+                                IVD_REQUESTED_HEIGHT_NOT_SUPPPORTED;
+                return (IV_FAIL);
+            }
+
+            if(NULL == ps_ip->s_ivd_fill_mem_rec_ip_t.pv_mem_rec_location)
+            {
+                ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+                                IVD_NUM_REC_NOT_SUFFICIENT;
+                return (IV_FAIL);
+            }
+
+            /* check memrecords sizes are correct */
+            ps_mem_rec = ps_ip->s_ivd_fill_mem_rec_ip_t.pv_mem_rec_location;
+            for(i = 0; i < MEM_REC_CNT; i++)
+            {
+                if(ps_mem_rec[i].u4_size != sizeof(iv_mem_rec_t))
+                {
+                    ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |= 1
+                                    << IVD_UNSUPPORTEDPARAM;
+                    ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+                                    IVD_MEM_REC_STRUCT_SIZE_INCORRECT;
+                    return IV_FAIL;
+                }
+            }
+        }
+            break;
+
+        case IV_CMD_INIT:
+        {
+            ihevcd_cxa_init_ip_t *ps_ip = (ihevcd_cxa_init_ip_t *)pv_api_ip;
+            ihevcd_cxa_init_op_t *ps_op = (ihevcd_cxa_init_op_t *)pv_api_op;
+            iv_mem_rec_t *ps_mem_rec;
+            WORD32 max_wd = ps_ip->s_ivd_init_ip_t.u4_frm_max_wd;
+            WORD32 max_ht = ps_ip->s_ivd_init_ip_t.u4_frm_max_ht;
+
+            max_wd = ALIGN64(max_wd);
+            max_ht = ALIGN64(max_ht);
+
+            ps_op->s_ivd_init_op_t.u4_error_code = 0;
+
+            if((ps_ip->s_ivd_init_ip_t.u4_size > sizeof(ihevcd_cxa_init_ip_t))
+                            || (ps_ip->s_ivd_init_ip_t.u4_size
+                                            < sizeof(ivd_init_ip_t)))
+            {
+                ps_op->s_ivd_init_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_init_op_t.u4_error_code |=
+                                IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                DEBUG("\n");
+                return (IV_FAIL);
+            }
+
+            if((ps_op->s_ivd_init_op_t.u4_size != sizeof(ihevcd_cxa_init_op_t))
+                            && (ps_op->s_ivd_init_op_t.u4_size
+                                            != sizeof(ivd_init_op_t)))
+            {
+                ps_op->s_ivd_init_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_init_op_t.u4_error_code |=
+                                IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                DEBUG("\n");
+                return (IV_FAIL);
+            }
+
+            if(ps_ip->s_ivd_init_ip_t.u4_num_mem_rec != MEM_REC_CNT)
+            {
+                ps_op->s_ivd_init_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_init_op_t.u4_error_code |=
+                                IVD_INIT_DEC_NOT_SUFFICIENT;
+                DEBUG("\n");
+                return (IV_FAIL);
+            }
+
+            if(max_wd < MIN_WD)
+            {
+                ps_op->s_ivd_init_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_init_op_t.u4_error_code |=
+                                IVD_INIT_DEC_WIDTH_NOT_SUPPPORTED;
+                DEBUG("\n");
+                return (IV_FAIL);
+            }
+
+            if(max_wd > MAX_WD)
+            {
+                ps_op->s_ivd_init_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_init_op_t.u4_error_code |=
+                                IVD_INIT_DEC_WIDTH_NOT_SUPPPORTED;
+                DEBUG("\n");
+                return (IV_FAIL);
+            }
+
+            if(max_ht < MIN_HT)
+            {
+                ps_op->s_ivd_init_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_init_op_t.u4_error_code |=
+                                IVD_INIT_DEC_HEIGHT_NOT_SUPPPORTED;
+                DEBUG("\n");
+                return (IV_FAIL);
+            }
+
+            if((max_ht * max_wd) > (MAX_HT * MAX_WD))
+
+            {
+                ps_op->s_ivd_init_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_init_op_t.u4_error_code |=
+                                IVD_INIT_DEC_HEIGHT_NOT_SUPPPORTED;
+                DEBUG("\n");
+                return (IV_FAIL);
+            }
+
+            if(NULL == ps_ip->s_ivd_init_ip_t.pv_mem_rec_location)
+            {
+                ps_op->s_ivd_init_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_init_op_t.u4_error_code |=
+                                IVD_NUM_REC_NOT_SUFFICIENT;
+                DEBUG("\n");
+                return (IV_FAIL);
+            }
+
+            if((ps_ip->s_ivd_init_ip_t.e_output_format != IV_YUV_420P)
+                            && (ps_ip->s_ivd_init_ip_t.e_output_format
+                                            != IV_YUV_422ILE)
+                            && (ps_ip->s_ivd_init_ip_t.e_output_format
+                                            != IV_RGB_565)
+                            && (ps_ip->s_ivd_init_ip_t.e_output_format
+                                            != IV_RGBA_8888)
+                            && (ps_ip->s_ivd_init_ip_t.e_output_format
+                                            != IV_YUV_420SP_UV)
+                            && (ps_ip->s_ivd_init_ip_t.e_output_format
+                                            != IV_YUV_420SP_VU))
+            {
+                ps_op->s_ivd_init_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_init_op_t.u4_error_code |=
+                                IVD_INIT_DEC_COL_FMT_NOT_SUPPORTED;
+                DEBUG("\n");
+                return (IV_FAIL);
+            }
+
+            /* verify number of mem records */
+            if(ps_ip->s_ivd_init_ip_t.u4_num_mem_rec < MEM_REC_CNT)
+            {
+                ps_op->s_ivd_init_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_init_op_t.u4_error_code |=
+                                IVD_INIT_DEC_MEM_REC_NOT_SUFFICIENT;
+                DEBUG("\n");
+                return IV_FAIL;
+            }
+
+            ps_mem_rec = ps_ip->s_ivd_init_ip_t.pv_mem_rec_location;
+            /* check memrecords sizes are correct */
+            for(i = 0; i < (WORD32)ps_ip->s_ivd_init_ip_t.u4_num_mem_rec; i++)
+            {
+                if(ps_mem_rec[i].u4_size != sizeof(iv_mem_rec_t))
+                {
+                    ps_op->s_ivd_init_op_t.u4_error_code |= 1
+                                    << IVD_UNSUPPORTEDPARAM;
+                    ps_op->s_ivd_init_op_t.u4_error_code |=
+                                    IVD_MEM_REC_STRUCT_SIZE_INCORRECT;
+                    DEBUG("i: %d\n", i);
+                    return IV_FAIL;
+                }
+                /* check memrecords pointers are not NULL */
+
+                if(ps_mem_rec[i].pv_base == NULL)
+                {
+
+                    ps_op->s_ivd_init_op_t.u4_error_code |= 1
+                                    << IVD_UNSUPPORTEDPARAM;
+                    ps_op->s_ivd_init_op_t.u4_error_code |=
+                                    IVD_INIT_DEC_MEM_REC_BASE_NULL;
+                    DEBUG("i: %d\n", i);
+                    return IV_FAIL;
+
+                }
+
+            }
+
+            /* verify memtabs for overlapping regions */
+            {
+                void *start[MEM_REC_CNT];
+                void *end[MEM_REC_CNT];
+
+                start[0] = (ps_mem_rec[0].pv_base);
+                end[0] = (UWORD8 *)(ps_mem_rec[0].pv_base)
+                                + ps_mem_rec[0].u4_mem_size - 1;
+                for(i = 1; i < MEM_REC_CNT; i++)
+                {
+                    /* This array is populated to check memtab overlapp */
+                    start[i] = (ps_mem_rec[i].pv_base);
+                    end[i] = (UWORD8 *)(ps_mem_rec[i].pv_base)
+                                    + ps_mem_rec[i].u4_mem_size - 1;
+
+                    for(j = 0; j < i; j++)
+                    {
+                        if((start[i] >= start[j]) && (start[i] <= end[j]))
+                        {
+                            ps_op->s_ivd_init_op_t.u4_error_code |= 1
+                                            << IVD_UNSUPPORTEDPARAM;
+                            ps_op->s_ivd_init_op_t.u4_error_code |=
+                                            IVD_INIT_DEC_MEM_REC_OVERLAP_ERR;
+                            DEBUG("i: %d, j: %d\n", i, j);
+                            return IV_FAIL;
+                        }
+
+                        if((end[i] >= start[j]) && (end[i] <= end[j]))
+                        {
+                            ps_op->s_ivd_init_op_t.u4_error_code |= 1
+                                            << IVD_UNSUPPORTEDPARAM;
+                            ps_op->s_ivd_init_op_t.u4_error_code |=
+                                            IVD_INIT_DEC_MEM_REC_OVERLAP_ERR;
+                            DEBUG("i: %d, j: %d\n", i, j);
+                            return IV_FAIL;
+                        }
+
+                        if((start[i] < start[j]) && (end[i] > end[j]))
+                        {
+                            ps_op->s_ivd_init_op_t.u4_error_code |= 1
+                                            << IVD_UNSUPPORTEDPARAM;
+                            ps_op->s_ivd_init_op_t.u4_error_code |=
+                                            IVD_INIT_DEC_MEM_REC_OVERLAP_ERR;
+                            DEBUG("i: %d, j: %d\n", i, j);
+                            return IV_FAIL;
+                        }
+                    }
+
+                }
+            }
+
+            {
+                iv_mem_rec_t mem_rec_ittiam_api[MEM_REC_CNT];
+                ihevcd_cxa_fill_mem_rec_ip_t s_fill_mem_rec_ip;
+                ihevcd_cxa_fill_mem_rec_op_t s_fill_mem_rec_op;
+                IV_API_CALL_STATUS_T e_status;
+
+                WORD32 i;
+                s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.e_cmd =
+                                IV_CMD_FILL_NUM_MEM_REC;
+                s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.pv_mem_rec_location =
+                                mem_rec_ittiam_api;
+                s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.u4_max_frm_wd =
+                                max_wd;
+                s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.u4_max_frm_ht =
+                                max_ht;
+
+                if(ps_ip->s_ivd_init_ip_t.u4_size
+                                > offsetof(ihevcd_cxa_init_ip_t, i4_level))
+                {
+                    s_fill_mem_rec_ip.i4_level = ps_ip->i4_level;
+                }
+                else
+                {
+                    s_fill_mem_rec_ip.i4_level = IHEVC_LEVEL_31;
+                }
+
+                if(ps_ip->s_ivd_init_ip_t.u4_size
+                                > offsetof(ihevcd_cxa_init_ip_t,
+                                           u4_num_ref_frames))
+                {
+                    s_fill_mem_rec_ip.u4_num_ref_frames =
+                                    ps_ip->u4_num_ref_frames;
+                }
+                else
+                {
+                    s_fill_mem_rec_ip.u4_num_ref_frames = (MAX_REF_CNT + 1);
+                }
+
+                if(ps_ip->s_ivd_init_ip_t.u4_size
+                                > offsetof(ihevcd_cxa_init_ip_t,
+                                           u4_num_reorder_frames))
+                {
+                    s_fill_mem_rec_ip.u4_num_reorder_frames =
+                                    ps_ip->u4_num_reorder_frames;
+                }
+                else
+                {
+                    s_fill_mem_rec_ip.u4_num_reorder_frames = (MAX_REF_CNT + 1);
+                }
+
+                if(ps_ip->s_ivd_init_ip_t.u4_size
+                                > offsetof(ihevcd_cxa_init_ip_t,
+                                           u4_num_extra_disp_buf))
+                {
+                    s_fill_mem_rec_ip.u4_num_extra_disp_buf =
+                                    ps_ip->u4_num_extra_disp_buf;
+                }
+                else
+                {
+                    s_fill_mem_rec_ip.u4_num_extra_disp_buf = 0;
+                }
+
+                if(ps_ip->s_ivd_init_ip_t.u4_size
+                                > offsetof(ihevcd_cxa_init_ip_t,
+                                           u4_share_disp_buf))
+                {
+#ifndef LOGO_EN
+                    s_fill_mem_rec_ip.u4_share_disp_buf =
+                                    ps_ip->u4_share_disp_buf;
+#else
+                    s_fill_mem_rec_ip.u4_share_disp_buf = 0;
+#endif
+                }
+                else
+                {
+                    s_fill_mem_rec_ip.u4_share_disp_buf = 0;
+                }
+
+                s_fill_mem_rec_ip.e_output_format =
+                                ps_ip->s_ivd_init_ip_t.e_output_format;
+
+                if((s_fill_mem_rec_ip.e_output_format != IV_YUV_420P)
+                                && (s_fill_mem_rec_ip.e_output_format
+                                                != IV_YUV_420SP_UV)
+                                && (s_fill_mem_rec_ip.e_output_format
+                                                != IV_YUV_420SP_VU))
+                {
+                    s_fill_mem_rec_ip.u4_share_disp_buf = 0;
+                }
+
+                s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.u4_size =
+                                sizeof(ihevcd_cxa_fill_mem_rec_ip_t);
+                s_fill_mem_rec_op.s_ivd_fill_mem_rec_op_t.u4_size =
+                                sizeof(ihevcd_cxa_fill_mem_rec_op_t);
+
+                for(i = 0; i < MEM_REC_CNT; i++)
+                    mem_rec_ittiam_api[i].u4_size = sizeof(iv_mem_rec_t);
+
+                e_status = ihevcd_cxa_api_function(NULL,
+                                                   (void *)&s_fill_mem_rec_ip,
+                                                   (void *)&s_fill_mem_rec_op);
+                if(IV_FAIL == e_status)
+                {
+                    ps_op->s_ivd_init_op_t.u4_error_code =
+                                    s_fill_mem_rec_op.s_ivd_fill_mem_rec_op_t.u4_error_code;
+                    DEBUG("Fail\n");
+                    return (IV_FAIL);
+                }
+
+                for(i = 0; i < MEM_REC_CNT; i++)
+                {
+#ifdef ARMRVDS
+                    if((UWORD32)(ps_mem_rec[i].pv_base) & (mem_rec_ittiam_api[i].u4_mem_alignment - 1))
+                    {
+                        ps_op->s_ivd_init_op_t.u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+                        ps_op->s_ivd_init_op_t.u4_error_code |= IVD_INIT_DEC_MEM_REC_ALIGNMENT_ERR;
+                        DEBUG("Fail\n");
+                        return IV_FAIL;
+                    }
+#endif
+
+                    if(ps_mem_rec[i].u4_mem_size
+                                    < mem_rec_ittiam_api[i].u4_mem_size)
+                    {
+                        ps_op->s_ivd_init_op_t.u4_error_code |= 1
+                                        << IVD_UNSUPPORTEDPARAM;
+                        ps_op->s_ivd_init_op_t.u4_error_code |=
+                                        IVD_INIT_DEC_MEM_REC_INSUFFICIENT_SIZE;
+                        DEBUG("i: %d \n", i);
+                        return IV_FAIL;
+                    }
+                    if(ps_mem_rec[i].u4_mem_alignment
+                                    != mem_rec_ittiam_api[i].u4_mem_alignment)
+                    {
+                        ps_op->s_ivd_init_op_t.u4_error_code |= 1
+                                        << IVD_UNSUPPORTEDPARAM;
+                        ps_op->s_ivd_init_op_t.u4_error_code |=
+                                        IVD_INIT_DEC_MEM_REC_ALIGNMENT_ERR;
+                        DEBUG("i: %d \n", i);
+                        return IV_FAIL;
+                    }
+                    if(ps_mem_rec[i].e_mem_type
+                                    != mem_rec_ittiam_api[i].e_mem_type)
+                    {
+                        UWORD32 check = IV_SUCCESS;
+                        UWORD32 diff = mem_rec_ittiam_api[i].e_mem_type
+                                        - ps_mem_rec[i].e_mem_type;
+
+                        if((ps_mem_rec[i].e_mem_type
+                                        <= IV_EXTERNAL_CACHEABLE_SCRATCH_MEM)
+                                        && (mem_rec_ittiam_api[i].e_mem_type
+                                                        >= IV_INTERNAL_NONCACHEABLE_PERSISTENT_MEM))
+                        {
+                            check = IV_FAIL;
+                        }
+                        if(3 != (mem_rec_ittiam_api[i].e_mem_type % 4))
+                        {
+                            /*
+                             * It is not IV_EXTERNAL_NONCACHEABLE_PERSISTENT_MEM or IV_EXTERNAL_CACHEABLE_PERSISTENT_MEM
+                             */
+                            if((diff < 1) || (diff > 3))
+                            {
+                                // Difference between 1 and 3 is okay for all cases other than the two filtered
+                                // with the MOD condition above
+                                check = IV_FAIL;
+                            }
+                        }
+                        else
+                        {
+                            if(diff == 1)
+                            {
+                                /*
+                                 * This particular case is when codec asked for External Persistent, but got
+                                 * Internal Scratch.
+                                 */
+                                check = IV_FAIL;
+                            }
+                            if((diff != 2) && (diff != 3))
+                            {
+                                check = IV_FAIL;
+                            }
+                        }
+                        if(check == IV_FAIL)
+                        {
+                            ps_op->s_ivd_init_op_t.u4_error_code |= 1
+                                            << IVD_UNSUPPORTEDPARAM;
+                            ps_op->s_ivd_init_op_t.u4_error_code |=
+                                            IVD_INIT_DEC_MEM_REC_INCORRECT_TYPE;
+                            DEBUG("i: %d \n", i);
+                            return IV_FAIL;
+                        }
+                    }
+                }
+            }
+
+        }
+            break;
+
+        case IVD_CMD_GET_DISPLAY_FRAME:
+        {
+            ihevcd_cxa_get_display_frame_ip_t *ps_ip =
+                            (ihevcd_cxa_get_display_frame_ip_t *)pv_api_ip;
+            ihevcd_cxa_get_display_frame_op_t *ps_op =
+                            (ihevcd_cxa_get_display_frame_op_t *)pv_api_op;
+
+            ps_op->s_ivd_get_display_frame_op_t.u4_error_code = 0;
+
+            if((ps_ip->s_ivd_get_display_frame_ip_t.u4_size
+                            != sizeof(ihevcd_cxa_get_display_frame_ip_t))
+                            && (ps_ip->s_ivd_get_display_frame_ip_t.u4_size
+                                            != sizeof(ivd_get_display_frame_ip_t)))
+            {
+                ps_op->s_ivd_get_display_frame_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_get_display_frame_op_t.u4_error_code |=
+                                IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                return (IV_FAIL);
+            }
+
+            if((ps_op->s_ivd_get_display_frame_op_t.u4_size
+                            != sizeof(ihevcd_cxa_get_display_frame_op_t))
+                            && (ps_op->s_ivd_get_display_frame_op_t.u4_size
+                                            != sizeof(ivd_get_display_frame_op_t)))
+            {
+                ps_op->s_ivd_get_display_frame_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_get_display_frame_op_t.u4_error_code |=
+                                IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                return (IV_FAIL);
+            }
+
+        }
+            break;
+
+        case IVD_CMD_REL_DISPLAY_FRAME:
+        {
+            ihevcd_cxa_rel_display_frame_ip_t *ps_ip =
+                            (ihevcd_cxa_rel_display_frame_ip_t *)pv_api_ip;
+            ihevcd_cxa_rel_display_frame_op_t *ps_op =
+                            (ihevcd_cxa_rel_display_frame_op_t *)pv_api_op;
+
+            ps_op->s_ivd_rel_display_frame_op_t.u4_error_code = 0;
+
+            if((ps_ip->s_ivd_rel_display_frame_ip_t.u4_size
+                            != sizeof(ihevcd_cxa_rel_display_frame_ip_t))
+                            && (ps_ip->s_ivd_rel_display_frame_ip_t.u4_size
+                                            != sizeof(ivd_rel_display_frame_ip_t)))
+            {
+                ps_op->s_ivd_rel_display_frame_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_rel_display_frame_op_t.u4_error_code |=
+                                IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                return (IV_FAIL);
+            }
+
+            if((ps_op->s_ivd_rel_display_frame_op_t.u4_size
+                            != sizeof(ihevcd_cxa_rel_display_frame_op_t))
+                            && (ps_op->s_ivd_rel_display_frame_op_t.u4_size
+                                            != sizeof(ivd_rel_display_frame_op_t)))
+            {
+                ps_op->s_ivd_rel_display_frame_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_rel_display_frame_op_t.u4_error_code |=
+                                IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                return (IV_FAIL);
+            }
+
+        }
+            break;
+
+        case IVD_CMD_SET_DISPLAY_FRAME:
+        {
+            ihevcd_cxa_set_display_frame_ip_t *ps_ip =
+                            (ihevcd_cxa_set_display_frame_ip_t *)pv_api_ip;
+            ihevcd_cxa_set_display_frame_op_t *ps_op =
+                            (ihevcd_cxa_set_display_frame_op_t *)pv_api_op;
+            UWORD32 j;
+
+            ps_op->s_ivd_set_display_frame_op_t.u4_error_code = 0;
+
+            if((ps_ip->s_ivd_set_display_frame_ip_t.u4_size
+                            != sizeof(ihevcd_cxa_set_display_frame_ip_t))
+                            && (ps_ip->s_ivd_set_display_frame_ip_t.u4_size
+                                            != sizeof(ivd_set_display_frame_ip_t)))
+            {
+                ps_op->s_ivd_set_display_frame_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_set_display_frame_op_t.u4_error_code |=
+                                IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                return (IV_FAIL);
+            }
+
+            if((ps_op->s_ivd_set_display_frame_op_t.u4_size
+                            != sizeof(ihevcd_cxa_set_display_frame_op_t))
+                            && (ps_op->s_ivd_set_display_frame_op_t.u4_size
+                                            != sizeof(ivd_set_display_frame_op_t)))
+            {
+                ps_op->s_ivd_set_display_frame_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_set_display_frame_op_t.u4_error_code |=
+                                IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                return (IV_FAIL);
+            }
+
+            if(ps_ip->s_ivd_set_display_frame_ip_t.num_disp_bufs == 0)
+            {
+                ps_op->s_ivd_set_display_frame_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_set_display_frame_op_t.u4_error_code |=
+                                IVD_DISP_FRM_ZERO_OP_BUFS;
+                return IV_FAIL;
+            }
+
+            for(j = 0; j < ps_ip->s_ivd_set_display_frame_ip_t.num_disp_bufs;
+                            j++)
+            {
+                if(ps_ip->s_ivd_set_display_frame_ip_t.s_disp_buffer[j].u4_num_bufs
+                                == 0)
+                {
+                    ps_op->s_ivd_set_display_frame_op_t.u4_error_code |= 1
+                                    << IVD_UNSUPPORTEDPARAM;
+                    ps_op->s_ivd_set_display_frame_op_t.u4_error_code |=
+                                    IVD_DISP_FRM_ZERO_OP_BUFS;
+                    return IV_FAIL;
+                }
+
+                for(i = 0;
+                                i
+                                                < (WORD32)ps_ip->s_ivd_set_display_frame_ip_t.s_disp_buffer[j].u4_num_bufs;
+                                i++)
+                {
+                    if(ps_ip->s_ivd_set_display_frame_ip_t.s_disp_buffer[j].pu1_bufs[i]
+                                    == NULL)
+                    {
+                        ps_op->s_ivd_set_display_frame_op_t.u4_error_code |= 1
+                                        << IVD_UNSUPPORTEDPARAM;
+                        ps_op->s_ivd_set_display_frame_op_t.u4_error_code |=
+                                        IVD_DISP_FRM_OP_BUF_NULL;
+                        return IV_FAIL;
+                    }
+
+                    if(ps_ip->s_ivd_set_display_frame_ip_t.s_disp_buffer[j].u4_min_out_buf_size[i]
+                                    == 0)
+                    {
+                        ps_op->s_ivd_set_display_frame_op_t.u4_error_code |= 1
+                                        << IVD_UNSUPPORTEDPARAM;
+                        ps_op->s_ivd_set_display_frame_op_t.u4_error_code |=
+                                        IVD_DISP_FRM_ZERO_OP_BUF_SIZE;
+                        return IV_FAIL;
+                    }
+                }
+            }
+        }
+            break;
+
+        case IVD_CMD_VIDEO_DECODE:
+        {
+            ihevcd_cxa_video_decode_ip_t *ps_ip =
+                            (ihevcd_cxa_video_decode_ip_t *)pv_api_ip;
+            ihevcd_cxa_video_decode_op_t *ps_op =
+                            (ihevcd_cxa_video_decode_op_t *)pv_api_op;
+
+            DEBUG("The input bytes is: %d",
+                            ps_ip->s_ivd_video_decode_ip_t.u4_num_Bytes);
+            ps_op->s_ivd_video_decode_op_t.u4_error_code = 0;
+
+            if(ps_ip->s_ivd_video_decode_ip_t.u4_size
+                            != sizeof(ihevcd_cxa_video_decode_ip_t)
+                            && ps_ip->s_ivd_video_decode_ip_t.u4_size
+                                            != offsetof(ivd_video_decode_ip_t,
+                                                        s_out_buffer))
+            {
+                ps_op->s_ivd_video_decode_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_video_decode_op_t.u4_error_code |=
+                                IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                return (IV_FAIL);
+            }
+
+            if(ps_op->s_ivd_video_decode_op_t.u4_size
+                            != sizeof(ihevcd_cxa_video_decode_op_t)
+                            && ps_op->s_ivd_video_decode_op_t.u4_size
+                                            != offsetof(ivd_video_decode_op_t,
+                                                        u4_output_present))
+            {
+                ps_op->s_ivd_video_decode_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_video_decode_op_t.u4_error_code |=
+                                IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                return (IV_FAIL);
+            }
+
+        }
+            break;
+
+        case IV_CMD_RETRIEVE_MEMREC:
+        {
+            ihevcd_cxa_retrieve_mem_rec_ip_t *ps_ip =
+                            (ihevcd_cxa_retrieve_mem_rec_ip_t *)pv_api_ip;
+            ihevcd_cxa_retrieve_mem_rec_op_t *ps_op =
+                            (ihevcd_cxa_retrieve_mem_rec_op_t *)pv_api_op;
+            iv_mem_rec_t *ps_mem_rec;
+
+            ps_op->s_ivd_retrieve_mem_rec_op_t.u4_error_code = 0;
+
+            if(ps_ip->s_ivd_retrieve_mem_rec_ip_t.u4_size
+                            != sizeof(ihevcd_cxa_retrieve_mem_rec_ip_t))
+            {
+                ps_op->s_ivd_retrieve_mem_rec_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_retrieve_mem_rec_op_t.u4_error_code |=
+                                IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                return (IV_FAIL);
+            }
+
+            if(ps_op->s_ivd_retrieve_mem_rec_op_t.u4_size
+                            != sizeof(ihevcd_cxa_retrieve_mem_rec_op_t))
+            {
+                ps_op->s_ivd_retrieve_mem_rec_op_t.u4_error_code |= 1
+                                << IVD_UNSUPPORTEDPARAM;
+                ps_op->s_ivd_retrieve_mem_rec_op_t.u4_error_code |=
+                                IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                return (IV_FAIL);
+            }
+
+            ps_mem_rec = ps_ip->s_ivd_retrieve_mem_rec_ip_t.pv_mem_rec_location;
+            /* check memrecords sizes are correct */
+            for(i = 0; i < MEM_REC_CNT; i++)
+            {
+                if(ps_mem_rec[i].u4_size != sizeof(iv_mem_rec_t))
+                {
+                    ps_op->s_ivd_retrieve_mem_rec_op_t.u4_error_code |= 1
+                                    << IVD_UNSUPPORTEDPARAM;
+                    ps_op->s_ivd_retrieve_mem_rec_op_t.u4_error_code |=
+                                    IVD_MEM_REC_STRUCT_SIZE_INCORRECT;
+                    return IV_FAIL;
+                }
+            }
+        }
+            break;
+
+        case IVD_CMD_VIDEO_CTL:
+        {
+            UWORD32 *pu4_ptr_cmd;
+            UWORD32 sub_command;
+
+            pu4_ptr_cmd = (UWORD32 *)pv_api_ip;
+            pu4_ptr_cmd += 2;
+            sub_command = *pu4_ptr_cmd;
+
+            switch(sub_command)
+            {
+                case IVD_CMD_CTL_SETPARAMS:
+                {
+                    ihevcd_cxa_ctl_set_config_ip_t *ps_ip;
+                    ihevcd_cxa_ctl_set_config_op_t *ps_op;
+                    ps_ip = (ihevcd_cxa_ctl_set_config_ip_t *)pv_api_ip;
+                    ps_op = (ihevcd_cxa_ctl_set_config_op_t *)pv_api_op;
+
+                    if(ps_ip->s_ivd_ctl_set_config_ip_t.u4_size
+                                    != sizeof(ihevcd_cxa_ctl_set_config_ip_t))
+                    {
+                        ps_op->s_ivd_ctl_set_config_op_t.u4_error_code |= 1
+                                        << IVD_UNSUPPORTEDPARAM;
+                        ps_op->s_ivd_ctl_set_config_op_t.u4_error_code |=
+                                        IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+                }
+                    //no break; is needed here
+                case IVD_CMD_CTL_SETDEFAULT:
+                {
+                    ihevcd_cxa_ctl_set_config_op_t *ps_op;
+                    ps_op = (ihevcd_cxa_ctl_set_config_op_t *)pv_api_op;
+                    if(ps_op->s_ivd_ctl_set_config_op_t.u4_size
+                                    != sizeof(ihevcd_cxa_ctl_set_config_op_t))
+                    {
+                        ps_op->s_ivd_ctl_set_config_op_t.u4_error_code |= 1
+                                        << IVD_UNSUPPORTEDPARAM;
+                        ps_op->s_ivd_ctl_set_config_op_t.u4_error_code |=
+                                        IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+                }
+                    break;
+
+                case IVD_CMD_CTL_GETPARAMS:
+                {
+                    ihevcd_cxa_ctl_getstatus_ip_t *ps_ip;
+                    ihevcd_cxa_ctl_getstatus_op_t *ps_op;
+
+                    ps_ip = (ihevcd_cxa_ctl_getstatus_ip_t *)pv_api_ip;
+                    ps_op = (ihevcd_cxa_ctl_getstatus_op_t *)pv_api_op;
+                    if(ps_ip->s_ivd_ctl_getstatus_ip_t.u4_size
+                                    != sizeof(ihevcd_cxa_ctl_getstatus_ip_t))
+                    {
+                        ps_op->s_ivd_ctl_getstatus_op_t.u4_error_code |= 1
+                                        << IVD_UNSUPPORTEDPARAM;
+                        ps_op->s_ivd_ctl_getstatus_op_t.u4_error_code |=
+                                        IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+                    if(ps_op->s_ivd_ctl_getstatus_op_t.u4_size
+                                    != sizeof(ihevcd_cxa_ctl_getstatus_op_t))
+                    {
+                        ps_op->s_ivd_ctl_getstatus_op_t.u4_error_code |= 1
+                                        << IVD_UNSUPPORTEDPARAM;
+                        ps_op->s_ivd_ctl_getstatus_op_t.u4_error_code |=
+                                        IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+                }
+                    break;
+
+                case IVD_CMD_CTL_GETBUFINFO:
+                {
+                    ihevcd_cxa_ctl_getbufinfo_ip_t *ps_ip;
+                    ihevcd_cxa_ctl_getbufinfo_op_t *ps_op;
+                    ps_ip = (ihevcd_cxa_ctl_getbufinfo_ip_t *)pv_api_ip;
+                    ps_op = (ihevcd_cxa_ctl_getbufinfo_op_t *)pv_api_op;
+
+                    if(ps_ip->s_ivd_ctl_getbufinfo_ip_t.u4_size
+                                    != sizeof(ihevcd_cxa_ctl_getbufinfo_ip_t))
+                    {
+                        ps_op->s_ivd_ctl_getbufinfo_op_t.u4_error_code |= 1
+                                        << IVD_UNSUPPORTEDPARAM;
+                        ps_op->s_ivd_ctl_getbufinfo_op_t.u4_error_code |=
+                                        IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+                    if(ps_op->s_ivd_ctl_getbufinfo_op_t.u4_size
+                                    != sizeof(ihevcd_cxa_ctl_getbufinfo_op_t))
+                    {
+                        ps_op->s_ivd_ctl_getbufinfo_op_t.u4_error_code |= 1
+                                        << IVD_UNSUPPORTEDPARAM;
+                        ps_op->s_ivd_ctl_getbufinfo_op_t.u4_error_code |=
+                                        IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+                }
+                    break;
+
+                case IVD_CMD_CTL_GETVERSION:
+                {
+                    ihevcd_cxa_ctl_getversioninfo_ip_t *ps_ip;
+                    ihevcd_cxa_ctl_getversioninfo_op_t *ps_op;
+                    ps_ip = (ihevcd_cxa_ctl_getversioninfo_ip_t *)pv_api_ip;
+                    ps_op = (ihevcd_cxa_ctl_getversioninfo_op_t *)pv_api_op;
+                    if(ps_ip->s_ivd_ctl_getversioninfo_ip_t.u4_size
+                                    != sizeof(ihevcd_cxa_ctl_getversioninfo_ip_t))
+                    {
+                        ps_op->s_ivd_ctl_getversioninfo_op_t.u4_error_code |= 1
+                                        << IVD_UNSUPPORTEDPARAM;
+                        ps_op->s_ivd_ctl_getversioninfo_op_t.u4_error_code |=
+                                        IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+                    if(ps_op->s_ivd_ctl_getversioninfo_op_t.u4_size
+                                    != sizeof(ihevcd_cxa_ctl_getversioninfo_op_t))
+                    {
+                        ps_op->s_ivd_ctl_getversioninfo_op_t.u4_error_code |= 1
+                                        << IVD_UNSUPPORTEDPARAM;
+                        ps_op->s_ivd_ctl_getversioninfo_op_t.u4_error_code |=
+                                        IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+                }
+                    break;
+
+                case IVD_CMD_CTL_FLUSH:
+                {
+                    ihevcd_cxa_ctl_flush_ip_t *ps_ip;
+                    ihevcd_cxa_ctl_flush_op_t *ps_op;
+                    ps_ip = (ihevcd_cxa_ctl_flush_ip_t *)pv_api_ip;
+                    ps_op = (ihevcd_cxa_ctl_flush_op_t *)pv_api_op;
+                    if(ps_ip->s_ivd_ctl_flush_ip_t.u4_size
+                                    != sizeof(ihevcd_cxa_ctl_flush_ip_t))
+                    {
+                        ps_op->s_ivd_ctl_flush_op_t.u4_error_code |= 1
+                                        << IVD_UNSUPPORTEDPARAM;
+                        ps_op->s_ivd_ctl_flush_op_t.u4_error_code |=
+                                        IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+                    if(ps_op->s_ivd_ctl_flush_op_t.u4_size
+                                    != sizeof(ihevcd_cxa_ctl_flush_op_t))
+                    {
+                        ps_op->s_ivd_ctl_flush_op_t.u4_error_code |= 1
+                                        << IVD_UNSUPPORTEDPARAM;
+                        ps_op->s_ivd_ctl_flush_op_t.u4_error_code |=
+                                        IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+                }
+                    break;
+
+                case IVD_CMD_CTL_RESET:
+                {
+                    ihevcd_cxa_ctl_reset_ip_t *ps_ip;
+                    ihevcd_cxa_ctl_reset_op_t *ps_op;
+                    ps_ip = (ihevcd_cxa_ctl_reset_ip_t *)pv_api_ip;
+                    ps_op = (ihevcd_cxa_ctl_reset_op_t *)pv_api_op;
+                    if(ps_ip->s_ivd_ctl_reset_ip_t.u4_size
+                                    != sizeof(ihevcd_cxa_ctl_reset_ip_t))
+                    {
+                        ps_op->s_ivd_ctl_reset_op_t.u4_error_code |= 1
+                                        << IVD_UNSUPPORTEDPARAM;
+                        ps_op->s_ivd_ctl_reset_op_t.u4_error_code |=
+                                        IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+                    if(ps_op->s_ivd_ctl_reset_op_t.u4_size
+                                    != sizeof(ihevcd_cxa_ctl_reset_op_t))
+                    {
+                        ps_op->s_ivd_ctl_reset_op_t.u4_error_code |= 1
+                                        << IVD_UNSUPPORTEDPARAM;
+                        ps_op->s_ivd_ctl_reset_op_t.u4_error_code |=
+                                        IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+                }
+                    break;
+                case IHEVCD_CXA_CMD_CTL_DEGRADE:
+                {
+                    ihevcd_cxa_ctl_degrade_ip_t *ps_ip;
+                    ihevcd_cxa_ctl_degrade_op_t *ps_op;
+
+                    ps_ip = (ihevcd_cxa_ctl_degrade_ip_t *)pv_api_ip;
+                    ps_op = (ihevcd_cxa_ctl_degrade_op_t *)pv_api_op;
+
+                    if(ps_ip->u4_size
+                                    != sizeof(ihevcd_cxa_ctl_degrade_ip_t))
+                    {
+                        ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+                        ps_op->u4_error_code |=
+                                        IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+
+                    if(ps_op->u4_size
+                                    != sizeof(ihevcd_cxa_ctl_degrade_op_t))
+                    {
+                        ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+                        ps_op->u4_error_code |=
+                                        IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+
+                    if((ps_ip->i4_degrade_pics < 0) ||
+                       (ps_ip->i4_degrade_pics > 4) ||
+                       (ps_ip->i4_nondegrade_interval < 0) ||
+                       (ps_ip->i4_degrade_type < 0) ||
+                       (ps_ip->i4_degrade_type > 15))
+                    {
+                        ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+                        return IV_FAIL;
+                    }
+
+                    break;
+                }
+
+                case IHEVCD_CXA_CMD_CTL_GET_BUFFER_DIMENSIONS:
+                {
+                    ihevcd_cxa_ctl_get_frame_dimensions_ip_t *ps_ip;
+                    ihevcd_cxa_ctl_get_frame_dimensions_op_t *ps_op;
+
+                    ps_ip =
+                                    (ihevcd_cxa_ctl_get_frame_dimensions_ip_t *)pv_api_ip;
+                    ps_op =
+                                    (ihevcd_cxa_ctl_get_frame_dimensions_op_t *)pv_api_op;
+
+                    if(ps_ip->u4_size
+                                    != sizeof(ihevcd_cxa_ctl_get_frame_dimensions_ip_t))
+                    {
+                        ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+                        ps_op->u4_error_code |=
+                                        IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+
+                    if(ps_op->u4_size
+                                    != sizeof(ihevcd_cxa_ctl_get_frame_dimensions_op_t))
+                    {
+                        ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+                        ps_op->u4_error_code |=
+                                        IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+
+                    break;
+                }
+
+                case IHEVCD_CXA_CMD_CTL_GET_VUI_PARAMS:
+                {
+                    ihevcd_cxa_ctl_get_vui_params_ip_t *ps_ip;
+                    ihevcd_cxa_ctl_get_vui_params_op_t *ps_op;
+
+                    ps_ip =
+                                    (ihevcd_cxa_ctl_get_vui_params_ip_t *)pv_api_ip;
+                    ps_op =
+                                    (ihevcd_cxa_ctl_get_vui_params_op_t *)pv_api_op;
+
+                    if(ps_ip->u4_size
+                                    != sizeof(ihevcd_cxa_ctl_get_vui_params_ip_t))
+                    {
+                        ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+                        ps_op->u4_error_code |=
+                                        IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+
+                    if(ps_op->u4_size
+                                    != sizeof(ihevcd_cxa_ctl_get_vui_params_op_t))
+                    {
+                        ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+                        ps_op->u4_error_code |=
+                                        IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+
+                    break;
+                }
+                case IHEVCD_CXA_CMD_CTL_SET_NUM_CORES:
+                {
+                    ihevcd_cxa_ctl_set_num_cores_ip_t *ps_ip;
+                    ihevcd_cxa_ctl_set_num_cores_op_t *ps_op;
+
+                    ps_ip = (ihevcd_cxa_ctl_set_num_cores_ip_t *)pv_api_ip;
+                    ps_op = (ihevcd_cxa_ctl_set_num_cores_op_t *)pv_api_op;
+
+                    if(ps_ip->u4_size
+                                    != sizeof(ihevcd_cxa_ctl_set_num_cores_ip_t))
+                    {
+                        ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+                        ps_op->u4_error_code |=
+                                        IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+
+                    if(ps_op->u4_size
+                                    != sizeof(ihevcd_cxa_ctl_set_num_cores_op_t))
+                    {
+                        ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+                        ps_op->u4_error_code |=
+                                        IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+
+#ifdef MULTICORE
+                    if((ps_ip->u4_num_cores < 1) || (ps_ip->u4_num_cores > MAX_NUM_CORES))
+#else
+                    if(ps_ip->u4_num_cores != 1)
+#endif
+                        {
+                            ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+                            return IV_FAIL;
+                        }
+                    break;
+                }
+                case IHEVCD_CXA_CMD_CTL_SET_PROCESSOR:
+                {
+                    ihevcd_cxa_ctl_set_processor_ip_t *ps_ip;
+                    ihevcd_cxa_ctl_set_processor_op_t *ps_op;
+
+                    ps_ip = (ihevcd_cxa_ctl_set_processor_ip_t *)pv_api_ip;
+                    ps_op = (ihevcd_cxa_ctl_set_processor_op_t *)pv_api_op;
+
+                    if(ps_ip->u4_size
+                                    != sizeof(ihevcd_cxa_ctl_set_processor_ip_t))
+                    {
+                        ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+                        ps_op->u4_error_code |=
+                                        IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+
+                    if(ps_op->u4_size
+                                    != sizeof(ihevcd_cxa_ctl_set_processor_op_t))
+                    {
+                        ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+                        ps_op->u4_error_code |=
+                                        IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+
+                    break;
+                }
+#ifdef GPU_BUILD
+                case IHEVCD_CXA_CMD_CTL_GPU_ENABLE_DISABLE:
+                {
+                    ihevcd_cxa_ctl_gpu_enable_diable_ip_t *ps_ip;
+                    ihevcd_cxa_ctl_gpu_enable_diable_op_t *ps_op;
+
+                    ps_ip = (ihevcd_cxa_ctl_gpu_enable_diable_ip_t *)pv_api_ip;
+                    ps_op = (ihevcd_cxa_ctl_gpu_enable_diable_op_t *)pv_api_op;
+
+                    if(ps_ip->u4_size
+                                    != sizeof(ihevcd_cxa_ctl_gpu_enable_diable_ip_t))
+                    {
+                        ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+                        ps_op->u4_error_code |=
+                                        IVD_IP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+
+                    if(ps_op->u4_size
+                                    != sizeof(ihevcd_cxa_ctl_gpu_enable_diable_op_t))
+                    {
+                        ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+                        ps_op->u4_error_code |=
+                                        IVD_OP_API_STRUCT_SIZE_INCORRECT;
+                        return IV_FAIL;
+                    }
+                    break;
+                }
+#endif
+                default:
+                    *(pu4_api_op + 1) |= 1 << IVD_UNSUPPORTEDPARAM;
+                    *(pu4_api_op + 1) |= IVD_UNSUPPORTED_API_CMD;
+                    return IV_FAIL;
+            }
+        }
+            break;
+        default:
+            *(pu4_api_op + 1) |= 1 << IVD_UNSUPPORTEDPARAM;
+            *(pu4_api_op + 1) |= IVD_UNSUPPORTED_API_CMD;
+            return IV_FAIL;
+    }
+
+    return IV_SUCCESS;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Sets default dynamic parameters
+*
+* @par Description:
+*  Sets default dynamic parameters. Will be called in ihevcd_init() to ensure
+* that even if set_params is not called, codec  continues to work
+*
+* @param[in] ps_codec_obj
+*  Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_set_default_params(codec_t *ps_codec)
+{
+
+    WORD32 ret = IV_SUCCESS;
+
+    ps_codec->e_pic_skip_mode = IVD_SKIP_NONE;
+    ps_codec->i4_strd = 0;
+    ps_codec->i4_disp_strd = 0;
+    ps_codec->i4_header_mode = 0;
+    ps_codec->e_pic_out_order = IVD_DISPLAY_FRAME_OUT;
+    return ret;
+}
+
+void ihevcd_update_function_ptr(codec_t *ps_codec)
+{
+
+    /* Init inter pred function array */
+    ps_codec->apf_inter_pred[0] = NULL;
+    ps_codec->apf_inter_pred[1] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_fptr;
+    ps_codec->apf_inter_pred[2] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_fptr;
+    ps_codec->apf_inter_pred[3] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_fptr;
+    ps_codec->apf_inter_pred[4] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr;
+    ps_codec->apf_inter_pred[5] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_w16out_fptr;
+    ps_codec->apf_inter_pred[6] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16out_fptr;
+    ps_codec->apf_inter_pred[7] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr;
+    ps_codec->apf_inter_pred[8] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr;
+    ps_codec->apf_inter_pred[9] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_fptr;
+    ps_codec->apf_inter_pred[10] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_w16out_fptr;
+    ps_codec->apf_inter_pred[11] = NULL;
+    ps_codec->apf_inter_pred[12] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_fptr;
+    ps_codec->apf_inter_pred[13] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_fptr;
+    ps_codec->apf_inter_pred[14] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_fptr;
+    ps_codec->apf_inter_pred[15] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr;
+    ps_codec->apf_inter_pred[16] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_w16out_fptr;
+    ps_codec->apf_inter_pred[17] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16out_fptr;
+    ps_codec->apf_inter_pred[18] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr;
+    ps_codec->apf_inter_pred[19] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr;
+    ps_codec->apf_inter_pred[20] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_fptr;
+    ps_codec->apf_inter_pred[21] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_w16out_fptr;
+
+    /* Init intra pred function array */
+    ps_codec->apf_intra_pred_luma[0] = (pf_intra_pred)NULL;
+    ps_codec->apf_intra_pred_luma[1] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_planar_fptr;
+    ps_codec->apf_intra_pred_luma[2] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_dc_fptr;
+    ps_codec->apf_intra_pred_luma[3] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_mode2_fptr;
+    ps_codec->apf_intra_pred_luma[4] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_3_to_9_fptr;
+    ps_codec->apf_intra_pred_luma[5] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_horz_fptr;
+    ps_codec->apf_intra_pred_luma[6] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_11_to_17_fptr;
+    ps_codec->apf_intra_pred_luma[7] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_18_34_fptr;
+    ps_codec->apf_intra_pred_luma[8] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_19_to_25_fptr;
+    ps_codec->apf_intra_pred_luma[9] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_ver_fptr;
+    ps_codec->apf_intra_pred_luma[10] =  (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_27_to_33_fptr;
+
+    ps_codec->apf_intra_pred_chroma[0] = (pf_intra_pred)NULL;
+    ps_codec->apf_intra_pred_chroma[1] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_planar_fptr;
+    ps_codec->apf_intra_pred_chroma[2] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_dc_fptr;
+    ps_codec->apf_intra_pred_chroma[3] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode2_fptr;
+    ps_codec->apf_intra_pred_chroma[4] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_3_to_9_fptr;
+    ps_codec->apf_intra_pred_chroma[5] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_horz_fptr;
+    ps_codec->apf_intra_pred_chroma[6] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_11_to_17_fptr;
+    ps_codec->apf_intra_pred_chroma[7] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_18_34_fptr;
+    ps_codec->apf_intra_pred_chroma[8] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_19_to_25_fptr;
+    ps_codec->apf_intra_pred_chroma[9] =  (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_ver_fptr;
+    ps_codec->apf_intra_pred_chroma[10] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_27_to_33_fptr;
+
+    /* Init itrans_recon function array */
+    ps_codec->apf_itrans_recon[0] = (pf_itrans_recon)ps_codec->s_func_selector.ihevc_itrans_recon_4x4_ttype1_fptr;
+    ps_codec->apf_itrans_recon[1] = (pf_itrans_recon)ps_codec->s_func_selector.ihevc_itrans_recon_4x4_fptr;
+    ps_codec->apf_itrans_recon[2] = (pf_itrans_recon)ps_codec->s_func_selector.ihevc_itrans_recon_8x8_fptr;
+    ps_codec->apf_itrans_recon[3] = (pf_itrans_recon)ps_codec->s_func_selector.ihevc_itrans_recon_16x16_fptr;
+    ps_codec->apf_itrans_recon[4] = (pf_itrans_recon)ps_codec->s_func_selector.ihevc_itrans_recon_32x32_fptr;
+    ps_codec->apf_itrans_recon[5] = (pf_itrans_recon)ps_codec->s_func_selector.ihevc_chroma_itrans_recon_4x4_fptr;
+    ps_codec->apf_itrans_recon[6] = (pf_itrans_recon)ps_codec->s_func_selector.ihevc_chroma_itrans_recon_8x8_fptr;
+    ps_codec->apf_itrans_recon[7] = (pf_itrans_recon)ps_codec->s_func_selector.ihevc_chroma_itrans_recon_16x16_fptr;
+
+    /* Init recon function array */
+    ps_codec->apf_recon[0] = (pf_recon)ps_codec->s_func_selector.ihevc_recon_4x4_ttype1_fptr;
+    ps_codec->apf_recon[1] = (pf_recon)ps_codec->s_func_selector.ihevc_recon_4x4_fptr;
+    ps_codec->apf_recon[2] = (pf_recon)ps_codec->s_func_selector.ihevc_recon_8x8_fptr;
+    ps_codec->apf_recon[3] = (pf_recon)ps_codec->s_func_selector.ihevc_recon_16x16_fptr;
+    ps_codec->apf_recon[4] = (pf_recon)ps_codec->s_func_selector.ihevc_recon_32x32_fptr;
+    ps_codec->apf_recon[5] = (pf_recon)ps_codec->s_func_selector.ihevc_chroma_recon_4x4_fptr;
+    ps_codec->apf_recon[6] = (pf_recon)ps_codec->s_func_selector.ihevc_chroma_recon_8x8_fptr;
+    ps_codec->apf_recon[7] = (pf_recon)ps_codec->s_func_selector.ihevc_chroma_recon_16x16_fptr;
+
+    /* Init itrans_recon_dc function array */
+    ps_codec->apf_itrans_recon_dc[0] = (pf_itrans_recon_dc)ps_codec->s_func_selector.ihevcd_itrans_recon_dc_luma_fptr;
+    ps_codec->apf_itrans_recon_dc[1] = (pf_itrans_recon_dc)ps_codec->s_func_selector.ihevcd_itrans_recon_dc_chroma_fptr;
+
+    /* Init sao function array */
+    ps_codec->apf_sao_luma[0] = (pf_sao_luma)ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_fptr;
+    ps_codec->apf_sao_luma[1] = (pf_sao_luma)ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_fptr;
+    ps_codec->apf_sao_luma[2] = (pf_sao_luma)ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_fptr;
+    ps_codec->apf_sao_luma[3] = (pf_sao_luma)ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_fptr;
+
+    ps_codec->apf_sao_chroma[0] = (pf_sao_chroma)ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_chroma_fptr;
+    ps_codec->apf_sao_chroma[1] = (pf_sao_chroma)ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_chroma_fptr;
+    ps_codec->apf_sao_chroma[2] = (pf_sao_chroma)ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_chroma_fptr;
+    ps_codec->apf_sao_chroma[3] = (pf_sao_chroma)ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_chroma_fptr;
+}
+/**
+*******************************************************************************
+*
+* @brief
+*  Initialize the context. This will be called by  init_mem_rec and during
+* reset
+*
+* @par Description:
+*  Initializes the context
+*
+* @param[in] ps_codec
+*  Codec context pointer
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_init(codec_t *ps_codec)
+{
+    WORD32 status = IV_SUCCESS;
+    WORD32 i;
+
+
+    ps_codec->i4_num_disp_bufs = 1;
+    ps_codec->i4_flush_mode = 0;
+
+    ps_codec->i4_ht = ps_codec->i4_disp_ht = ps_codec->i4_max_ht;
+    ps_codec->i4_wd = ps_codec->i4_disp_wd = ps_codec->i4_max_wd;
+    ps_codec->i4_strd = 0;
+    ps_codec->i4_disp_strd = 0;
+    ps_codec->i4_num_cores = 1;
+
+    ps_codec->u4_pic_cnt = 0;
+    ps_codec->u4_disp_cnt = 0;
+
+    ps_codec->i4_header_mode = 0;
+    ps_codec->i4_header_in_slice_mode = 0;
+    ps_codec->i4_sps_done = 0;
+    ps_codec->i4_pps_done = 0;
+    ps_codec->i4_init_done   = 1;
+    ps_codec->i4_first_pic_done = 0;
+    ps_codec->s_parse.i4_first_pic_init = 0;
+    ps_codec->i4_error_code = 0;
+    ps_codec->i4_reset_flag = 0;
+
+    ps_codec->i4_prev_poc_msb = 0;
+    ps_codec->i4_prev_poc_lsb = -1;
+    ps_codec->i4_max_prev_poc_lsb = -1;
+    ps_codec->s_parse.i4_abs_pic_order_cnt = -1;
+
+    /* Set ref chroma format by default to 420SP UV interleaved */
+    ps_codec->e_ref_chroma_fmt = IV_YUV_420SP_UV;
+
+#ifdef GPU_BUILD
+#ifndef FRAME_STAGGER_ONLY
+    /* Flag to switch bw MC on GPU and CPU. GPU disabled functionality
+     * not tested. Later move the flag to dynamic parameters.
+     * By default disable GPU. App has to enable GPU thro CNT call.
+     */
+    ps_codec->u4_gpu_enabled = 0;
+#else
+    ps_codec->u4_gpu_enabled = 0;
+#endif
+
+    ps_codec->u4_parsing_view = 0;
+
+#endif
+    /* If the codec is in shared mode and required format is 420 SP VU interleaved then change
+     * reference buffers chroma format
+     */
+    if(IV_YUV_420SP_VU == ps_codec->e_chroma_fmt)
+    {
+        ps_codec->e_ref_chroma_fmt = IV_YUV_420SP_VU;
+    }
+
+
+
+    ps_codec->i4_disable_deblk_pic = 0;
+
+    ps_codec->i4_degrade_pic_cnt    = 0;
+    ps_codec->i4_degrade_pics       = 0;
+    ps_codec->i4_degrade_type       = 0;
+    ps_codec->i4_disable_sao_pic    = 0;
+    ps_codec->i4_fullpel_inter_pred = 0;
+    ps_codec->u4_enable_fmt_conv_ahead = 0;
+
+    {
+        sps_t *ps_sps = ps_codec->ps_sps_base;
+        pps_t *ps_pps = ps_codec->ps_pps_base;
+
+        for(i = 0; i < MAX_SPS_CNT; i++)
+        {
+            ps_sps->i1_sps_valid = 0;
+            ps_sps++;
+        }
+
+        for(i = 0; i < MAX_PPS_CNT; i++)
+        {
+            ps_pps->i1_pps_valid = 0;
+            ps_pps++;
+        }
+    }
+
+    ihevcd_set_default_params(ps_codec);
+    ps_codec->pv_proc_jobq = ihevcd_jobq_init(ps_codec->pv_proc_jobq_buf, ps_codec->i4_proc_jobq_buf_size);
+    RETURN_IF((ps_codec->pv_proc_jobq == NULL), IV_FAIL);
+
+    /* Update the jobq context to all the threads */
+    ps_codec->s_parse.pv_proc_jobq = ps_codec->pv_proc_jobq;
+    for(i = 0; i < MAX_PROCESS_THREADS; i++)
+    {
+        ps_codec->as_process[i].pv_proc_jobq = ps_codec->pv_proc_jobq;
+        ps_codec->as_process[i].i4_id = i;
+        ps_codec->as_process[i].ps_codec = ps_codec;
+
+        /* Set the following to zero assuming it is a single core solution
+         * When threads are launched these will be set appropriately
+         */
+        ps_codec->as_process[i].i4_check_parse_status = 0;
+        ps_codec->as_process[i].i4_check_proc_status = 0;
+    }
+    /* Initialize MV Bank buffer manager */
+    ihevc_buf_mgr_init((buf_mgr_t *)ps_codec->pv_mv_buf_mgr);
+
+    /* Initialize Picture buffer manager */
+    ihevc_buf_mgr_init((buf_mgr_t *)ps_codec->pv_pic_buf_mgr);
+
+    ps_codec->ps_pic_buf = (pic_buf_t *)ps_codec->pv_pic_buf_base;
+
+    memset(ps_codec->ps_pic_buf, 0, BUF_MGR_MAX_CNT  * sizeof(pic_buf_t));
+
+
+
+    /* Initialize display buffer manager */
+    ihevc_disp_mgr_init((disp_mgr_t *)ps_codec->pv_disp_buf_mgr);
+
+    /* Initialize dpb manager */
+    ihevc_dpb_mgr_init((dpb_mgr_t *)ps_codec->pv_dpb_mgr);
+
+    ps_codec->e_processor_soc = SOC_GENERIC;
+    /* The following can be over-ridden using soc parameter as a hack */
+    ps_codec->u4_nctb = 0x7FFFFFFF;
+    ihevcd_init_arch(ps_codec);
+
+    ihevcd_init_function_ptr(ps_codec);
+
+    ihevcd_update_function_ptr(ps_codec);
+
+    return status;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Gets number of memory records required by the codec
+*
+* @par Description:
+*  Gets codec mem record requirements and adds concealment  modules
+* requirements
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_get_num_rec(void *pv_api_ip, void *pv_api_op)
+{
+
+    iv_num_mem_rec_op_t *ps_mem_q_op;
+
+    UNUSED(pv_api_ip);
+    ps_mem_q_op = (iv_num_mem_rec_op_t *)pv_api_op;
+    ps_mem_q_op->u4_num_mem_rec = MEM_REC_CNT;
+    DEBUG("Get num mem records without concealment %d\n",
+                    ps_mem_q_op->u4_num_mem_rec);
+#ifdef APPLY_CONCEALMENT
+    {
+        IV_API_CALL_STATUS_T status;
+        icncl_num_mem_rec_ip_t cncl_mem_ip;
+        icncl_num_mem_rec_op_t cncl_mem_op;
+
+        cncl_mem_ip.s_ivd_num_rec_ip_t.e_cmd = IV_CMD_GET_NUM_MEM_REC;
+        cncl_mem_ip.s_ivd_num_rec_ip_t.u4_size = sizeof(icncl_num_mem_rec_ip_t);
+
+        status = icncl_api_function(NULL, (void *)&cncl_mem_ip, (void *)&cncl_mem_op);
+
+        if(status == IV_SUCCESS)
+        {
+            /* Add the concealment library's memory requirements */
+            ps_mem_q_op->u4_num_mem_rec += cncl_mem_op.s_ivd_num_mem_rec_op_t.u4_num_mem_rec;
+            DEBUG("Get num mem records %d\n", ps_mem_q_op->u4_num_mem_rec);
+            return status; /* Nothing else to do, return */
+        }
+        else
+        {
+            /*
+             * Something went wrong with the concealment library call.
+             */
+            DEBUG("ERROR: Get num mem records %d\n", ps_mem_q_op->u4_num_mem_rec);
+            return status;
+        }
+
+    }
+#endif //APPLY_CONCEALMENT
+
+
+    return IV_SUCCESS;
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Fills memory requirements of the codec
+*
+* @par Description:
+*  Gets codec mem record requirements and adds concealment  modules
+* requirements
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_fill_num_mem_rec(void *pv_api_ip, void *pv_api_op)
+{
+
+    ihevcd_cxa_fill_mem_rec_ip_t *ps_mem_q_ip;
+    ihevcd_cxa_fill_mem_rec_op_t *ps_mem_q_op;
+    WORD32 level;
+    WORD32 num_reorder_frames;
+    WORD32 num_ref_frames;
+    WORD32 num_extra_disp_bufs;
+    WORD32 max_dpb_size;
+
+    iv_mem_rec_t *ps_mem_rec;
+    iv_mem_rec_t *ps_mem_rec_base;
+    WORD32 no_of_mem_rec_filled;
+    WORD32 chroma_format, share_disp_buf;
+    WORD32 max_ctb_cnt;
+    WORD32 max_wd_luma, max_wd_chroma;
+    WORD32 max_ht_luma, max_ht_chroma;
+    WORD32 max_tile_cols, max_tile_rows;
+    WORD32 max_ctb_rows, max_ctb_cols;
+    WORD32 max_num_cu_cols;
+    WORD32 i;
+    WORD32 max_num_4x4_cols;
+    IV_API_CALL_STATUS_T status = IV_SUCCESS;
+    no_of_mem_rec_filled = 0;
+
+    //TODO: Remove as and when the following are used
+    UNUSED(num_extra_disp_bufs);
+    UNUSED(no_of_mem_rec_filled);
+    UNUSED(max_wd_chroma);
+    UNUSED(max_ht_chroma);
+
+    ps_mem_q_ip = (ihevcd_cxa_fill_mem_rec_ip_t *)pv_api_ip;
+    ps_mem_q_op = (ihevcd_cxa_fill_mem_rec_op_t *)pv_api_op;
+
+    if(ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.u4_size
+                    > offsetof(ihevcd_cxa_fill_mem_rec_ip_t, i4_level))
+    {
+        level = ps_mem_q_ip->i4_level;
+        /* Spec requires level should be multiplied by 30
+         * API has values where level is multiplied by 10. This keeps it consistent with H264
+         * Because of the above differences, level is multiplied by 3 here.
+         */
+        level *= 3;
+    }
+    else
+    {
+        level = MAX_LEVEL;
+    }
+
+    if(ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.u4_size
+                    > offsetof(ihevcd_cxa_fill_mem_rec_ip_t,
+                               u4_num_reorder_frames))
+    {
+        num_reorder_frames = ps_mem_q_ip->u4_num_reorder_frames;
+    }
+    else
+    {
+        num_reorder_frames = MAX_REF_CNT;
+    }
+
+    if(ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.u4_size
+                    > offsetof(ihevcd_cxa_fill_mem_rec_ip_t, u4_num_ref_frames))
+    {
+        num_ref_frames = ps_mem_q_ip->u4_num_ref_frames;
+    }
+    else
+    {
+        num_ref_frames = MAX_REF_CNT;
+    }
+
+    if(ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.u4_size
+                    > offsetof(ihevcd_cxa_fill_mem_rec_ip_t,
+                               u4_num_extra_disp_buf))
+    {
+        num_extra_disp_bufs = ps_mem_q_ip->u4_num_extra_disp_buf;
+    }
+    else
+    {
+        num_extra_disp_bufs = 0;
+    }
+
+    if(ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.u4_size
+                    > offsetof(ihevcd_cxa_fill_mem_rec_ip_t, u4_share_disp_buf))
+    {
+#ifndef LOGO_EN
+        share_disp_buf = ps_mem_q_ip->u4_share_disp_buf;
+#else
+        share_disp_buf = 0;
+#endif
+    }
+    else
+    {
+        share_disp_buf = 0;
+    }
+
+    if(ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.u4_size
+                    > offsetof(ihevcd_cxa_fill_mem_rec_ip_t, e_output_format))
+    {
+        chroma_format = ps_mem_q_ip->e_output_format;
+    }
+    else
+    {
+        chroma_format = -1;
+    }
+
+    /* Shared disp buffer mode is supported only for 420SP formats */
+    if((chroma_format != IV_YUV_420P) &&
+       (chroma_format != IV_YUV_420SP_UV) &&
+       (chroma_format != IV_YUV_420SP_VU))
+    {
+        share_disp_buf = 0;
+    }
+
+    {
+
+        max_ht_luma = ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.u4_max_frm_ht;
+        max_wd_luma = ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.u4_max_frm_wd;
+
+        max_ht_luma = ALIGN64(max_ht_luma);
+        max_wd_luma = ALIGN64(max_wd_luma);
+
+
+
+        max_tile_cols = (max_wd_luma + MIN_TILE_WD - 1) / MIN_TILE_WD;
+        max_tile_rows = (max_ht_luma + MIN_TILE_HT - 1) / MIN_TILE_HT;
+        max_ctb_rows  = max_ht_luma / MIN_CTB_SIZE;
+        max_ctb_cols  = max_wd_luma / MIN_CTB_SIZE;
+        max_ctb_cnt   = max_ctb_rows * max_ctb_cols;
+        max_num_cu_cols = max_wd_luma / MIN_CU_SIZE;
+        max_num_4x4_cols = max_wd_luma / 4;
+    }
+    /*
+     * If level is lesser than 31 and the resolution required is higher,
+     * then make the level at least 31.
+     */
+    /*    if (num_mbs > MAX_NUM_MBS_3_0 && level < MAX_LEVEL)
+     {
+     level           = MAX_LEVEL;
+     }
+     */
+    if((level < MIN_LEVEL) || (level > MAX_LEVEL))
+    {
+        ps_mem_q_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+                        IHEVCD_LEVEL_UNSUPPORTED;
+        level = MAX_LEVEL;
+    }
+    if(num_ref_frames > MAX_REF_CNT)
+    {
+        ps_mem_q_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+                        IHEVCD_NUM_REF_UNSUPPORTED;
+        num_ref_frames = MAX_REF_CNT;
+    }
+
+    if(num_reorder_frames > MAX_REF_CNT)
+    {
+        ps_mem_q_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+                        IHEVCD_NUM_REORDER_UNSUPPORTED;
+        num_reorder_frames = MAX_REF_CNT;
+    }
+
+    max_dpb_size = ihevcd_get_dpb_size(level, max_wd_luma * max_ht_luma);
+    ps_mem_rec_base = ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.pv_mem_rec_location;
+
+    /* Set all memory reconds as persistent and alignment as 128
+     * by default
+     */
+    ps_mem_rec = ps_mem_rec_base;
+    for(i = 0; i < MEM_REC_CNT; i++)
+    {
+        ps_mem_rec->u4_mem_alignment = 128;
+        ps_mem_rec->e_mem_type = IV_EXTERNAL_CACHEABLE_PERSISTENT_MEM;
+        ps_mem_rec++;
+    }
+
+    /* Request memory for HEVCD object */
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_IV_OBJ];
+    ps_mem_rec->u4_mem_size = sizeof(iv_obj_t);
+
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_IV_OBJ,
+                    ps_mem_rec->u4_mem_size);
+
+    /* Request memory for HEVC Codec context */
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_CODEC];
+    ps_mem_rec->u4_mem_size = sizeof(codec_t);
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_CODEC,
+                    ps_mem_rec->u4_mem_size);
+
+    /* Request memory for buffer which holds bitstream after emulation prevention */
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_BITSBUF];
+    ps_mem_rec->u4_mem_size = MAX((max_wd_luma * max_ht_luma), MIN_BITSBUF_SIZE);
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_BITSBUF,
+                    ps_mem_rec->u4_mem_size);
+
+    /* Request memory for buffer which holds TU structures and coeff data for
+     * a set of CTBs in the current picture */
+    /*TODO Currently the buffer is allocated at a frame level. Reduce this to
+     * allocate for s set of CTBs and add appropriate synchronization logic to
+     * ensure that this is data is not overwritten before consumption
+     */
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_TU_DATA];
+    ps_mem_rec->u4_mem_size = ihevcd_get_tu_data_size(max_wd_luma * max_ht_luma);
+#ifdef GPU_BUILD
+    /* For ping-pong view */
+    ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size);
+    ps_mem_rec->u4_mem_size = ps_mem_rec->u4_mem_size * 2;
+#endif
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_TU_DATA,
+                    ps_mem_rec->u4_mem_size);
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_MVBANK];
+
+    ps_mem_rec->u4_mem_size = sizeof(buf_mgr_t);
+
+    /* Size for holding mv_buf_t for each MV Bank */
+    /* Note this allocation is done for BUF_MGR_MAX_CNT instead of
+     * max_dpb_size or MAX_DPB_SIZE for following reasons
+     * max_dpb_size will be based on max_wd and max_ht
+     * For higher max_wd and max_ht this number will be smaller than MAX_DPB_SIZE
+     * But during actual initialization number of buffers allocated can be more
+     *
+     * One extra MV Bank is needed to hold current pics MV bank.
+     * Since this is only a structure allocation and not actual buffer allocation,
+     * it is allocated for BUF_MGR_MAX_CNT entries
+     */
+    ps_mem_rec->u4_mem_size += BUF_MGR_MAX_CNT * sizeof(mv_buf_t);
+#ifdef GPU_BUILD
+    /* Request one extra since release is delayed by one frame.*/
+    ps_mem_rec->u4_mem_size += sizeof(mv_buf_t);
+#endif
+
+    {
+        /* Allocate for pu_map, pu_t and pic_pu_idx for each MV bank */
+        /* Note: Number of luma samples is not max_wd * max_ht here, instead it is
+         * set to maximum number of luma samples allowed at the given level.
+         * This is done to ensure that any stream with width and height lesser
+         * than max_wd and max_ht is supported. Number of buffers required can be greater
+         * for lower width and heights at a given level and this increased number of buffers
+         * might require more memory than what max_wd and max_ht buffer would have required
+         * Also note one extra buffer is allocted to store current pictures MV bank
+         * In case of asynchronous parsing and processing, number of buffers should increase here
+         * based on when parsing and processing threads are synchronized
+         */
+        WORD32 lvl_idx = ihevcd_get_lvl_idx(level);
+        WORD32 max_luma_samples = gai4_ihevc_max_luma_pic_size[lvl_idx];
+#ifdef GPU_BUILD
+        ps_mem_rec->u4_mem_size += (max_dpb_size + 2) *
+                        ihevcd_get_pic_mv_bank_size(max_luma_samples);
+#else
+        ps_mem_rec->u4_mem_size += (max_dpb_size + 1) *
+                        ihevcd_get_pic_mv_bank_size(max_luma_samples);
+#endif
+        DEBUG("\nMemory record Id %d = %d \n", MEM_REC_MVBANK,
+                        ps_mem_rec->u4_mem_size);
+    }
+    // TODO GPU : Have to creat ping-pong view for VPS,SPS,PPS.
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_VPS];
+    ps_mem_rec->u4_mem_size = MAX_VPS_CNT * sizeof(vps_t);
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_VPS,
+                    ps_mem_rec->u4_mem_size);
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_SPS];
+    ps_mem_rec->u4_mem_size = MAX_SPS_CNT * sizeof(sps_t);
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_SPS,
+                    ps_mem_rec->u4_mem_size);
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_PPS];
+    ps_mem_rec->u4_mem_size = MAX_PPS_CNT * sizeof(pps_t);
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PPS,
+                    ps_mem_rec->u4_mem_size);
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_SLICE_HDR];
+    ps_mem_rec->u4_mem_size = MAX_SLICE_HDR_CNT * sizeof(slice_header_t);
+#ifdef GPU_BUILD
+    /* OpenCL ping pong buffer */
+    ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size);
+    ps_mem_rec->u4_mem_size *= 2;
+#endif
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_SLICE_HDR,
+                    ps_mem_rec->u4_mem_size);
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_TILE];
+    {
+        WORD32 tile_size;
+
+        tile_size  = max_tile_cols * max_tile_rows;
+        tile_size  *= sizeof(tile_t);
+
+
+        ps_mem_rec->u4_mem_size = MAX_PPS_CNT * tile_size;
+    }
+
+
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_TILE,
+                    ps_mem_rec->u4_mem_size);
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_ENTRY_OFST];
+    {
+        WORD32 num_entry_points;
+
+        /* One entry point per tile */
+        num_entry_points  = max_tile_cols * max_tile_rows;
+
+        /* One entry point per row of CTBs */
+        /*********************************************************************/
+        /* Only tiles or entropy sync is enabled at a time in main           */
+        /* profile, but since memory required does not increase too much,    */
+        /* this allocation is done to handle both cases                      */
+        /*********************************************************************/
+        num_entry_points  += max_ctb_rows;
+
+
+        ps_mem_rec->u4_mem_size = sizeof(WORD32) * num_entry_points;
+    }
+
+
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_ENTRY_OFST,
+                    ps_mem_rec->u4_mem_size);
+
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_SCALING_MAT];
+    {
+        WORD32 scaling_mat_size;
+
+        SCALING_MAT_SIZE(scaling_mat_size)
+        ps_mem_rec->u4_mem_size = (MAX_SPS_CNT + MAX_PPS_CNT) * scaling_mat_size * sizeof(WORD16);
+    }
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_SCALING_MAT,
+                    ps_mem_rec->u4_mem_size);
+
+    /* Holds one row skip_flag at 8x8 level used during parsing */
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_PARSE_SKIP_FLAG];
+
+    /* 1 bit per 8x8 */
+    ps_mem_rec->u4_mem_size = max_num_cu_cols / 8;
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PARSE_SKIP_FLAG,
+                  ps_mem_rec->u4_mem_size);
+
+    /* Holds one row skip_flag at 8x8 level used during parsing */
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_PARSE_CT_DEPTH];
+
+    /* 2 bits per 8x8 */
+    ps_mem_rec->u4_mem_size = max_num_cu_cols / 4;
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PARSE_CT_DEPTH,
+                  ps_mem_rec->u4_mem_size);
+
+    /* Holds one row skip_flag at 8x8 level used during parsing */
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_PARSE_INTRA_PRED_MODE];
+
+    /* 8 bits per 4x4 */
+    /* 16 bytes each for top and left 64 pixels and 16 bytes for default mode */
+    ps_mem_rec->u4_mem_size = 3 * 16 * sizeof(UWORD8);
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PARSE_INTRA_PRED_MODE,
+                  ps_mem_rec->u4_mem_size);
+
+    /* Holds one intra mode at 8x8 level for entire picture */
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_INTRA_FLAG];
+
+    /* 1 bit per 8x8 */
+    ps_mem_rec->u4_mem_size = (max_wd_luma / MIN_CU_SIZE) * (max_ht_luma / MIN_CU_SIZE) / 8;
+#ifdef GPU_BUILD
+    ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size);
+    ps_mem_rec->u4_mem_size = ps_mem_rec->u4_mem_size * 2;
+#endif
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_INTRA_FLAG,
+                  ps_mem_rec->u4_mem_size);
+
+    /* Holds one transquant bypass flag at 8x8 level for entire picture */
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_TRANSQUANT_BYPASS_FLAG];
+
+    /* 1 bit per 8x8 */
+    /* Extra row and column are allocated for easy processing of top and left blocks while loop filtering */
+    ps_mem_rec->u4_mem_size = ((max_wd_luma + 64) / MIN_CU_SIZE) * ((max_ht_luma + 64) / MIN_CU_SIZE) / 8;
+#ifdef GPU_BUILD
+    ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size);
+    ps_mem_rec->u4_mem_size = ps_mem_rec->u4_mem_size * 2;
+#endif
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_TRANSQUANT_BYPASS_FLAG,
+                  ps_mem_rec->u4_mem_size);
+
+    /* Request memory to hold thread handles for each processing thread */
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_THREAD_HANDLE];
+    ps_mem_rec->u4_mem_size = MAX_PROCESS_THREADS * ithread_get_handle_size();
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_THREAD_HANDLE,
+                    ps_mem_rec->u4_mem_size);
+
+
+    {
+        WORD32 job_queue_size;
+        WORD32 num_jobs;
+        ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_JOBQ];
+
+
+        /* One job per row of CTBs */
+        num_jobs  = max_ctb_rows;
+
+        /* One each tile a row of CTBs, num_jobs has to incremented */
+        num_jobs  *= max_tile_cols;
+
+        /* One format convert/frame copy job per row of CTBs for non-shared mode*/
+        num_jobs  += max_ctb_rows;
+
+#ifdef GPU_BUILD
+        num_jobs *= 2;
+#endif
+
+        job_queue_size = ihevcd_jobq_ctxt_size();
+        job_queue_size += num_jobs * sizeof(proc_job_t);
+        ps_mem_rec->u4_mem_size = job_queue_size;
+        DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PROC_JOBQ,
+                        ps_mem_rec->u4_mem_size);
+    }
+
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_PARSE_MAP];
+    ps_mem_rec->u4_mem_size = max_ctb_cnt;
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PARSE_MAP,
+                    ps_mem_rec->u4_mem_size);
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_MAP];
+    ps_mem_rec->u4_mem_size = max_ctb_cnt;
+#ifdef GPU_BUILD
+    /* OpenCL PING PONG buffer */
+    ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size);
+    ps_mem_rec->u4_mem_size *= 2;
+#endif
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PROC_MAP,
+                    ps_mem_rec->u4_mem_size);
+
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_DISP_MGR];
+
+    /* size for holding display manager context */
+    ps_mem_rec->u4_mem_size = sizeof(buf_mgr_t);
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_DISP_MGR,
+                    ps_mem_rec->u4_mem_size);
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_DPB_MGR];
+
+    /* size for holding dpb manager context */
+    ps_mem_rec->u4_mem_size = sizeof(dpb_mgr_t);
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_DPB_MGR,
+                    ps_mem_rec->u4_mem_size);
+
+    /** Holds top and left neighbor's pu idx into picture level pu array */
+    /* Only one top row is enough but left has to be replicated for each process context */
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_PIC_PU_IDX_NEIGHBOR];
+
+    ps_mem_rec->u4_mem_size = (max_num_4x4_cols  /* left */ + MAX_PROCESS_THREADS * (MAX_CTB_SIZE / 4)/* top */ + 1/* top right */) * sizeof(WORD32);
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PIC_PU_IDX_NEIGHBOR,
+                    ps_mem_rec->u4_mem_size);
+
+
+
+    /* TO hold scratch buffers needed for each process context */
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_SCRATCH];
+    {
+        WORD32 size = 0;
+        WORD32 inter_pred_tmp_buf_size;
+        WORD32 ntaps_luma;
+        WORD32 pu_map_size;
+        WORD32 sao_size = 0;
+        ntaps_luma = 8;
+
+        /* Max inter pred size (number of bytes) */
+        inter_pred_tmp_buf_size = sizeof(WORD16) * (MAX_CTB_SIZE + ntaps_luma) * MAX_CTB_SIZE;
+        inter_pred_tmp_buf_size = ALIGN64(inter_pred_tmp_buf_size);
+
+
+        /* To hold pu_index w.r.t. frame level pu_t array for a CTB at 4x4 level*/
+        /* 16 x 16 4x4 in a CTB of size 64 x 64 and two extra needed for holding
+         * neighbors
+         */
+        pu_map_size = sizeof(WORD32) * (18 * 18);
+
+        pu_map_size = ALIGN64(pu_map_size);
+        size += pu_map_size;
+
+        /* To hold inter pred temporary buffers */
+        size += 2 * inter_pred_tmp_buf_size;
+
+
+        /* Allocate for each process context */
+        size *= MAX_PROCESS_THREADS;
+
+
+#ifdef GPU_SAO_PING_PONG
+        /* To hold SAO left buffer for luma */
+        sao_size += sizeof(UWORD8) * (MAX(max_ht_luma, max_wd_luma)) * 2;
+
+        /* To hold SAO left buffer for chroma */
+        sao_size += sizeof(UWORD8) * (MAX(max_ht_luma, max_wd_luma)) * 2;
+
+        /* To hold SAO top buffer for luma */
+        sao_size += sizeof(UWORD8) * max_wd_luma * 2;
+
+        /* To hold SAO top buffer for chroma */
+        sao_size += sizeof(UWORD8) * max_wd_luma * 2;
+
+        /* To hold SAO top left luma pixel value for last output ctb in a row*/
+        sao_size += sizeof(UWORD8) * max_ctb_rows * 2;
+
+        /* To hold SAO top left chroma pixel value last output ctb in a row*/
+        sao_size += sizeof(UWORD8) * max_ctb_rows * 2 * 2;
+
+        /* To hold SAO top left pixel luma for current ctb - column array*/
+        sao_size += sizeof(UWORD8) * max_ctb_rows * 2;
+
+        /* To hold SAO top left pixel chroma for current ctb-column array*/
+        sao_size += sizeof(UWORD8) * max_ctb_rows * 2 * 2;
+
+        /* To hold SAO top right pixel luma pixel value last output ctb in a row*/
+        sao_size += sizeof(UWORD8) * max_ctb_cols * 2;
+
+        /* To hold SAO top right pixel chroma pixel value last output ctb in a row*/
+        sao_size += sizeof(UWORD8) * max_ctb_cols * 2 * 2;
+
+        /*To hold SAO botton bottom left pixels for luma*/
+        sao_size += sizeof(UWORD8) * max_ctb_rows * 2;
+
+        /*To hold SAO botton bottom left pixels for luma*/
+        sao_size += sizeof(UWORD8) * max_ctb_rows * 2 * 2;
+#else
+        /* To hold SAO left buffer for luma */
+        sao_size += sizeof(UWORD8) * (MAX(max_ht_luma, max_wd_luma));
+
+        /* To hold SAO left buffer for chroma */
+        sao_size += sizeof(UWORD8) * (MAX(max_ht_luma, max_wd_luma));
+
+        /* To hold SAO top buffer for luma */
+        sao_size += sizeof(UWORD8) * max_wd_luma;
+
+        /* To hold SAO top buffer for chroma */
+        sao_size += sizeof(UWORD8) * max_wd_luma;
+
+        /* To hold SAO top left luma pixel value for last output ctb in a row*/
+        sao_size += sizeof(UWORD8) * max_ctb_rows;
+
+        /* To hold SAO top left chroma pixel value last output ctb in a row*/
+        sao_size += sizeof(UWORD8) * max_ctb_rows * 2;
+
+        /* To hold SAO top left pixel luma for current ctb - column array*/
+        sao_size += sizeof(UWORD8) * max_ctb_rows;
+
+        /* To hold SAO top left pixel chroma for current ctb-column array*/
+        sao_size += sizeof(UWORD8) * max_ctb_rows * 2;
+
+        /* To hold SAO top right pixel luma pixel value last output ctb in a row*/
+        sao_size += sizeof(UWORD8) * max_ctb_cols;
+
+        /* To hold SAO top right pixel chroma pixel value last output ctb in a row*/
+        sao_size += sizeof(UWORD8) * max_ctb_cols * 2;
+
+        /*To hold SAO botton bottom left pixels for luma*/
+        sao_size += sizeof(UWORD8) * max_ctb_rows;
+
+        /*To hold SAO botton bottom left pixels for luma*/
+        sao_size += sizeof(UWORD8) * max_ctb_rows * 2;
+#endif
+        sao_size = ALIGN64(sao_size);
+        size += sao_size;
+        ps_mem_rec->u4_mem_size = size;
+    }
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PROC_SCRATCH,
+                    ps_mem_rec->u4_mem_size);
+
+    /* TO hold scratch buffers needed for each SAO context */
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_SAO_SCRATCH];
+    {
+        WORD32 size = 0;
+
+        size = 4 * MAX_CTB_SIZE * MAX_CTB_SIZE;
+
+        /* 2 temporary buffers*/
+        size *= 2;
+
+        size *= MAX_PROCESS_THREADS;
+
+        ps_mem_rec->u4_mem_size = size;
+    }
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_SAO_SCRATCH,
+                    ps_mem_rec->u4_mem_size);
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_BS_QP];
+    {
+        WORD32 size = 0;
+        WORD32 vert_bs_size, horz_bs_size;
+        WORD32 qp_const_flag_size;
+        WORD32 qp_size, num_8x8;
+
+        /* Max Number of vertical edges */
+        vert_bs_size = max_wd_luma / 8 + MAX_CTB_SIZE / 8;
+
+        /* Max Number of horizontal edges - extra MAX_CTB_SIZE / 8 to handle the last 4 rows separately(shifted CTB processing) */
+        vert_bs_size *= (max_ht_luma + MAX_CTB_SIZE) / MIN_TU_SIZE;
+
+        /* Number of bytes */
+        vert_bs_size /= 8;
+
+        /* Two bits per edge */
+        vert_bs_size *= 2;
+
+        /* Max Number of horizontal edges */
+        horz_bs_size = max_ht_luma / 8 + MAX_CTB_SIZE / 8;
+
+        /* Max Number of vertical edges - extra MAX_CTB_SIZE / 8 to handle the last 4 columns separately(shifted CTB processing) */
+        horz_bs_size *= (max_wd_luma + MAX_CTB_SIZE) / MIN_TU_SIZE;
+
+        /* Number of bytes */
+        horz_bs_size /= 8;
+
+        /* Two bits per edge */
+        horz_bs_size *= 2;
+
+        /* Max CTBs in a row */
+        qp_const_flag_size = max_wd_luma / MIN_CTB_SIZE + 1 /* The last ctb row deblk is done in last ctb + 1 row.*/;
+
+        /* Max CTBs in a column */
+        qp_const_flag_size *= max_ht_luma / MIN_CTB_SIZE;
+
+        /* Number of bytes */
+        qp_const_flag_size = (qp_const_flag_size + 7) >> 3;
+
+        /* QP changes at CU level - So store at 8x8 level */
+        num_8x8 = (max_ht_luma * max_wd_luma) / (MIN_CU_SIZE * MIN_CU_SIZE);
+        qp_size = num_8x8;
+
+        /* To hold vertical boundary strength */
+        size += vert_bs_size;
+
+        /* To hold horizontal boundary strength */
+        size += horz_bs_size;
+
+        /* To hold QP */
+        size += qp_size;
+
+        /* To hold QP const in CTB flags */
+        size += qp_const_flag_size;
+
+        ps_mem_rec->u4_mem_size = size;
+#ifdef GPU_BUILD
+        ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size);
+        ps_mem_rec->u4_mem_size = ps_mem_rec->u4_mem_size * 2;
+#endif
+    }
+
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_BS_QP,
+                    ps_mem_rec->u4_mem_size);
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_TILE_IDX];
+    {
+        WORD32 size = 0;
+        /* Max CTBs in a row */
+        size  = max_wd_luma / MIN_CTB_SIZE + 2 /* Top row and bottom row extra. This ensures accessing left,top in first row
+                                                  and right in last row will not result in invalid access*/;
+        /* Max CTBs in a column */
+        size *= max_ht_luma / MIN_CTB_SIZE;
+
+        size *= sizeof(UWORD16);
+        ps_mem_rec->u4_mem_size = size;
+#ifdef GPU_BUILD
+        ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size);
+        ps_mem_rec->u4_mem_size = ps_mem_rec->u4_mem_size * 2;
+#endif
+    }
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_TILE_IDX,
+                    ps_mem_rec->u4_mem_size);
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_SAO];
+    {
+        UWORD32 size;
+
+        /* 4 bytes per color component per CTB */
+        size = 3 * 4;
+
+        /* MAX number of CTBs in a row */
+        size *= max_wd_luma / MIN_CTB_SIZE;
+
+        /* MAX number of CTBs in a column */
+        size *= max_ht_luma / MIN_CTB_SIZE;
+#ifdef GPU_BUILD
+        ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size);
+        ps_mem_rec->u4_mem_size = size * 2;
+#else
+        ps_mem_rec->u4_mem_size = size;
+#endif
+    }
+
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_SAO,
+                    ps_mem_rec->u4_mem_size);
+
+#ifdef GPU_BUILD
+    /* Memory record for GPU context */
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_GPU];
+    ps_mem_rec->u4_mem_size = ihevcd_gpu_get_ctxt_size();
+
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_GPU,
+                    ps_mem_rec->u4_mem_size);
+#endif
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_REF_PIC];
+
+    /* size for holding buffer manager context */
+    ps_mem_rec->u4_mem_size = sizeof(buf_mgr_t);
+
+    /* Size for holding pic_buf_t for each reference picture */
+    /* Note this allocation is done for BUF_MGR_MAX_CNT instead of
+     * max_dpb_size or MAX_DPB_SIZE for following reasons
+     * max_dpb_size will be based on max_wd and max_ht
+     * For higher max_wd and max_ht this number will be smaller than MAX_DPB_SIZE
+     * But during actual initialization number of buffers allocated can be more
+     *
+     * Also to handle display depth application can allocate more than what
+     * codec asks for in case of non-shared mode
+     * Since this is only a structure allocation and not actual buffer allocation,
+     * it is allocated for BUF_MGR_MAX_CNT entries
+     */
+    ps_mem_rec->u4_mem_size += BUF_MGR_MAX_CNT * sizeof(pic_buf_t);
+
+    /* In case of non-shared mode allocate for reference picture buffers */
+    if(0 == share_disp_buf)
+    {
+        UWORD32 num_reorder_frames_local = num_reorder_frames;
+#ifdef GPU_BUILD
+        // TODO GPU : Increment only if multicore.
+        num_reorder_frames_local += 1;
+#endif
+        /* Note: Number of luma samples is not max_wd * max_ht here, instead it is
+         * set to maximum number of luma samples allowed at the given level.
+         * This is done to ensure that any stream with width and height lesser
+         * than max_wd and max_ht is supported. Number of buffers required can be greater
+         * for lower width and heights at a given level and this increased number of buffers
+         * might require more memory than what max_wd and max_ht buffer would have required
+         * Number of buffers is doubled in order to return one frame at a time instead of sending
+         * multiple outputs during dpb full case.
+         * Also note one extra buffer is allocted to store current picture
+         * In case of asynchronous parsing and processing, number of buffers should increase here
+         * based on when parsing and processing threads are synchronized
+         */
+        ps_mem_rec->u4_mem_size +=
+                        ihevcd_get_total_pic_buf_size(max_wd_luma * max_ht_luma, level,  PAD_WD,  PAD_HT,
+                                                      num_ref_frames, num_reorder_frames_local);
+    }
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_REF_PIC,
+                    ps_mem_rec->u4_mem_size);
+
+    /* Request memory to hold mem records to be returned during retrieve call */
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_BACKUP];
+    ps_mem_rec->u4_mem_size = MEM_REC_CNT * sizeof(iv_mem_rec_t);
+    DEBUG("\nMemory record Id %d = %d \n", MEM_REC_BACKUP,
+                    ps_mem_rec->u4_mem_size);
+
+    /* Each memtab size is aligned to next multiple of 128 bytes */
+    /* This is to ensure all the memtabs start at different cache lines */
+    ps_mem_rec = ps_mem_rec_base;
+    for(i = 0; i < MEM_REC_CNT; i++)
+    {
+        ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size);
+        ps_mem_rec++;
+    }
+    ps_mem_q_op->s_ivd_fill_mem_rec_op_t.u4_num_mem_rec_filled = MEM_REC_CNT;
+#ifdef APPLY_CONCEALMENT
+    {
+        IV_API_CALL_STATUS_T status;
+        icncl_fill_mem_rec_ip_t cncl_fill_ip;
+        icncl_fill_mem_rec_op_t cncl_fill_op;
+        UWORD8 mem_loc = MEM_REC_CNT;
+
+        cncl_fill_ip.s_ivd_fill_mem_rec_ip_t.e_cmd = IV_CMD_FILL_NUM_MEM_REC;
+        cncl_fill_ip.s_ivd_fill_mem_rec_ip_t.pv_mem_rec_location = &(memTab[mem_loc]);
+        cncl_fill_ip.s_ivd_fill_mem_rec_ip_t.u4_size = ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.u4_size;
+        cncl_fill_ip.s_ivd_fill_mem_rec_ip_t.u4_max_frm_wd = max_wd_luma;
+        cncl_fill_ip.s_ivd_fill_mem_rec_ip_t.u4_max_frm_ht = max_ht_luma;
+
+        status = icncl_api_function(NULL, (void *)&cncl_fill_ip, (void *)&cncl_fill_op);
+
+        if(IV_SUCCESS == status)
+        {
+            icncl_num_mem_rec_ip_t cncl_mem_ip;
+            icncl_num_mem_rec_op_t cncl_mem_op;
+
+            cncl_mem_ip.s_ivd_num_rec_ip_t.e_cmd = IV_CMD_GET_NUM_MEM_REC;
+            cncl_mem_ip.s_ivd_num_rec_ip_t.u4_size = sizeof(icncl_num_mem_rec_ip_t);
+
+            status = icncl_api_function(NULL, (void *)&cncl_mem_ip, (void *)&cncl_mem_op);
+            if(IV_SUCCESS == status)
+            {
+                ps_mem_q_op->s_ivd_fill_mem_rec_op_t.u4_num_mem_rec_filled += cncl_mem_op.s_ivd_num_mem_rec_op_t.u4_num_mem_rec;
+            }
+        }
+
+        return status;
+
+    }
+#endif //APPLY_CONCEALMENT
+    DEBUG("Num mem recs in fill call : %d\n",
+                    ps_mem_q_op->s_ivd_fill_mem_rec_op_t.u4_num_mem_rec_filled);
+
+
+    return (status);
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Initializes from mem records passed to the codec
+*
+* @par Description:
+*  Initializes pointers based on mem records passed
+*
+* @param[in] ps_codec_obj
+*  Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_init_mem_rec(iv_obj_t *ps_codec_obj,
+                           void *pv_api_ip,
+                           void *pv_api_op)
+{
+
+    ihevcd_cxa_init_ip_t *dec_init_ip;
+    ihevcd_cxa_init_op_t *dec_init_op;
+    WORD32 i;
+    iv_mem_rec_t *ps_mem_rec, *ps_mem_rec_base;
+    WORD32 status = IV_SUCCESS;
+    codec_t *ps_codec;
+    WORD32 max_tile_cols, max_tile_rows;
+
+    dec_init_ip = (ihevcd_cxa_init_ip_t *)pv_api_ip;
+    dec_init_op = (ihevcd_cxa_init_op_t *)pv_api_op;
+
+    ps_mem_rec_base = dec_init_ip->s_ivd_init_ip_t.pv_mem_rec_location;
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_CODEC];
+    ps_codec_obj->pv_codec_handle = ps_mem_rec->pv_base;
+
+    ps_codec = (codec_t *)(ps_codec_obj->pv_codec_handle);
+
+    /* Note this memset can not be done in init() call, since init will called
+    during reset as well. And calling this during reset will mean all pointers
+    need to reinitialized*/
+    memset(ps_codec, 0, sizeof(codec_t));
+
+    if(dec_init_ip->s_ivd_init_ip_t.u4_size
+                    > offsetof(ihevcd_cxa_init_ip_t, i4_level))
+    {
+        ps_codec->i4_init_level = dec_init_ip->i4_level;
+
+        ps_codec->i4_init_level *= 3;
+    }
+    else
+    {
+        ps_codec->i4_init_level = MAX_LEVEL;
+    }
+
+    if(dec_init_ip->s_ivd_init_ip_t.u4_size
+                    > offsetof(ihevcd_cxa_init_ip_t, u4_num_ref_frames))
+    {
+        ps_codec->i4_init_num_ref = dec_init_ip->u4_num_ref_frames;
+    }
+    else
+    {
+        ps_codec->i4_init_num_ref = MAX_REF_CNT;
+    }
+
+    if(dec_init_ip->s_ivd_init_ip_t.u4_size
+                    > offsetof(ihevcd_cxa_init_ip_t, u4_num_reorder_frames))
+    {
+        ps_codec->i4_init_num_reorder = dec_init_ip->u4_num_reorder_frames;
+    }
+    else
+    {
+        ps_codec->i4_init_num_reorder = MAX_REF_CNT;
+    }
+
+    if(dec_init_ip->s_ivd_init_ip_t.u4_size
+                    > offsetof(ihevcd_cxa_init_ip_t, u4_num_extra_disp_buf))
+    {
+        ps_codec->i4_init_num_extra_disp_buf =
+                        dec_init_ip->u4_num_extra_disp_buf;
+    }
+    else
+    {
+        ps_codec->i4_init_num_extra_disp_buf = 0;
+    }
+
+    if(dec_init_ip->s_ivd_init_ip_t.u4_size
+                    > offsetof(ihevcd_cxa_init_ip_t, u4_share_disp_buf))
+    {
+#ifndef LOGO_EN
+        ps_codec->i4_share_disp_buf = dec_init_ip->u4_share_disp_buf;
+#else
+        ps_codec->i4_share_disp_buf = 0;
+#endif
+    }
+    else
+    {
+        ps_codec->i4_share_disp_buf = 0;
+    }
+    /* Shared display mode is supported only for 420SP and 420P formats */
+    if((dec_init_ip->s_ivd_init_ip_t.e_output_format != IV_YUV_420P) &&
+       (dec_init_ip->s_ivd_init_ip_t.e_output_format != IV_YUV_420SP_UV) &&
+       (dec_init_ip->s_ivd_init_ip_t.e_output_format != IV_YUV_420SP_VU))
+    {
+        ps_codec->i4_share_disp_buf = 0;
+    }
+
+    if((ps_codec->i4_init_level < MIN_LEVEL)
+                    || (ps_codec->i4_init_level > MAX_LEVEL))
+    {
+        dec_init_op->s_ivd_init_op_t.u4_error_code |= IHEVCD_LEVEL_UNSUPPORTED;
+        return (IV_FAIL);
+    }
+
+    if(ps_codec->i4_init_num_ref > MAX_REF_CNT)
+    {
+        dec_init_op->s_ivd_init_op_t.u4_error_code |=
+                        IHEVCD_NUM_REF_UNSUPPORTED;
+        ps_codec->i4_init_num_ref = MAX_REF_CNT;
+    }
+
+    if(ps_codec->i4_init_num_reorder > MAX_REF_CNT)
+    {
+        dec_init_op->s_ivd_init_op_t.u4_error_code |=
+                        IHEVCD_NUM_REORDER_UNSUPPORTED;
+        ps_codec->i4_init_num_reorder = MAX_REF_CNT;
+    }
+
+    if(ps_codec->i4_init_num_extra_disp_buf > MAX_REF_CNT)
+    {
+        dec_init_op->s_ivd_init_op_t.u4_error_code |=
+                        IHEVCD_NUM_EXTRA_DISP_UNSUPPORTED;
+        ps_codec->i4_init_num_extra_disp_buf = 0;
+    }
+
+    ps_codec->e_chroma_fmt = dec_init_ip->s_ivd_init_ip_t.e_output_format;
+
+    ps_codec->i4_max_wd = dec_init_ip->s_ivd_init_ip_t.u4_frm_max_wd;
+    ps_codec->i4_max_ht = dec_init_ip->s_ivd_init_ip_t.u4_frm_max_ht;
+
+    ps_codec->i4_max_wd = ALIGN64(ps_codec->i4_max_wd);
+    ps_codec->i4_max_ht = ALIGN64(ps_codec->i4_max_ht);
+
+    max_tile_cols = (ps_codec->i4_max_wd + MIN_TILE_WD - 1) / MIN_TILE_WD;
+    max_tile_rows = (ps_codec->i4_max_ht + MIN_TILE_HT - 1) / MIN_TILE_HT;
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_BACKUP];
+    ps_codec->ps_mem_rec_backup = (iv_mem_rec_t *)ps_mem_rec->pv_base;
+
+    memcpy(ps_codec->ps_mem_rec_backup, ps_mem_rec_base,
+           MEM_REC_CNT * sizeof(iv_mem_rec_t));
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_BITSBUF];
+    ps_codec->pu1_bitsbuf = (UWORD8 *)ps_mem_rec->pv_base;
+    ps_codec->u4_bitsbuf_size = ps_mem_rec->u4_mem_size;
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_TU_DATA];
+#ifdef GPU_BUILD
+    ps_codec->apv_tu_data[0] = ps_mem_rec->pv_base;
+    ps_codec->apv_tu_data[1] = (void *)((UWORD8 *)ps_codec->apv_tu_data[0] + (ps_mem_rec->u4_mem_size / 2));
+#else
+    ps_codec->pv_tu_data = ps_mem_rec->pv_base;
+#endif
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_MVBANK];
+    ps_codec->pv_mv_buf_mgr = ps_mem_rec->pv_base;
+    ps_codec->pv_mv_bank_buf_base = (UWORD8 *)ps_codec->pv_mv_buf_mgr + sizeof(buf_mgr_t);
+
+    ps_codec->i4_total_mv_bank_size = ps_mem_rec->u4_mem_size - sizeof(buf_mgr_t);
+
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_VPS];
+    ps_codec->ps_vps_base = (vps_t *)ps_mem_rec->pv_base;
+    ps_codec->s_parse.ps_vps_base = ps_codec->ps_vps_base;
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_SPS];
+    ps_codec->ps_sps_base = (sps_t *)ps_mem_rec->pv_base;
+    ps_codec->s_parse.ps_sps_base = ps_codec->ps_sps_base;
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_PPS];
+    ps_codec->ps_pps_base = (pps_t *)ps_mem_rec->pv_base;
+    ps_codec->s_parse.ps_pps_base = ps_codec->ps_pps_base;
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_SLICE_HDR];
+#ifdef GPU_BUILD
+    ps_codec->aps_slice_hdr_base[0] = (slice_header_t *)ps_mem_rec->pv_base;
+    ps_codec->aps_slice_hdr_base[1] = (slice_header_t *)ps_mem_rec->pv_base + MAX_SLICE_HDR_CNT;
+#else
+    ps_codec->ps_slice_hdr_base = (slice_header_t *)ps_mem_rec->pv_base;
+    ps_codec->s_parse.ps_slice_hdr_base = ps_codec->ps_slice_hdr_base;
+#endif
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_TILE];
+    ps_codec->ps_tile = (tile_t *)ps_mem_rec->pv_base;
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_ENTRY_OFST];
+    ps_codec->pi4_entry_ofst = (WORD32 *)ps_mem_rec->pv_base;
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_SCALING_MAT];
+    ps_codec->pi2_scaling_mat = (WORD16 *)ps_mem_rec->pv_base;
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_PARSE_SKIP_FLAG];
+    ps_codec->s_parse.pu4_skip_cu_top = (UWORD32 *)ps_mem_rec->pv_base;
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_PARSE_CT_DEPTH];
+    ps_codec->s_parse.pu4_ct_depth_top = (UWORD32 *)ps_mem_rec->pv_base;
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_PARSE_INTRA_PRED_MODE];
+    ps_codec->s_parse.pu1_luma_intra_pred_mode_left =
+                    (UWORD8 *)ps_mem_rec->pv_base;
+    ps_codec->s_parse.pu1_luma_intra_pred_mode_top  =
+                    (UWORD8 *)ps_mem_rec->pv_base + 16;
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_INTRA_FLAG];
+
+#ifdef GPU_BUILD
+    ps_codec->apu1_pic_intra_flag[0] = ps_mem_rec->pv_base;
+    ps_codec->apu1_pic_intra_flag[1] = ps_codec->apu1_pic_intra_flag[0] + (ps_mem_rec->u4_mem_size / 2);
+#else
+    memset(ps_mem_rec->pv_base, 0, (ps_codec->i4_max_wd / MIN_CU_SIZE) * (ps_codec->i4_max_ht / MIN_CU_SIZE) / 8);
+
+    ps_codec->pu1_pic_intra_flag = (UWORD8 *)ps_mem_rec->pv_base;
+    ps_codec->s_parse.pu1_pic_intra_flag = ps_codec->pu1_pic_intra_flag;
+#endif
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_TRANSQUANT_BYPASS_FLAG];
+
+#ifdef GPU_BUILD
+    {
+        WORD32 loop_filter_strd = (ps_codec->i4_max_wd + 63) >> 6;
+
+        /* The offset is added for easy processing of top and left blocks while loop filtering */
+        ps_codec->apu1_pic_no_loop_filter_flag[0] = (UWORD8 *)ps_mem_rec->pv_base + loop_filter_strd + 1;
+        ps_codec->apu1_pic_no_loop_filter_flag[1] = (UWORD8 *)ps_mem_rec->pv_base + (ps_mem_rec->u4_mem_size / 2) + loop_filter_strd + 1;
+    }
+#else
+    {
+        WORD32 loop_filter_size = ((ps_codec->i4_max_wd  + 64) / MIN_CU_SIZE) * ((ps_codec->i4_max_ht + 64) / MIN_CU_SIZE) / 8;
+        WORD32 loop_filter_strd = (ps_codec->i4_max_wd + 63) >> 6;
+
+        memset(ps_mem_rec->pv_base, 0, loop_filter_size);
+
+        /* The offset is added for easy processing of top and left blocks while loop filtering */
+        ps_codec->pu1_pic_no_loop_filter_flag = (UWORD8 *)ps_mem_rec->pv_base + loop_filter_strd + 1;
+        ps_codec->s_parse.pu1_pic_no_loop_filter_flag = ps_codec->pu1_pic_no_loop_filter_flag;
+        ps_codec->s_parse.s_deblk_ctxt.pu1_pic_no_loop_filter_flag = ps_codec->pu1_pic_no_loop_filter_flag;
+        ps_codec->s_parse.s_sao_ctxt.pu1_pic_no_loop_filter_flag = ps_codec->pu1_pic_no_loop_filter_flag;
+    }
+#endif
+
+    /* Initialize pointers in PPS structures */
+    {
+        sps_t *ps_sps = ps_codec->ps_sps_base;
+        pps_t *ps_pps = ps_codec->ps_pps_base;
+        tile_t *ps_tile =  ps_codec->ps_tile;
+        WORD16 *pi2_scaling_mat =  ps_codec->pi2_scaling_mat;
+        WORD32 scaling_mat_size;
+
+        SCALING_MAT_SIZE(scaling_mat_size);
+
+        for(i = 0; i < MAX_SPS_CNT; i++)
+        {
+            ps_sps->pi2_scaling_mat  = pi2_scaling_mat;
+            pi2_scaling_mat += scaling_mat_size;
+            ps_sps++;
+        }
+
+        for(i = 0; i < MAX_PPS_CNT; i++)
+        {
+            ps_pps->ps_tile = ps_tile;
+            ps_tile += (max_tile_cols * max_tile_rows);
+
+            ps_pps->pi2_scaling_mat  = pi2_scaling_mat;
+            pi2_scaling_mat += scaling_mat_size;
+            ps_pps++;
+        }
+
+    }
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_THREAD_HANDLE];
+    for(i = 0; i < MAX_PROCESS_THREADS; i++)
+    {
+        WORD32 handle_size = ithread_get_handle_size();
+        ps_codec->apv_process_thread_handle[i] =
+                        (UWORD8 *)ps_mem_rec->pv_base + (i * handle_size);
+    }
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_JOBQ];
+    ps_codec->pv_proc_jobq_buf = ps_mem_rec->pv_base;
+    ps_codec->i4_proc_jobq_buf_size = ps_mem_rec->u4_mem_size;
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_PARSE_MAP];
+    ps_codec->pu1_parse_map = (UWORD8 *)ps_mem_rec->pv_base;
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_MAP];
+#ifdef GPU_BUILD
+    memset(ps_mem_rec->pv_base, 0, ps_mem_rec->u4_mem_size);
+    ps_codec->apu1_proc_map[0] = (UWORD8 *)ps_mem_rec->pv_base;
+    ps_codec->apu1_proc_map[1] = (UWORD8 *)ps_mem_rec->pv_base + (ps_mem_rec->u4_mem_size / 2);
+#else
+    ps_codec->pu1_proc_map = (UWORD8 *)ps_mem_rec->pv_base;
+#endif
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_DISP_MGR];
+    ps_codec->pv_disp_buf_mgr = ps_mem_rec->pv_base;
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_DPB_MGR];
+    ps_codec->pv_dpb_mgr = ps_mem_rec->pv_base;
+
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_PIC_PU_IDX_NEIGHBOR];
+
+    for(i = 0; i < MAX_PROCESS_THREADS; i++)
+    {
+        UWORD32 *pu4_buf = (UWORD32 *)ps_mem_rec->pv_base;
+        ps_codec->as_process[i].pu4_pic_pu_idx_left = pu4_buf + i * (MAX_CTB_SIZE / 4);
+        memset(ps_codec->as_process[i].pu4_pic_pu_idx_left, 0, sizeof(UWORD32) * MAX_CTB_SIZE / 4);
+        ps_codec->as_process[i].pu4_pic_pu_idx_top = pu4_buf + MAX_PROCESS_THREADS * (MAX_CTB_SIZE / 4);
+    }
+    memset(ps_codec->as_process[0].pu4_pic_pu_idx_top, 0, sizeof(UWORD32) * (ps_codec->i4_max_wd / 4 + 1));
+
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_SCRATCH];
+    {
+        UWORD8 *pu1_buf = (UWORD8 *)ps_mem_rec->pv_base;
+        WORD32 pic_pu_idx_map_size;
+
+        WORD32 inter_pred_tmp_buf_size, ntaps_luma;
+
+        /* Max inter pred size */
+        ntaps_luma = 8;
+        inter_pred_tmp_buf_size = sizeof(WORD16) * (MAX_CTB_SIZE + ntaps_luma) * MAX_CTB_SIZE;
+
+        inter_pred_tmp_buf_size = ALIGN64(inter_pred_tmp_buf_size);
+
+        /* To hold pu_index w.r.t. frame level pu_t array for a CTB */
+        pic_pu_idx_map_size = sizeof(WORD32) * (18 * 18);
+        pic_pu_idx_map_size = ALIGN64(pic_pu_idx_map_size);
+        for(i = 0; i < MAX_PROCESS_THREADS; i++)
+        {
+            ps_codec->as_process[i].pi2_inter_pred_tmp_buf1 = (WORD16 *)pu1_buf;
+            pu1_buf += inter_pred_tmp_buf_size;
+
+            ps_codec->as_process[i].pi2_inter_pred_tmp_buf2 = (WORD16 *)pu1_buf;
+            pu1_buf += inter_pred_tmp_buf_size;
+
+            /* Inverse transform intermediate and inverse scan output buffers reuse inter pred scratch buffers */
+            ps_codec->as_process[i].pi2_itrans_intrmd_buf =
+                            ps_codec->as_process[i].pi2_inter_pred_tmp_buf2;
+            ps_codec->as_process[i].pi2_invscan_out =
+                            ps_codec->as_process[i].pi2_inter_pred_tmp_buf1;
+
+            ps_codec->as_process[i].pu4_pic_pu_idx_map = (UWORD32 *)pu1_buf;
+            ps_codec->as_process[i].s_bs_ctxt.pu4_pic_pu_idx_map =
+                            (UWORD32 *)pu1_buf;
+            pu1_buf += pic_pu_idx_map_size;
+
+            //   ps_codec->as_process[i].pi2_inter_pred_tmp_buf3 = (WORD16 *)pu1_buf;
+            //   pu1_buf += inter_pred_tmp_buf_size;
+
+            ps_codec->as_process[i].i4_inter_pred_tmp_buf_strd = MAX_CTB_SIZE;
+
+        }
+        for(i = 0; i < MAX_PROCESS_THREADS; i++)
+        {
+            ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_left_luma = (UWORD8 *)pu1_buf;
+        }
+        ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_left_luma = (UWORD8 *)pu1_buf;
+        pu1_buf += MAX(ps_codec->i4_max_ht, ps_codec->i4_max_wd);
+
+        for(i = 0; i < MAX_PROCESS_THREADS; i++)
+        {
+            ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_left_chroma = (UWORD8 *)pu1_buf;
+        }
+        ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_left_chroma = (UWORD8 *)pu1_buf;
+        pu1_buf += MAX(ps_codec->i4_max_ht, ps_codec->i4_max_wd);
+        for(i = 0; i < MAX_PROCESS_THREADS; i++)
+        {
+            ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_top_luma = (UWORD8 *)pu1_buf;
+        }
+        ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_top_luma = (UWORD8 *)pu1_buf;
+        pu1_buf += ps_codec->i4_max_wd;
+
+        for(i = 0; i < MAX_PROCESS_THREADS; i++)
+        {
+            ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_top_chroma = (UWORD8 *)pu1_buf;
+        }
+        ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_top_chroma = (UWORD8 *)pu1_buf;
+        pu1_buf += ps_codec->i4_max_wd;
+        for(i = 0; i < MAX_PROCESS_THREADS; i++)
+        {
+            ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_luma_top_left_ctb = (UWORD8 *)pu1_buf;
+        }
+        ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_luma_top_left_ctb = (UWORD8 *)pu1_buf;
+        pu1_buf += ps_codec->i4_max_ht / MIN_CTB_SIZE;
+
+        for(i = 0; i < MAX_PROCESS_THREADS; i++)
+        {
+            ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_chroma_top_left_ctb = (UWORD8 *)pu1_buf;
+        }
+        ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_chroma_top_left_ctb = (UWORD8 *)pu1_buf;
+        pu1_buf += (ps_codec->i4_max_ht / MIN_CTB_SIZE) * 2;
+
+        for(i = 0; i < MAX_PROCESS_THREADS; i++)
+        {
+            ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_top_left_luma_curr_ctb = (UWORD8 *)pu1_buf;
+        }
+        ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_top_left_luma_curr_ctb = (UWORD8 *)pu1_buf;
+        pu1_buf += ps_codec->i4_max_ht / MIN_CTB_SIZE;
+
+        for(i = 0; i < MAX_PROCESS_THREADS; i++)
+        {
+            ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_top_left_chroma_curr_ctb = (UWORD8 *)pu1_buf;
+        }
+        ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_top_left_chroma_curr_ctb = (UWORD8 *)pu1_buf;
+
+        pu1_buf += (ps_codec->i4_max_ht / MIN_CTB_SIZE) * 2;
+        for(i = 0; i < MAX_PROCESS_THREADS; i++)
+        {
+            ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_top_left_luma_top_right = (UWORD8 *)pu1_buf;
+        }
+        ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_top_left_luma_top_right = (UWORD8 *)pu1_buf;
+
+        pu1_buf += ps_codec->i4_max_wd / MIN_CTB_SIZE;
+        for(i = 0; i < MAX_PROCESS_THREADS; i++)
+        {
+            ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_top_left_chroma_top_right = (UWORD8 *)pu1_buf;
+        }
+        ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_top_left_chroma_top_right = (UWORD8 *)pu1_buf;
+
+        pu1_buf += (ps_codec->i4_max_wd / MIN_CTB_SIZE) * 2;
+
+        /*Per CTB, Store 1 value for luma , 2 values for chroma*/
+        for(i = 0; i < MAX_PROCESS_THREADS; i++)
+        {
+            ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_top_left_luma_bot_left = (UWORD8 *)pu1_buf;
+        }
+        ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_top_left_luma_bot_left = (UWORD8 *)pu1_buf;
+
+        pu1_buf += (ps_codec->i4_max_ht / MIN_CTB_SIZE);
+
+        for(i = 0; i < MAX_PROCESS_THREADS; i++)
+        {
+            ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_top_left_chroma_bot_left = (UWORD8 *)pu1_buf;
+        }
+        ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_top_left_chroma_bot_left = (UWORD8 *)pu1_buf;
+
+        pu1_buf += (ps_codec->i4_max_ht / MIN_CTB_SIZE) * 2;
+    }
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_SAO_SCRATCH];
+    {
+        UWORD8 *pu1_buf = (UWORD8 *)ps_mem_rec->pv_base;
+        for(i = 0; i < MAX_PROCESS_THREADS; i++)
+        {
+            ps_codec->as_process[i].s_sao_ctxt.pu1_tmp_buf_luma = (UWORD8 *)pu1_buf;
+            pu1_buf += 4 * MAX_CTB_SIZE * MAX_CTB_SIZE * sizeof(UWORD8);
+
+            ps_codec->as_process[i].s_sao_ctxt.pu1_tmp_buf_chroma = (UWORD8 *)pu1_buf;
+            pu1_buf += 4 * MAX_CTB_SIZE * MAX_CTB_SIZE * sizeof(UWORD8);
+        }
+    }
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_BS_QP];
+    {
+        UWORD8 *pu1_buf = (UWORD8 *)ps_mem_rec->pv_base;
+        WORD32 vert_bs_size, horz_bs_size;
+        WORD32 qp_const_flag_size;
+        WORD32 qp_size;
+        WORD32 num_8x8;
+
+        /* Max Number of vertical edges */
+        vert_bs_size = ps_codec->i4_max_wd / 8 + MAX_CTB_SIZE / 8;
+
+        /* Max Number of horizontal edges - extra MAX_CTB_SIZE / 8 to handle the last 4 rows separately(shifted CTB processing) */
+        vert_bs_size *= (ps_codec->i4_max_ht + MAX_CTB_SIZE) / MIN_TU_SIZE;
+
+        /* Number of bytes */
+        vert_bs_size /= 8;
+
+        /* Two bits per edge */
+        vert_bs_size *= 2;
+
+        /* Max Number of horizontal edges */
+        horz_bs_size = ps_codec->i4_max_ht / 8 + MAX_CTB_SIZE / 8;
+
+        /* Max Number of vertical edges - extra MAX_CTB_SIZE / 8 to handle the last 4 columns separately(shifted CTB processing) */
+        horz_bs_size *= (ps_codec->i4_max_wd + MAX_CTB_SIZE) / MIN_TU_SIZE;
+
+        /* Number of bytes */
+        horz_bs_size /= 8;
+
+        /* Two bits per edge */
+        horz_bs_size *= 2;
+
+        /* Max CTBs in a row */
+        qp_const_flag_size = ps_codec->i4_max_wd / MIN_CTB_SIZE + 1 /* The last ctb row deblk is done in last ctb + 1 row.*/;
+
+        /* Max CTBs in a column */
+        qp_const_flag_size *= ps_codec->i4_max_ht / MIN_CTB_SIZE;
+
+        /* Number of bytes */
+        qp_const_flag_size /= 8;
+
+        /* QP changes at CU level - So store at 8x8 level */
+        num_8x8 = (ps_codec->i4_max_ht * ps_codec->i4_max_wd) / (MIN_CU_SIZE * MIN_CU_SIZE);
+        qp_size = num_8x8;
+#ifndef GPU_BUILD
+        memset(pu1_buf, 0, vert_bs_size + horz_bs_size + qp_size + qp_const_flag_size);
+
+        for(i = 0; i < MAX_PROCESS_THREADS; i++)
+        {
+            ps_codec->as_process[i].s_bs_ctxt.pu4_pic_vert_bs = (UWORD32 *)pu1_buf;
+            ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu4_pic_vert_bs = (UWORD32 *)pu1_buf;
+            ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu4_pic_vert_bs = (UWORD32 *)pu1_buf;
+            pu1_buf += vert_bs_size;
+
+            ps_codec->as_process[i].s_bs_ctxt.pu4_pic_horz_bs = (UWORD32 *)pu1_buf;
+            ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu4_pic_horz_bs = (UWORD32 *)pu1_buf;
+            ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu4_pic_horz_bs = (UWORD32 *)pu1_buf;
+            pu1_buf += horz_bs_size;
+
+            ps_codec->as_process[i].s_bs_ctxt.pu1_pic_qp = (UWORD8 *)pu1_buf;
+            ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp = (UWORD8 *)pu1_buf;
+            ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp = (UWORD8 *)pu1_buf;
+            pu1_buf += qp_size;
+
+            ps_codec->as_process[i].s_bs_ctxt.pu1_pic_qp_const_in_ctb = (UWORD8 *)pu1_buf;
+            ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp_const_in_ctb = (UWORD8 *)pu1_buf;
+            ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp_const_in_ctb = (UWORD8 *)pu1_buf;
+            pu1_buf += qp_const_flag_size;
+
+            pu1_buf -= (vert_bs_size + horz_bs_size + qp_size + qp_const_flag_size);
+        }
+#endif
+#ifdef GPU_BUILD
+        ps_codec->apu4_pic_vert_bs[0] = (UWORD32 *)pu1_buf;
+        pu1_buf += vert_bs_size;
+
+        ps_codec->apu4_pic_horz_bs[0] = (UWORD32 *)pu1_buf;
+        pu1_buf += horz_bs_size;
+
+        ps_codec->apu1_pic_qp[0] = (UWORD8 *)pu1_buf;
+        pu1_buf += qp_size;
+
+        ps_codec->apu1_pic_qp_const_in_ctb[0] = (UWORD8 *)pu1_buf;
+        pu1_buf += qp_const_flag_size;
+
+        ps_codec->apu4_pic_vert_bs[1] = (UWORD32 *)pu1_buf;
+        pu1_buf += vert_bs_size;
+
+        ps_codec->apu4_pic_horz_bs[1] = (UWORD32 *)pu1_buf;
+        pu1_buf += horz_bs_size;
+
+        ps_codec->apu1_pic_qp[1] = (UWORD8 *)pu1_buf;
+        pu1_buf += qp_size;
+
+        ps_codec->apu1_pic_qp_const_in_ctb[1] = (UWORD8 *)pu1_buf;
+        pu1_buf += qp_const_flag_size;
+#else
+        ps_codec->s_parse.s_bs_ctxt.pu4_pic_vert_bs = (UWORD32 *)pu1_buf;
+        pu1_buf += vert_bs_size;
+
+        ps_codec->s_parse.s_bs_ctxt.pu4_pic_horz_bs = (UWORD32 *)pu1_buf;
+        pu1_buf += horz_bs_size;
+
+        ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp = (UWORD8 *)pu1_buf;
+        pu1_buf += qp_size;
+
+        ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp_const_in_ctb = (UWORD8 *)pu1_buf;
+        pu1_buf += qp_const_flag_size;
+#endif
+
+    }
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_TILE_IDX];
+    {
+#ifdef GPU_BUILD
+        UWORD8 *pu1_buf = (UWORD8 *)ps_mem_rec->pv_base;
+        for(i = 0; i < 2; i++)
+        {
+            ps_codec->as_process[i].pu1_tile_idx = (UWORD16 *)pu1_buf;
+        }
+
+        pu1_buf += ps_mem_rec->u4_mem_size / 2;
+
+        for(i = 2; i < 4; i++)
+        {
+            ps_codec->as_process[i].pu1_tile_idx = (UWORD16 *)pu1_buf;
+        }
+#else
+        UWORD8 *pu1_buf = (UWORD8 *)ps_mem_rec->pv_base;
+
+        for(i = 0; i < MAX_PROCESS_THREADS; i++)
+        {
+            ps_codec->as_process[i].pu1_tile_idx = (UWORD16 *)pu1_buf + ps_codec->i4_max_wd / MIN_CTB_SIZE /* Offset 1 row */;
+        }
+#endif
+    }
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_SAO];
+#ifdef GPU_BUILD
+    memset(ps_mem_rec->pv_base, 0, ps_mem_rec->u4_mem_size);
+    ps_codec->aps_pic_sao[0] = (sao_t *)ps_mem_rec->pv_base;
+    ps_codec->aps_pic_sao[1] = (sao_t *)((UWORD32)ps_mem_rec->pv_base + ps_mem_rec->u4_mem_size / 2);
+#else
+    ps_codec->s_parse.ps_pic_sao = (sao_t *)ps_mem_rec->pv_base;
+    ps_codec->s_parse.s_sao_ctxt.ps_pic_sao = (sao_t *)ps_mem_rec->pv_base;
+    for(i = 0; i < MAX_PROCESS_THREADS; i++)
+    {
+        ps_codec->as_process[i].s_sao_ctxt.ps_pic_sao = ps_codec->s_parse.ps_pic_sao;
+    }
+#endif
+#ifdef GPU_BUILD
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_GPU];
+    ps_codec->s_gpu_ctxt.pv_gpu_priv = ps_mem_rec->pv_base;
+
+
+    /* Initialize opencl device */
+    /* Call GPU init before codec init so that reference frame buf could be allocated */
+    status = ihevcd_gpu_mc_init(ps_codec);
+    RETURN_IF((status == IV_FAIL), IV_FAIL);
+#endif
+
+    ps_mem_rec = &ps_mem_rec_base[MEM_REC_REF_PIC];
+#if defined(GPU_BUILD) && !defined(FRAME_STAGGER_ONLY)
+    ps_codec->pv_pic_buf_mgr = ihevcd_gpu_alloc_ref_buf(ps_codec,
+                                                        ps_mem_rec->u4_mem_alignment,
+                                                        ps_mem_rec->u4_mem_size);
+    RETURN_IF((ps_codec->pv_pic_buf_mgr == NULL), IV_FAIL);
+#else
+    ps_codec->pv_pic_buf_mgr = ps_mem_rec->pv_base;
+#endif
+    ps_codec->pv_pic_buf_base = (UWORD8 *)ps_codec->pv_pic_buf_mgr + sizeof(buf_mgr_t);
+    ps_codec->i4_total_pic_buf_size = ps_mem_rec->u4_mem_size - sizeof(buf_mgr_t);
+
+
+
+
+
+#ifdef APPLY_CONCEALMENT
+    {
+
+        UWORD32 mem_loc;
+
+        icncl_init_ip_t cncl_init_ip;
+        icncl_init_op_t cncl_init_op;
+        iv_mem_rec_t *ps_mem_rec;
+        DecStruct *ps_codec;
+
+        ps_mem_rec = dec_init_ip->s_ivd_init_ip_t.pv_mem_rec_location;
+        mem_loc = MEM_REC_CNT;
+
+        ps_codec->ps_conceal = (iv_obj_t *)ps_mem_rec[mem_loc].pv_base;
+        ps_codec->i4_first_frame_done = 0;
+
+        cncl_init_ip.u4_size = sizeof(icncl_init_ip_t);
+        cncl_init_ip.pv_mem_rec_location = &(ps_mem_rec[mem_loc]);
+        cncl_init_ip.e_cmd = IV_CMD_INIT;
+
+        status = icncl_api_function(ps_codec->ps_conceal, (void *)&cncl_init_ip, (void *)&cncl_init_op);
+
+    }
+#endif //APPLY_CONCEALMENT
+
+    status = ihevcd_init(ps_codec);
+
+    TRACE_INIT(NULL);
+    STATS_INIT();
+    return status;
+}
+/**
+*******************************************************************************
+*
+* @brief
+*  Retrieves mem records passed to the codec
+*
+* @par Description:
+*  Retrieves memrecs passed earlier
+*
+* @param[in] ps_codec_obj
+*  Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_retrieve_memrec(iv_obj_t *ps_codec_obj,
+                              void *pv_api_ip,
+                              void *pv_api_op)
+{
+
+    iv_retrieve_mem_rec_ip_t *dec_clr_ip;
+    iv_retrieve_mem_rec_op_t *dec_clr_op;
+    codec_t *ps_codec;
+    dec_clr_ip = (iv_retrieve_mem_rec_ip_t *)pv_api_ip;
+    dec_clr_op = (iv_retrieve_mem_rec_op_t *)pv_api_op;
+    ps_codec = (codec_t *)(ps_codec_obj->pv_codec_handle);
+#ifdef GPU_BUILD
+    ihevcd_gpu_mc_deinit(&ps_codec->s_gpu_ctxt);
+
+#endif
+
+    if(ps_codec->i4_init_done != 1)
+    {
+        dec_clr_op->u4_error_code |= 1 << IVD_FATALERROR;
+        dec_clr_op->u4_error_code |= IHEVCD_INIT_NOT_DONE;
+        return IV_FAIL;
+    }
+
+    memcpy(dec_clr_ip->pv_mem_rec_location, ps_codec->ps_mem_rec_backup,
+           MEM_REC_CNT * (sizeof(iv_mem_rec_t)));
+    dec_clr_op->u4_num_mem_rec_filled = MEM_REC_CNT;
+
+#ifdef APPLY_CONCEALMENT
+    {
+        IV_API_CALL_STATUS_T status;
+        icncl_fill_mem_rec_ip_t cncl_fill_ip;
+        icncl_fill_mem_rec_op_t cncl_fill_op;
+
+        iv_mem_rec_t *ps_mem_rec;
+
+        UWORD8 mem_loc = MEM_REC_CNT;
+        UWORD8 num_cncl_mem = 0;
+
+        ps_mem_rec = dec_clr_ip->pv_mem_rec_location;
+
+        cncl_fill_ip.s_ivd_fill_mem_rec_ip_t.e_cmd = IV_CMD_FILL_NUM_MEM_REC;
+        cncl_fill_ip.s_ivd_fill_mem_rec_ip_t.pv_mem_rec_location = &(ps_mem_rec[mem_loc]);
+        cncl_fill_ip.s_ivd_fill_mem_rec_ip_t.u4_size = sizeof(icncl_fill_mem_rec_ip_t);
+
+        status = icncl_api_function(NULL, (void *)&cncl_fill_ip, (void *)&cncl_fill_op);
+
+        cncl_fill_ip.s_ivd_fill_mem_rec_ip_t.e_cmd = IV_CMD_RETRIEVE_MEMREC;
+        cncl_fill_op.s_ivd_fill_mem_rec_op_t.u4_size = sizeof(icncl_fill_mem_rec_op_t);
+
+        status = icncl_api_function(ps_codec->ps_conceal, (void *)&cncl_fill_ip, (void *)&cncl_fill_op);
+
+        if(status == IV_SUCCESS)
+        {
+            /* Add the concealment library's memory requirements */
+            dec_clr_op->u4_num_mem_rec_filled += cncl_fill_op.s_ivd_fill_mem_rec_op_t.u4_num_mem_rec_filled;
+        }
+    }
+#endif //APPLY_CONCEALMENT
+    DEBUG("Retrieve num mem recs: %d\n",
+                    dec_clr_op->u4_num_mem_rec_filled);
+    STATS_PRINT();
+    ihevcd_jobq_free((jobq_t *)ps_codec->pv_proc_jobq);
+
+
+
+    return IV_SUCCESS;
+
+}
+/**
+*******************************************************************************
+*
+* @brief
+*  Passes display buffer from application to codec
+*
+* @par Description:
+*  Adds display buffer to the codec
+*
+* @param[in] ps_codec_obj
+*  Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_set_display_frame(iv_obj_t *ps_codec_obj,
+                                void *pv_api_ip,
+                                void *pv_api_op)
+{
+    WORD32 ret = IV_SUCCESS;
+
+    ivd_set_display_frame_ip_t *ps_dec_disp_ip;
+    ivd_set_display_frame_op_t *ps_dec_disp_op;
+
+    WORD32 i;
+
+    codec_t *ps_codec = (codec_t *)(ps_codec_obj->pv_codec_handle);
+
+    ps_dec_disp_ip = (ivd_set_display_frame_ip_t *)pv_api_ip;
+    ps_dec_disp_op = (ivd_set_display_frame_op_t *)pv_api_op;
+
+    ps_codec->i4_num_disp_bufs = 0;
+    if(ps_codec->i4_share_disp_buf)
+    {
+        UWORD32 num_bufs = ps_dec_disp_ip->num_disp_bufs;
+        pic_buf_t *ps_pic_buf;
+        UWORD8 *pu1_buf;
+        WORD32 buf_ret;
+        WORD32 strd;
+        strd = ps_codec->i4_strd;
+        if(0 == strd)
+            strd = ps_codec->i4_max_wd + PAD_WD;
+        num_bufs = MIN(num_bufs, BUF_MGR_MAX_CNT);
+        ps_codec->i4_num_disp_bufs = num_bufs;
+
+        ps_pic_buf = (pic_buf_t *)ps_codec->ps_pic_buf;
+        for(i = 0; i < (WORD32)num_bufs; i++)
+        {
+            pu1_buf =  ps_dec_disp_ip->s_disp_buffer[i].pu1_bufs[0];
+            ps_pic_buf->pu1_luma = pu1_buf + strd * PAD_TOP + PAD_LEFT;
+
+            pu1_buf =  ps_dec_disp_ip->s_disp_buffer[i].pu1_bufs[1];
+            ps_pic_buf->pu1_chroma = pu1_buf + strd * (PAD_TOP / 2) + PAD_LEFT;
+
+            buf_ret = ihevc_buf_mgr_add((buf_mgr_t *)ps_codec->pv_pic_buf_mgr, ps_pic_buf, i);
+
+            if(0 != buf_ret)
+            {
+                ps_codec->i4_error_code = IHEVCD_BUF_MGR_ERROR;
+                return IHEVCD_BUF_MGR_ERROR;
+            }
+
+            /* Mark pic buf as needed for display */
+            /* This ensures that till the buffer is explicitly passed to the codec,
+             * application owns the buffer. Decoder is allowed to use a buffer only
+             * when application sends it through fill this buffer call in OMX
+             */
+            ihevc_buf_mgr_set_status((buf_mgr_t *)ps_codec->pv_pic_buf_mgr, i, BUF_MGR_DISP);
+
+            ps_pic_buf++;
+
+        }
+    }
+
+    ps_dec_disp_op->u4_error_code = 0;
+    return ret;
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Sets the decoder in flush mode. Decoder will come out of  flush only
+* after returning all the buffers or at reset
+*
+* @par Description:
+*  Sets the decoder in flush mode
+*
+* @param[in] ps_codec_obj
+*  Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_set_flush_mode(iv_obj_t *ps_codec_obj,
+                             void *pv_api_ip,
+                             void *pv_api_op)
+{
+
+    codec_t *ps_codec;
+    ivd_ctl_flush_op_t *ps_ctl_op = (ivd_ctl_flush_op_t *)pv_api_op;
+    UNUSED(pv_api_ip);
+    ps_codec = (codec_t *)(ps_codec_obj->pv_codec_handle);
+
+    /* Signal flush frame control call */
+    ps_codec->i4_flush_mode = 1;
+
+    ps_ctl_op->u4_error_code = 0;
+
+    /* Set pic count to zero, so that decoder starts buffering again */
+    /* once it comes out of flush mode */
+    ps_codec->u4_pic_cnt = 0;
+    ps_codec->u4_disp_cnt = 0;
+    return IV_SUCCESS;
+
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Gets decoder status and buffer requirements
+*
+* @par Description:
+*  Gets the decoder status
+*
+* @param[in] ps_codec_obj
+*  Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_get_status(iv_obj_t *ps_codec_obj,
+                         void *pv_api_ip,
+                         void *pv_api_op)
+{
+
+    WORD32 i;
+    codec_t *ps_codec;
+    WORD32 wd, ht;
+    ivd_ctl_getstatus_op_t *ps_ctl_op = (ivd_ctl_getstatus_op_t *)pv_api_op;
+
+    UNUSED(pv_api_ip);
+
+    ps_ctl_op->u4_error_code = 0;
+
+    ps_codec = (codec_t *)(ps_codec_obj->pv_codec_handle);
+
+    ps_ctl_op->u4_min_num_in_bufs = MIN_IN_BUFS;
+    if(ps_codec->e_chroma_fmt == IV_YUV_420P)
+        ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_420;
+    else if(ps_codec->e_chroma_fmt == IV_YUV_422ILE)
+        ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_422ILE;
+    else if(ps_codec->e_chroma_fmt == IV_RGB_565)
+        ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_RGB565;
+    else if(ps_codec->e_chroma_fmt == IV_RGBA_8888)
+        ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_RGBA8888;
+    else if((ps_codec->e_chroma_fmt == IV_YUV_420SP_UV)
+                    || (ps_codec->e_chroma_fmt == IV_YUV_420SP_VU))
+        ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_420SP;
+
+    ps_ctl_op->u4_num_disp_bufs = 1;
+
+    for(i = 0; i < (WORD32)ps_ctl_op->u4_min_num_in_bufs; i++)
+    {
+        ps_ctl_op->u4_min_in_buf_size[i] = MAX((ps_codec->i4_wd * ps_codec->i4_ht), MIN_BITSBUF_SIZE);
+    }
+
+    wd = ps_codec->i4_wd;
+    ht = ps_codec->i4_ht;
+
+    if(ps_codec->i4_sps_done)
+    {
+        if(0 == ps_codec->i4_share_disp_buf)
+        {
+            wd = ps_codec->i4_disp_wd;
+            ht = ps_codec->i4_disp_ht;
+
+        }
+        else
+        {
+            wd = ps_codec->i4_disp_strd;
+            ht = ps_codec->i4_ht + PAD_HT;
+        }
+    }
+    else
+    {
+        if(1 == ps_codec->i4_share_disp_buf)
+        {
+            wd = ALIGN32(wd + PAD_WD);
+            ht += PAD_HT;
+        }
+    }
+
+    if(ps_codec->i4_disp_strd > wd)
+        wd = ps_codec->i4_disp_strd;
+
+    if(0 == ps_codec->i4_share_disp_buf)
+        ps_ctl_op->u4_num_disp_bufs = 1;
+    else
+    {
+        WORD32 pic_size;
+        WORD32 max_dpb_size;
+
+        if(ps_codec->i4_sps_done)
+        {
+            sps_t *ps_sps = (ps_codec->s_parse.ps_sps_base + ps_codec->i4_sps_id);
+            WORD32 reorder_pic_cnt;
+            WORD32 ref_pic_cnt;
+            WORD32 level;
+
+            reorder_pic_cnt = MIN(ps_sps->ai1_sps_max_num_reorder_pics[0], ps_codec->i4_init_num_reorder);
+            pic_size = ps_sps->i2_pic_width_in_luma_samples * ps_sps->i2_pic_height_in_luma_samples;
+
+            level = ps_codec->i4_init_level;
+            max_dpb_size = ihevcd_get_dpb_size(level, pic_size);
+            ref_pic_cnt = max_dpb_size;
+            ps_ctl_op->u4_num_disp_bufs = reorder_pic_cnt;
+
+            ps_ctl_op->u4_num_disp_bufs += ref_pic_cnt + 1;
+
+        }
+        else
+        {
+            pic_size = ps_codec->i4_max_wd * ps_codec->i4_max_ht;
+            max_dpb_size = ihevcd_get_dpb_size(ps_codec->i4_init_level, pic_size);
+            ps_ctl_op->u4_num_disp_bufs = 2 * max_dpb_size;
+
+            ps_ctl_op->u4_num_disp_bufs = MIN(ps_ctl_op->u4_num_disp_bufs,
+                            (ps_codec->i4_init_num_ref + ps_codec->i4_init_num_reorder + 1));
+
+        }
+
+        ps_ctl_op->u4_num_disp_bufs = MIN(
+                        ps_ctl_op->u4_num_disp_bufs, 32);
+    }
+
+    /*!*/
+    if(ps_codec->e_chroma_fmt == IV_YUV_420P)
+    {
+        ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht);
+        ps_ctl_op->u4_min_out_buf_size[1] = (wd * ht) >> 2;
+        ps_ctl_op->u4_min_out_buf_size[2] = (wd * ht) >> 2;
+    }
+    else if(ps_codec->e_chroma_fmt == IV_YUV_422ILE)
+    {
+        ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht) * 2;
+        ps_ctl_op->u4_min_out_buf_size[1] =
+                        ps_ctl_op->u4_min_out_buf_size[2] = 0;
+    }
+    else if(ps_codec->e_chroma_fmt == IV_RGB_565)
+    {
+        ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht) * 2;
+        ps_ctl_op->u4_min_out_buf_size[1] =
+                        ps_ctl_op->u4_min_out_buf_size[2] = 0;
+    }
+    else if(ps_codec->e_chroma_fmt == IV_RGBA_8888)
+    {
+        ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht) * 4;
+        ps_ctl_op->u4_min_out_buf_size[1] =
+                        ps_ctl_op->u4_min_out_buf_size[2] = 0;
+    }
+    else if((ps_codec->e_chroma_fmt == IV_YUV_420SP_UV)
+                    || (ps_codec->e_chroma_fmt == IV_YUV_420SP_VU))
+    {
+        ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht);
+        ps_ctl_op->u4_min_out_buf_size[1] = (wd * ht) >> 1;
+        ps_ctl_op->u4_min_out_buf_size[2] = 0;
+    }
+    ps_ctl_op->u4_pic_ht = ht;
+    ps_ctl_op->u4_pic_wd = wd;
+    ps_ctl_op->u4_frame_rate = 30000;
+    ps_ctl_op->u4_bit_rate = 1000000;
+    ps_ctl_op->e_content_type = IV_PROGRESSIVE;
+    ps_ctl_op->e_output_chroma_format = ps_codec->e_chroma_fmt;
+    ps_codec->i4_num_disp_bufs = ps_ctl_op->u4_num_disp_bufs;
+    return IV_SUCCESS;
+}
+/**
+*******************************************************************************
+*
+* @brief
+*  Gets decoder buffer requirements
+*
+* @par Description:
+*  Gets the decoder buffer requirements. If called before  header decoder,
+* buffer requirements are based on max_wd  and max_ht else actual width and
+* height will be used
+*
+* @param[in] ps_codec_obj
+*  Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_get_buf_info(iv_obj_t *ps_codec_obj,
+                           void *pv_api_ip,
+                           void *pv_api_op)
+{
+
+    codec_t *ps_codec;
+    UWORD32 i = 0;
+    WORD32 wd, ht;
+    ivd_ctl_getbufinfo_op_t *ps_ctl_op =
+                    (ivd_ctl_getbufinfo_op_t *)pv_api_op;
+
+    UNUSED(pv_api_ip);
+    ps_ctl_op->u4_error_code = 0;
+
+    ps_codec = (codec_t *)(ps_codec_obj->pv_codec_handle);
+
+    ps_ctl_op->u4_min_num_in_bufs = MIN_IN_BUFS;
+    if(ps_codec->e_chroma_fmt == IV_YUV_420P)
+        ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_420;
+    else if(ps_codec->e_chroma_fmt == IV_YUV_422ILE)
+        ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_422ILE;
+    else if(ps_codec->e_chroma_fmt == IV_RGB_565)
+        ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_RGB565;
+    else if(ps_codec->e_chroma_fmt == IV_RGBA_8888)
+        ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_RGBA8888;
+    else if((ps_codec->e_chroma_fmt == IV_YUV_420SP_UV)
+                    || (ps_codec->e_chroma_fmt == IV_YUV_420SP_VU))
+        ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_420SP;
+
+    ps_ctl_op->u4_num_disp_bufs = 1;
+
+    for(i = 0; i < ps_ctl_op->u4_min_num_in_bufs; i++)
+    {
+        ps_ctl_op->u4_min_in_buf_size[i] = MAX((ps_codec->i4_wd * ps_codec->i4_ht), MIN_BITSBUF_SIZE);
+    }
+
+    wd = ps_codec->i4_max_wd;
+    ht = ps_codec->i4_max_ht;
+
+    if(ps_codec->i4_sps_done)
+    {
+        if(0 == ps_codec->i4_share_disp_buf)
+        {
+            wd = ps_codec->i4_disp_wd;
+            ht = ps_codec->i4_disp_ht;
+
+        }
+        else
+        {
+            wd = ps_codec->i4_disp_strd;
+            ht = ps_codec->i4_ht + PAD_HT;
+        }
+    }
+    else
+    {
+        if(1 == ps_codec->i4_share_disp_buf)
+        {
+            wd = ALIGN32(wd + PAD_WD);
+            ht += PAD_HT;
+        }
+    }
+
+    if(ps_codec->i4_disp_strd > wd)
+        wd = ps_codec->i4_disp_strd;
+
+    if(0 == ps_codec->i4_share_disp_buf)
+        ps_ctl_op->u4_num_disp_bufs = 1;
+    else
+    {
+        WORD32 pic_size;
+        WORD32 max_dpb_size;
+
+        if(ps_codec->i4_sps_done)
+        {
+            sps_t *ps_sps = (ps_codec->s_parse.ps_sps_base + ps_codec->i4_sps_id);
+            WORD32 reorder_pic_cnt;
+            WORD32 ref_pic_cnt;
+            WORD32 level;
+
+            reorder_pic_cnt = MIN(ps_sps->ai1_sps_max_num_reorder_pics[0], ps_codec->i4_init_num_reorder);
+            pic_size = ps_sps->i2_pic_width_in_luma_samples * ps_sps->i2_pic_height_in_luma_samples;
+
+            level = ps_codec->i4_init_level;
+            max_dpb_size = ihevcd_get_dpb_size(level, pic_size);
+            ref_pic_cnt = max_dpb_size;
+            ps_ctl_op->u4_num_disp_bufs = reorder_pic_cnt;
+
+            ps_ctl_op->u4_num_disp_bufs += ref_pic_cnt + 1;
+
+        }
+        else
+        {
+            pic_size = ps_codec->i4_max_wd * ps_codec->i4_max_ht;
+            max_dpb_size = ihevcd_get_dpb_size(ps_codec->i4_init_level, pic_size);
+            ps_ctl_op->u4_num_disp_bufs = 2 * max_dpb_size;
+
+            ps_ctl_op->u4_num_disp_bufs = MIN(ps_ctl_op->u4_num_disp_bufs,
+                            (ps_codec->i4_init_num_ref + ps_codec->i4_init_num_reorder + 1));
+
+        }
+
+        ps_ctl_op->u4_num_disp_bufs = MIN(
+                        ps_ctl_op->u4_num_disp_bufs, 32);
+
+    }
+
+    /*!*/
+    if(ps_codec->e_chroma_fmt == IV_YUV_420P)
+    {
+        ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht);
+        ps_ctl_op->u4_min_out_buf_size[1] = (wd * ht) >> 2;
+        ps_ctl_op->u4_min_out_buf_size[2] = (wd * ht) >> 2;
+    }
+    else if(ps_codec->e_chroma_fmt == IV_YUV_422ILE)
+    {
+        ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht) * 2;
+        ps_ctl_op->u4_min_out_buf_size[1] =
+                        ps_ctl_op->u4_min_out_buf_size[2] = 0;
+    }
+    else if(ps_codec->e_chroma_fmt == IV_RGB_565)
+    {
+        ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht) * 2;
+        ps_ctl_op->u4_min_out_buf_size[1] =
+                        ps_ctl_op->u4_min_out_buf_size[2] = 0;
+    }
+    else if(ps_codec->e_chroma_fmt == IV_RGBA_8888)
+    {
+        ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht) * 4;
+        ps_ctl_op->u4_min_out_buf_size[1] =
+                        ps_ctl_op->u4_min_out_buf_size[2] = 0;
+    }
+    else if((ps_codec->e_chroma_fmt == IV_YUV_420SP_UV)
+                    || (ps_codec->e_chroma_fmt == IV_YUV_420SP_VU))
+    {
+        ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht);
+        ps_ctl_op->u4_min_out_buf_size[1] = (wd * ht) >> 1;
+        ps_ctl_op->u4_min_out_buf_size[2] = 0;
+    }
+    ps_codec->i4_num_disp_bufs = ps_ctl_op->u4_num_disp_bufs;
+
+    return IV_SUCCESS;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Sets dynamic parameters
+*
+* @par Description:
+*  Sets dynamic parameters. Note Frame skip, decode header  mode are dynamic
+*  Dynamic change in stride is not  supported
+*
+* @param[in] ps_codec_obj
+*  Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_set_params(iv_obj_t *ps_codec_obj,
+                         void *pv_api_ip,
+                         void *pv_api_op)
+{
+
+    codec_t *ps_codec;
+    WORD32 ret = IV_SUCCESS;
+    WORD32 strd;
+    ivd_ctl_set_config_ip_t *s_ctl_dynparams_ip =
+                    (ivd_ctl_set_config_ip_t *)pv_api_ip;
+    ivd_ctl_set_config_op_t *s_ctl_dynparams_op =
+                    (ivd_ctl_set_config_op_t *)pv_api_op;
+
+    ps_codec = (codec_t *)(ps_codec_obj->pv_codec_handle);
+
+    s_ctl_dynparams_op->u4_error_code = 0;
+
+    ps_codec->e_pic_skip_mode = s_ctl_dynparams_ip->e_frm_skip_mode;
+
+    if(s_ctl_dynparams_ip->e_frm_skip_mode != IVD_SKIP_NONE)
+    {
+
+        if((s_ctl_dynparams_ip->e_frm_skip_mode != IVD_SKIP_P) &&
+           (s_ctl_dynparams_ip->e_frm_skip_mode != IVD_SKIP_B) &&
+           (s_ctl_dynparams_ip->e_frm_skip_mode != IVD_SKIP_PB))
+        {
+            s_ctl_dynparams_op->u4_error_code = (1 << IVD_UNSUPPORTEDPARAM);
+            ret = IV_FAIL;
+        }
+    }
+
+    strd = ps_codec->i4_disp_strd;
+    if(1 == ps_codec->i4_share_disp_buf)
+    {
+        strd = ps_codec->i4_strd;
+    }
+
+
+    if((-1 != (WORD32)s_ctl_dynparams_ip->u4_disp_wd) &&
+                    (0  != s_ctl_dynparams_ip->u4_disp_wd) &&
+                    (0  != strd) &&
+                    ((WORD32)s_ctl_dynparams_ip->u4_disp_wd < strd))
+    {
+        s_ctl_dynparams_op->u4_error_code |= (1 << IVD_UNSUPPORTEDPARAM);
+        s_ctl_dynparams_op->u4_error_code |= IHEVCD_INVALID_DISP_STRD;
+        ret = IV_FAIL;
+    }
+    else
+    {
+        if((WORD32)s_ctl_dynparams_ip->u4_disp_wd >= ps_codec->i4_wd)
+        {
+            strd = s_ctl_dynparams_ip->u4_disp_wd;
+        }
+        else if(0 == ps_codec->i4_sps_done ||
+                        0 == ps_codec->i4_pps_done)
+        {
+            strd = s_ctl_dynparams_ip->u4_disp_wd;
+        }
+        else if(s_ctl_dynparams_ip->u4_disp_wd == 0)
+        {
+            strd = ps_codec->i4_disp_strd;
+        }
+        else
+        {
+            strd = 0;
+            s_ctl_dynparams_op->u4_error_code |= (1 << IVD_UNSUPPORTEDPARAM);
+            s_ctl_dynparams_op->u4_error_code |= IHEVCD_INVALID_DISP_STRD;
+            ret = IV_FAIL;
+        }
+    }
+
+    ps_codec->i4_disp_strd = strd;
+    if(1 == ps_codec->i4_share_disp_buf)
+    {
+        ps_codec->i4_strd = strd;
+    }
+
+    if(s_ctl_dynparams_ip->e_vid_dec_mode == IVD_DECODE_FRAME)
+        ps_codec->i4_header_mode = 0;
+    else if(s_ctl_dynparams_ip->e_vid_dec_mode == IVD_DECODE_HEADER)
+        ps_codec->i4_header_mode = 1;
+    else
+    {
+
+        s_ctl_dynparams_op->u4_error_code = (1 << IVD_UNSUPPORTEDPARAM);
+        ps_codec->i4_header_mode = 1;
+        ret = IV_FAIL;
+    }
+
+
+    return ret;
+
+}
+/**
+*******************************************************************************
+*
+* @brief
+*  Resets the decoder state
+*
+* @par Description:
+*  Resets the decoder state by calling ihevcd_init()
+*
+* @param[in] ps_codec_obj
+*  Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_reset(iv_obj_t *ps_codec_obj, void *pv_api_ip, void *pv_api_op)
+{
+    codec_t *ps_codec;
+    ivd_ctl_reset_op_t *s_ctl_reset_op = (ivd_ctl_reset_op_t *)pv_api_op;
+    UNUSED(pv_api_ip);
+    ps_codec = (codec_t *)(ps_codec_obj->pv_codec_handle);
+
+    if(ps_codec != NULL)
+    {
+        DEBUG("\nReset called \n");
+        ihevcd_init(ps_codec);
+    }
+    else
+    {
+        DEBUG("\nReset called without Initializing the decoder\n");
+        s_ctl_reset_op->u4_error_code = IHEVCD_INIT_NOT_DONE;
+    }
+
+    return IV_SUCCESS;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Releases display buffer from application to codec  to signal to the codec
+* that it can write to this buffer  if required. Till release is called,
+* codec can not write  to this buffer
+*
+* @par Description:
+*  Marks the buffer as display done
+*
+* @param[in] ps_codec_obj
+*  Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_rel_display_frame(iv_obj_t *ps_codec_obj,
+                                void *pv_api_ip,
+                                void *pv_api_op)
+{
+
+    ivd_rel_display_frame_ip_t *ps_dec_rel_disp_ip;
+    ivd_rel_display_frame_op_t *ps_dec_rel_disp_op;
+
+    codec_t *ps_codec = (codec_t *)ps_codec_obj->pv_codec_handle;
+
+    ps_dec_rel_disp_ip = (ivd_rel_display_frame_ip_t *)pv_api_ip;
+    ps_dec_rel_disp_op = (ivd_rel_display_frame_op_t *)pv_api_op;
+
+    UNUSED(ps_dec_rel_disp_op);
+
+    if(0 == ps_codec->i4_share_disp_buf)
+    {
+        return IV_SUCCESS;
+    }
+
+    ihevc_buf_mgr_release((buf_mgr_t *)ps_codec->pv_pic_buf_mgr, ps_dec_rel_disp_ip->u4_disp_buf_id, BUF_MGR_DISP);
+
+    return IV_SUCCESS;
+}
+/**
+*******************************************************************************
+*
+* @brief
+*  Sets degrade params
+*
+* @par Description:
+*  Sets degrade params.
+*  Refer to ihevcd_cxa_ctl_degrade_ip_t definition for details
+*
+* @param[in] ps_codec_obj
+*  Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_set_degrade(iv_obj_t *ps_codec_obj,
+                          void *pv_api_ip,
+                          void *pv_api_op)
+{
+    ihevcd_cxa_ctl_degrade_ip_t *ps_ip;
+    ihevcd_cxa_ctl_degrade_op_t *ps_op;
+    codec_t *ps_codec = (codec_t *)ps_codec_obj->pv_codec_handle;
+
+    ps_ip = (ihevcd_cxa_ctl_degrade_ip_t *)pv_api_ip;
+    ps_op = (ihevcd_cxa_ctl_degrade_op_t *)pv_api_op;
+
+    ps_codec->i4_degrade_type = ps_ip->i4_degrade_type;
+    ps_codec->i4_nondegrade_interval = ps_ip->i4_nondegrade_interval;
+    ps_codec->i4_degrade_pics = ps_ip->i4_degrade_pics;
+
+    ps_op->u4_error_code = 0;
+    ps_codec->i4_degrade_pic_cnt = 0;
+
+    return IV_SUCCESS;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Gets frame dimensions/offsets
+*
+* @par Description:
+*  Gets frame buffer chararacteristics such a x & y offsets  display and
+* buffer dimensions
+*
+* @param[in] ps_codec_obj
+*  Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_get_frame_dimensions(iv_obj_t *ps_codec_obj,
+                                   void *pv_api_ip,
+                                   void *pv_api_op)
+{
+    ihevcd_cxa_ctl_get_frame_dimensions_ip_t *ps_ip;
+    ihevcd_cxa_ctl_get_frame_dimensions_op_t *ps_op;
+    codec_t *ps_codec = (codec_t *)ps_codec_obj->pv_codec_handle;
+    WORD32 disp_wd, disp_ht, buffer_wd, buffer_ht, x_offset, y_offset;
+    ps_ip = (ihevcd_cxa_ctl_get_frame_dimensions_ip_t *)pv_api_ip;
+    ps_op = (ihevcd_cxa_ctl_get_frame_dimensions_op_t *)pv_api_op;
+    UNUSED(ps_ip);
+    if(ps_codec->i4_sps_done)
+    {
+        disp_wd = ps_codec->i4_disp_wd;
+        disp_ht = ps_codec->i4_disp_ht;
+
+        if(0 == ps_codec->i4_share_disp_buf)
+        {
+            buffer_wd = disp_wd;
+            buffer_ht = disp_ht;
+        }
+        else
+        {
+            buffer_wd = ps_codec->i4_strd;
+            buffer_ht = ps_codec->i4_ht + PAD_HT;
+        }
+    }
+    else
+    {
+
+        disp_wd = ps_codec->i4_max_wd;
+        disp_ht = ps_codec->i4_max_ht;
+
+        if(0 == ps_codec->i4_share_disp_buf)
+        {
+            buffer_wd = disp_wd;
+            buffer_ht = disp_ht;
+        }
+        else
+        {
+            buffer_wd = ALIGN16(disp_wd) + PAD_WD;
+            buffer_ht = ALIGN16(disp_ht) + PAD_HT;
+
+        }
+    }
+    if(ps_codec->i4_strd > buffer_wd)
+        buffer_wd = ps_codec->i4_strd;
+
+    if(0 == ps_codec->i4_share_disp_buf)
+    {
+        x_offset = 0;
+        y_offset = 0;
+    }
+    else
+    {
+        y_offset = PAD_TOP;
+        x_offset = PAD_LEFT;
+#if 0
+        if((NULL != ps_codec->ps_seqParams) && (1 == (ps_codec->ps_seqParams->u1_is_valid)) && (0 != ps_codec->u2_crop_offset_y))
+        {
+            y_offset += ps_codec->u2_crop_offset_y / ps_codec->i4_strd;
+            x_offset += ps_codec->u2_crop_offset_y % ps_codec->i4_strd;
+        }
+#endif
+    }
+
+    ps_op->u4_disp_wd[0] = disp_wd;
+    ps_op->u4_disp_ht[0] = disp_ht;
+    ps_op->u4_buffer_wd[0] = buffer_wd;
+    ps_op->u4_buffer_ht[0] = buffer_ht;
+    ps_op->u4_x_offset[0] = x_offset;
+    ps_op->u4_y_offset[0] = y_offset;
+
+    ps_op->u4_disp_wd[1] = ps_op->u4_disp_wd[2] = ((ps_op->u4_disp_wd[0] + 1)
+                    >> 1);
+    ps_op->u4_disp_ht[1] = ps_op->u4_disp_ht[2] = ((ps_op->u4_disp_ht[0] + 1)
+                    >> 1);
+    ps_op->u4_buffer_wd[1] = ps_op->u4_buffer_wd[2] = (ps_op->u4_buffer_wd[0]
+                    >> 1);
+    ps_op->u4_buffer_ht[1] = ps_op->u4_buffer_ht[2] = (ps_op->u4_buffer_ht[0]
+                    >> 1);
+    ps_op->u4_x_offset[1] = ps_op->u4_x_offset[2] = (ps_op->u4_x_offset[0]
+                    >> 1);
+    ps_op->u4_y_offset[1] = ps_op->u4_y_offset[2] = (ps_op->u4_y_offset[0]
+                    >> 1);
+
+    if((ps_codec->e_chroma_fmt == IV_YUV_420SP_UV)
+                    || (ps_codec->e_chroma_fmt == IV_YUV_420SP_VU))
+    {
+        ps_op->u4_disp_wd[2] = 0;
+        ps_op->u4_disp_ht[2] = 0;
+        ps_op->u4_buffer_wd[2] = 0;
+        ps_op->u4_buffer_ht[2] = 0;
+        ps_op->u4_x_offset[2] = 0;
+        ps_op->u4_y_offset[2] = 0;
+
+        ps_op->u4_disp_wd[1] <<= 1;
+        ps_op->u4_buffer_wd[1] <<= 1;
+        ps_op->u4_x_offset[1] <<= 1;
+    }
+
+    return IV_SUCCESS;
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Gets vui parameters
+*
+* @par Description:
+*  Gets VUI parameters
+*
+* @param[in] ps_codec_obj
+*  Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_get_vui_params(iv_obj_t *ps_codec_obj,
+                             void *pv_api_ip,
+                             void *pv_api_op)
+{
+    ihevcd_cxa_ctl_get_vui_params_ip_t *ps_ip;
+    ihevcd_cxa_ctl_get_vui_params_op_t *ps_op;
+    codec_t *ps_codec = (codec_t *)ps_codec_obj->pv_codec_handle;
+    sps_t *ps_sps;
+    vui_t *ps_vui;
+    WORD32 i;
+
+    ps_ip = (ihevcd_cxa_ctl_get_vui_params_ip_t *)pv_api_ip;
+    ps_op = (ihevcd_cxa_ctl_get_vui_params_op_t *)pv_api_op;
+
+    if(0 == ps_codec->i4_sps_done)
+    {
+        ps_op->u4_error_code = IHEVCD_VUI_PARAMS_NOT_FOUND;
+        return IV_FAIL;
+    }
+
+    ps_sps = ps_codec->s_parse.ps_sps;
+    if(0 == ps_sps->i1_sps_valid || 0 == ps_sps->i1_vui_parameters_present_flag)
+    {
+        WORD32 sps_idx = 0;
+        ps_sps = ps_codec->ps_sps_base;
+
+        while((0 == ps_sps->i1_sps_valid) || (0 == ps_sps->i1_vui_parameters_present_flag))
+        {
+            sps_idx++;
+            ps_sps++;
+
+            if(sps_idx == MAX_SPS_CNT - 1)
+            {
+                ps_op->u4_error_code = IHEVCD_VUI_PARAMS_NOT_FOUND;
+                return IV_FAIL;
+            }
+        }
+    }
+
+    ps_vui = &ps_sps->s_vui_parameters;
+    UNUSED(ps_ip);
+
+    ps_op->u1_aspect_ratio_info_present_flag         =  ps_vui->u1_aspect_ratio_info_present_flag;
+    ps_op->u1_aspect_ratio_idc                       =  ps_vui->u1_aspect_ratio_idc;
+    ps_op->u2_sar_width                              =  ps_vui->u2_sar_width;
+    ps_op->u2_sar_height                             =  ps_vui->u2_sar_height;
+    ps_op->u1_overscan_info_present_flag             =  ps_vui->u1_overscan_info_present_flag;
+    ps_op->u1_overscan_appropriate_flag              =  ps_vui->u1_overscan_appropriate_flag;
+    ps_op->u1_video_signal_type_present_flag         =  ps_vui->u1_video_signal_type_present_flag;
+    ps_op->u1_video_format                           =  ps_vui->u1_video_format;
+    ps_op->u1_video_full_range_flag                  =  ps_vui->u1_video_full_range_flag;
+    ps_op->u1_colour_description_present_flag        =  ps_vui->u1_colour_description_present_flag;
+    ps_op->u1_colour_primaries                       =  ps_vui->u1_colour_primaries;
+    ps_op->u1_transfer_characteristics               =  ps_vui->u1_transfer_characteristics;
+    ps_op->u1_matrix_coefficients                    =  ps_vui->u1_matrix_coefficients;
+    ps_op->u1_chroma_loc_info_present_flag           =  ps_vui->u1_chroma_loc_info_present_flag;
+    ps_op->u1_chroma_sample_loc_type_top_field       =  ps_vui->u1_chroma_sample_loc_type_top_field;
+    ps_op->u1_chroma_sample_loc_type_bottom_field    =  ps_vui->u1_chroma_sample_loc_type_bottom_field;
+    ps_op->u1_neutral_chroma_indication_flag         =  ps_vui->u1_neutral_chroma_indication_flag;
+    ps_op->u1_field_seq_flag                         =  ps_vui->u1_field_seq_flag;
+    ps_op->u1_frame_field_info_present_flag          =  ps_vui->u1_frame_field_info_present_flag;
+    ps_op->u1_default_display_window_flag            =  ps_vui->u1_default_display_window_flag;
+    ps_op->u4_def_disp_win_left_offset               =  ps_vui->u4_def_disp_win_left_offset;
+    ps_op->u4_def_disp_win_right_offset              =  ps_vui->u4_def_disp_win_right_offset;
+    ps_op->u4_def_disp_win_top_offset                =  ps_vui->u4_def_disp_win_top_offset;
+    ps_op->u4_def_disp_win_bottom_offset             =  ps_vui->u4_def_disp_win_bottom_offset;
+    ps_op->u1_vui_hrd_parameters_present_flag        =  ps_vui->u1_vui_hrd_parameters_present_flag;
+    ps_op->u1_vui_timing_info_present_flag           =  ps_vui->u1_vui_timing_info_present_flag;
+    ps_op->u4_vui_num_units_in_tick                  =  ps_vui->u4_vui_num_units_in_tick;
+    ps_op->u4_vui_time_scale                         =  ps_vui->u4_vui_time_scale;
+    ps_op->u1_poc_proportional_to_timing_flag        =  ps_vui->u1_poc_proportional_to_timing_flag;
+    ps_op->u1_num_ticks_poc_diff_one_minus1          =  ps_vui->u1_num_ticks_poc_diff_one_minus1;
+    ps_op->u1_bitstream_restriction_flag             =  ps_vui->u1_bitstream_restriction_flag;
+    ps_op->u1_tiles_fixed_structure_flag             =  ps_vui->u1_tiles_fixed_structure_flag;
+    ps_op->u1_motion_vectors_over_pic_boundaries_flag =  ps_vui->u1_motion_vectors_over_pic_boundaries_flag;
+    ps_op->u1_restricted_ref_pic_lists_flag          =  ps_vui->u1_restricted_ref_pic_lists_flag;
+    ps_op->u4_min_spatial_segmentation_idc           =  ps_vui->u4_min_spatial_segmentation_idc;
+    ps_op->u1_max_bytes_per_pic_denom                =  ps_vui->u1_max_bytes_per_pic_denom;
+    ps_op->u1_max_bits_per_mincu_denom               =  ps_vui->u1_max_bits_per_mincu_denom;
+    ps_op->u1_log2_max_mv_length_horizontal          =  ps_vui->u1_log2_max_mv_length_horizontal;
+    ps_op->u1_log2_max_mv_length_vertical            =  ps_vui->u1_log2_max_mv_length_vertical;
+
+
+    /* HRD parameters */
+    ps_op->u1_timing_info_present_flag                         =    ps_vui->s_vui_hrd_parameters.u1_timing_info_present_flag;
+    ps_op->u4_num_units_in_tick                                =    ps_vui->s_vui_hrd_parameters.u4_num_units_in_tick;
+    ps_op->u4_time_scale                                       =    ps_vui->s_vui_hrd_parameters.u4_time_scale;
+    ps_op->u1_nal_hrd_parameters_present_flag                  =    ps_vui->s_vui_hrd_parameters.u1_nal_hrd_parameters_present_flag;
+    ps_op->u1_vcl_hrd_parameters_present_flag                  =    ps_vui->s_vui_hrd_parameters.u1_vcl_hrd_parameters_present_flag;
+    ps_op->u1_cpbdpb_delays_present_flag                       =    ps_vui->s_vui_hrd_parameters.u1_cpbdpb_delays_present_flag;
+    ps_op->u1_sub_pic_cpb_params_present_flag                  =    ps_vui->s_vui_hrd_parameters.u1_sub_pic_cpb_params_present_flag;
+    ps_op->u1_tick_divisor_minus2                              =    ps_vui->s_vui_hrd_parameters.u1_tick_divisor_minus2;
+    ps_op->u1_du_cpb_removal_delay_increment_length_minus1     =    ps_vui->s_vui_hrd_parameters.u1_du_cpb_removal_delay_increment_length_minus1;
+    ps_op->u1_sub_pic_cpb_params_in_pic_timing_sei_flag        =    ps_vui->s_vui_hrd_parameters.u1_sub_pic_cpb_params_in_pic_timing_sei_flag;
+    ps_op->u1_dpb_output_delay_du_length_minus1                =    ps_vui->s_vui_hrd_parameters.u1_dpb_output_delay_du_length_minus1;
+    ps_op->u4_bit_rate_scale                                   =    ps_vui->s_vui_hrd_parameters.u4_bit_rate_scale;
+    ps_op->u4_cpb_size_scale                                   =    ps_vui->s_vui_hrd_parameters.u4_cpb_size_scale;
+    ps_op->u4_cpb_size_du_scale                                =    ps_vui->s_vui_hrd_parameters.u4_cpb_size_du_scale;
+    ps_op->u1_initial_cpb_removal_delay_length_minus1          =    ps_vui->s_vui_hrd_parameters.u1_initial_cpb_removal_delay_length_minus1;
+    ps_op->u1_au_cpb_removal_delay_length_minus1               =    ps_vui->s_vui_hrd_parameters.u1_au_cpb_removal_delay_length_minus1;
+    ps_op->u1_dpb_output_delay_length_minus1                   =    ps_vui->s_vui_hrd_parameters.u1_dpb_output_delay_length_minus1;
+
+    for(i = 0; i < 6; i++)
+    {
+        ps_op->au1_fixed_pic_rate_general_flag[i]                  =    ps_vui->s_vui_hrd_parameters.au1_fixed_pic_rate_general_flag[i];
+        ps_op->au1_fixed_pic_rate_within_cvs_flag[i]               =    ps_vui->s_vui_hrd_parameters.au1_fixed_pic_rate_within_cvs_flag[i];
+        ps_op->au1_elemental_duration_in_tc_minus1[i]              =    ps_vui->s_vui_hrd_parameters.au1_elemental_duration_in_tc_minus1[i];
+        ps_op->au1_low_delay_hrd_flag[i]                           =    ps_vui->s_vui_hrd_parameters.au1_low_delay_hrd_flag[i];
+        ps_op->au1_cpb_cnt_minus1[i]                               =    ps_vui->s_vui_hrd_parameters.au1_cpb_cnt_minus1[i];
+    }
+
+
+    return IV_SUCCESS;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Sets Processor type
+*
+* @par Description:
+*  Sets Processor type
+*
+* @param[in] ps_codec_obj
+*  Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_set_processor(iv_obj_t *ps_codec_obj,
+                            void *pv_api_ip,
+                            void *pv_api_op)
+{
+    ihevcd_cxa_ctl_set_processor_ip_t *ps_ip;
+    ihevcd_cxa_ctl_set_processor_op_t *ps_op;
+    codec_t *ps_codec = (codec_t *)ps_codec_obj->pv_codec_handle;
+
+    ps_ip = (ihevcd_cxa_ctl_set_processor_ip_t *)pv_api_ip;
+    ps_op = (ihevcd_cxa_ctl_set_processor_op_t *)pv_api_op;
+
+    ps_codec->e_processor_arch = (IVD_ARCH_T)ps_ip->u4_arch;
+    ps_codec->e_processor_soc = (IVD_SOC_T)ps_ip->u4_soc;
+
+    ihevcd_init_function_ptr(ps_codec);
+
+    ihevcd_update_function_ptr(ps_codec);
+
+    if(ps_codec->e_processor_soc && (ps_codec->e_processor_soc <= SOC_HISI_37X))
+    {
+        /* 8th bit indicates if format conversion is to be done ahead */
+        if(ps_codec->e_processor_soc & 0x80)
+            ps_codec->u4_enable_fmt_conv_ahead = 1;
+
+        /* Lower 7 bit indicate NCTB - if non-zero */
+        ps_codec->e_processor_soc &= 0x7F;
+
+        if(ps_codec->e_processor_soc)
+            ps_codec->u4_nctb = ps_codec->e_processor_soc;
+
+
+    }
+
+    if((ps_codec->e_processor_soc == SOC_HISI_37X) && (ps_codec->i4_num_cores == 2))
+    {
+        ps_codec->u4_nctb = 2;
+    }
+
+
+    ps_op->u4_error_code = 0;
+    return IV_SUCCESS;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Sets Number of cores that can be used in the codec. Codec uses these many
+* threads for decoding
+*
+* @par Description:
+*  Sets number of cores
+*
+* @param[in] ps_codec_obj
+*  Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_set_num_cores(iv_obj_t *ps_codec_obj,
+                            void *pv_api_ip,
+                            void *pv_api_op)
+{
+    ihevcd_cxa_ctl_set_num_cores_ip_t *ps_ip;
+    ihevcd_cxa_ctl_set_num_cores_op_t *ps_op;
+    codec_t *ps_codec = (codec_t *)ps_codec_obj->pv_codec_handle;
+
+    ps_ip = (ihevcd_cxa_ctl_set_num_cores_ip_t *)pv_api_ip;
+    ps_op = (ihevcd_cxa_ctl_set_num_cores_op_t *)pv_api_op;
+
+#ifdef MULTICORE
+    ps_codec->i4_num_cores = ps_ip->u4_num_cores;
+#else
+    ps_codec->i4_num_cores = 1;
+#endif
+    ps_op->u4_error_code = 0;
+    return IV_SUCCESS;
+}
+#ifdef GPU_BUILD
+/**
+*******************************************************************************
+*
+* @brief
+*  Enables or disables GPU in run-time
+*
+* @par Description:
+*
+*
+* @param[in] ps_codec_obj
+*  Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_gpu_enable_disable(iv_obj_t *ps_codec_obj,
+                                 void *pv_api_ip,
+                                 void *pv_api_op)
+{
+    ihevcd_cxa_ctl_gpu_enable_diable_ip_t *ps_ip;
+    ihevcd_cxa_ctl_gpu_enable_diable_op_t *ps_op;
+    codec_t *ps_codec = (codec_t *)ps_codec_obj->pv_codec_handle;
+
+    ps_ip = (ihevcd_cxa_ctl_gpu_enable_diable_ip_t *)pv_api_ip;
+    ps_op = (ihevcd_cxa_ctl_gpu_enable_diable_op_t *)pv_api_op;
+
+#ifndef FRAME_STAGGER_ONLY
+    ps_codec->u4_gpu_enabled = ps_ip->u4_gpu_enable_diable;
+#endif
+    ps_op->u4_error_code = 0;
+    return IV_SUCCESS;
+}
+#endif
+/**
+*******************************************************************************
+*
+* @brief
+*  Codec control call
+*
+* @par Description:
+*  Codec control call which in turn calls appropriate calls  based on
+* subcommand
+*
+* @param[in] ps_codec_obj
+*  Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+*  Pointer to input argument structure
+*
+* @param[out] pv_api_op
+*  Pointer to output argument structure
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_ctl(iv_obj_t *ps_codec_obj, void *pv_api_ip, void *pv_api_op)
+{
+    ivd_ctl_set_config_ip_t *ps_ctl_ip;
+    ivd_ctl_set_config_op_t *ps_ctl_op;
+    WORD32 ret = 0;
+    WORD32 subcommand;
+    codec_t *ps_codec = (codec_t *)ps_codec_obj->pv_codec_handle;
+
+    ps_ctl_ip = (ivd_ctl_set_config_ip_t *)pv_api_ip;
+    ps_ctl_op = (ivd_ctl_set_config_op_t *)pv_api_op;
+
+    if(ps_codec->i4_init_done != 1)
+    {
+        ps_ctl_op->u4_error_code |= 1 << IVD_FATALERROR;
+        ps_ctl_op->u4_error_code |= IHEVCD_INIT_NOT_DONE;
+        return IV_FAIL;
+    }
+    subcommand = ps_ctl_ip->e_sub_cmd;
+
+    switch(subcommand)
+    {
+        case IVD_CMD_CTL_GETPARAMS:
+            ret = ihevcd_get_status(ps_codec_obj, (void *)pv_api_ip,
+                                    (void *)pv_api_op);
+            break;
+        case IVD_CMD_CTL_SETPARAMS:
+            ret = ihevcd_set_params(ps_codec_obj, (void *)pv_api_ip,
+                                    (void *)pv_api_op);
+            break;
+        case IVD_CMD_CTL_RESET:
+            ret = ihevcd_reset(ps_codec_obj, (void *)pv_api_ip,
+                               (void *)pv_api_op);
+            break;
+        case IVD_CMD_CTL_SETDEFAULT:
+        {
+            ivd_ctl_set_config_op_t *s_ctl_dynparams_op =
+                            (ivd_ctl_set_config_op_t *)pv_api_op;
+
+            ret = ihevcd_set_default_params(ps_codec);
+            if(IV_SUCCESS == ret)
+                s_ctl_dynparams_op->u4_error_code = 0;
+            break;
+        }
+        case IVD_CMD_CTL_FLUSH:
+            ret = ihevcd_set_flush_mode(ps_codec_obj, (void *)pv_api_ip,
+                                        (void *)pv_api_op);
+            break;
+        case IVD_CMD_CTL_GETBUFINFO:
+            ret = ihevcd_get_buf_info(ps_codec_obj, (void *)pv_api_ip,
+                                      (void *)pv_api_op);
+            break;
+        case IVD_CMD_CTL_GETVERSION:
+        {
+            ivd_ctl_getversioninfo_ip_t *ps_ip;
+            ivd_ctl_getversioninfo_op_t *ps_op;
+            IV_API_CALL_STATUS_T ret;
+            ps_ip = (ivd_ctl_getversioninfo_ip_t *)pv_api_ip;
+            ps_op = (ivd_ctl_getversioninfo_op_t *)pv_api_op;
+
+            ps_op->u4_error_code = IV_SUCCESS;
+
+            if((WORD32)ps_ip->u4_version_buffer_size <= 0)
+            {
+                ps_op->u4_error_code = IHEVCD_CXA_VERS_BUF_INSUFFICIENT;
+                ret = IV_FAIL;
+            }
+            else
+            {
+                ret = ihevcd_get_version((CHAR *)ps_ip->pv_version_buffer,
+                                         ps_ip->u4_version_buffer_size);
+                if(ret != IV_SUCCESS)
+                {
+                    ps_op->u4_error_code = IHEVCD_CXA_VERS_BUF_INSUFFICIENT;
+                    ret = IV_FAIL;
+                }
+            }
+        }
+            break;
+        case IHEVCD_CXA_CMD_CTL_DEGRADE:
+            ret = ihevcd_set_degrade(ps_codec_obj, (void *)pv_api_ip,
+                            (void *)pv_api_op);
+            break;
+        case IHEVCD_CXA_CMD_CTL_SET_NUM_CORES:
+            ret = ihevcd_set_num_cores(ps_codec_obj, (void *)pv_api_ip,
+                                       (void *)pv_api_op);
+            break;
+        case IHEVCD_CXA_CMD_CTL_GET_BUFFER_DIMENSIONS:
+            ret = ihevcd_get_frame_dimensions(ps_codec_obj, (void *)pv_api_ip,
+                                              (void *)pv_api_op);
+            break;
+        case IHEVCD_CXA_CMD_CTL_GET_VUI_PARAMS:
+            ret = ihevcd_get_vui_params(ps_codec_obj, (void *)pv_api_ip,
+                                        (void *)pv_api_op);
+            break;
+        case IHEVCD_CXA_CMD_CTL_SET_PROCESSOR:
+            ret = ihevcd_set_processor(ps_codec_obj, (void *)pv_api_ip,
+                            (void *)pv_api_op);
+            break;
+#ifdef GPU_BUILD
+        case IHEVCD_CXA_CMD_CTL_GPU_ENABLE_DISABLE:
+            ret = ihevcd_gpu_enable_disable(ps_codec_obj, (void *)pv_api_ip,
+                            (void *)pv_api_op);
+            break;
+#endif
+        default:
+            DEBUG("\nDo nothing\n");
+            break;
+    }
+
+    return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Codecs entry point function. All the function calls to  the codec are
+* done using this function with different  values specified in command
+*
+* @par Description:
+*  Arguments are tested for validity and then based on the  command
+* appropriate function is called
+*
+* @param[in] ps_handle
+*  API level handle for codec
+*
+* @param[in] pv_api_ip
+*  Input argument structure
+*
+* @param[out] pv_api_op
+*  Output argument structure
+*
+* @returns  Status of the function corresponding to command
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IV_API_CALL_STATUS_T ihevcd_cxa_api_function(iv_obj_t *ps_handle,
+                                             void *pv_api_ip,
+                                             void *pv_api_op)
+{
+    WORD32 command;
+    UWORD32 *pu4_ptr_cmd;
+    WORD32 ret = 0;
+    IV_API_CALL_STATUS_T e_status;
+    e_status = api_check_struct_sanity(ps_handle, pv_api_ip, pv_api_op);
+
+    if(e_status != IV_SUCCESS)
+    {
+        DEBUG("error code = %d\n", *((UWORD32 *)pv_api_op + 1));
+        return IV_FAIL;
+    }
+
+    pu4_ptr_cmd = (UWORD32 *)pv_api_ip;
+    pu4_ptr_cmd++;
+
+    command = *pu4_ptr_cmd;
+
+    switch(command)
+    {
+        case IV_CMD_GET_NUM_MEM_REC:
+            ret = ihevcd_get_num_rec((void *)pv_api_ip, (void *)pv_api_op);
+
+            break;
+        case IV_CMD_FILL_NUM_MEM_REC:
+
+            ret = ihevcd_fill_num_mem_rec((void *)pv_api_ip, (void *)pv_api_op);
+            break;
+        case IV_CMD_INIT:
+            ret = ihevcd_init_mem_rec(ps_handle, (void *)pv_api_ip,
+                                      (void *)pv_api_op);
+            break;
+
+        case IVD_CMD_VIDEO_DECODE:
+            ret = ihevcd_decode(ps_handle, (void *)pv_api_ip, (void *)pv_api_op);
+            break;
+
+        case IVD_CMD_GET_DISPLAY_FRAME:
+            //ret = ihevcd_get_display_frame(ps_handle,(void *)pv_api_ip,(void *)pv_api_op);
+            break;
+
+        case IVD_CMD_SET_DISPLAY_FRAME:
+            ret = ihevcd_set_display_frame(ps_handle, (void *)pv_api_ip,
+                                           (void *)pv_api_op);
+
+            break;
+
+        case IVD_CMD_REL_DISPLAY_FRAME:
+            ret = ihevcd_rel_display_frame(ps_handle, (void *)pv_api_ip,
+                                           (void *)pv_api_op);
+            break;
+
+        case IV_CMD_RETRIEVE_MEMREC:
+            ret = ihevcd_retrieve_memrec(ps_handle, (void *)pv_api_ip,
+                                         (void *)pv_api_op);
+            break;
+
+        case IVD_CMD_VIDEO_CTL:
+            ret = ihevcd_ctl(ps_handle, (void *)pv_api_ip, (void *)pv_api_op);
+            break;
+        default:
+            ret = IV_FAIL;
+            break;
+    }
+
+    return (IV_API_CALL_STATUS_T)ret;
+}
+

diff --git a/decoder/ihevcd_bitstream.c b/decoder/ihevcd_bitstream.c
new file mode 100644
index 0000000..be9addb
--- /dev/null
+++ b/decoder/ihevcd_bitstream.c

@@ -0,0 +1,580 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_bitstream.c
+*
+* @brief
+*  Contains functions for bitstream access
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+* - ihevcd_bits_init()
+* - ihevcd_bits_flush()
+* - ihevcd_bits_flush_to_byte_boundary()
+* - ihevcd_bits_nxt()
+* - ihevcd_bits_nxt32()
+* - ihevcd_bits_get()
+* - ihevcd_bits_num_bits_remaining()
+* - ihevcd_bits_num_bits_consumed()
+* - ihevcd_sev()
+* - ihevcd_uev()
+*
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_bitstream.h"
+
+/*****************************************************************************/
+/* Function Prototypes                                                       */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Function used for bitstream structure initialization
+*
+* @par Description:
+*  Initialize bitstream structure elements
+*
+* @param[in] ps_bitstrm
+*  Pointer to bitstream structure
+*
+* @param[in] pu1_buf
+*  Pointer to bitstream data
+*
+* @param[in] u4_numbytes
+*  Number of bytes in bitstream
+*
+* @returns  none
+*
+* @remarks
+*  Assumes pu1_buf is aligned to 4 bytes. If not aligned  then all bitstream
+* accesses will be unaligned and hence  costlier. Since this is codec memory
+* that holds emulation prevented data, assumption of aligned to 4 bytes is
+* valid
+*
+*******************************************************************************
+*/
+void ihevcd_bits_init(bitstrm_t *ps_bitstrm,
+                      UWORD8 *pu1_buf,
+                      UWORD32 u4_numbytes)
+{
+    UWORD32 u4_cur_word;
+    UWORD32 u4_nxt_word;
+    UWORD32 u4_temp;
+    UWORD32 *pu4_buf;
+
+    pu4_buf     = (UWORD32 *)pu1_buf;
+    u4_temp = *pu4_buf++;
+    u4_cur_word = ITT_BIG_ENDIAN(u4_temp);
+    u4_temp = *pu4_buf++;
+    u4_nxt_word = ITT_BIG_ENDIAN(u4_temp);
+
+    ps_bitstrm->u4_bit_ofst     = 0;
+    ps_bitstrm->pu1_buf_base    = pu1_buf;
+    ps_bitstrm->pu4_buf         = pu4_buf;
+    ps_bitstrm->u4_cur_word     = u4_cur_word;
+    ps_bitstrm->u4_nxt_word     = u4_nxt_word;
+
+    ps_bitstrm->pu1_buf_max     = pu1_buf + u4_numbytes + 8;
+
+    return;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Flushes given number of bits. Bits consumed increases by  this number
+*
+* @par Description:
+*  Increment bit offset by numbits. If bit offset increases  beyond 32, then
+* move nxt_word to cur_word, read next  word32 to nxt_word after endian
+* conversion
+*
+* @param[in] ps_bitstrm
+*  Pointer to bitstream structure
+*
+* @param[in] u4_numbits
+*  Number of bits to be flushed
+*
+* @returns  None
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+void ihevcd_bits_flush(bitstrm_t *ps_bitstrm, UWORD32 u4_numbits)
+{
+
+    BITS_FLUSH(ps_bitstrm->pu4_buf,
+               ps_bitstrm->u4_bit_ofst,
+               ps_bitstrm->u4_cur_word,
+               ps_bitstrm->u4_nxt_word,
+               u4_numbits);
+
+    return;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Flushes to next byte boundary.Bits consumed increases by  this number
+*
+* @par Description:
+*  Compute number of bits remaining in the current byte  then call
+* ihevcd_bits_flush() bits with this number
+*
+* @param[in] ps_bitstrm
+*  Pointer to bitstream structure
+*
+* @returns  None
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+void ihevcd_bits_flush_to_byte_boundary(bitstrm_t *ps_bitstrm)
+{
+    UWORD32 u4_numbits;
+    u4_numbits = (ps_bitstrm->u4_bit_ofst) & 7;
+
+    u4_numbits = 8 - u4_numbits;
+
+    BITS_FLUSH(ps_bitstrm->pu4_buf,
+               ps_bitstrm->u4_bit_ofst,
+               ps_bitstrm->u4_cur_word,
+               ps_bitstrm->u4_nxt_word,
+               u4_numbits);
+
+    return;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Seeks by given number of bits in the bitstream from current position
+*
+* @par Description:
+*  Add given number of bits to bitstream offset and update pu4_buf, cur_word and
+*  nxt_word accordingly
+*
+* @param[in] ps_bitstrm
+*  Pointer to bitstream structure
+*
+* @param[in] numbits
+*  Number of bits to seek
+*
+* @returns  None
+*
+* @remarks
+* Assumes emulation prevention has been done before and the buffer does not
+* contain any emulation prevention bytes
+*
+*******************************************************************************
+*/
+void ihevcd_bits_seek(bitstrm_t *ps_bitstrm, WORD32 numbits)
+{
+    WORD32 val;
+    ASSERT(numbits >= -32);
+    ASSERT(numbits <= 32);
+    /* Check if Seeking backwards*/
+    if(numbits < 0)
+    {
+        UWORD32 abs_numbits = -numbits;
+        if(ps_bitstrm->u4_bit_ofst >= abs_numbits)
+        {
+            /* If the current offset is greater than number of bits to seek back,
+             * then subtract abs_numbits from offset and return.
+             */
+            ps_bitstrm->u4_bit_ofst -= abs_numbits;
+            return;
+        }
+        else
+        {
+            /* If the current offset is lesser than number of bits to seek back,
+             * then subtract abs_numbits from offset and add 32 and move cur_word to nxt_word
+             * and load cur_word appropriately and decrement pu4_buf
+             */
+            ps_bitstrm->u4_bit_ofst -= abs_numbits;
+            ps_bitstrm->u4_bit_ofst += 32;
+            ps_bitstrm->pu4_buf--;
+
+            val = *(ps_bitstrm->pu4_buf - 2);
+            ps_bitstrm->u4_nxt_word = ps_bitstrm->u4_cur_word;
+            ps_bitstrm->u4_cur_word = ITT_BIG_ENDIAN(val);
+            return;
+        }
+    }
+    else
+    {
+        /* Not supported/tested currently */
+        ASSERT(1);
+        BITS_FLUSH(ps_bitstrm->pu4_buf,
+                   ps_bitstrm->u4_bit_ofst,
+                   ps_bitstrm->u4_cur_word,
+                   ps_bitstrm->u4_nxt_word,
+                   numbits);
+
+
+    }
+    return;
+}
+/**
+*******************************************************************************
+*
+* @brief
+*  Snoops for next numbits number of bits from the bitstream this does not
+* update the bitstream offset and does not  consume the bits
+*
+* @par Description:
+*  Extract required number of bits from cur_word & nxt_word  return these
+* bits
+*
+* @param[in] ps_bitstrm
+*  Pointer to bitstream structure
+*
+* @param[in] u4_numbits
+*  Number of bits
+*
+* @returns  Next u4_numbits number of bits
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+UWORD32 ihevcd_bits_nxt(bitstrm_t *ps_bitstrm, UWORD32 u4_numbits)
+{
+    UWORD32 u4_bits_read;
+
+    BITS_NXT(u4_bits_read,
+             ps_bitstrm->pu4_buf,
+             ps_bitstrm->u4_bit_ofst,
+             ps_bitstrm->u4_cur_word,
+             ps_bitstrm->u4_nxt_word,
+             u4_numbits);
+    return u4_bits_read;
+}
+/**
+*******************************************************************************
+*
+* @brief
+*  Snoops for next 32 bits from the bitstream  this does not update the
+* bitstream offset and does not  consume the bits
+*
+* @par Description:
+*  Extract required number of bits from cur_word & nxt_word  return these
+* bits
+*
+* @param[in] ps_bitstrm
+*  Pointer to bitstream structure
+*
+* @param[in] u4_numbits
+*  Number of bits
+*
+* @returns  Next 32 bits
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+UWORD32 ihevcd_bits_nxt32(bitstrm_t *ps_bitstrm, UWORD32 u4_numbits)
+{
+    UWORD32 u4_bits_read;
+    UNUSED(u4_numbits);
+    BITS_NXT32(u4_bits_read,
+               ps_bitstrm->pu4_buf,
+               ps_bitstrm->u4_bit_ofst,
+               ps_bitstrm->u4_cur_word,
+               ps_bitstrm->u4_nxt_word);
+    return u4_bits_read;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Reads next numbits number of bits from the bitstream  this updates the
+* bitstream offset and consumes the bits
+*
+* @par Description:
+*  Extract required number of bits from cur_word & nxt_word  return these
+* bits
+*
+* @param[in] ps_bitstrm
+*  Pointer to bitstream structure
+*
+* @param[in] u4_numbits
+*  Number of bits
+*
+* @returns  Bits read
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+UWORD32 ihevcd_bits_get(bitstrm_t *ps_bitstrm, UWORD32 u4_numbits)
+{
+    UWORD32 u4_bits_read;
+
+    BITS_GET(u4_bits_read,
+             ps_bitstrm->pu4_buf,
+             ps_bitstrm->u4_bit_ofst,
+             ps_bitstrm->u4_cur_word,
+             ps_bitstrm->u4_nxt_word,
+             u4_numbits);
+    return u4_bits_read;
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Returns the number of bits remaining in the bitstream
+*
+* @par Description:
+*  Compute number of bits remaining based on current pointer and buffer base
+* and current offset. Since 8 bytes are  read at the start into cur_word and
+* nxt_word and are not  consumed, 8 has to be subtracted
+*
+* @param[in] ps_bitstrm
+*  Pointer to bitstream structure
+*
+* @returns  Total number of bits remaining
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+UWORD32  ihevcd_bits_num_bits_remaining(bitstrm_t *ps_bitstrm)
+{
+    UWORD32 u4_bits_consumed;
+    UWORD32 u4_size_in_bits;
+
+    /* 8 bytes are read in cur_word and nxt_word at the start. Hence */
+    /* subtract 8 bytes */
+    u4_bits_consumed = (UWORD32)(((UWORD8 *)ps_bitstrm->pu4_buf -
+                                  (UWORD8 *)ps_bitstrm->pu1_buf_base - 8) <<
+                                 3) + ps_bitstrm->u4_bit_ofst;
+
+    u4_size_in_bits = (UWORD32)(ps_bitstrm->pu1_buf_max -
+                    ps_bitstrm->pu1_buf_base);
+    return (u4_size_in_bits - u4_bits_consumed);
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Returns the number of bits consumed in the bitstream
+*
+* @par Description:
+*  Compute number of bits consumed based on current pointer  and buffer base
+* and current offset. Since 8 bytes are  read at the start into cur_word and
+* nxt_word and are not  consumed, 8 has to be subtracted
+*
+* @param[in] ps_bitstrm
+*  Pointer to bitstream structure
+*
+* @returns  Total number of bits bits consumed
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+UWORD32  ihevcd_bits_num_bits_consumed(bitstrm_t *ps_bitstrm)
+{
+    UWORD32 u4_bits_consumed;
+    /* 8 bytes are read in cur_word and nxt_word at the start. Hence */
+    /* subtract 8 bytes */
+
+    u4_bits_consumed = (UWORD32)(((UWORD8 *)ps_bitstrm->pu4_buf -
+                                  (UWORD8 *)ps_bitstrm->pu1_buf_base - 8) <<
+                                 3) + ps_bitstrm->u4_bit_ofst;
+    return u4_bits_consumed;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Reads unsigned integer 0-th order exp-golomb-coded syntax element from
+* the bitstream  Section: 9.2
+*
+* @par Description:
+*  Extract required number of bits from cur_word & nxt_word  return these
+* bits
+*
+* @param[in] ps_bitstrm
+*  Pointer to bitstream structure
+*
+* @returns  UEV decoded syntax element
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+UWORD32 ihevcd_uev(bitstrm_t *ps_bitstrm)
+{
+    UWORD32 u4_bits_read;
+    UWORD32 u4_clz;
+
+
+    /***************************************************************/
+    /* Find leading zeros in next 32 bits                          */
+    /***************************************************************/
+    BITS_NXT32(u4_bits_read,
+               ps_bitstrm->pu4_buf,
+               ps_bitstrm->u4_bit_ofst,
+               ps_bitstrm->u4_cur_word,
+               ps_bitstrm->u4_nxt_word);
+
+
+    u4_clz = CLZ(u4_bits_read);
+
+    BITS_FLUSH(ps_bitstrm->pu4_buf,
+               ps_bitstrm->u4_bit_ofst,
+               ps_bitstrm->u4_cur_word,
+               ps_bitstrm->u4_nxt_word,
+               (u4_clz + 1));
+
+    u4_bits_read = 0;
+    if(u4_clz)
+    {
+        BITS_GET(u4_bits_read,
+                 ps_bitstrm->pu4_buf,
+                 ps_bitstrm->u4_bit_ofst,
+                 ps_bitstrm->u4_cur_word,
+                 ps_bitstrm->u4_nxt_word,
+                 u4_clz);
+    }
+    return ((1 << u4_clz) + u4_bits_read - 1);
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Reads signed integer 0-th order exp-golomb-coded syntax  element from the
+* bitstream. Function similar to get_uev  Section: 9.2.1
+*
+* @par Description:
+*  Extract required number of bits from cur_word & nxt_word  return these
+* bits
+*
+* @param[in] ps_bitstrm
+*  Pointer to bitstream structure
+*
+* @returns  UEV decoded syntax element
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_sev(bitstrm_t *ps_bitstrm)
+{
+    UWORD32 u4_bits_read;
+    UWORD32 u4_clz;
+    UWORD32 u4_abs_val;
+
+
+    /***************************************************************/
+    /* Find leading zeros in next 32 bits                          */
+    /***************************************************************/
+    BITS_NXT32(u4_bits_read,
+               ps_bitstrm->pu4_buf,
+               ps_bitstrm->u4_bit_ofst,
+               ps_bitstrm->u4_cur_word,
+               ps_bitstrm->u4_nxt_word);
+
+
+    u4_clz = CLZ(u4_bits_read);
+
+    BITS_FLUSH(ps_bitstrm->pu4_buf,
+               ps_bitstrm->u4_bit_ofst,
+               ps_bitstrm->u4_cur_word,
+               ps_bitstrm->u4_nxt_word,
+               (u4_clz + 1));
+
+    u4_bits_read = 0;
+    if(u4_clz)
+    {
+        BITS_GET(u4_bits_read,
+                 ps_bitstrm->pu4_buf,
+                 ps_bitstrm->u4_bit_ofst,
+                 ps_bitstrm->u4_cur_word,
+                 ps_bitstrm->u4_nxt_word,
+                 u4_clz);
+    }
+    u4_abs_val = ((1 << u4_clz) + u4_bits_read) >> 1;
+    if(u4_bits_read & 0x1)
+        return (-(WORD32)u4_abs_val);
+    else
+        return (u4_abs_val);
+}
+
+
+
+
+
+

diff --git a/decoder/ihevcd_bitstream.h b/decoder/ihevcd_bitstream.h
new file mode 100644
index 0000000..907c934
--- /dev/null
+++ b/decoder/ihevcd_bitstream.h

@@ -0,0 +1,226 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_bitps_bitstrm.h
+*
+* @brief
+*  Header for bitps_bitstrm access functions
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_BITSTREAM_H_
+#define _IHEVCD_BITSTREAM_H_
+/**
+ *  @brief  defines the maximum number of bits in a bitstream word
+ */
+#define WORD_SIZE         32
+/**
+ *  @brief  Twice the WORD_SIZE
+ */
+#define DBL_WORD_SIZE     (2 * (WORD_SIZE))
+
+/**
+ *  @brief  WORD_SIZE - 1
+ */
+#define WORD_SIZE_MINUS1  (WORD_SIZE - 1)
+
+/**
+******************************************************************************
+* @brief Macro used to copy elements in bistream structure to local variables.
+******************************************************************************
+*/
+
+#define GET_STREAM(m_ps_bitstrm, m_pu4_buf, m_u4_bit_ofst,  \
+                  m_u4_cur_word, m_u4_nxt_word)             \
+{                                                           \
+    m_pu4_buf            = m_ps_bitstrm->pu4_buf;           \
+    m_u4_bit_ofst        = m_ps_bitstrm->u4_bit_ofst;       \
+    m_u4_cur_word        = m_ps_bitstrm->u4_cur_word;       \
+    m_u4_nxt_word        = m_ps_bitstrm->u4_nxt_word;       \
+}
+
+/**
+******************************************************************************
+* @brief Macro used to copy local variables to elements in bistream structure.
+******************************************************************************
+*/
+#define SET_STREAM(m_ps_bitstrm, m_pu4_buf, m_u4_bit_ofst,  \
+                  m_u4_cur_word, m_u4_nxt_word)             \
+{                                                           \
+    m_ps_bitstrm->pu4_buf       = m_pu4_buf;                \
+    m_ps_bitstrm->u4_bit_ofst   = m_u4_bit_ofst;            \
+    m_ps_bitstrm->u4_cur_word   = m_u4_cur_word;            \
+    m_ps_bitstrm->u4_nxt_word   = m_u4_nxt_word;            \
+}
+
+
+
+/**
+******************************************************************************
+* @brief  Snoop next m_cnt bits without updating offsets or buffer increments.
+* Data is not consumed in this call
+******************************************************************************
+*/
+#define BITS_NXT(m_u4_bits, m_pu4_buf, m_u4_bit_ofst,       \
+                 m_u4_cur_word, m_u4_nxt_word, m_cnt)       \
+{                                                           \
+    m_u4_bits = (m_u4_cur_word << m_u4_bit_ofst)  >>        \
+                              (WORD_SIZE - m_cnt);          \
+                                                            \
+    if(m_u4_bit_ofst > (WORD_SIZE - m_cnt))                 \
+    {                                                       \
+        m_u4_bits |= SHR(m_u4_nxt_word,                     \
+                   (WORD_SIZE + WORD_SIZE - m_cnt           \
+                          - m_u4_bit_ofst));                \
+    }                                                       \
+}
+
+
+/**
+******************************************************************************
+*  @brief Snoop next 32 bits without updating offsets or buffer increments.
+* Data is not consumed in this call
+******************************************************************************
+*/
+#define BITS_NXT32(m_u4_bits, m_pu4_buf, m_u4_bit_ofst,             \
+                 m_u4_cur_word, m_u4_nxt_word)                      \
+{                                                                   \
+    m_u4_bits = (m_u4_cur_word << m_u4_bit_ofst);                   \
+                                                                    \
+    m_u4_bits |= SHR(m_u4_nxt_word, (WORD_SIZE - m_u4_bit_ofst));   \
+}
+
+
+/**
+******************************************************************************
+*  @brief  Flush m_u4_bits and updated the buffer pointer.
+* Data is consumed
+******************************************************************************
+*/
+#define BITS_FLUSH(m_pu4_buf, m_u4_bit_ofst, m_u4_cur_word, \
+                    m_u4_nxt_word, m_cnt)                   \
+{                                                           \
+    UWORD32 temp;                                           \
+                                                            \
+    m_u4_bit_ofst += m_cnt;                                 \
+    if( m_u4_bit_ofst >=   WORD_SIZE )                      \
+    {                                                       \
+        m_u4_cur_word  = m_u4_nxt_word;                     \
+        /* Getting the next word */                         \
+        temp = *(m_pu4_buf++);                              \
+                                                            \
+        m_u4_bit_ofst -= WORD_SIZE;                         \
+        /* Swapping little endian to big endian conversion*/\
+        m_u4_nxt_word = ITT_BIG_ENDIAN(temp);                   \
+    }                                                       \
+}
+/**
+******************************************************************************
+*  @brief Get m_cnt number of bits and update bffer pointers and offset.
+* Data is consumed
+******************************************************************************
+*/
+#define BITS_GET(m_u4_bits, m_pu4_buf, m_u4_bit_ofst,           \
+                          m_u4_cur_word,m_u4_nxt_word, m_cnt)   \
+{                                                               \
+    m_u4_bits = (m_u4_cur_word << m_u4_bit_ofst)                \
+                             >> (WORD_SIZE - m_cnt);            \
+    m_u4_bit_ofst += m_cnt;                                     \
+    if(m_u4_bit_ofst > WORD_SIZE)                               \
+    {                                                           \
+        m_u4_bits |= SHR(m_u4_nxt_word,                         \
+                     (DBL_WORD_SIZE - m_u4_bit_ofst));          \
+    }                                                           \
+                                                                \
+    if( m_u4_bit_ofst >=   WORD_SIZE )                          \
+    {                                                           \
+        UWORD32 pu4_word_tmp;                                   \
+        m_u4_cur_word  = m_u4_nxt_word;                         \
+        /* Getting the next word */                             \
+        pu4_word_tmp = *(m_pu4_buf++);                          \
+                                                                \
+        m_u4_bit_ofst -= WORD_SIZE;                             \
+        /* Swapping little endian to big endian conversion*/    \
+        m_u4_nxt_word  = ITT_BIG_ENDIAN(pu4_word_tmp);              \
+    }                                                           \
+}
+
+/**
+******************************************************************************
+*  @brief Get 1 bit and update buffer pointers and offset.
+* Data is consumed
+******************************************************************************
+*/
+
+#define BIT_GET(m_u4_bits,m_pu4_buf,m_u4_bit_ofst,              \
+                          m_u4_cur_word,m_u4_nxt_word)          \
+{                                                               \
+    m_u4_bits = (m_u4_cur_word << m_u4_bit_ofst)                \
+                             >> (WORD_SIZE_MINUS1);             \
+    m_u4_bit_ofst++;                                            \
+                                                                \
+    if(m_u4_bit_ofst ==  WORD_SIZE)                             \
+    {                                                           \
+        UWORD32 pu4_word_tmp;                                   \
+        m_u4_cur_word  = m_u4_nxt_word;                         \
+        /* Getting the next word */                             \
+        pu4_word_tmp = *m_pu4_buf++;                            \
+                                                                \
+        m_u4_bit_ofst = 0;                                      \
+        /* Swapping little endian to big endian conversion*/    \
+        m_u4_nxt_word  = ITT_BIG_ENDIAN(pu4_word_tmp);              \
+    }                                                           \
+}
+
+void ihevcd_bits_init(bitstrm_t *ps_bitstrm,
+                      UWORD8 *pu1_buf,
+                      UWORD32 u4_numbytes);
+void ihevcd_bits_flush(bitstrm_t *ps_bitstrm, UWORD32 u4_numbits);
+
+void ihevcd_bits_flush_to_byte_boundary(bitstrm_t *ps_bitstrm);
+
+UWORD32 ihevcd_bits_nxt(bitstrm_t *ps_bitstrm, UWORD32 u4_numbits);
+
+UWORD32 ihevcd_bits_nxt32(bitstrm_t *ps_bitstrm, UWORD32 u4_numbits);
+
+
+UWORD32 ihevcd_bits_get(bitstrm_t *ps_bitstrm, UWORD32 u4_numbits);
+
+UWORD32  ihevcd_bits_num_bits_remaining(bitstrm_t *ps_bitstrm);
+
+
+UWORD32  ihevcd_bits_num_bits_consumed(bitstrm_t *ps_bitstrm);
+
+UWORD32 ihevcd_uev(bitstrm_t *ps_bitstrm);
+
+WORD32 ihevcd_sev(bitstrm_t *ps_bitstrm);
+
+void ihevcd_bits_seek(bitstrm_t *ps_bitstrm, WORD32 numbits);
+
+#endif /* _IHEVCD_BITSTREAM_H_ */

diff --git a/decoder/ihevcd_boundary_strength.c b/decoder/ihevcd_boundary_strength.c
new file mode 100644
index 0000000..9451e70
--- /dev/null
+++ b/decoder/ihevcd_boundary_strength.c

@@ -0,0 +1,1008 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_boundary_strength.c
+ *
+ * @brief
+ *  Contains functions for computing boundary strength
+ *
+ * @author
+ *  Harish
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+
+#include "ihevc_error.h"
+#include "ihevc_common_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_utils.h"
+#include "ihevcd_profile.h"
+
+/*****************************************************************************/
+/* Function Prototypes                                                       */
+/*****************************************************************************/
+
+
+#define SET_NGBHR_ALL_AVAIL(avail)          avail = 0x1F;
+
+#define SET_NGBHR_BOTLEFT_NOTAVAIL(avail)   avail &= ~0x10;
+#define SET_NGBHR_LEFT_NOTAVAIL(avail)      avail &= ~0x8;
+#define SET_NGBHR_TOPLEFT_NOTAVAIL(avail)   avail &= ~0x4;
+#define SET_NGBHR_TOP_NOTAVAIL(avail)       avail &= ~0x2;
+#define SET_NGBHR_TOPRIGHT_NOTAVAIL(avail)  avail &= ~0x1;
+
+WORD32 ihevcd_pu_boundary_strength(pu_t *ps_pu,
+                                   pu_t *ps_ngbr_pu)
+{
+    WORD32 i4_bs;
+    UWORD32 l0_ref_pic_buf_id, l1_ref_pic_buf_id;
+    UWORD32 ngbr_l0_ref_pic_buf_id, ngbr_l1_ref_pic_buf_id;
+
+    WORD16 i2_mv_x0, i2_mv_y0, i2_mv_x1, i2_mv_y1;
+    WORD16 i2_ngbr_mv_x0, i2_ngbr_mv_y0, i2_ngbr_mv_x1, i2_ngbr_mv_y1;
+
+    WORD32 num_mv, ngbr_num_mv;
+
+    num_mv = (PRED_BI == ps_pu->b2_pred_mode) ? 2 : 1;
+    ngbr_num_mv = (PRED_BI == ps_ngbr_pu->b2_pred_mode) ? 2 : 1;
+
+    l0_ref_pic_buf_id = ps_pu->mv.i1_l0_ref_pic_buf_id;
+    l1_ref_pic_buf_id = ps_pu->mv.i1_l1_ref_pic_buf_id;
+    ngbr_l0_ref_pic_buf_id = ps_ngbr_pu->mv.i1_l0_ref_pic_buf_id;
+    ngbr_l1_ref_pic_buf_id = ps_ngbr_pu->mv.i1_l1_ref_pic_buf_id;
+
+
+    i2_mv_x0 = ps_pu->mv.s_l0_mv.i2_mvx;
+    i2_mv_y0 = ps_pu->mv.s_l0_mv.i2_mvy;
+    i2_mv_x1 = ps_pu->mv.s_l1_mv.i2_mvx;
+    i2_mv_y1 = ps_pu->mv.s_l1_mv.i2_mvy;
+
+    i2_ngbr_mv_x0 = ps_ngbr_pu->mv.s_l0_mv.i2_mvx;
+    i2_ngbr_mv_y0 = ps_ngbr_pu->mv.s_l0_mv.i2_mvy;
+    i2_ngbr_mv_x1 = ps_ngbr_pu->mv.s_l1_mv.i2_mvx;
+    i2_ngbr_mv_y1 = ps_ngbr_pu->mv.s_l1_mv.i2_mvy;
+
+
+    /* If two motion vectors are used */
+    if((2 == num_mv) &&
+            (2 == ngbr_num_mv))
+    {
+        if((l0_ref_pic_buf_id == ngbr_l0_ref_pic_buf_id && l1_ref_pic_buf_id == ngbr_l1_ref_pic_buf_id) ||
+                (l0_ref_pic_buf_id == ngbr_l1_ref_pic_buf_id && l1_ref_pic_buf_id == ngbr_l0_ref_pic_buf_id))
+        {
+            if(l0_ref_pic_buf_id != l1_ref_pic_buf_id) /* Different L0 and L1 */
+            {
+                if(l0_ref_pic_buf_id == ngbr_l0_ref_pic_buf_id)
+                {
+                    i4_bs = (ABS(i2_mv_x0 - i2_ngbr_mv_x0) < 4) &&
+                            (ABS(i2_mv_y0 - i2_ngbr_mv_y0) < 4) &&
+                            (ABS(i2_mv_x1 - i2_ngbr_mv_x1) < 4) &&
+                            (ABS(i2_mv_y1 - i2_ngbr_mv_y1) < 4) ? 0 : 1;
+                }
+                else
+                {
+                    i4_bs = (ABS(i2_mv_x0 - i2_ngbr_mv_x1) < 4) &&
+                            (ABS(i2_mv_y0 - i2_ngbr_mv_y1) < 4) &&
+                            (ABS(i2_mv_x1 - i2_ngbr_mv_x0) < 4) &&
+                            (ABS(i2_mv_y1 - i2_ngbr_mv_y0) < 4) ? 0 : 1;
+                }
+            }
+            else /* Same L0 and L1 */
+            {
+                i4_bs = ((ABS(i2_mv_x0 - i2_ngbr_mv_x0) >= 4) ||
+                         (ABS(i2_mv_y0 - i2_ngbr_mv_y0) >= 4) ||
+                         (ABS(i2_mv_x1 - i2_ngbr_mv_x1) >= 4) ||
+                         (ABS(i2_mv_y1 - i2_ngbr_mv_y1) >= 4)) &&
+                                ((ABS(i2_mv_x0 - i2_ngbr_mv_x1) >= 4) ||
+                                 (ABS(i2_mv_y0 - i2_ngbr_mv_y1) >= 4) ||
+                                 (ABS(i2_mv_x1 - i2_ngbr_mv_x0) >= 4) ||
+                                 (ABS(i2_mv_y1 - i2_ngbr_mv_y0) >= 4)) ? 1 : 0;
+            }
+        }
+        else /* If the reference pictures used are different */
+        {
+            i4_bs = 1;
+        }
+    }
+
+    /* If one motion vector is used in both PUs */
+    else if((1 == num_mv) &&
+            (1 == ngbr_num_mv))
+    {
+        WORD16 i2_mv_x, i2_mv_y;
+        WORD16 i2_ngbr_mv_x, i2_ngbr_mv_y;
+        UWORD32 ref_pic_buf_id, ngbr_ref_pic_buf_id;
+
+        if(PRED_L0 == ps_pu->b2_pred_mode)
+        {
+            i2_mv_x = i2_mv_x0;
+            i2_mv_y = i2_mv_y0;
+            ref_pic_buf_id = l0_ref_pic_buf_id;
+        }
+        else
+        {
+            i2_mv_x = i2_mv_x1;
+            i2_mv_y = i2_mv_y1;
+            ref_pic_buf_id = l1_ref_pic_buf_id;
+        }
+
+        if(PRED_L0 == ps_ngbr_pu->b2_pred_mode)
+        {
+            i2_ngbr_mv_x = i2_ngbr_mv_x0;
+            i2_ngbr_mv_y = i2_ngbr_mv_y0;
+            ngbr_ref_pic_buf_id = ngbr_l0_ref_pic_buf_id;
+        }
+        else
+        {
+            i2_ngbr_mv_x = i2_ngbr_mv_x1;
+            i2_ngbr_mv_y = i2_ngbr_mv_y1;
+            ngbr_ref_pic_buf_id = ngbr_l1_ref_pic_buf_id;
+        }
+
+        i4_bs = (ref_pic_buf_id == ngbr_ref_pic_buf_id) &&
+                (ABS(i2_mv_x - i2_ngbr_mv_x) < 4)  &&
+                (ABS(i2_mv_y - i2_ngbr_mv_y) < 4) ? 0 : 1;
+    }
+
+    /* If the no. of motion vectors is not the same */
+    else
+    {
+        i4_bs = 1;
+    }
+
+
+    return i4_bs;
+}
+
+/* QP is also populated in the same function */
+WORD32 ihevcd_ctb_boundary_strength_islice(bs_ctxt_t *ps_bs_ctxt)
+{
+    pps_t *ps_pps;
+    sps_t *ps_sps;
+    tu_t *ps_tu;
+    UWORD32 *pu4_vert_bs;
+    UWORD32 *pu4_horz_bs;
+    WORD32 vert_bs_strd;
+    WORD32 horz_bs_strd;
+    WORD32 vert_bs0_tmp;
+    WORD32 horz_bs0_tmp;
+    UWORD8 *pu1_qp;
+    WORD32 qp_strd;
+    UWORD32 u4_qp_const_in_ctb;
+    WORD32 ctb_indx;
+    WORD32 i4_tu_cnt;
+    WORD32 log2_ctb_size;
+    WORD32 ctb_size;
+
+    WORD8 i1_loop_filter_across_tiles_enabled_flag;
+    WORD8 i1_loop_filter_across_slices_enabled_flag;
+
+    WORD32 i;
+
+    PROFILE_DISABLE_BOUNDARY_STRENGTH();
+
+    ps_pps = ps_bs_ctxt->ps_pps;
+    ps_sps = ps_bs_ctxt->ps_sps;
+    i1_loop_filter_across_tiles_enabled_flag = ps_pps->i1_loop_filter_across_tiles_enabled_flag;
+    i1_loop_filter_across_slices_enabled_flag = ps_bs_ctxt->ps_slice_hdr->i1_slice_loop_filter_across_slices_enabled_flag;
+    i4_tu_cnt = ps_bs_ctxt->i4_ctb_tu_cnt;
+
+    log2_ctb_size = ps_sps->i1_log2_ctb_size;
+    ctb_size = (1 << log2_ctb_size);
+
+    /* strides are in units of number of bytes */
+    /* ctb_size * ctb_size / 8 / 16 is the number of bytes needed per CTB */
+    vert_bs_strd = ps_sps->i2_pic_wd_in_ctb << (2 * log2_ctb_size - 7);
+    horz_bs_strd = (ps_sps->i2_pic_wd_in_ctb + 1) << (2 * log2_ctb_size - 7);
+    pu4_vert_bs = (UWORD32 *)((UWORD8 *)ps_bs_ctxt->pu4_pic_vert_bs +
+                    (ps_bs_ctxt->i4_ctb_x << (2 * log2_ctb_size - 7)) +
+                    ps_bs_ctxt->i4_ctb_y * vert_bs_strd);
+    pu4_horz_bs = (UWORD32 *)((UWORD8 *)ps_bs_ctxt->pu4_pic_horz_bs +
+                    (ps_bs_ctxt->i4_ctb_x << (2 * log2_ctb_size - 7)) +
+                    ps_bs_ctxt->i4_ctb_y * horz_bs_strd);
+
+    /* ctb_size/8 elements per CTB */
+    qp_strd = ps_sps->i2_pic_wd_in_ctb << (log2_ctb_size - 3);
+    pu1_qp = ps_bs_ctxt->pu1_pic_qp + ((ps_bs_ctxt->i4_ctb_x + ps_bs_ctxt->i4_ctb_y * qp_strd) << (log2_ctb_size - 3));
+
+    ctb_indx = ps_bs_ctxt->i4_ctb_x + ps_sps->i2_pic_wd_in_ctb * ps_bs_ctxt->i4_ctb_y;
+    u4_qp_const_in_ctb = ps_bs_ctxt->pu1_pic_qp_const_in_ctb[ctb_indx >> 3] & (1 << (ctb_indx & 7));
+
+    vert_bs0_tmp = pu4_vert_bs[0] & (0xFFFFFFFF >> (sizeof(UWORD32) * 8 - ctb_size / 2));
+    horz_bs0_tmp = pu4_horz_bs[0] & (0xFFFFFFFF >> (sizeof(UWORD32) * 8 - ctb_size / 2));
+
+    /* ctb_size/8 is the number of edges per CTB
+     * ctb_size/4 is the number of BS values needed per edge
+     * divided by 8 for the number of bytes
+     * 2 is the number of bits needed for each BS value */
+/*
+    memset(pu4_vert_bs, 0, (ctb_size / 8 + 1) * (ctb_size / 4) / 8 * 2 );
+    memset(pu4_horz_bs, 0, (ctb_size / 8) * (ctb_size / 4) / 8 * 2 );
+*/
+    memset(pu4_vert_bs, 0, (1 << (2 * log2_ctb_size - 7)) + ((ctb_size >> 5) << 1));
+    memset(pu4_horz_bs, 0, (1 << (2 * log2_ctb_size - 7)));
+
+    /* pu4_vert_bs[0] has information about the left CTB which is not required when ctb_x = 0 */
+    if(0 != ps_bs_ctxt->i4_ctb_x)
+    {
+        pu4_vert_bs[0] |= vert_bs0_tmp;
+    }
+
+    /* pu4_horz_bs[0] has information about the top CTB which is not required when ctb_y = 0 */
+    if(0 != ps_bs_ctxt->i4_ctb_y)
+    {
+        pu4_horz_bs[0] |= horz_bs0_tmp;
+    }
+
+    ps_tu = ps_bs_ctxt->ps_tu;
+
+    /* Populating the QP array - if const_qp_in_ctb flag is one, set only the first element */
+    if(u4_qp_const_in_ctb)
+        pu1_qp[0] = ps_tu->b7_qp;
+
+    for(i = 0; i < i4_tu_cnt; i++)
+    {
+        WORD32 start_pos_x;
+        WORD32 start_pos_y;
+        WORD32 tu_size;
+
+
+        UWORD32 u4_bs;
+        ps_tu = ps_bs_ctxt->ps_tu + i;
+
+        /* start_pos_x and start_pos_y are in units of min TU size (4x4) */
+        start_pos_x = ps_tu->b4_pos_x;
+        start_pos_y = ps_tu->b4_pos_y;
+
+        tu_size = 1 << (ps_tu->b3_size + 2);
+        tu_size >>= 2; /* TU size divided by 4 */
+
+        u4_bs = DUP_LSB_10(tu_size);
+
+        /* Only if the current edge falls on 8 pixel grid set BS */
+        if(0 == (start_pos_x & 1))
+        {
+            WORD32 shift;
+            shift = start_pos_y * 2;
+            /* shift += (((start_pos_x >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+             *  will reduce to the following assuming ctb size is one of 16, 32 and 64
+             *  and deblocking is done on 8x8 grid
+             */
+            if(6 != log2_ctb_size)
+                shift += ((start_pos_x & 2) << (log2_ctb_size - 2));
+            pu4_vert_bs[start_pos_x >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+        }
+        /* Only if the current edge falls on 8 pixel grid set BS */
+        if(0 == (start_pos_y & 1))
+        {
+            WORD32 shift;
+            shift = start_pos_x * 2;
+            /* shift += (((start_pos_y >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+             *  will reduce to the following assuming ctb size is one of 16, 32 and 64
+             *  and deblocking is done on 8x8 grid
+             */
+            if(6 != log2_ctb_size)
+                shift += ((start_pos_y & 2) << (log2_ctb_size - 2));
+            pu4_horz_bs[start_pos_y >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+        }
+
+        /* Populating the QP array */
+        if(0 == u4_qp_const_in_ctb)
+        {
+            if(0 == (start_pos_x & 1) && 0 == (start_pos_y & 1))
+            {
+                WORD32 row, col;
+                for(row = start_pos_y; row < start_pos_y + tu_size; row += 2)
+                {
+                    for(col = start_pos_x; col < start_pos_x + tu_size; col += 2)
+                    {
+                        pu1_qp[(row >> 1) * qp_strd + (col >> 1)] = ps_tu->b7_qp;
+                    }
+                }
+            }
+        }
+
+    }
+    {
+        /*Determine if the slice is dependent, and is its left neighbor belongs to the same slice, in a different tile*/
+        UWORD32 ctb_addr;
+        WORD32 slice_idx, left_slice_idx = -1, top_slice_idx = -1;
+        /* If left neighbor is not available, then set BS for entire first column to zero */
+        if(!ps_pps->i1_tiles_enabled_flag)
+        {
+            if((0 == i1_loop_filter_across_tiles_enabled_flag && 0 == ps_bs_ctxt->i4_ctb_tile_x) ||
+                            (0 == i1_loop_filter_across_slices_enabled_flag && 0 == ps_bs_ctxt->i4_ctb_slice_x && 0 == ps_bs_ctxt->i4_ctb_slice_y) ||
+                            (0 == ps_bs_ctxt->i4_ctb_x))
+            {
+                pu4_vert_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
+            }
+        }
+        else
+        {
+            //If across-tiles is disabled
+            if((0 == i1_loop_filter_across_tiles_enabled_flag && 0 == ps_bs_ctxt->i4_ctb_tile_x))
+            {
+                pu4_vert_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
+            }
+            else
+            {
+                ctb_addr = ps_bs_ctxt->i4_ctb_x + (ps_bs_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+                slice_idx = ps_bs_ctxt->pu1_slice_idx[ctb_addr];
+                if(ps_bs_ctxt->i4_ctb_x)
+                {
+                    ctb_addr = (ps_bs_ctxt->i4_ctb_x - 1) + (ps_bs_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+                    left_slice_idx = ps_bs_ctxt->pu1_slice_idx[ctb_addr];
+                }
+                /*If the 1st slice in a new tile is a dependent slice*/
+                if(!((ps_bs_ctxt->ps_slice_hdr->i1_dependent_slice_flag == 1) && (slice_idx == left_slice_idx)))
+                {
+                    if((0 == i1_loop_filter_across_slices_enabled_flag && (
+                                    (0 == ps_bs_ctxt->i4_ctb_slice_x && 0 == ps_bs_ctxt->i4_ctb_slice_y) || (0 == ps_bs_ctxt->i4_ctb_slice_x && 0 == ps_bs_ctxt->i4_ctb_tile_x) ||
+                                    ((0 == ps_bs_ctxt->i4_ctb_tile_x) && (slice_idx != left_slice_idx)))) ||
+                                    (0 == ps_bs_ctxt->i4_ctb_x))
+                    {
+                        pu4_vert_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
+                    }
+                }
+            }
+        }
+
+        ctb_addr = ps_bs_ctxt->i4_ctb_x + (ps_bs_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+        slice_idx = ps_bs_ctxt->pu1_slice_idx[ctb_addr];
+        if(ps_bs_ctxt->i4_ctb_y)
+        {
+            ctb_addr = (ps_bs_ctxt->i4_ctb_x) + ((ps_bs_ctxt->i4_ctb_y - 1) * ps_sps->i2_pic_wd_in_ctb);
+            top_slice_idx = ps_bs_ctxt->pu1_slice_idx[ctb_addr];
+        }
+
+        /* If top neighbor is not available, then set BS for entire first row to zero */
+        if((0 == i1_loop_filter_across_tiles_enabled_flag && 0 == ps_bs_ctxt->i4_ctb_tile_y)
+                        || (0 == i1_loop_filter_across_slices_enabled_flag && ((0 == ps_bs_ctxt->i4_ctb_slice_y) || (slice_idx != top_slice_idx)))
+                        || (0 == ps_bs_ctxt->i4_ctb_y))
+        {
+            pu4_horz_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
+        }
+    }
+
+    /**
+     *  Set BS of bottom and right frame boundaries to zero if it is an incomplete CTB
+     *   (They might have been set to  non zero values because of CBF of the current CTB)
+     *   This block might not be needed for I slices*/
+    {
+        WORD32 num_rows_remaining = (ps_sps->i2_pic_height_in_luma_samples - (ps_bs_ctxt->i4_ctb_y << log2_ctb_size)) >> 3;
+        WORD32 num_cols_remaining = (ps_sps->i2_pic_width_in_luma_samples - (ps_bs_ctxt->i4_ctb_x << log2_ctb_size)) >> 3;
+        if(num_rows_remaining < (ctb_size >> 3))
+        {
+            /* WORD32 offset = (((num_rows_remaining >> 3) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 4));
+             *  will reduce to the following assuming ctb size is one of 16, 32 and 64
+             *  and deblocking is done on 8x8 grid
+             */
+            WORD32 offset;
+            offset = (num_rows_remaining >> (6 - log2_ctb_size)) << 2;
+            if(6 != log2_ctb_size)
+                offset += (num_rows_remaining & 1) << (log2_ctb_size - 4);
+
+            memset(((UWORD8 *)pu4_horz_bs) + offset, 0, 1 << (log2_ctb_size - 4));
+        }
+
+        if(num_cols_remaining < (ctb_size >> 3))
+        {
+            /* WORD32 offset = (((num_cols_remaining >> 3) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 4));
+             *  will reduce to the following assuming ctb size is one of 16, 32 and 64
+             *  and deblocking is done on 8x8 grid
+             */
+
+            WORD32 offset;
+            offset = (num_cols_remaining >> (6 - log2_ctb_size)) << 2;
+            if(6 != log2_ctb_size)
+                offset += (num_cols_remaining & 1) << (log2_ctb_size - 4);
+
+            memset(((UWORD8 *)pu4_vert_bs) + offset, 0, 1 << (log2_ctb_size - 4));
+        }
+    }
+
+    return 0;
+}
+WORD32 ihevcd_ctb_boundary_strength_pbslice(bs_ctxt_t *ps_bs_ctxt)
+{
+    sps_t *ps_sps;
+    pps_t *ps_pps;
+    WORD32 cur_ctb_idx, next_ctb_idx = 0;
+    WORD32 i4_tu_cnt;
+    WORD32 i4_pu_cnt;
+    tu_t *ps_tu;
+
+    UWORD32 *pu4_vert_bs;
+    UWORD32 *pu4_horz_bs;
+    WORD32 vert_bs_strd;
+    WORD32 horz_bs_strd;
+    WORD32 vert_bs0_tmp;
+    WORD32 horz_bs0_tmp;
+    UWORD8 *pu1_qp;
+    WORD32 qp_strd;
+    UWORD32 u4_qp_const_in_ctb;
+    WORD32 ctb_indx;
+    WORD32 log2_ctb_size;
+    WORD32 ctb_size;
+
+    WORD32 i;
+    WORD8 i1_loop_filter_across_tiles_enabled_flag;
+    WORD8 i1_loop_filter_across_slices_enabled_flag;
+
+    PROFILE_DISABLE_BOUNDARY_STRENGTH();
+
+    ps_sps = ps_bs_ctxt->ps_sps;
+    ps_pps = ps_bs_ctxt->ps_pps;
+
+    log2_ctb_size = ps_sps->i1_log2_ctb_size;
+    ctb_size = (1 << log2_ctb_size);
+
+    /* strides are in units of number of bytes */
+    /* ctb_size * ctb_size / 8 / 16 is the number of bytes needed per CTB */
+    vert_bs_strd = ps_sps->i2_pic_wd_in_ctb << (2 * log2_ctb_size - 7);
+    horz_bs_strd = (ps_sps->i2_pic_wd_in_ctb + 1) << (2 * log2_ctb_size - 7);
+    pu4_vert_bs = (UWORD32 *)((UWORD8 *)ps_bs_ctxt->pu4_pic_vert_bs +
+                    (ps_bs_ctxt->i4_ctb_x << (2 * log2_ctb_size - 7)) +
+                    ps_bs_ctxt->i4_ctb_y * vert_bs_strd);
+    pu4_horz_bs = (UWORD32 *)((UWORD8 *)ps_bs_ctxt->pu4_pic_horz_bs +
+                    (ps_bs_ctxt->i4_ctb_x << (2 * log2_ctb_size - 7)) +
+                    ps_bs_ctxt->i4_ctb_y * horz_bs_strd);
+
+    vert_bs0_tmp = pu4_vert_bs[0] & (0xFFFFFFFF >> (sizeof(UWORD32) * 8 - ctb_size / 2));
+    horz_bs0_tmp = pu4_horz_bs[0] & (0xFFFFFFFF >> (sizeof(UWORD32) * 8 - ctb_size / 2));
+
+    ps_tu = ps_bs_ctxt->ps_tu;
+
+    /* ctb_size/8 elements per CTB */
+    qp_strd = ps_sps->i2_pic_wd_in_ctb << (log2_ctb_size - 3);
+    pu1_qp = ps_bs_ctxt->pu1_pic_qp + ((ps_bs_ctxt->i4_ctb_x + ps_bs_ctxt->i4_ctb_y * qp_strd) << (log2_ctb_size - 3));
+
+    ctb_indx = ps_bs_ctxt->i4_ctb_x + ps_sps->i2_pic_wd_in_ctb * ps_bs_ctxt->i4_ctb_y;
+    u4_qp_const_in_ctb = ps_bs_ctxt->pu1_pic_qp_const_in_ctb[ctb_indx >> 3] & (1 << (ctb_indx & 7));
+
+    i1_loop_filter_across_tiles_enabled_flag = ps_pps->i1_loop_filter_across_tiles_enabled_flag;
+    i1_loop_filter_across_slices_enabled_flag = ps_bs_ctxt->ps_slice_hdr->i1_slice_loop_filter_across_slices_enabled_flag;
+
+    /* ctb_size/8 is the number of edges per CTB
+     * ctb_size/4 is the number of BS values needed per edge
+     * divided by 8 for the number of bytes
+     * 2 is the number of bits needed for each BS value */
+/*
+    memset(pu4_vert_bs, 0, (ctb_size / 8 + 1) * (ctb_size / 4) * 2 / 8 );
+    memset(pu4_horz_bs, 0, (ctb_size / 8) * (ctb_size / 4) * 2 / 8 );
+*/
+    memset(pu4_vert_bs, 0, (1 << (2 * log2_ctb_size - 7)) + (ctb_size >> 4));
+    memset(pu4_horz_bs, 0, (1 << (2 * log2_ctb_size - 7)));
+
+    /* pu4_vert_bs[0] has information about the left CTB which is not required when ctb_x = 0 */
+    if(0 != ps_bs_ctxt->i4_ctb_x)
+    {
+        pu4_vert_bs[0] |= vert_bs0_tmp;
+    }
+
+    /* pu4_horz_bs[0] has information about the top CTB which is not required when ctb_y = 0 */
+    if(0 != ps_bs_ctxt->i4_ctb_y)
+    {
+        pu4_horz_bs[0] |= horz_bs0_tmp;
+    }
+    /* pu4_horz_bs[horz_bs_strd / 4] corresponds to pu4_horz_bs[0] of the bottom CTB */
+    *(UWORD32 *)((UWORD8 *)pu4_horz_bs + horz_bs_strd) = 0;
+
+    cur_ctb_idx = ps_bs_ctxt->i4_ctb_x
+                    + ps_bs_ctxt->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+    next_ctb_idx = ps_bs_ctxt->i4_next_tu_ctb_cnt;
+    if(1 == ps_bs_ctxt->ps_codec->i4_num_cores)
+    {
+        i4_tu_cnt = ps_bs_ctxt->pu4_pic_tu_idx[next_ctb_idx] - ps_bs_ctxt->pu4_pic_tu_idx[cur_ctb_idx % RESET_TU_BUF_NCTB];
+    }
+    else
+    {
+        i4_tu_cnt = ps_bs_ctxt->pu4_pic_tu_idx[next_ctb_idx] - ps_bs_ctxt->pu4_pic_tu_idx[cur_ctb_idx];
+    }
+
+    ps_tu = ps_bs_ctxt->ps_tu;
+    if(u4_qp_const_in_ctb)
+        pu1_qp[0] = ps_tu->b7_qp;
+
+    /* For all TUs in the CTB For left and top edges, check if there are coded coefficients on either sides of the edge */
+    for(i = 0; i < i4_tu_cnt; i++)
+    {
+        WORD32 start_pos_x;
+        WORD32 start_pos_y;
+        WORD32 end_pos_x;
+        WORD32 end_pos_y;
+        WORD32 tu_size;
+        UWORD32 u4_bs;
+        WORD32 intra_flag;
+        UWORD8 *pu1_pic_intra_flag;
+
+        ps_tu = ps_bs_ctxt->ps_tu + i;
+
+        start_pos_x = ps_tu->b4_pos_x;
+        start_pos_y = ps_tu->b4_pos_y;
+
+        tu_size = 1 << (ps_tu->b3_size + 2);
+        tu_size >>= 2;
+
+        end_pos_x = start_pos_x + tu_size;
+        end_pos_y = start_pos_y + tu_size;
+
+        {
+            WORD32 tu_abs_x = (ps_bs_ctxt->i4_ctb_x << log2_ctb_size) + (start_pos_x << 2);
+            WORD32 tu_abs_y = (ps_bs_ctxt->i4_ctb_y << log2_ctb_size) + (start_pos_y << 2);
+
+            WORD32 numbytes_row =  (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
+
+            pu1_pic_intra_flag = ps_bs_ctxt->ps_codec->pu1_pic_intra_flag;
+            pu1_pic_intra_flag += (tu_abs_y >> 3) * numbytes_row;
+            pu1_pic_intra_flag += (tu_abs_x >> 6);
+
+            intra_flag = *pu1_pic_intra_flag;
+            intra_flag &= (1 << ((tu_abs_x >> 3) % 8));
+        }
+        if(intra_flag)
+        {
+            u4_bs = DUP_LSB_10(tu_size);
+
+            /* Only if the current edge falls on 8 pixel grid set BS */
+            if(0 == (start_pos_x & 1))
+            {
+                WORD32 shift;
+                shift = start_pos_y * 2;
+                /* shift += (((start_pos_x >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+                 *  will reduce to the following assuming ctb size is one of 16, 32 and 64
+                 *  and deblocking is done on 8x8 grid
+                 */
+                if(6 != log2_ctb_size)
+                    shift += ((start_pos_x & 2) << (log2_ctb_size - 2));
+                pu4_vert_bs[start_pos_x >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+            }
+            /* Only if the current edge falls on 8 pixel grid set BS */
+            if(0 == (start_pos_y & 1))
+            {
+                WORD32 shift;
+                shift = start_pos_x * 2;
+                /* shift += (((start_pos_y >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+                 *  will reduce to the following assuming ctb size is one of 16, 32 and 64
+                 *  and deblocking is done on 8x8 grid
+                 */
+                if(6 != log2_ctb_size)
+                    shift += ((start_pos_y & 2) << (log2_ctb_size - 2));
+                pu4_horz_bs[start_pos_y >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+            }
+        }
+
+
+        /* If the current TU is coded then set both top edge and left edge BS to 1 and go to next TU */
+        if(ps_tu->b1_y_cbf)
+        {
+            u4_bs = DUP_LSB_01(tu_size);
+
+            /* Only if the current edge falls on 8 pixel grid set BS */
+            if(0 == (start_pos_x & 1))
+            {
+                WORD32 shift;
+                shift = start_pos_y * 2;
+                /* shift += (((start_pos_x >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+                 *  will reduce to the following assuming ctb size is one of 16, 32 and 64
+                 *  and deblocking is done on 8x8 grid
+                 */
+                if(6 != log2_ctb_size)
+                    shift += ((start_pos_x & 2) << (log2_ctb_size - 2));
+                pu4_vert_bs[start_pos_x >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+            }
+            /* Only if the current edge falls on 8 pixel grid set BS */
+            if(0 == (start_pos_y & 1))
+            {
+                WORD32 shift;
+                shift = start_pos_x * 2;
+                /* shift += (((start_pos_y >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+                 *  will reduce to the following assuming ctb size is one of 16, 32 and 64
+                 *  and deblocking is done on 8x8 grid
+                 */
+                if(6 != log2_ctb_size)
+                    shift += ((start_pos_y & 2) << (log2_ctb_size - 2));
+                pu4_horz_bs[start_pos_y >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+            }
+            /* Only if the current edge falls on 8 pixel grid set BS */
+            if(0 == (end_pos_x & 1))
+            {
+                if(!(ctb_size / 8 == (end_pos_x >> 1) && ps_bs_ctxt->i4_ctb_x == ps_sps->i2_pic_wd_in_ctb - 1))
+                {
+                    WORD32 shift;
+                    shift = start_pos_y * 2;
+                    shift += (((end_pos_x >> 1) & ((MAX_CTB_SIZE >> log2_ctb_size) - 1)) << (log2_ctb_size - 1));
+                    pu4_vert_bs[end_pos_x >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+                }
+            }
+            /* Only if the current edge falls on 8 pixel grid set BS */
+            if(0 == (end_pos_y & 1))
+            {
+                /* If end_pos_y corresponds to the bottom of the CTB, write to pu4_horz_bs[0] of the bottom CTB */
+                if(ctb_size / 8 == (end_pos_y >> 1))
+                {
+                    *(UWORD32 *)((UWORD8 *)pu4_horz_bs + horz_bs_strd) |= (u4_bs << (start_pos_x * 2));
+                }
+                else
+                {
+                    WORD32 shift;
+                    shift = start_pos_x * 2;
+                    shift += (((end_pos_y >> 1) & ((MAX_CTB_SIZE >> log2_ctb_size) - 1)) << (log2_ctb_size - 1));
+                    pu4_horz_bs[end_pos_y >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+                }
+            }
+        }
+
+        if(0 == u4_qp_const_in_ctb)
+        {
+            if(0 == (start_pos_x & 1) && 0 == (start_pos_y & 1))
+            {
+                WORD32 row, col;
+                for(row = start_pos_y; row < start_pos_y + tu_size; row += 2)
+                {
+                    for(col = start_pos_x; col < start_pos_x + tu_size; col += 2)
+                    {
+                        pu1_qp[(row >> 1) * qp_strd + (col >> 1)] = ps_tu->b7_qp;
+                    }
+                }
+            }
+        }
+    }
+
+    /* For all PUs in the CTB,
+    For left and top edges, compute BS */
+
+    cur_ctb_idx = ps_bs_ctxt->i4_ctb_x
+                    + ps_bs_ctxt->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+
+    {
+        WORD32 next_ctb_idx;
+        next_ctb_idx = ps_bs_ctxt->i4_next_pu_ctb_cnt;
+        i4_pu_cnt = ps_bs_ctxt->pu4_pic_pu_idx[next_ctb_idx] - ps_bs_ctxt->pu4_pic_pu_idx[cur_ctb_idx];
+    }
+
+    for(i = 0; i < i4_pu_cnt; i++)
+    {
+        WORD32 start_pos_x;
+        WORD32 start_pos_y;
+        WORD32 end_pos_x;
+        WORD32 end_pos_y;
+        WORD32 pu_wd, pu_ht;
+        UWORD32 u4_bs;
+        pu_t *ps_pu = ps_bs_ctxt->ps_pu + i;
+        pu_t *ps_ngbr_pu;
+        UWORD32 u4_ngbr_pu_indx;
+
+        start_pos_x = ps_pu->b4_pos_x;
+        start_pos_y = ps_pu->b4_pos_y;
+
+        pu_wd = (ps_pu->b4_wd + 1);
+        pu_ht = (ps_pu->b4_ht + 1);
+
+        end_pos_x = start_pos_x + pu_wd;
+        end_pos_y = start_pos_y + pu_ht;
+
+        /* If the current PU is intra, set Boundary strength as 2 for both top and left edge */
+        /* Need not mask the BS to zero even if it was set to 1 already since BS 2 and 3 are assumed to be the same in leaf level functions */
+        if(ps_pu->b1_intra_flag)
+        {
+            u4_bs = DUP_LSB_10(pu_ht);
+
+            /* Only if the current edge falls on 8 pixel grid set BS */
+            if(0 == (start_pos_x & 1))
+            {
+                WORD32 shift;
+                shift = start_pos_y * 2;
+                /* shift += (((start_pos_x >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+                 *  will reduce to the following assuming ctb size is one of 16, 32 and 64
+                 *  and deblocking is done on 8x8 grid
+                 */
+                if(6 != log2_ctb_size)
+                    shift += ((start_pos_x & 2) << (log2_ctb_size - 2));
+                pu4_vert_bs[start_pos_x >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+            }
+
+            u4_bs = DUP_LSB_10(pu_wd);
+
+            /* Only if the current edge falls on 8 pixel grid set BS */
+            if(0 == (start_pos_y & 1))
+            {
+                WORD32 shift;
+                shift = start_pos_x * 2;
+                /* shift += (((start_pos_y >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+                 *  will reduce to the following assuming ctb size is one of 16, 32 and 64
+                 *  and deblocking is done on 8x8 grid
+                 */
+                if(6 != log2_ctb_size)
+                    shift += ((start_pos_y & 2) << (log2_ctb_size - 2));
+                pu4_horz_bs[start_pos_y >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+            }
+        }
+
+        else
+        {
+            /* Vertical edge */
+            /* Process only if the edge is not a frame edge */
+            if(0 != ps_bs_ctxt->i4_ctb_x + start_pos_x)
+            {
+                do
+                {
+                    WORD32 pu_ngbr_ht;
+                    WORD32 min_pu_ht;
+                    WORD32 ngbr_end_pos_y;
+                    UWORD32 ngbr_pu_idx_strd;
+                    ngbr_pu_idx_strd = MAX_CTB_SIZE / MIN_PU_SIZE + 2;
+                    u4_ngbr_pu_indx = ps_bs_ctxt->pu4_pic_pu_idx_map[(start_pos_y + 1) * ngbr_pu_idx_strd + (start_pos_x)];
+                    ps_ngbr_pu = ps_bs_ctxt->ps_pic_pu + u4_ngbr_pu_indx;
+
+                    pu_ngbr_ht = ps_ngbr_pu->b4_ht + 1;
+                    ngbr_end_pos_y = ps_ngbr_pu->b4_pos_y + pu_ngbr_ht;
+
+                    min_pu_ht = MIN(ngbr_end_pos_y, end_pos_y) - start_pos_y;
+
+                    if(ps_ngbr_pu->b1_intra_flag)
+                    {
+                        u4_bs = DUP_LSB_10(min_pu_ht);
+
+                        /* Only if the current edge falls on 8 pixel grid set BS */
+                        if(0 == (start_pos_x & 1))
+                        {
+                            WORD32 shift;
+                            shift = start_pos_y * 2;
+                            /* shift += (((start_pos_x >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+                             *  will reduce to the following assuming ctb size is one of 16, 32 and 64
+                             *  and deblocking is done on 8x8 grid
+                             */
+                            if(6 != log2_ctb_size)
+                                shift += ((start_pos_x & 2) << (log2_ctb_size - 2));
+                            pu4_vert_bs[start_pos_x >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+                        }
+                    }
+                    else
+                    {
+                        u4_bs = ihevcd_pu_boundary_strength(ps_pu, ps_ngbr_pu);
+                        if(u4_bs)
+                        {
+                            u4_bs = DUP_LSB_01(min_pu_ht);
+                            if(0 == (start_pos_x & 1))
+                            {
+                                WORD32 shift;
+                                shift = start_pos_y * 2;
+                                /* shift += (((start_pos_x >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+                                 *  will reduce to the following assuming ctb size is one of 16, 32 and 64
+                                 *  and deblocking is done on 8x8 grid
+                                 */
+                                if(6 != log2_ctb_size)
+                                    shift += ((start_pos_x & 2) << (log2_ctb_size - 2));
+                                pu4_vert_bs[start_pos_x >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+                            }
+                        }
+                    }
+
+                    pu_ht -= min_pu_ht;
+                    start_pos_y += min_pu_ht;
+                }while(pu_ht > 0);
+
+                /* Reinitialising since the values are updated in the previous loop */
+                pu_ht = ps_pu->b4_ht + 1;
+                start_pos_y = ps_pu->b4_pos_y;
+            }
+
+            /* Horizontal edge */
+            /* Process only if the edge is not a frame edge */
+            if(0 != ps_bs_ctxt->i4_ctb_y + start_pos_y)
+            {
+                do
+                {
+                    WORD32 pu_ngbr_wd;
+                    WORD32 min_pu_wd;
+                    WORD32 ngbr_end_pos_x;
+                    UWORD32 ngbr_pu_idx_strd = MAX_CTB_SIZE / MIN_PU_SIZE + 2;
+                    u4_ngbr_pu_indx = ps_bs_ctxt->pu4_pic_pu_idx_map[(start_pos_y)*ngbr_pu_idx_strd + (start_pos_x + 1)];
+                    ps_ngbr_pu = ps_bs_ctxt->ps_pic_pu + u4_ngbr_pu_indx;
+
+                    pu_ngbr_wd = ps_ngbr_pu->b4_wd + 1;
+                    ngbr_end_pos_x = ps_ngbr_pu->b4_pos_x + pu_ngbr_wd;
+
+                    min_pu_wd = MIN(ngbr_end_pos_x, end_pos_x) - start_pos_x;
+
+                    if(ps_ngbr_pu->b1_intra_flag)
+                    {
+                        u4_bs = DUP_LSB_10(min_pu_wd);
+
+                        /* Only if the current edge falls on 8 pixel grid set BS */
+                        if(0 == (start_pos_y & 1))
+                        {
+                            WORD32 shift;
+                            shift = start_pos_x * 2;
+                            /* shift += (((start_pos_y >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+                             *  will reduce to the following assuming ctb size is one of 16, 32 and 64
+                             *  and deblocking is done on 8x8 grid
+                             */
+                            if(6 != log2_ctb_size)
+                                shift += ((start_pos_y & 2) << (log2_ctb_size - 2));
+                            pu4_horz_bs[start_pos_y >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+                        }
+                    }
+                    else
+                    {
+                        u4_bs = ihevcd_pu_boundary_strength(ps_pu, ps_ngbr_pu);
+                        if(u4_bs)
+                        {
+                            u4_bs = DUP_LSB_01(min_pu_wd);
+
+                            /* Only if the current edge falls on 8 pixel grid set BS */
+                            if(0 == (start_pos_y & 1))
+                            {
+                                WORD32 shift;
+                                shift = start_pos_x * 2;
+                                /* shift += (((start_pos_y >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+                                 *  will reduce to the following assuming ctb size is one of 16, 32 and 64
+                                 *  and deblocking is done on 8x8 grid
+                                 */
+                                if(6 != log2_ctb_size)
+                                    shift += ((start_pos_y & 2) << (log2_ctb_size - 2));
+                                pu4_horz_bs[start_pos_y >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+                            }
+                        }
+                    }
+
+                    pu_wd -= min_pu_wd;
+                    start_pos_x += min_pu_wd;
+                }while(pu_wd > 0);
+
+                /* Reinitialising since the values are updated in the previous loop */
+                pu_wd = ps_pu->b4_wd + 1;
+                start_pos_x = ps_pu->b4_pos_x;
+            }
+        }
+    }
+
+    {
+        /* If left neighbor is not available, then set BS for entire first column to zero */
+        UWORD32 ctb_addr;
+        WORD32 slice_idx, left_slice_idx = -1, top_slice_idx = -1;
+
+        if(!ps_pps->i1_tiles_enabled_flag)
+        {
+            if((0 == i1_loop_filter_across_tiles_enabled_flag && 0 == ps_bs_ctxt->i4_ctb_tile_x) ||
+                            (0 == i1_loop_filter_across_slices_enabled_flag && 0 == ps_bs_ctxt->i4_ctb_slice_x && 0 == ps_bs_ctxt->i4_ctb_slice_y) ||
+                            (0 == ps_bs_ctxt->i4_ctb_x))
+            {
+                pu4_vert_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
+            }
+        }
+        else
+        {
+            if((0 == i1_loop_filter_across_tiles_enabled_flag && 0 == ps_bs_ctxt->i4_ctb_tile_x))
+            {
+                pu4_vert_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
+            }
+            else
+            {
+
+                ctb_addr = ps_bs_ctxt->i4_ctb_x + (ps_bs_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+                slice_idx = ps_bs_ctxt->pu1_slice_idx[ctb_addr];
+
+                if(ps_bs_ctxt->i4_ctb_x)
+                {
+                    ctb_addr = (ps_bs_ctxt->i4_ctb_x - 1) + (ps_bs_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+                    left_slice_idx = ps_bs_ctxt->pu1_slice_idx[ctb_addr];
+                }
+
+                if(!((ps_bs_ctxt->ps_slice_hdr->i1_dependent_slice_flag == 1) && (slice_idx == left_slice_idx)))
+                {
+                    if((0 == i1_loop_filter_across_slices_enabled_flag && (
+                                    (0 == ps_bs_ctxt->i4_ctb_slice_x && 0 == ps_bs_ctxt->i4_ctb_slice_y) || (0 == ps_bs_ctxt->i4_ctb_slice_x && 0 == ps_bs_ctxt->i4_ctb_tile_x)
+                                    || ((0 == ps_bs_ctxt->i4_ctb_tile_x) && (slice_idx != left_slice_idx)))) || (0 == ps_bs_ctxt->i4_ctb_x))
+                    {
+                        pu4_vert_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
+                    }
+                }
+            }
+        }
+
+        ctb_addr = ps_bs_ctxt->i4_ctb_x + (ps_bs_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+        slice_idx = ps_bs_ctxt->pu1_slice_idx[ctb_addr];
+        if(ps_bs_ctxt->i4_ctb_y)
+        {
+            ctb_addr = (ps_bs_ctxt->i4_ctb_x) + ((ps_bs_ctxt->i4_ctb_y - 1) * ps_sps->i2_pic_wd_in_ctb);
+            top_slice_idx = ps_bs_ctxt->pu1_slice_idx[ctb_addr];
+        }
+        /* If top neighbor is not available, then set BS for entire first row to zero */
+        if((0 == i1_loop_filter_across_tiles_enabled_flag && 0 == ps_bs_ctxt->i4_ctb_tile_y)
+                        || (0 == i1_loop_filter_across_slices_enabled_flag && ((0 == ps_bs_ctxt->i4_ctb_slice_y) || (slice_idx != top_slice_idx)))
+                        || (0 == ps_bs_ctxt->i4_ctb_y))
+        {
+            pu4_horz_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
+        }
+    }
+
+    /**
+     *  Set BS of bottom and right frame boundaries to zero if it is an incomplete CTB
+     *   (They might have set to  non zero values because of CBF of the current CTB)*/
+    {
+        WORD32 num_rows_remaining = (ps_sps->i2_pic_height_in_luma_samples - (ps_bs_ctxt->i4_ctb_y << log2_ctb_size)) >> 3;
+        WORD32 num_cols_remaining = (ps_sps->i2_pic_width_in_luma_samples - (ps_bs_ctxt->i4_ctb_x << log2_ctb_size)) >> 3;
+        if(num_rows_remaining < (ctb_size >> 3))
+        {
+            /* WORD32 offset = (((num_rows_remaining >> 3) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 4));
+             *  will reduce to the following assuming ctb size is one of 16, 32 and 64
+             *  and deblocking is done on 8x8 grid
+             */
+            WORD32 offset;
+            offset = (num_rows_remaining >> (6 - log2_ctb_size)) << 2;
+            if(6 != log2_ctb_size)
+                offset += (num_rows_remaining & 1) << (log2_ctb_size - 4);
+
+            memset(((UWORD8 *)pu4_horz_bs) + offset, 0, 1 << (log2_ctb_size - 4));
+        }
+
+        if(num_cols_remaining < (ctb_size >> 3))
+        {
+            /* WORD32 offset = (((num_cols_remaining >> 3) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 4));
+             *  will reduce to the following assuming ctb size is one of 16, 32 and 64
+             *  and deblocking is done on 8x8 grid
+             */
+
+            WORD32 offset;
+            offset = (num_cols_remaining >> (6 - log2_ctb_size)) << 2;
+            if(6 != log2_ctb_size)
+                offset += (num_cols_remaining & 1) << (log2_ctb_size - 4);
+
+            memset(((UWORD8 *)pu4_vert_bs) + offset, 0, 1 << (log2_ctb_size - 4));
+        }
+    }
+    return 0;
+}

diff --git a/decoder/ihevcd_boundary_strength.h b/decoder/ihevcd_boundary_strength.h
new file mode 100644
index 0000000..c2f3e16
--- /dev/null
+++ b/decoder/ihevcd_boundary_strength.h

@@ -0,0 +1,49 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_bitps_bitstrm.h
+*
+* @brief
+*  Header for bitps_bitstrm access functions
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef IHEVCD_BOUNDARY_STRENGTH_H_
+#define IHEVCD_BOUNDARY_STRENGTH_H_
+
+WORD32 ihevcd_ctb_boundary_strength_islice(bs_ctxt_t *ps_bs_ctxt);
+
+WORD32 ihevcd_ctb_boundary_strength_pbslice(bs_ctxt_t *ps_bs_ctxt);
+
+WORD32 ihevcd_pu_boundary_strength(pu_t *ps_pu,
+                                   pu_t *ps_ngbr_pu);
+
+
+
+#endif /* IHEVCD_BOUNDARY_STRENGTH_H_ */

diff --git a/decoder/ihevcd_cabac.c b/decoder/ihevcd_cabac.c
new file mode 100644
index 0000000..07e9e54
--- /dev/null
+++ b/decoder/ihevcd_cabac.c

@@ -0,0 +1,845 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ ******************************************************************************
+ * @file ihevcd_cabac.c
+ *
+ * @brief
+ *    This file contains function definitions related to CABAC parsing
+ *
+ * @author
+ *    Ittiam
+ *
+ *
+ * List of Functions
+ *
+ *   ihevcd_cabac_init()
+ *   ihevcd_cabac_decode_bin()
+ *   ihevcd_cabac_decode_bypass_bin()
+ *   ihevcd_cabac_decode_bypass_bins_tunary()
+ *   ihevcd_cabac_decode_terminate()
+ *   ihevcd_cabac_decode_bin_tunary()
+ *   ihevcd_cabac_decode_bypass_bins()
+ *   ihevcd_cabac_decode_bypass_bins_egk()
+ *   ihevcd_cabac_decode_trunc_rice()
+ *   ihevcd_cabac_flush()
+ *
+ ******************************************************************************
+ */
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+
+#include "ihevc_debug.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_cabac_tables.h"
+
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_cabac.h"
+#include "ihevcd_trace.h"
+
+#ifdef TRACE
+extern trace_t g_trace;
+#endif
+#if DEBUG_CABAC_RANGE_OFST
+#if FULLRANGE
+#define DEBUG_RANGE_OFST(str, m_range, m_ofst )  \
+{\
+    UWORD32 m_clz, m_range_shift, m_ofst_shift;                           \
+    m_clz = CLZ(m_range);                                                \
+    m_clz -= (32 - RANGE_NUMBITS);                                      \
+    m_range_shift = m_range << m_clz;                                    \
+    m_range_shift = m_range_shift >> RANGE_SHIFT;                                 \
+    m_ofst_shift = m_ofst << m_clz;                                    \
+    m_ofst_shift = m_ofst_shift >> RANGE_SHIFT;                                 \
+    fprintf( g_trace.fp, "%-40s R: %3d O: %3d\n", str, m_range_shift, m_ofst_shift); \
+}
+
+#else
+#define DEBUG_RANGE_OFST(str,  m_range, m_ofst) \
+    fprintf( g_trace.fp, "%-40s R: %3d O: %3d\n", str, m_range, m_ofst);
+#endif
+#else
+#define DEBUG_RANGE_OFST(str, m_range, m_ofst )
+#endif
+/*****************************************************************************/
+/* Function Definitions                                                      */
+/*****************************************************************************/
+
+/**
+ ******************************************************************************
+ *
+ *  @brief Initializes the decoder cabac engine
+ *
+ *  @par   Description
+ *  This routine needs to be called at start of slice/frame decode
+ *
+ *  @param[in,out]   ps_cabac_ctxt
+ *  pointer to cabac context (handle)
+ *
+ *  @param[in]   ps_bitstrm
+ *  pointer to bitstream context (handle)
+ *
+ *  @param[in]   qp
+ *  current slice Qp
+ *
+ *  @param[in]   cabac_init_idc
+ *  current slice init idc (range  [0 - 2])
+ *
+ *  @param[in]   pu1_init_ctxt
+ *  Init cabac context to be used (range  [0 - 2])
+ *
+ *  @return      success or failure error code
+ *
+ ******************************************************************************
+ */
+IHEVCD_ERROR_T ihevcd_cabac_init(cab_ctxt_t *ps_cabac,
+                                 bitstrm_t *ps_bitstrm,
+                                 WORD32 qp,
+                                 WORD32 cabac_init_idc,
+                                 const UWORD8 *pu1_init_ctxt)
+{
+    /* Sanity checks */
+    ASSERT(ps_cabac != NULL);
+    ASSERT(ps_bitstrm != NULL);
+    ASSERT((qp >= 0) && (qp < 52));
+    ASSERT((cabac_init_idc >= 0) && (cabac_init_idc < 3));
+    UNUSED(qp);
+    UNUSED(cabac_init_idc);
+    /* CABAC engine uses 32 bit range instead of 9 bits as specified by
+     * the spec. This is done to reduce number of renormalizations
+     */
+    /* cabac engine initialization */
+#if FULLRANGE
+    ps_cabac->u4_range = (UWORD32)510 << RANGE_SHIFT;
+    BITS_GET(ps_cabac->u4_ofst, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+                    ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, (9 + RANGE_SHIFT));
+
+#else
+    ps_cabac->u4_range = (UWORD32)510;
+    BITS_GET(ps_cabac->u4_ofst, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+                    ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, 9);
+
+#endif
+
+    /* cabac context initialization based on init idc and slice qp */
+    memcpy(ps_cabac->au1_ctxt_models,
+           pu1_init_ctxt,
+           IHEVC_CAB_CTXT_END);
+    DEBUG_RANGE_OFST("init", ps_cabac->u4_range, ps_cabac->u4_ofst);
+    return ((IHEVCD_ERROR_T)IHEVCD_SUCCESS);
+}
+
+IHEVCD_ERROR_T ihevcd_cabac_reset(cab_ctxt_t *ps_cabac,
+                                  bitstrm_t *ps_bitstrm)
+{
+    /* Sanity checks */
+    ASSERT(ps_cabac != NULL);
+    ASSERT(ps_bitstrm != NULL);
+
+    /* CABAC engine uses 32 bit range instead of 9 bits as specified by
+     * the spec. This is done to reduce number of renormalizations
+     */
+    /* cabac engine initialization */
+#if FULLRANGE
+    ps_cabac->u4_range = (UWORD32)510 << RANGE_SHIFT;
+    BITS_GET(ps_cabac->u4_ofst, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+                    ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, (9 + RANGE_SHIFT));
+
+#else
+    ps_cabac->u4_range = (UWORD32)510;
+    BITS_GET(ps_cabac->u4_ofst, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+                    ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, 9);
+
+#endif
+
+    return ((IHEVCD_ERROR_T)IHEVCD_SUCCESS);
+}
+
+/**
+ ******************************************************************************
+ *
+ *  @brief Decodes a bin based on probablilty and mps packed context model
+ *
+ *  @par   Description
+ *  Decodes a bin as per Section : 9.3.3.2.1 and calls renormalization if required
+ *  as per section 9.3.3.2.2
+ *  1. Apart from decoding bin, context model is updated as per state transition
+ *  2. Range and Low renormalization is done based on bin and original state
+ *  3. After renorm bistream is updated (if required)
+ *
+ *  @param[in,out]   ps_cabac
+ *  pointer to cabac context (handle)
+ *
+ *  @param[in]   ctxt_index
+ *  index of cabac context model containing pState[bits6-1] | MPS[bit0]
+ *
+ *  @param[in]   ps_bitstrm
+ *  Bitstream context
+ *
+ *  @return      bin(boolean) to be decoded
+ *
+ ******************************************************************************
+ */
+UWORD32 ihevcd_cabac_decode_bin(cab_ctxt_t *ps_cabac,
+                                bitstrm_t *ps_bitstrm,
+                                WORD32 ctxt_index
+
+                               )
+{
+    UWORD32 u4_range = ps_cabac->u4_range;
+    UWORD32 u4_ofst = ps_cabac->u4_ofst;
+    UWORD32 u4_rlps;
+    UWORD32 u4_bin;
+    UWORD8 *pu1_ctxt_model = &ps_cabac->au1_ctxt_models[ctxt_index];
+    WORD32 state_mps = *pu1_ctxt_model;
+#if FULLRANGE
+    WORD32 clz;
+#endif
+    UWORD32 u4_qnt_range;
+
+    /* Sanity checks */
+    ASSERT(u4_range >= 256);
+    ASSERT((ctxt_index >= 0) && (ctxt_index < IHEVC_CAB_CTXT_END));
+    ASSERT(state_mps < 128);
+#if FULLRANGE
+    clz = CLZ(u4_range);
+    clz -= (32 - RANGE_NUMBITS);
+    u4_qnt_range = u4_range << clz;
+    u4_qnt_range = (u4_qnt_range >> (RANGE_SHIFT + 6)) & 0x3;
+#else
+    u4_qnt_range = (u4_range >> 6) & 0x3;
+#endif
+    /* Get the lps range from LUT based on quantized range and state */
+    u4_rlps = gau1_ihevc_cabac_rlps[state_mps >> 1][u4_qnt_range];
+#if FULLRANGE
+    u4_rlps = u4_rlps << (RANGE_SHIFT - clz);
+#endif
+    u4_range -= u4_rlps;
+
+    u4_bin = state_mps & 1;
+
+    if(u4_ofst >= u4_range)
+    {
+        u4_bin = 1 - u4_bin;
+        u4_ofst -= u4_range;
+        u4_range = u4_rlps;
+    }
+
+    *pu1_ctxt_model = gau1_ihevc_next_state[(state_mps << 1) | u4_bin];
+
+    /*****************************************************************/
+    /* Re-normalization; calculate bits generated based on range(R)  */
+    /*****************************************************************/
+    if(u4_range < (1 << 8))
+    {
+        UWORD32 u4_bits;
+        WORD32 numbits;
+        numbits = CLZ(u4_range);
+        numbits -= (32 - RANGE_NUMBITS);
+#if !FULLRANGE
+        numbits -= RANGE_SHIFT;
+#endif
+        BITS_GET(u4_bits, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+                 ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, numbits);
+
+        u4_ofst <<= numbits;
+        u4_ofst |= u4_bits;
+        u4_range <<= numbits;
+
+    }
+    /* Update the cabac context */
+    ps_cabac->u4_range = u4_range;
+    ps_cabac->u4_ofst = u4_ofst;
+    DEBUG_RANGE_OFST("bin", ps_cabac->u4_range, ps_cabac->u4_ofst);
+
+    return (u4_bin);
+
+
+}
+
+/**
+ ******************************************************************************
+ *
+ *  @brief Decodes a bypass bin (equi-probable 0 / 1)
+ *
+ *  @par   Description
+ *  Decodes a bypss bin as per Section : 9.3.3.2.3
+ *
+ *  @param[in,out]  ps_cabac
+ *  pointer to cabac context (handle)
+ *
+ *  @param[in]   ps_bitstrm
+ *  Bitstream context
+ *
+ *  @return      Decoded bypass bin
+ *
+ ******************************************************************************
+ */
+UWORD32 ihevcd_cabac_decode_bypass_bin(cab_ctxt_t *ps_cabac,
+                                       bitstrm_t *ps_bitstrm)
+{
+
+    UWORD32 u4_bin;
+    UWORD32 u4_range = ps_cabac->u4_range;
+    UWORD32 u4_ofst = ps_cabac->u4_ofst;
+    UWORD32 u4_bits;
+
+    /* Sanity checks */
+    ASSERT(u4_range >= 256);
+
+    BIT_GET(u4_bits, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+            ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word);
+
+    u4_ofst <<= 1;
+    u4_ofst |= u4_bits;
+
+    u4_bin = 0;
+    if(u4_ofst >= u4_range)
+    {
+        u4_bin = 1;
+        u4_ofst -= u4_range;
+    }
+
+    /* Update the cabac context */
+    ps_cabac->u4_ofst = u4_ofst;
+    DEBUG_RANGE_OFST("bypass end", ps_cabac->u4_range, ps_cabac->u4_ofst);
+    return (u4_bin);
+}
+
+/**
+ ******************************************************************************
+ *
+ *  @brief Decodes a terminate bin (1:terminate 0:do not terminate)
+ *
+ *  @par   Description
+ *  Decodes a terminate bin to be called for end_of_slice_flag and pcm_flag
+ *  as per Section : 9.3.3.2.4
+ *
+ *  @param[in,out]  ps_cabac
+ *  pointer to cabac context (handle)
+ *
+ *  @param[in]   ps_bitstrm
+ *  Bitstream context
+ *
+ *  @return    Decoded Bin to indicate whether to terminate or not
+ *
+ ******************************************************************************
+ */
+UWORD32 ihevcd_cabac_decode_terminate(cab_ctxt_t *ps_cabac,
+                                      bitstrm_t *ps_bitstrm)
+{
+    UWORD32 u4_range = ps_cabac->u4_range;
+    UWORD32 u4_ofst = ps_cabac->u4_ofst;
+    UWORD32 u4_bin;
+#if FULLRANGE
+    WORD32 clz;
+#endif
+    /* Sanity checks */
+    ASSERT(u4_range >= 256);
+#if FULLRANGE
+    clz = CLZ(u4_range);
+    clz -= (32 - RANGE_NUMBITS);
+    u4_range -= 2 << (RANGE_SHIFT - clz);
+#else
+    u4_range -= 2;
+#endif
+
+    if(u4_ofst >= u4_range)
+    {
+        u4_bin = 1;
+
+#if FULLRANGE
+        /* In case of FULLRANGE extra bits read earlier need to pushed back to the bitstream */
+        {
+            WORD32 clz;
+            WORD32 numbits;
+            clz = CLZ(ps_cabac->u4_range);
+
+            numbits = (32 - clz);
+            numbits -= 9;
+
+            ihevcd_bits_seek(ps_bitstrm, -numbits);
+        }
+#endif
+
+    }
+    else
+    {
+        u4_bin = 0;
+    }
+    if(0 == u4_bin)
+    {
+        UWORD32 u4_bits;
+        WORD32 numbits;
+        numbits = CLZ(u4_range);
+        numbits -= (32 - RANGE_NUMBITS);
+#if !FULLRANGE
+        numbits -= RANGE_SHIFT;
+#endif
+        /* Renormalize if required */
+        if(numbits)
+        {
+            BITS_GET(u4_bits, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+                     ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, numbits);
+
+            u4_ofst <<= numbits;
+            u4_ofst |= u4_bits;
+            u4_range <<= numbits;
+        }
+    }
+    /* bits to be inserted in the bitstream */
+    ps_cabac->u4_range = u4_range;
+    ps_cabac->u4_ofst = u4_ofst;
+    DEBUG_RANGE_OFST("term", ps_cabac->u4_range, ps_cabac->u4_ofst);
+
+    return (u4_bin);
+}
+
+/**
+ ******************************************************************************
+ *
+ *  @brief Decodes a bypass bin (equi-probable 0 / 1)
+ *
+ *  @par   Description
+ *  Decodes a bypss bin as per Section : 9.3.3.2.3
+ *
+ *  @param[in,out]  ps_cabac
+ *  pointer to cabac context (handle)
+ *
+ *  @param[in]   ps_bitstrm
+ *  Bitstream context
+ *
+ *  @param[in]   numbins
+ *  Number of bins to decoded
+ *
+ *  @return      Decoded bypass bin
+ *
+ *  @remarks     Tested only for numbins less than 17
+ *
+ ******************************************************************************
+ */
+
+UWORD32 ihevcd_cabac_decode_bypass_bins(cab_ctxt_t *ps_cabac,
+                                        bitstrm_t *ps_bitstrm,
+                                        WORD32 numbins)
+{
+    UWORD32 u4_bins;
+
+
+    UWORD32 u4_range = ps_cabac->u4_range;
+    UWORD32 u4_ofst = ps_cabac->u4_ofst;
+    UWORD32 u4_bits;
+    ASSERT(u4_range >= 256);
+    ASSERT(numbins > 0);
+
+    /* Sanity checks */
+    ASSERT(numbins < 17);
+
+    u4_bins = 0;
+
+    BITS_GET(u4_bits, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+                    ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, numbins);
+
+    do
+    {
+        UWORD32 u4_bit;
+        numbins--;
+        u4_bit = (u4_bits >> numbins) & 1;
+        u4_ofst <<= 1;
+        u4_ofst |= u4_bit;
+
+        u4_bins <<= 1;
+        if(u4_ofst >= u4_range)
+        {
+            u4_bins += 1;
+            u4_ofst -= u4_range;
+        }
+    }while(numbins);
+
+    /* Update the cabac context */
+    ps_cabac->u4_ofst = u4_ofst;
+    DEBUG_RANGE_OFST("bypass", ps_cabac->u4_range, ps_cabac->u4_ofst);
+    return (u4_bins);
+}
+
+/**
+ ******************************************************************************
+ *
+ *  @brief Decodes a truncated unary symbol associated with context model(s)
+ *
+ *  @par   Description
+ *  Decodes symbols coded with TUnary binarization as per sec 9.3.2.2
+ *  This is used for computing symbols like qp_delta,
+ *  last_sig_coeff_prefix_x, last_sig_coeff_prefix_y.
+ *
+ *  The context models associated with each bin is computed as :
+ *   current bin context = "base context idx" + (bin_idx >> shift)
+ *  where
+ *   1. "base context idx" is the base index for the syntax element
+ *   2. "bin_idx" is the current bin index of the unary code
+ *   3. "shift" is the shift factor associated with this syntax element
+ *
+ *  @param[in,out] ps_cabac
+ *   pointer to cabac context (handle)
+ *
+ *  @param[in]   ps_bitstrm
+ *  Bitstream context
+ *
+ *  @param[in]   c_max
+ *   maximum value of sym (required for tunary binarization)
+ *
+ *  @param[in]   ctxt_index
+ *   base context model index for this syntax element
+ *
+ *  @param[in]   ctxt_shift
+ *   shift factor for context increments associated with this syntax element
+ *
+ *  @param[in]   ctxt_inc_max
+ *   max value of context increment beyond which all bins will use same ctxt
+ *
+ *  @return     syntax element decoded
+ *
+ ******************************************************************************
+ */
+UWORD32 ihevcd_cabac_decode_bins_tunary(cab_ctxt_t *ps_cabac,
+                                        bitstrm_t *ps_bitstrm,
+                                        WORD32 c_max,
+                                        WORD32 ctxt_index,
+                                        WORD32 ctxt_shift,
+                                        WORD32 ctxt_inc_max)
+{
+    UWORD32 u4_sym;
+    WORD32 bin;
+
+    /* Sanity checks */
+    ASSERT(c_max > 0);
+    ASSERT((ctxt_index >= 0) && (ctxt_index < IHEVC_CAB_CTXT_END));
+    ASSERT((ctxt_index + (c_max >> ctxt_shift)) < IHEVC_CAB_CTXT_END);
+
+    u4_sym = 0;
+    do
+    {
+        WORD32 bin_index;
+        bin_index = ctxt_index + MIN((u4_sym >> ctxt_shift), ctxt_inc_max);
+        IHEVCD_CABAC_DECODE_BIN(bin, ps_cabac, ps_bitstrm,  bin_index);
+        u4_sym++;
+    }while(((WORD32)u4_sym < c_max) && bin);
+
+    u4_sym = u4_sym - 1 + bin;
+
+    return (u4_sym);
+}
+
+/**
+ ******************************************************************************
+ *
+ *  @brief Decodes a syntax element as truncated unary bypass bins
+ *
+ *  @par   Description
+ *  Decodes symbols coded with TUnary binarization as per sec 9.3.2.2
+ *  These symbols are coded as bypass bins
+ *   This is used for computing symbols like merge_idx,
+ *  mpm_idx etc
+ *
+ *  @param[in,out]ps_cabac
+ *   pointer to cabac context (handle)
+ *
+ *  @param[in]   ps_bitstrm
+ *  Bitstream context
+ *
+ *  @param[in]   c_max
+ *   maximum value of sym (required for tunary binarization)
+ *
+ *  @return      syntax element decoded
+ *
+ ******************************************************************************
+ */
+UWORD32 ihevcd_cabac_decode_bypass_bins_tunary(cab_ctxt_t *ps_cabac,
+                                               bitstrm_t *ps_bitstrm,
+                                               WORD32 c_max)
+{
+
+    UWORD32 u4_sym;
+    WORD32 bin;
+    UWORD32 u4_ofst = ps_cabac->u4_ofst;
+    UWORD32 u4_range = ps_cabac->u4_range;
+    UWORD32 u4_bits;
+    /* Sanity checks */
+    ASSERT(c_max > 0);
+    ASSERT(u4_range >= 256);
+    u4_sym = 0;
+    BITS_NXT(u4_bits, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+                    ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, (UWORD32)c_max);
+    u4_bits <<= (32 - c_max);
+    do
+    {
+        u4_ofst <<= 1;
+        u4_ofst |= (u4_bits >> 31);
+        u4_bits <<= 1;
+
+        bin = 0;
+        if(u4_ofst >= u4_range)
+        {
+            bin = 1;
+            u4_ofst -= u4_range;
+        }
+        u4_sym++;
+    }while(((WORD32)u4_sym < c_max) && bin);
+    BITS_FLUSH(ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+                    ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, u4_sym);
+
+    u4_sym = u4_sym - 1 + bin;
+    /* Update the cabac context */
+    ps_cabac->u4_ofst = u4_ofst;
+
+    return (u4_sym);
+}
+
+/**
+ ******************************************************************************
+ *
+ *  @brief Decodes a syntax element as kth order Exp-Golomb code (EGK)
+ *
+ *  @par   Description
+ *  Decodes a syntax element binarized as kth order Exp-Golomb code (EGK)
+ *  Elements are coded as bypass bins
+ *
+ *  @param[in,out] ps_cabac
+ *   pointer to cabac context (handle)
+ *
+ *  @param[in]   u4_sym
+ *   syntax element to be coded as EGK
+ *
+ *  @param[in]   k
+ *   order of EGk
+ *
+ *  @return      success or failure error code
+ *
+ ******************************************************************************
+ */
+UWORD32 ihevcd_cabac_decode_bypass_bins_egk(cab_ctxt_t *ps_cabac,
+                                            bitstrm_t *ps_bitstrm,
+                                            WORD32 k)
+{
+
+    UWORD32 u4_sym;
+    WORD32 numones;
+    WORD32 bin;
+
+    /* Sanity checks */
+    ASSERT((k >= 0));
+
+    numones = k;
+    bin = 1;
+    u4_sym = 0;
+    while(bin)
+    {
+        IHEVCD_CABAC_DECODE_BYPASS_BIN(bin, ps_cabac, ps_bitstrm);
+        u4_sym += bin << numones++;
+    }
+
+    numones -= 1;
+    numones = CLIP3(numones, 0, 16);
+
+    if(numones)
+    {
+        UWORD32 u4_suffix;
+
+        IHEVCD_CABAC_DECODE_BYPASS_BINS(u4_suffix, ps_cabac, ps_bitstrm, numones);
+        u4_sym += u4_suffix;
+    }
+    return (u4_sym);
+}
+
+/**
+ ******************************************************************************
+ *
+ *  @brief Decodes a syntax element as truncated rice code (TR)
+ *
+ *  @par   Description
+ *  Decodes a syntax element as truncated rice code (TR)
+ *  Elements are coded as bypass bins
+ *  This function ise used for coeff_abs_level_remaining coding when
+ *  level is less than c_rice_max
+ *
+ *  @param[in,out] ps_cabac
+ *   pointer to cabac context (handle)
+ *
+ *  @param[in]   u4_sym
+ *   syntax element to be coded as truncated rice code
+ *
+ *  @param[in]   c_rice_param
+ *    shift factor for truncated unary prefix coding of (u4_sym >> c_rice_param)
+ *
+ *  @param[in]   c_rice_max
+ *    max symbol val below which a suffix is coded as (u4_sym%(1<<c_rice_param))
+ *    This is currently (4 << c_rice_param) for coeff_abs_level_remaining
+ *
+ *  @return      success or failure error code
+ *
+ ******************************************************************************
+ */
+UWORD32 ihevcd_cabac_decode_bypass_bins_trunc_rice(cab_ctxt_t *ps_cabac,
+                                                   bitstrm_t *ps_bitstrm,
+                                                   WORD32 c_rice_param,
+                                                   WORD32 c_rice_max)
+{
+    UWORD32 u4_sym;
+    WORD32 bin;
+    WORD32 c_max;
+    UWORD32 u4_suffix;
+    /* Sanity checks */
+    ASSERT((c_rice_param >= 0));
+
+
+    /* Decode prefix coded as TUnary */
+    c_max = c_rice_max >> c_rice_param;
+    u4_sym = 0;
+    do
+    {
+        IHEVCD_CABAC_DECODE_BYPASS_BIN(bin, ps_cabac, ps_bitstrm);
+        u4_sym++;
+
+    }while(((WORD32)u4_sym < c_max) && bin);
+    u4_sym = u4_sym - 1 + bin;
+
+    /* If suffix is present, then decode c_rice_param number of bins */
+    if(c_rice_param)
+    {
+        IHEVCD_CABAC_DECODE_BYPASS_BINS(u4_suffix, ps_cabac, ps_bitstrm, c_rice_param);
+
+        u4_sym = (u4_sym << c_rice_param) | u4_suffix;
+    }
+    return (u4_sym);
+}
+#if 0
+/**
+ ******************************************************************************
+ *
+ *  @brief Flushes the cabac decoder engine as per section 9.3.4 figure 9-12
+ *
+ *  @par   Description
+ *
+ *
+ *  @param[in,out]   ps_cabac
+ *  pointer to cabac context (handle)
+ *
+ *  @return      success or failure error code
+ *
+ ******************************************************************************
+ */
+IHEVCD_ERROR_T ihevcd_cabac_flush(cab_ctxt_t *ps_cabac)
+{
+    UWORD32 u4_ofst = ps_cabac->u4_ofst;
+    UWORD32 u4_bits_gen = ps_cabac->u4_bits_gen;
+
+    UWORD8 *pu1_strm_buf = ps_cabac->pu1_strm_buffer;
+    UWORD32 u4_strm_buf_offset = ps_cabac->u4_strm_buf_offset;
+    WORD32 zero_run = ps_cabac->i4_zero_bytes_run;
+    UWORD32 u4_out_standing_bytes = ps_cabac->u4_out_standing_bytes;
+
+    /************************************************************************/
+    /* Insert the carry (propogated in previous byte) along with            */
+    /* outstanding bytes (if any) and flush remaining bits                  */
+    /************************************************************************/
+    //TODO: Review this function
+    {
+        /* carry = 1 => putbit(1); carry propogated due to L renorm */
+        WORD32 carry = (u4_ofst >> (u4_bits_gen + CABAC_BITS)) & 0x1;
+        WORD32 last_byte;
+        WORD32 bits_left;
+        WORD32 rem_bits;
+
+        /*********************************************************************/
+        /* Bitstream overflow check                                          */
+        /* NOTE: corner case of epb bytes (max 2 for 32bit word) not handled */
+        /*********************************************************************/
+        if((u4_strm_buf_offset + u4_out_standing_bytes + 1)
+                        >= ps_cabac->u4_max_strm_size)
+        {
+            /* return without corrupting the buffer beyond its size */
+            return (IHEVCD_BITSTREAM_BUFFER_OVERFLOW);
+        }
+
+        if(carry)
+        {
+            /* previous byte carry add will not result in overflow to        */
+            /* u4_strm_buf_offset - 2 as we track 0xff as outstanding bytes  */
+            pu1_strm_buf[u4_strm_buf_offset - 1] += carry;
+            zero_run = 0;
+        }
+
+        /*        Insert outstanding bytes (if any)         */
+        while(u4_out_standing_bytes)
+        {
+            UWORD8 u1_0_or_ff = carry ? 0 : 0xFF;
+
+            PUTBYTE_EPB(pu1_strm_buf, u4_strm_buf_offset, u1_0_or_ff, zero_run);
+
+            u4_out_standing_bytes--;
+        }
+
+        /*  clear the carry in low */
+        u4_ofst &= ((1 << (u4_bits_gen + CABAC_BITS)) - 1);
+
+        /* extract the remaining bits;                                   */
+        /* includes addtitional msb 2 bits of low as per Figure 9-12     */
+        bits_left = u4_bits_gen + 2;
+        rem_bits = (u4_ofst >> (u4_bits_gen + CABAC_BITS - bits_left));
+
+        if(bits_left >= 8)
+        {
+            last_byte = (rem_bits >> (bits_left - 8)) & 0xFF;
+            PUTBYTE_EPB(pu1_strm_buf, u4_strm_buf_offset, last_byte, zero_run);
+            bits_left -= 8;
+        }
+
+        /* insert last byte along with rbsp stop bit(1) and 0's in the end */
+        last_byte = (rem_bits << (8 - bits_left)) | (1 << (bits_left - 1));
+        last_byte &= 0xFF;
+        PUTBYTE_EPB(pu1_strm_buf, u4_strm_buf_offset, last_byte, zero_run);
+
+        /* update the state variables and return success */
+        ps_cabac->u4_strm_buf_offset = u4_strm_buf_offset;
+        ps_cabac->i4_zero_bytes_run = zero_run;
+        return (IHEVCD_SUCCESS);
+    }
+}
+#endif

diff --git a/decoder/ihevcd_cabac.h b/decoder/ihevcd_cabac.h
new file mode 100644
index 0000000..2c4a543
--- /dev/null
+++ b/decoder/ihevcd_cabac.h

@@ -0,0 +1,286 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+******************************************************************************
+* @file ihevcd_cabac.h
+*
+* @brief
+*  This file contains decoder cabac engine related structures and
+*  interface prototypes
+*
+* @author
+*  Ittiam
+******************************************************************************
+*/
+
+#ifndef _IHEVCD_CABAC_H_
+#define _IHEVCD_CABAC_H_
+
+#include "ihevc_typedefs.h"
+/*****************************************************************************/
+/* Constant Macros                                                           */
+/*****************************************************************************/
+/**
+*******************************************************************************
+@brief
+*******************************************************************************
+ */
+#define CABAC_BITS  9
+
+/**
+ * Following definitions control whether cabac functions are inlined as macros or
+ * are called as functions. Set these to 0 to debug cabac leaf level functions
+ * Note these macros assume FULLRANGE is 1.
+ */
+#define CABAC_DECODE_BIN            1
+#define CABAC_DECODE_BYPASS_BIN     1
+#define CABAC_DECODE_BYPASS_BINS    1
+
+/*****************************************************************************/
+/* Function Macros                                                           */
+/*****************************************************************************/
+#if CABAC_DECODE_BIN
+#define IHEVCD_CABAC_DECODE_BIN(u4_bin, ps_cabac, ps_bitstrm, ctxt_index)       \
+{                                                                               \
+    UWORD32 u4_range = ps_cabac->u4_range;                                      \
+    UWORD32 u4_ofst = ps_cabac->u4_ofst;                                        \
+    UWORD32 u4_rlps;                                                            \
+    UWORD8 *pu1_ctxt_model = &ps_cabac->au1_ctxt_models[ctxt_index];            \
+    WORD32 state_mps = *pu1_ctxt_model;                                         \
+    WORD32 clz;                                                                 \
+    UWORD32 u4_qnt_range;                                                       \
+                                                                                \
+    /* Sanity checks */                                                         \
+    ASSERT(FULLRANGE == 1);                                                     \
+    ASSERT(u4_range >= 256);                                                    \
+    ASSERT((ctxt_index >= 0) && (ctxt_index < IHEVC_CAB_CTXT_END));             \
+    ASSERT(state_mps < 128);                                                    \
+    clz = CLZ(u4_range);                                                        \
+    clz -= (32 - RANGE_NUMBITS);                                                \
+    u4_qnt_range = u4_range << clz;                                             \
+    u4_qnt_range = (u4_qnt_range >> (RANGE_SHIFT + 6)) & 0x3;                   \
+    /* Get the lps range from LUT based on quantized range and state */         \
+    u4_rlps = gau1_ihevc_cabac_rlps[state_mps >> 1][u4_qnt_range];              \
+    u4_rlps = u4_rlps << (RANGE_SHIFT - clz);                                   \
+    u4_range -= u4_rlps;                                                        \
+                                                                                \
+    u4_bin = state_mps & 1;                                                     \
+                                                                                \
+    if(u4_ofst >= u4_range)                                                     \
+    {                                                                           \
+        u4_bin = 1 - u4_bin;                                                    \
+        u4_ofst -= u4_range;                                                    \
+        u4_range = u4_rlps;                                                     \
+    }                                                                           \
+                                                                                \
+    *pu1_ctxt_model = gau1_ihevc_next_state[(state_mps << 1) | u4_bin];         \
+                                                                                \
+    /*****************************************************************/         \
+    /* Re-normalization; calculate bits generated based on range(R)  */         \
+    /*****************************************************************/         \
+    if(u4_range < (1 << 8))                                                     \
+    {                                                                           \
+        UWORD32 u4_bits;                                                        \
+        WORD32 numbits;                                                         \
+        numbits = CLZ(u4_range);                                                \
+        numbits -= (32 - RANGE_NUMBITS);                                        \
+        BITS_GET(u4_bits, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,         \
+                 ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, numbits);    \
+                                                                                \
+        u4_ofst <<= numbits;                                                    \
+        u4_ofst |= u4_bits;                                                     \
+        u4_range <<= numbits;                                                   \
+                                                                                \
+    }                                                                           \
+    /* Update the cabac context */                                              \
+    ps_cabac->u4_range = u4_range;                                              \
+    ps_cabac->u4_ofst = u4_ofst;                                                \
+                                                                                \
+}
+#else
+#define IHEVCD_CABAC_DECODE_BIN(u4_bin, ps_cabac, ps_bitstrm, ctxt_index)       \
+        u4_bin = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_index);
+#endif
+
+#if CABAC_DECODE_BYPASS_BIN
+#define IHEVCD_CABAC_DECODE_BYPASS_BIN(u4_bin, ps_cabac, ps_bitstrm)            \
+{                                                                               \
+                                                                                \
+    UWORD32 u4_range = ps_cabac->u4_range;                                      \
+    UWORD32 u4_ofst = ps_cabac->u4_ofst;                                        \
+    UWORD32 u4_bits;                                                            \
+                                                                                \
+    /* Sanity checks */                                                         \
+    ASSERT(FULLRANGE == 1);                                                     \
+    ASSERT(u4_range >= 256);                                                    \
+                                                                                \
+    BIT_GET(u4_bits, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,              \
+            ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word);                  \
+                                                                                \
+    u4_ofst <<= 1;                                                              \
+    u4_ofst |= u4_bits;                                                         \
+                                                                                \
+    u4_bin = 0;                                                                 \
+    if(u4_ofst >= u4_range)                                                     \
+    {                                                                           \
+        u4_bin = 1;                                                             \
+        u4_ofst -= u4_range;                                                    \
+    }                                                                           \
+                                                                                \
+    /* Update the cabac context */                                              \
+    ps_cabac->u4_ofst = u4_ofst;                                                \
+}
+#else
+
+#define IHEVCD_CABAC_DECODE_BYPASS_BIN(u4_bin, ps_cabac, ps_bitstrm)            \
+                u4_bin = ihevcd_cabac_decode_bypass_bin(ps_cabac, ps_bitstrm);
+#endif
+
+#if CABAC_DECODE_BYPASS_BINS
+#define IHEVCD_CABAC_DECODE_BYPASS_BINS(u4_bins, ps_cabac, ps_bitstrm, numbins) \
+{                                                                               \
+    UWORD32 u4_range = ps_cabac->u4_range;                                      \
+    UWORD32 u4_ofst = ps_cabac->u4_ofst;                                        \
+    UWORD32 u4_bits;                                                            \
+    ASSERT(FULLRANGE == 1);                                                     \
+    ASSERT(u4_range >= 256);                                                    \
+    ASSERT(numbins > 0);                                                        \
+    {                                                                           \
+        WORD32 numbins_tmp = numbins;                                           \
+        /* Sanity checks */                                                     \
+        ASSERT(numbins < 17);                                                   \
+                                                                                \
+        u4_bins = 0;                                                            \
+                                                                                \
+        BITS_GET(u4_bits, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,         \
+                    ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, numbins); \
+        do                                                                      \
+        {                                                                       \
+            UWORD32 u4_bit;                                                     \
+            numbins_tmp--;                                                      \
+            u4_bit = (u4_bits >> numbins_tmp) & 1;                              \
+            u4_ofst <<= 1;                                                      \
+            u4_ofst |= u4_bit;                                                  \
+                                                                                \
+            u4_bins <<= 1;                                                      \
+            if(u4_ofst >= u4_range)                                             \
+            {                                                                   \
+                u4_bins += 1;                                                   \
+                u4_ofst -= u4_range;                                            \
+            }                                                                   \
+        }while(numbins_tmp);                                                    \
+                                                                                \
+        /* Update the cabac context */                                          \
+        ps_cabac->u4_ofst = u4_ofst;                                            \
+    }                                                                           \
+}
+
+
+#else
+
+#define IHEVCD_CABAC_DECODE_BYPASS_BINS(u4_bins, ps_cabac, ps_bitstrm, numbins) \
+      u4_bins = ihevcd_cabac_decode_bypass_bins(ps_cabac, ps_bitstrm, numbins);
+
+#endif
+/*****************************************************************************/
+/* Structures                                                                */
+/*****************************************************************************/
+
+
+
+/*****************************************************************************/
+/* Extern Function Declarations                                              */
+/*****************************************************************************/
+IHEVCD_ERROR_T    ihevcd_cabac_init
+(
+                cab_ctxt_t  *ps_cabac,
+                bitstrm_t   *ps_bitstrm,
+                WORD32      slice_qp,
+                WORD32      cabac_init_idc,
+                const UWORD8      *pu1_init_ctxt
+);
+
+
+
+UWORD32    ihevcd_cabac_decode_bin
+(
+                cab_ctxt_t  *ps_cabac,
+                bitstrm_t *ps_bitstrm,
+                WORD32      ctxt_index
+);
+
+UWORD32    ihevcd_cabac_decode_bypass_bin
+(
+                cab_ctxt_t  *ps_cabac,
+                bitstrm_t *ps_bitstrm
+);
+
+UWORD32    ihevcd_cabac_decode_terminate
+(
+                cab_ctxt_t  *ps_cabac,
+                bitstrm_t *ps_bitstrm
+);
+
+UWORD32    ihevcd_cabac_decode_bypass_bins
+(
+                cab_ctxt_t  *ps_cabac,
+                bitstrm_t *ps_bitstrm,
+                WORD32       num_bins
+);
+
+UWORD32    ihevcd_cabac_decode_bins_tunary
+(
+                cab_ctxt_t  *ps_cabac,
+                bitstrm_t *ps_bitstrm,
+                WORD32       c_max,
+                WORD32       ctxt_index,
+                WORD32       ctxt_shift,
+                WORD32       ctxt_inc_max
+
+);
+
+UWORD32    ihevcd_cabac_decode_bypass_bins_tunary
+(
+                cab_ctxt_t  *ps_cabac,
+                bitstrm_t *ps_bitstrm,
+                WORD32       c_max
+
+);
+
+UWORD32    ihevcd_cabac_decode_bypass_bins_egk
+(
+                cab_ctxt_t  *ps_cabac,
+                bitstrm_t *ps_bitstrm,
+                WORD32       k
+);
+
+UWORD32    ihevcd_cabac_decode_bypass_bins_trunc_rice
+(
+                cab_ctxt_t  *ps_cabac,
+                bitstrm_t *ps_bitstrm,
+                WORD32       c_rice_param,
+                WORD32       c_rice_max
+);
+
+IHEVCD_ERROR_T  ihevcd_cabac_flush(cab_ctxt_t  *ps_cabac);
+
+IHEVCD_ERROR_T ihevcd_cabac_reset(cab_ctxt_t *ps_cabac,
+                                  bitstrm_t *ps_bitstrm);
+
+#endif /* _IHEVCD_CABAC_H_ */

diff --git a/decoder/ihevcd_common_tables.c b/decoder/ihevcd_common_tables.c
new file mode 100644
index 0000000..1f6065b
--- /dev/null
+++ b/decoder/ihevcd_common_tables.c

@@ -0,0 +1,49 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevcd_common_tables.c
+ *
+ * @brief
+ *  Contains common global tables for decoder
+ *
+ * @author
+ *  Naveen S R
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+#include "ihevc_typedefs.h"
+#include "ihevcd_common_tables.h"
+#include "ihevc_defs.h"
+
+WORD16 gai2_ihevcd_chroma_qp[] =
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29,
+    30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38,
+    39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 };
+
+const UWORD8 gau1_intra_pred_chroma_modes[] =
+    { INTRA_PLANAR, INTRA_ANGULAR(26), INTRA_ANGULAR(10), INTRA_DC };
+

diff --git a/decoder/ihevcd_common_tables.h b/decoder/ihevcd_common_tables.h
new file mode 100644
index 0000000..61bc93f
--- /dev/null
+++ b/decoder/ihevcd_common_tables.h

@@ -0,0 +1,42 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/*****************************************************************************/
+/*                                                                           */
+/*  File Name         : ihevcd_common_tables.h                               */
+/*                                                                           */
+/*  Description       : Common tables                                        */
+/*                                                                           */
+/*  List of Functions : None                                                 */
+/*                                                                           */
+/*  Issues / Problems : None                                                 */
+/*                                                                           */
+/*  Revision History  :                                                      */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
+/*         07 09 2012   Harish      Initial Version                          */
+/*****************************************************************************/
+
+#ifndef _IHEVCD_COMMON_TABLES_H_
+#define _IHEVCD_COMMON_TABLES_H_
+
+extern WORD16 gai2_ihevcd_chroma_qp[];
+
+extern const UWORD8 gau1_intra_pred_chroma_modes[];
+
+
+#endif /*_IHEVCD_COMMON_TABLES_H_*/

diff --git a/decoder/ihevcd_cxa.h b/decoder/ihevcd_cxa.h
new file mode 100644
index 0000000..be241c0
--- /dev/null
+++ b/decoder/ihevcd_cxa.h

@@ -0,0 +1,1098 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_cxa.h
+*
+* @brief
+*  This file contains all the necessary structure and  enumeration
+* definitions needed for the Application  Program Interface(API) of the
+* Ittiam HEVC decoder  on Cortex Ax
+*
+* @author
+*  Harish
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef __IHEVCD_CXA_H__
+#define __IHEVCD_CXA_H__
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "iv.h"
+#include "ivd.h"
+
+
+/*****************************************************************************/
+/* Constant Macros                                                           */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Function Macros                                                           */
+/*****************************************************************************/
+#define IS_IVD_CONCEALMENT_APPLIED(x)       (x & (1 << IVD_APPLIEDCONCEALMENT))
+#define IS_IVD_INSUFFICIENTDATA_ERROR(x)    (x & (1 << IVD_INSUFFICIENTDATA))
+#define IS_IVD_CORRUPTEDDATA_ERROR(x)       (x & (1 << IVD_CORRUPTEDDATA))
+#define IS_IVD_CORRUPTEDHEADER_ERROR(x)     (x & (1 << IVD_CORRUPTEDHEADER))
+#define IS_IVD_UNSUPPORTEDINPUT_ERROR(x)    (x & (1 << IVD_UNSUPPORTEDINPUT))
+#define IS_IVD_UNSUPPORTEDPARAM_ERROR(x)    (x & (1 << IVD_UNSUPPORTEDPARAM))
+#define IS_IVD_FATAL_ERROR(x)               (x & (1 << IVD_FATALERROR))
+#define IS_IVD_INVALID_BITSTREAM_ERROR(x)   (x & (1 << IVD_INVALID_BITSTREAM))
+#define IS_IVD_INCOMPLETE_BITSTREAM_ERROR(x) (x & (1 << IVD_INCOMPLETE_BITSTREAM))
+
+
+/*****************************************************************************/
+/* API Function Prototype                                                    */
+/*****************************************************************************/
+IV_API_CALL_STATUS_T ihevcd_cxa_api_function(iv_obj_t *ps_handle,
+                                             void *pv_api_ip,
+                                             void *pv_api_op);
+
+/*****************************************************************************/
+/* Enums                                                                     */
+/*****************************************************************************/
+/* Codec Error codes for HEVC  Decoder                                       */
+
+
+typedef enum {
+    /**
+     *  No error
+     */
+    IHEVCD_SUCCESS = 0,
+
+    /**
+     *  Codec calls done without successful init
+     */
+    IHEVCD_INIT_NOT_DONE                        = IVD_DUMMY_ELEMENT_FOR_CODEC_EXTENSIONS,
+
+
+    IHEVCD_CXA_VID_HDR_DEC_NUM_FRM_BUF_NOT_SUFFICIENT,
+
+    /**
+     *  Unsupported level passed as an argument
+     */
+    IHEVCD_LEVEL_UNSUPPORTED,
+    /**
+     *  Unsupported number of reference pictures passed as an argument
+     */
+    IHEVCD_NUM_REF_UNSUPPORTED,
+    /**
+     *  Unsupported number of reorder pictures passed as an argument
+     */
+    IHEVCD_NUM_REORDER_UNSUPPORTED,
+    /**
+     *  Unsupported number of extra display pictures passed as an argument
+     */
+    IHEVCD_NUM_EXTRA_DISP_UNSUPPORTED,
+    /**
+     *  Invalid display stride requested.
+     */
+    IHEVCD_INVALID_DISP_STRD,
+
+    /**
+     * Reached end of sequence
+     */
+    IHEVCD_END_OF_SEQUENCE,
+
+    /**
+     * Width/height greater than max width and max height
+     */
+    IHEVCD_UNSUPPORTED_DIMENSIONS,
+
+    /**
+     *  Buffer size to hold version string is not sufficient
+     *  Allocate more to hold version string
+     */
+    IHEVCD_CXA_VERS_BUF_INSUFFICIENT,
+    /**
+     * Stream chroma format other than YUV420
+     */
+    IHEVCD_UNSUPPORTED_CHROMA_FMT_IDC,
+
+    /**
+     * Generic failure
+     */
+    IHEVCD_FAIL                             = 0x7FFFFFFF
+
+
+}IHEVCD_CXA_ERROR_CODES_T;
+
+/*****************************************************************************/
+/* Extended Structures                                                       */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/*  Get Number of Memory Records                                             */
+/*****************************************************************************/
+
+
+typedef struct {
+
+    /**
+     * ivd_num_mem_rec_ip_t
+     */
+    iv_num_mem_rec_ip_t                    s_ivd_num_mem_rec_ip_t;
+}ihevcd_cxa_num_mem_rec_ip_t;
+
+
+typedef struct {
+
+    /**
+     * ivd_num_mem_rec_op_t
+     */
+    iv_num_mem_rec_op_t                    s_ivd_num_mem_rec_op_t;
+}ihevcd_cxa_num_mem_rec_op_t;
+
+
+/*****************************************************************************/
+/*  Fill Memory Records                                                      */
+/*****************************************************************************/
+
+
+typedef struct {
+    /**
+     * ivd_fill_mem_rec_ip_t
+     */
+    iv_fill_mem_rec_ip_t                    s_ivd_fill_mem_rec_ip_t;
+
+    /**
+     * level
+     */
+    WORD32                                  i4_level;
+
+    /**
+     * num_reorder_frames
+     */
+    UWORD32                                 u4_num_reorder_frames;
+
+    /**
+     * num_ref_frames
+     */
+    UWORD32                                 u4_num_ref_frames;
+
+    /**
+     * share_disp_buf
+     */
+    UWORD32                                 u4_share_disp_buf;
+
+    /**
+     * format in which codec has to give out frame data for display
+     */
+    IV_COLOR_FORMAT_T                       e_output_format;
+
+    /**
+     * Number of extra display buffers that will be allocated to handle display pipeline depth
+     */
+    UWORD32                                 u4_num_extra_disp_buf;
+
+}ihevcd_cxa_fill_mem_rec_ip_t;
+
+
+typedef struct {
+
+    /**
+     * ivd_fill_mem_rec_op_t
+     */
+
+    iv_fill_mem_rec_op_t                   s_ivd_fill_mem_rec_op_t;
+
+}ihevcd_cxa_fill_mem_rec_op_t;
+
+/*****************************************************************************/
+/*  Retrieve Memory Records                                                  */
+/*****************************************************************************/
+
+
+typedef struct {
+
+    /**
+     * ivd_retrieve_mem_rec_ip_t
+     */
+    iv_retrieve_mem_rec_ip_t               s_ivd_retrieve_mem_rec_ip_t;
+}ihevcd_cxa_retrieve_mem_rec_ip_t;
+
+
+typedef struct {
+
+    /**
+     * ivd_retrieve_mem_rec_op_t
+     */
+    iv_retrieve_mem_rec_op_t               s_ivd_retrieve_mem_rec_op_t;
+}ihevcd_cxa_retrieve_mem_rec_op_t;
+
+
+/*****************************************************************************/
+/*   Initialize decoder                                                      */
+/*****************************************************************************/
+
+
+typedef struct {
+
+    /**
+     * ivd_init_ip_t
+     */
+    ivd_init_ip_t                           s_ivd_init_ip_t;
+
+    /**
+     * level
+     */
+    WORD32                                  i4_level;
+
+    /**
+     * num_reorder_frames
+     */
+    UWORD32                                 u4_num_reorder_frames;
+
+    /**
+     * num_ref_frames
+     */
+    UWORD32                                 u4_num_ref_frames;
+
+    /**
+     * share_disp_buf
+     */
+    UWORD32                                 u4_share_disp_buf;
+
+    /**
+     * Number of extra display buffers that will be allocated to handle display pipeline depth
+     */
+    UWORD32                                 u4_num_extra_disp_buf;
+}ihevcd_cxa_init_ip_t;
+
+
+typedef struct {
+
+    /**
+     * ivd_init_op_t
+     */
+    ivd_init_op_t                           s_ivd_init_op_t;
+}ihevcd_cxa_init_op_t;
+
+
+/*****************************************************************************/
+/*   Video Decode                                                            */
+/*****************************************************************************/
+
+
+typedef struct {
+
+    /**
+     * ivd_video_decode_ip_t
+     */
+    ivd_video_decode_ip_t                   s_ivd_video_decode_ip_t;
+}ihevcd_cxa_video_decode_ip_t;
+
+
+typedef struct {
+
+    /**
+     * ivd_video_decode_op_t
+     */
+    ivd_video_decode_op_t                   s_ivd_video_decode_op_t;
+}ihevcd_cxa_video_decode_op_t;
+
+
+/*****************************************************************************/
+/*   Get Display Frame                                                       */
+/*****************************************************************************/
+
+
+typedef struct
+{
+    /**
+     * ivd_get_display_frame_ip_t
+     */
+    ivd_get_display_frame_ip_t              s_ivd_get_display_frame_ip_t;
+}ihevcd_cxa_get_display_frame_ip_t;
+
+
+typedef struct
+{
+    /**
+     * ivd_get_display_frame_op_t
+     */
+    ivd_get_display_frame_op_t              s_ivd_get_display_frame_op_t;
+}ihevcd_cxa_get_display_frame_op_t;
+
+/*****************************************************************************/
+/*   Set Display Frame                                                       */
+/*****************************************************************************/
+
+
+typedef struct
+{
+    /**
+     * ivd_set_display_frame_ip_t
+     */
+    ivd_set_display_frame_ip_t              s_ivd_set_display_frame_ip_t;
+}ihevcd_cxa_set_display_frame_ip_t;
+
+
+typedef struct
+{
+    /**
+     * ivd_set_display_frame_op_t
+     */
+    ivd_set_display_frame_op_t              s_ivd_set_display_frame_op_t;
+}ihevcd_cxa_set_display_frame_op_t;
+
+/*****************************************************************************/
+/*   Release Display Buffers                                                 */
+/*****************************************************************************/
+
+
+typedef struct
+{
+    /**
+     * ivd_rel_display_frame_ip_t
+     */
+
+    ivd_rel_display_frame_ip_t                  s_ivd_rel_display_frame_ip_t;
+}ihevcd_cxa_rel_display_frame_ip_t;
+
+
+typedef struct
+{
+    /**
+     * ivd_rel_display_frame_op_t
+     */
+    ivd_rel_display_frame_op_t                  s_ivd_rel_display_frame_op_t;
+}ihevcd_cxa_rel_display_frame_op_t;
+
+
+typedef enum
+{
+    /** Set number of cores/threads to be used */
+    IHEVCD_CXA_CMD_CTL_SET_NUM_CORES         = IVD_CMD_CTL_CODEC_SUBCMD_START,
+
+    /** Set processor details */
+    IHEVCD_CXA_CMD_CTL_SET_PROCESSOR         = IVD_CMD_CTL_CODEC_SUBCMD_START + 0x001,
+
+    /** Get display buffer dimensions */
+    IHEVCD_CXA_CMD_CTL_GET_BUFFER_DIMENSIONS = IVD_CMD_CTL_CODEC_SUBCMD_START + 0x100,
+
+    /** Get VUI parameters */
+    IHEVCD_CXA_CMD_CTL_GET_VUI_PARAMS        = IVD_CMD_CTL_CODEC_SUBCMD_START + 0x101,
+
+    /** Enable/disable GPU, supported on select platforms */
+    IHEVCD_CXA_CMD_CTL_GPU_ENABLE_DISABLE    = IVD_CMD_CTL_CODEC_SUBCMD_START + 0x200,
+
+    /** Set degrade level */
+    IHEVCD_CXA_CMD_CTL_DEGRADE               = IVD_CMD_CTL_CODEC_SUBCMD_START + 0x300
+}IHEVCD_CXA_CMD_CTL_SUB_CMDS;
+/*****************************************************************************/
+/*   Video control  Flush                                                    */
+/*****************************************************************************/
+
+
+typedef struct {
+
+    /**
+     * ivd_ctl_flush_ip_t
+     */
+    ivd_ctl_flush_ip_t                      s_ivd_ctl_flush_ip_t;
+}ihevcd_cxa_ctl_flush_ip_t;
+
+
+typedef struct {
+
+    /**
+     * ivd_ctl_flush_op_t
+     */
+    ivd_ctl_flush_op_t                      s_ivd_ctl_flush_op_t;
+}ihevcd_cxa_ctl_flush_op_t;
+
+/*****************************************************************************/
+/*   Video control reset                                                     */
+/*****************************************************************************/
+
+
+typedef struct {
+
+    /**
+     * ivd_ctl_reset_ip_t
+     */
+    ivd_ctl_reset_ip_t                      s_ivd_ctl_reset_ip_t;
+}ihevcd_cxa_ctl_reset_ip_t;
+
+
+typedef struct {
+
+    /**
+     * ivd_ctl_reset_op_t
+     */
+    ivd_ctl_reset_op_t                      s_ivd_ctl_reset_op_t;
+}ihevcd_cxa_ctl_reset_op_t;
+
+
+/*****************************************************************************/
+/*   Video control  Set Params                                               */
+/*****************************************************************************/
+
+
+typedef struct {
+
+    /**
+     *  ivd_ctl_set_config_ip_t
+     */
+    ivd_ctl_set_config_ip_t             s_ivd_ctl_set_config_ip_t;
+}ihevcd_cxa_ctl_set_config_ip_t;
+
+
+typedef struct {
+
+    /**
+     * ivd_ctl_set_config_op_t
+     */
+    ivd_ctl_set_config_op_t             s_ivd_ctl_set_config_op_t;
+}ihevcd_cxa_ctl_set_config_op_t;
+
+/*****************************************************************************/
+/*   Video control:Get Buf Info                                              */
+/*****************************************************************************/
+
+
+typedef struct {
+
+    /**
+     * ivd_ctl_getbufinfo_ip_t
+     */
+    ivd_ctl_getbufinfo_ip_t             s_ivd_ctl_getbufinfo_ip_t;
+}ihevcd_cxa_ctl_getbufinfo_ip_t;
+
+
+
+typedef struct {
+
+    /**
+     * ivd_ctl_getbufinfo_op_t
+     */
+    ivd_ctl_getbufinfo_op_t             s_ivd_ctl_getbufinfo_op_t;
+}ihevcd_cxa_ctl_getbufinfo_op_t;
+
+
+/*****************************************************************************/
+/*   Video control:Getstatus Call                                            */
+/*****************************************************************************/
+
+
+typedef struct {
+
+    /**
+     * ivd_ctl_getstatus_ip_t
+     */
+    ivd_ctl_getstatus_ip_t                  s_ivd_ctl_getstatus_ip_t;
+}ihevcd_cxa_ctl_getstatus_ip_t;
+
+
+
+typedef struct {
+
+    /**
+     * ivd_ctl_getstatus_op_t
+     */
+    ivd_ctl_getstatus_op_t                  s_ivd_ctl_getstatus_op_t;
+}ihevcd_cxa_ctl_getstatus_op_t;
+
+
+/*****************************************************************************/
+/*   Video control:Get Version Info                                          */
+/*****************************************************************************/
+
+
+typedef struct {
+
+    /**
+     *  ivd_ctl_getversioninfo_ip_t
+     */
+    ivd_ctl_getversioninfo_ip_t         s_ivd_ctl_getversioninfo_ip_t;
+}ihevcd_cxa_ctl_getversioninfo_ip_t;
+
+
+
+typedef struct {
+
+    /**
+     *  ivd_ctl_getversioninfo_op_t
+     */
+    ivd_ctl_getversioninfo_op_t         s_ivd_ctl_getversioninfo_op_t;
+}ihevcd_cxa_ctl_getversioninfo_op_t;
+
+
+typedef struct {
+
+    /**
+     * u4_size
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * cmd
+     */
+    IVD_API_COMMAND_TYPE_T                      e_cmd;
+
+    /**
+     * sub_cmd
+     */
+    IVD_CONTROL_API_COMMAND_TYPE_T              e_sub_cmd;
+
+    /**
+     * Pictures that are are degraded
+     * 0 : No degrade
+     * 1 : Only on non-reference frames
+     * 2 : Use interval specified by u4_nondegrade_interval
+     * 3 : All non-key frames
+     * 4 : All frames
+     */
+    WORD32                                     i4_degrade_pics;
+
+    /**
+     * Interval for pictures which are completely decoded without any degradation
+     */
+    WORD32                                     i4_nondegrade_interval;
+
+    /**
+     * bit position (lsb is zero): Type of degradation
+     * 0 : Disable SAO
+     * 1 : Disable deblocking
+     * 2 : Faster inter prediction filters
+     * 3 : Fastest inter prediction filters
+     */
+    WORD32                                     i4_degrade_type;
+
+}ihevcd_cxa_ctl_degrade_ip_t;
+
+typedef struct
+{
+    /**
+     * u4_size
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * error_code
+     */
+    UWORD32                                     u4_error_code;
+}ihevcd_cxa_ctl_degrade_op_t;
+
+typedef struct
+{
+
+    /**
+     * size
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * cmd
+     */
+    IVD_API_COMMAND_TYPE_T                      e_cmd;
+
+    /**
+     * sub_cmd
+     */
+    IVD_CONTROL_API_COMMAND_TYPE_T              e_sub_cmd;
+
+    /**
+     * num_cores
+     */
+    UWORD32                                     u4_num_cores;
+}ihevcd_cxa_ctl_set_num_cores_ip_t;
+
+typedef struct
+{
+
+    /**
+     * size
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * error_code
+     */
+    UWORD32                                     u4_error_code;
+}ihevcd_cxa_ctl_set_num_cores_op_t;
+
+typedef struct
+{
+    /**
+     * size
+     */
+    UWORD32                                     u4_size;
+    /**
+     * cmd
+     */
+    IVD_API_COMMAND_TYPE_T                      e_cmd;
+    /**
+     * sub cmd
+     */
+    IVD_CONTROL_API_COMMAND_TYPE_T              e_sub_cmd;
+    /**
+     * Processor type
+     */
+    UWORD32                                     u4_arch;
+    /**
+     * SOC type
+     */
+    UWORD32                                     u4_soc;
+
+    /**
+     * num_cores
+     */
+    UWORD32                                     u4_num_cores;
+
+}ihevcd_cxa_ctl_set_processor_ip_t;
+
+typedef struct
+{
+    /**
+     * size
+     */
+    UWORD32                                     u4_size;
+    /**
+     * error_code
+     */
+    UWORD32                                     u4_error_code;
+}ihevcd_cxa_ctl_set_processor_op_t;
+
+typedef struct
+{
+
+    /**
+     * size
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * cmd
+     */
+    IVD_API_COMMAND_TYPE_T                      e_cmd;
+
+    /**
+     * sub cmd
+     */
+    IVD_CONTROL_API_COMMAND_TYPE_T              e_sub_cmd;
+}ihevcd_cxa_ctl_get_frame_dimensions_ip_t;
+
+
+typedef struct {
+
+    /**
+     * size
+     */
+    UWORD32                                     u4_size;
+
+    /**
+     * error_code
+     */
+    UWORD32                                     u4_error_code;
+
+    /**
+     * x_offset[3]
+     */
+    UWORD32                                     u4_x_offset[3];
+
+    /**
+     * y_offset[3]
+     */
+    UWORD32                                     u4_y_offset[3];
+
+    /**
+     * disp_wd[3]
+     */
+    UWORD32                                     u4_disp_wd[3];
+
+    /**
+     * disp_ht[3]
+     */
+    UWORD32                                     u4_disp_ht[3];
+
+    /**
+     * buffer_wd[3]
+     */
+    UWORD32                                     u4_buffer_wd[3];
+
+    /**
+     * buffer_ht[3]
+     */
+    UWORD32                                     u4_buffer_ht[3];
+}ihevcd_cxa_ctl_get_frame_dimensions_op_t;
+
+typedef struct {
+    UWORD32                                     u4_size;
+    IVD_API_COMMAND_TYPE_T                      e_cmd;
+    IVD_CONTROL_API_COMMAND_TYPE_T              e_sub_cmd;
+}ihevcd_cxa_ctl_get_vui_params_ip_t;
+
+typedef struct {
+    UWORD32                                     u4_size;
+    UWORD32                                     u4_error_code;
+
+    /**
+    *  indicates the presence of aspect_ratio
+    */
+    UWORD8 u1_aspect_ratio_info_present_flag;
+
+    /**
+    *  specifies the aspect ratio of the luma samples
+    */
+    UWORD8 u1_aspect_ratio_idc;
+
+    /**
+    *  width of the luma samples. user dependent
+    */
+    UWORD16 u2_sar_width;
+
+    /**
+    *  hieght of the luma samples. user dependent
+    */
+    UWORD16 u2_sar_height;
+
+    /**
+    * if 1, specifies that the overscan_appropriate_flag is present
+    * if 0, the preferred display method for the video signal is unspecified
+    */
+    UWORD8 u1_overscan_info_present_flag;
+
+    /**
+    * if 1,indicates that the cropped decoded pictures output
+    * are suitable for display using overscan
+    */
+    UWORD8 u1_overscan_appropriate_flag;
+
+    /**
+    * if 1 specifies that video_format, video_full_range_flag and
+    * colour_description_present_flag are present
+    */
+    UWORD8 u1_video_signal_type_present_flag;
+
+    /**
+    *
+    */
+    UWORD8 u1_video_format;
+
+    /**
+    * indicates the black level and range of the luma and chroma signals
+    */
+    UWORD8 u1_video_full_range_flag;
+
+    /**
+    * if 1,to 1 specifies that colour_primaries, transfer_characteristics
+    * and matrix_coefficients are present
+    */
+    UWORD8 u1_colour_description_present_flag;
+
+    /**
+    * indicates the chromaticity coordinates of the source primaries
+    */
+    UWORD8 u1_colour_primaries;
+
+    /**
+    * indicates the opto-electronic transfer characteristic of the source picture
+    */
+    UWORD8 u1_transfer_characteristics;
+
+    /**
+    * the matrix coefficients used in deriving luma and chroma signals
+    * from the green, blue, and red primaries
+    */
+    UWORD8 u1_matrix_coefficients;
+
+    /**
+    * if 1, specifies that chroma_sample_loc_type_top_field and
+    * chroma_sample_loc_type_bottom_field are present
+    */
+    UWORD8 u1_chroma_loc_info_present_flag;
+
+    /**
+    * location of chroma samples
+    */
+    UWORD8 u1_chroma_sample_loc_type_top_field;
+
+    UWORD8 u1_chroma_sample_loc_type_bottom_field;
+
+    /**
+    * if 1, indicates that the value of all decoded chroma samples is
+    * equal to 1 << ( BitDepthC - 1 )
+    */
+    UWORD8 u1_neutral_chroma_indication_flag;
+
+    /**
+    *  1 indicates that the coded video sequence conveys pictures that represent fields
+    *  0 indicates the pictures that represents field
+    */
+    UWORD8 u1_field_seq_flag;
+
+    /**
+    * specifies that picture timing SEI messages are present for every picture
+    */
+    UWORD8 u1_frame_field_info_present_flag;
+
+    /**
+    * 1 indicates that the default display window parameters follow next in the VUI
+    */
+    UWORD8 u1_default_display_window_flag;
+
+    /**
+    * specify the samples of the pictures in the coded video sequence
+    * that are within the default display window,
+    * in terms of a rectangular region specified in picture coordinates for display
+    */
+    UWORD32 u4_def_disp_win_left_offset;
+
+    UWORD32 u4_def_disp_win_right_offset;
+
+    UWORD32 u4_def_disp_win_top_offset;
+
+    UWORD32 u4_def_disp_win_bottom_offset;
+
+    /**
+    *  to 1 specifies that the syntax structure hrd_parameters is present in the vui_parameters syntax structue
+    */
+    UWORD8 u1_vui_hrd_parameters_present_flag;
+
+    /**
+    *   Indicates the presence of the
+    *   num_units_in_ticks, time_scale flag
+    */
+    UWORD8 u1_vui_timing_info_present_flag;
+
+    /**
+    *   Number of units that
+    *   correspond to one increment of the
+    *   clock. Indicates the  resolution
+    */
+    UWORD32 u4_vui_num_units_in_tick;
+
+    /**
+    *   The number of time units that pass in one second
+    */
+    UWORD32 u4_vui_time_scale;
+    /**
+    * if 1, indicates that the POC for each picture in the coded video sequence (cvs) (not the first picture), in decoding order,
+    * is proportional to the output time of the picture relative to that of the first picture in the cvs
+    */
+    UWORD8 u1_poc_proportional_to_timing_flag;
+
+    /**
+    * num_ticks_poc_diff_one_minus1 plus 1 specifies the number of clock ticks
+    * corresponding to a difference of poc values equal to 1
+    */
+    UWORD8 u1_num_ticks_poc_diff_one_minus1;
+
+    /**
+    * 1, specifies that the following cvs bitstream restriction parameters are present
+    */
+    UWORD8 u1_bitstream_restriction_flag;
+
+    /**
+    *  if 1, indicates that each pps that is active in the cvs has
+    *  the same value of the tile syntax elements
+    */
+    UWORD8 u1_tiles_fixed_structure_flag;
+
+    /**
+    * if 0, indicates that no pel outside the pic boundaries and
+    * no sub-pels derived using pels outside the pic boundaries is used for inter prediction
+    */
+    UWORD8 u1_motion_vectors_over_pic_boundaries_flag;
+
+    /**
+    * if 1, indicates
+    * all P/B slices belonging to the same pic have an identical refpic list0,
+    * all B slices that belong to the same picture have an identical refpic list1.
+    */
+    UWORD8 u1_restricted_ref_pic_lists_flag;
+
+    /**
+    *
+    */
+    UWORD8 u4_min_spatial_segmentation_idc;
+    /**
+    * Indicates a number of bytes not exceeded by the sum of the sizes of the VCL NAL units
+    * associated with any coded picture
+    */
+    UWORD8 u1_max_bytes_per_pic_denom;
+
+    /**
+    *  Indicates an upper bound for the number of bits of coding_unit() data
+    */
+    UWORD8 u1_max_bits_per_mincu_denom;
+
+    /**
+    * Indicate the maximum absolute value of a decoded horizontal MV component
+    * in quarter-pel luma units
+    */
+    UWORD8 u1_log2_max_mv_length_horizontal;
+
+    /**
+    * Indicate the maximum absolute value of a decoded vertical MV component
+    * in quarter-pel luma units
+    */
+    UWORD8 u1_log2_max_mv_length_vertical;
+
+    /**
+     * HRD parameters
+     */
+
+
+    /**
+    *   Indicates the presence of the
+    *   num_units_in_ticks, time_scale flag
+    */
+    UWORD8 u1_timing_info_present_flag;
+
+    /**
+    *   Number of units that
+    *   correspond to one increment of the
+    *   clock. Indicates the  resolution
+    */
+    UWORD32 u4_num_units_in_tick;
+
+    /**
+    *   The number of time units that pass in one second
+    */
+    UWORD32 u4_time_scale;
+
+    /**
+    * Nal- hrd parameters flag
+    */
+    UWORD8 u1_nal_hrd_parameters_present_flag;
+
+    /**
+    * VCL- hrd parameters flag
+    */
+    UWORD8 u1_vcl_hrd_parameters_present_flag;
+
+    /**
+    * Indicates the presence of NAL-HRD params or VCL_HRD params
+    * in the bitstream
+    */
+    UWORD8 u1_cpbdpb_delays_present_flag;
+
+    /**
+    * specifies that sub-picture level CPB removal delay parameters are
+    * present in picture timing SEI messages
+    */
+    UWORD8 u1_sub_pic_cpb_params_present_flag;
+
+    /**
+    * specify the clock sub-tick
+    * (the minimum interval of time that can be represented in the coded data when sub_pic_cpb_params_present_flag is equal to 1)
+    */
+    UWORD8 u1_tick_divisor_minus2;
+
+    /**
+    * specifies the length, in bits for the du cpb delay syntax in pt_sei
+    */
+    UWORD8 u1_du_cpb_removal_delay_increment_length_minus1;
+
+    /**
+    * Indicates presence of sub_pic_cpb_params in pic timing sei
+    */
+    UWORD8 u1_sub_pic_cpb_params_in_pic_timing_sei_flag;
+
+    /**
+     * Indicates dpb output delay for the du
+     */
+    UWORD8 u1_dpb_output_delay_du_length_minus1;
+
+    /**
+    * (together with bit_rate_value_minus1) specifies the
+    * maximum input bit rate of the i-th CPB
+    */
+    UWORD8 u4_bit_rate_scale;
+
+    /**
+    * (together with cpb_size_du_value_minus1) specfies
+    * CPB size of the i-th CPB when the CPB operates
+    * at the access unit level
+    */
+    UWORD8 u4_cpb_size_scale;
+
+    /**
+    * (together with cpb_size_du_value_minus1) specfies
+    * CPB size of the i-th CPB when the CPB operates
+    * at the sub-picture level
+    */
+    UWORD8 u4_cpb_size_du_scale;
+
+
+    /**
+    * specifies the length, in bits for initial cpb delay (nal/vcl)sysntax in bp sei
+    */
+    UWORD8  u1_initial_cpb_removal_delay_length_minus1;
+
+    /**
+    * specifies the length, in bits for the au cpb delay syntax in pt_sei
+    */
+    UWORD8  u1_au_cpb_removal_delay_length_minus1;
+
+    /**
+    * specifies the length, in bits, of the pic_dpb_output_delay syntax element in the pt SEI message
+    */
+    UWORD8  u1_dpb_output_delay_length_minus1;
+
+    /**
+    * if 1, , for the highest temporal sub-layers, the temporal distance between the HRD output times
+    *  of consecutive pictures in output order is constrained refer to Table E-6
+    */
+    UWORD8 au1_fixed_pic_rate_general_flag[6];
+
+    UWORD8 au1_fixed_pic_rate_within_cvs_flag[6];
+
+    /**
+    * if 1, , for the highest temporal sub-layers, the temporal distance (in clock ticks) between the
+    * element units that specify HRD output times of consecutive pictures in output order is constrained
+    * refer to Table E-6
+    */
+    UWORD8 au1_elemental_duration_in_tc_minus1[6];
+
+    /**
+    * specifies the HRD operational mode
+    */
+    UWORD8 au1_low_delay_hrd_flag[6];
+
+    /**
+    * 1 specifies the number of alternative CPB specifications in the
+    * bitstream of the cvs when HighestTid is equal to i
+    */
+    UWORD8 au1_cpb_cnt_minus1[6];
+}ihevcd_cxa_ctl_get_vui_params_op_t;
+
+#ifdef GPU_BUILD
+typedef struct {
+    UWORD32                                     u4_size;
+    IVD_API_COMMAND_TYPE_T                      e_cmd;
+    IVD_CONTROL_API_COMMAND_TYPE_T              e_sub_cmd;
+    UWORD32                                     u4_gpu_enable_diable; // 1 - Enable 0 - Diable
+}ihevcd_cxa_ctl_gpu_enable_diable_ip_t;
+
+typedef struct {
+    UWORD32                                     u4_size;
+    UWORD32                                     u4_error_code;
+}ihevcd_cxa_ctl_gpu_enable_diable_op_t;
+#endif
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+#endif /* __IHEVCD_CXA_H__ */

diff --git a/decoder/ihevcd_deblk.c b/decoder/ihevcd_deblk.c
new file mode 100644
index 0000000..652bf8c
--- /dev/null
+++ b/decoder/ihevcd_deblk.c

@@ -0,0 +1,849 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_deblk.c
+*
+* @brief
+*  Contains definition for the ctb level deblk function
+*
+* @author
+*  Srinivas T
+*
+* @par List of Functions:
+*   - ihevc_deblk()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+
+#include "ihevc_error.h"
+#include "ihevc_common_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_utils.h"
+#include "ihevcd_debug.h"
+
+#include "ihevc_deblk.h"
+#include "ihevc_deblk_tables.h"
+#include "ihevcd_profile.h"
+/**
+*******************************************************************************
+*
+* @brief
+*     Deblock CTB level function.
+*
+* @par Description:
+*     For a given CTB, deblocking on both vertical and
+*     horizontal edges is done. Both the luma and chroma
+*     blocks are processed
+*
+* @param[in] ps_deblk
+*  Pointer to the deblock context
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevcd_deblk_ctb(deblk_ctxt_t *ps_deblk,
+                      WORD32 i4_is_last_ctb_x,
+                      WORD32 i4_is_last_ctb_y)
+{
+    WORD32 ctb_size;
+    WORD32 log2_ctb_size;
+    UWORD32 u4_bs;
+    WORD32 bs_tz; /*Leading zeros in boundary strength*/
+    WORD32 qp_p, qp_q;
+
+    WORD32 filter_p, filter_q;
+
+    UWORD8 *pu1_src;
+    WORD32 qp_strd;
+    UWORD32 *pu4_vert_bs, *pu4_horz_bs;
+    UWORD32 *pu4_ctb_vert_bs, *pu4_ctb_horz_bs;
+    WORD32 vert_bs_strd, horz_bs_strd;
+    WORD32 src_strd;
+    UWORD8 *pu1_qp;
+    UWORD16 *pu2_ctb_no_loop_filter_flag;
+    UWORD16 au2_ctb_no_loop_filter_flag[9];
+
+    WORD32 col, row;
+
+    /* Flag to indicate if QP is constant in CTB
+     * 0 - top_left, 1 - top, 2 - left, 3 - current */
+    UWORD32 u4_qp_const_in_ctb[4] = { 0, 0, 0, 0 };
+    WORD32 ctb_indx;
+    WORD32  chroma_yuv420sp_vu = ps_deblk->is_chroma_yuv420sp_vu;
+    sps_t *ps_sps;
+    pps_t *ps_pps;
+    codec_t *ps_codec;
+    slice_header_t *ps_slice_hdr;
+
+    PROFILE_DISABLE_DEBLK();
+
+    ps_sps = ps_deblk->ps_sps;
+    ps_pps = ps_deblk->ps_pps;
+    ps_codec = ps_deblk->ps_codec;
+    ps_slice_hdr = ps_deblk->ps_slice_hdr;
+
+    log2_ctb_size = ps_sps->i1_log2_ctb_size;
+    ctb_size = (1 << ps_sps->i1_log2_ctb_size);
+
+    /* strides are in units of number of bytes */
+    /* ctb_size * ctb_size / 8 / 16 is the number of bytes needed per CTB */
+    vert_bs_strd = ps_sps->i2_pic_wd_in_ctb << (2 * log2_ctb_size - 7);
+    horz_bs_strd = (ps_sps->i2_pic_wd_in_ctb + 1) << (2 * log2_ctb_size - 7);
+    pu4_vert_bs = (UWORD32 *)((UWORD8 *)ps_deblk->s_bs_ctxt.pu4_pic_vert_bs +
+                    (ps_deblk->i4_ctb_x << (2 * log2_ctb_size - 7)) +
+                    ps_deblk->i4_ctb_y * vert_bs_strd);
+    pu4_ctb_vert_bs = pu4_vert_bs;
+
+    pu4_horz_bs = (UWORD32 *)((UWORD8 *)ps_deblk->s_bs_ctxt.pu4_pic_horz_bs +
+                    (ps_deblk->i4_ctb_x << (2 * log2_ctb_size - 7)) +
+                    ps_deblk->i4_ctb_y * horz_bs_strd);
+    pu4_ctb_horz_bs = pu4_horz_bs;
+
+    qp_strd = ps_sps->i2_pic_wd_in_ctb << (log2_ctb_size - 3);
+    pu1_qp = ps_deblk->s_bs_ctxt.pu1_pic_qp + ((ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * qp_strd) << (log2_ctb_size - 3));
+
+    pu2_ctb_no_loop_filter_flag = ps_deblk->au2_ctb_no_loop_filter_flag;
+
+    ctb_indx = ps_deblk->i4_ctb_x + ps_sps->i2_pic_wd_in_ctb * ps_deblk->i4_ctb_y;
+    if(i4_is_last_ctb_y)
+    {
+        pu4_vert_bs = (UWORD32 *)((UWORD8 *)pu4_vert_bs + vert_bs_strd);
+        pu4_ctb_vert_bs = pu4_vert_bs;
+        /* ctb_size/8 is the number of edges per CTB
+         * ctb_size/4 is the number of BS values needed per edge
+         * divided by 8 for the number of bytes
+         * 2 is the number of bits needed for each BS value */
+        memset(pu4_vert_bs, 0, 1 << (2 * log2_ctb_size - 7));
+
+        pu1_qp += (qp_strd << (log2_ctb_size - 3));
+        pu2_ctb_no_loop_filter_flag += (ctb_size >> 3);
+        ctb_indx += ps_sps->i2_pic_wd_in_ctb;
+    }
+
+    if(i4_is_last_ctb_x)
+    {
+        pu4_horz_bs = (UWORD32 *)((UWORD8 *)pu4_horz_bs + (1 << (2 * log2_ctb_size - 7)));
+        pu4_ctb_horz_bs = pu4_horz_bs;
+        memset(pu4_horz_bs, 0, 1 << (2 * log2_ctb_size - 7));
+
+        pu1_qp += (ctb_size >> 3);
+
+        for(row = 0; row < (ctb_size >> 3) + 1; row++)
+            au2_ctb_no_loop_filter_flag[row] = ps_deblk->au2_ctb_no_loop_filter_flag[row] >> (ctb_size >> 3);
+        pu2_ctb_no_loop_filter_flag = au2_ctb_no_loop_filter_flag;
+        ctb_indx += 1;
+    }
+
+    u4_qp_const_in_ctb[3] = ps_deblk->s_bs_ctxt.pu1_pic_qp_const_in_ctb[(ctb_indx) >> 3] & (1 << (ctb_indx & 7));
+
+    if(ps_deblk->i4_ctb_x || i4_is_last_ctb_x)
+    {
+        u4_qp_const_in_ctb[2] = ps_deblk->s_bs_ctxt.pu1_pic_qp_const_in_ctb[(ctb_indx - 1) >> 3] & (1 << ((ctb_indx - 1) & 7));
+    }
+
+    if((ps_deblk->i4_ctb_x || i4_is_last_ctb_x) && (ps_deblk->i4_ctb_y || i4_is_last_ctb_y))
+    {
+        u4_qp_const_in_ctb[0] =
+                        ps_deblk->s_bs_ctxt.pu1_pic_qp_const_in_ctb[(ctb_indx - ps_sps->i2_pic_wd_in_ctb - 1) >> 3] &
+                        (1 << ((ctb_indx - ps_sps->i2_pic_wd_in_ctb - 1) & 7));
+    }
+
+
+
+    if(ps_deblk->i4_ctb_y || i4_is_last_ctb_y)
+    {
+        u4_qp_const_in_ctb[1] =
+                        ps_deblk->s_bs_ctxt.pu1_pic_qp_const_in_ctb[(ctb_indx - ps_sps->i2_pic_wd_in_ctb) >> 3] &
+                        (1 << ((ctb_indx - ps_sps->i2_pic_wd_in_ctb) & 7));
+    }
+
+    src_strd = ps_codec->i4_strd;
+
+    /* Luma Vertical Edge */
+
+    if(0 == i4_is_last_ctb_x)
+    {
+        /* Top CTB's slice header */
+        slice_header_t *ps_slice_hdr_top;
+#ifdef GPU_BUILD
+//TODO GPU : Later define it for ARM only version as well
+        {
+            WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+            if(i4_is_last_ctb_y)
+                cur_ctb_indx += ps_sps->i2_pic_wd_in_ctb;
+            ps_slice_hdr_top = ps_deblk->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - ps_sps->i2_pic_wd_in_ctb];
+        }
+#else
+        {
+            WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+            if(i4_is_last_ctb_y)
+                cur_ctb_indx += ps_sps->i2_pic_wd_in_ctb;
+            ps_slice_hdr_top = ps_codec->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - ps_sps->i2_pic_wd_in_ctb];
+        }
+#endif
+
+        pu1_src = ps_deblk->pu1_cur_pic_luma + ((ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_deblk->ps_codec->i4_strd) << (log2_ctb_size));
+        pu1_src += i4_is_last_ctb_y ? ps_deblk->ps_codec->i4_strd << log2_ctb_size : 0;
+
+        /** Deblocking is done on a shifted CTB -
+         *  Vertical edge processing is done by shifting the CTB up by four pixels */
+        pu1_src -= 4 * src_strd;
+
+        for(col = 0; col < ctb_size / 8; col++)
+        {
+            WORD32 shift = 0;
+
+            /* downshift vert_bs by ctb_size/2 for each column
+             *  shift = (col & ((MAX_CTB_SIZE >> log2_ctb_size) - 1)) << (log2_ctb_size - 1);
+             *  which will reduce to the following assuming ctb size is one of 16, 32 and 64
+             *  and deblocking is done on 8x8 grid
+             */
+            if(6 != log2_ctb_size)
+                shift = (col & 1) << (log2_ctb_size - 1);
+
+            /* BS for the column - Last row is excluded and the top row is included*/
+            u4_bs = (pu4_vert_bs[0] >> shift) << 2;
+
+            if(ps_deblk->i4_ctb_y || i4_is_last_ctb_y)
+            {
+                /* Picking the last BS of the previous CTB corresponding to the same column */
+                UWORD32 *pu4_vert_bs_top = (UWORD32 *)((UWORD8 *)pu4_vert_bs - vert_bs_strd);
+                UWORD32 u4_top_bs = (*pu4_vert_bs_top) >> (shift + (1 << (log2_ctb_size - 1)) - 2);
+                u4_bs |= u4_top_bs & 3;
+            }
+
+            for(row = 0; row < ctb_size / 4;)
+            {
+                WORD8 i1_beta_offset_div2 = ps_slice_hdr->i1_beta_offset_div2;
+                WORD8 i1_tc_offset_div2 = ps_slice_hdr->i1_tc_offset_div2;
+
+                /* Trailing zeros are computed and the corresponding rows are not processed */
+                bs_tz = CTZ(u4_bs) >> 1;
+                if(0 != bs_tz)
+                {
+                    u4_bs = u4_bs >> (bs_tz << 1);
+                    if((row + bs_tz) >= (ctb_size / 4))
+                        pu1_src += 4 * (ctb_size / 4 - row) * src_strd;
+                    else
+                        pu1_src += 4 * bs_tz  * src_strd;
+
+                    row += bs_tz;
+                    continue;
+                }
+
+                if(0 == row)
+                {
+                    i1_beta_offset_div2 = ps_slice_hdr_top->i1_beta_offset_div2;
+                    i1_tc_offset_div2 = ps_slice_hdr_top->i1_tc_offset_div2;
+
+                    if(0 == col)
+                    {
+                        qp_p = u4_qp_const_in_ctb[0] ?
+                                        pu1_qp[-ctb_size / 8 * qp_strd - ctb_size / 8] :
+                                        pu1_qp[-qp_strd - 1];
+                    }
+                    else
+                    {
+                        qp_p = u4_qp_const_in_ctb[1] ?
+                                        pu1_qp[-ctb_size / 8 * qp_strd] :
+                                        pu1_qp[col - 1 - qp_strd];
+                    }
+
+                    qp_q = u4_qp_const_in_ctb[1] ?
+                                    pu1_qp[-ctb_size / 8 * qp_strd] :
+                                    pu1_qp[col - qp_strd];
+                }
+                else
+                {
+                    if(0 == col)
+                    {
+                        qp_p = u4_qp_const_in_ctb[2] ?
+                                        pu1_qp[-ctb_size / 8] :
+                                        pu1_qp[((row - 1) >> 1) * qp_strd - 1];
+                    }
+                    else
+                    {
+                        qp_p = u4_qp_const_in_ctb[3] ?
+                                        pu1_qp[0] :
+                                        pu1_qp[((row - 1) >> 1) * qp_strd + col - 1];
+                    }
+
+                    qp_q = u4_qp_const_in_ctb[3] ?
+                                    pu1_qp[0] :
+                                    pu1_qp[((row - 1) >> 1) * qp_strd + col];
+                }
+
+                filter_p = (pu2_ctb_no_loop_filter_flag[(row + 1) >> 1] >> col) & 1;
+                filter_q = (pu2_ctb_no_loop_filter_flag[(row + 1) >> 1] >> col) & 2;
+                /* filter_p and filter_q are inverted as they are calculated using no_loop_filter_flags */
+                filter_p = !filter_p;
+                filter_q = !filter_q;
+
+                if(filter_p || filter_q)
+                {
+#if DEBUG_DEBLK_LEAF_LEVEL
+                    {
+                        DUMP_DEBLK_LUMA_VERT(pu1_src, src_strd,
+                                             u4_bs & 3, qp_p, qp_q,
+                                             ps_slice_hdr->i1_beta_offset_div2,
+                                             ps_slice_hdr->i1_tc_offset_div2,
+                                             filter_p, filter_q);
+                    }
+#endif
+                    ps_codec->s_func_selector.ihevc_deblk_luma_vert_fptr(pu1_src, src_strd,
+                                                                         u4_bs & 3, qp_p, qp_q,
+                                                                         i1_beta_offset_div2,
+                                                                         i1_tc_offset_div2,
+                                                                         filter_p, filter_q);
+                }
+
+                pu1_src += 4 * src_strd;
+                u4_bs = u4_bs >> 2;
+                row++;
+            }
+
+            if((64 == ctb_size) ||
+                            ((32 == ctb_size) && (col & 1)))
+            {
+                pu4_vert_bs++;
+            }
+            pu1_src -= (src_strd << log2_ctb_size);
+            pu1_src += 8;
+        }
+        pu4_vert_bs = pu4_ctb_vert_bs;
+    }
+
+
+    /* Luma Horizontal Edge */
+
+    if(0 == i4_is_last_ctb_y)
+    {
+
+        /* Left CTB's slice header */
+        slice_header_t *ps_slice_hdr_left;
+#ifdef GPU_BUILD
+//TODO GPU : Later define it for ARM only version as well
+        {
+            WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+            if(i4_is_last_ctb_x)
+                cur_ctb_indx += 1;
+            ps_slice_hdr_left = ps_deblk->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - 1];
+        }
+#else
+        {
+            WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+            if(i4_is_last_ctb_x)
+                cur_ctb_indx += 1;
+            ps_slice_hdr_left = ps_codec->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - 1];
+        }
+#endif
+        pu1_src = ps_deblk->pu1_cur_pic_luma + ((ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_deblk->ps_codec->i4_strd) << log2_ctb_size);
+        pu1_src += i4_is_last_ctb_x ? ctb_size : 0;
+
+        /** Deblocking is done on a shifted CTB -
+         *  Horizontal edge processing is done by shifting the CTB left by four pixels */
+        pu1_src -= 4;
+        for(row = 0; row < ctb_size / 8; row++)
+        {
+            WORD32 shift = 0;
+
+            /* downshift vert_bs by ctb_size/2 for each column
+             *  shift = (row & (MAX_CTB_SIZE / ctb_size - 1)) * ctb_size / 2;
+             *  which will reduce to the following assuming ctb size is one of 16, 32 and 64
+             *  and deblocking is done on 8x8 grid
+             */
+            if(6 != log2_ctb_size)
+                shift = (row & 1) << (log2_ctb_size - 1);
+
+            /* BS for the row - Last column is excluded and the left column is included*/
+            u4_bs = (pu4_horz_bs[0] >> shift) << 2;
+
+            if(ps_deblk->i4_ctb_x || i4_is_last_ctb_x)
+            {
+                /** Picking the last BS of the previous CTB corresponding to the same row
+                * UWORD32 *pu4_horz_bs_left = (UWORD32 *)((UWORD8 *)pu4_horz_bs - (ctb_size / 8) * (ctb_size / 4) / 8 * 2);
+                */
+                UWORD32 *pu4_horz_bs_left = (UWORD32 *)((UWORD8 *)pu4_horz_bs - (1 << (2 * log2_ctb_size - 7)));
+                UWORD32 u4_left_bs = (*pu4_horz_bs_left) >> (shift + (1 << (log2_ctb_size - 1)) - 2);
+                u4_bs |= u4_left_bs & 3;
+            }
+
+            for(col = 0; col < ctb_size / 4;)
+            {
+                WORD8 i1_beta_offset_div2 = ps_slice_hdr->i1_beta_offset_div2;
+                WORD8 i1_tc_offset_div2 = ps_slice_hdr->i1_tc_offset_div2;
+
+                bs_tz = CTZ(u4_bs) >> 1;
+                if(0 != bs_tz)
+                {
+                    u4_bs = u4_bs >> (bs_tz << 1);
+
+                    if((col + bs_tz) >= (ctb_size / 4))
+                        pu1_src += 4 * (ctb_size / 4 - col);
+                    else
+                        pu1_src += 4 * bs_tz;
+
+                    col += bs_tz;
+                    continue;
+                }
+
+                if(0 == col)
+                {
+                    i1_beta_offset_div2 = ps_slice_hdr_left->i1_beta_offset_div2;
+                    i1_tc_offset_div2 = ps_slice_hdr_left->i1_tc_offset_div2;
+
+                    if(0 == row)
+                    {
+                        qp_p = u4_qp_const_in_ctb[0] ?
+                                        pu1_qp[-ctb_size / 8 * qp_strd - ctb_size / 8] :
+                                        pu1_qp[-qp_strd - 1];
+                    }
+                    else
+                    {
+                        qp_p = u4_qp_const_in_ctb[2] ?
+                                        pu1_qp[-ctb_size / 8] :
+                                        pu1_qp[(row - 1) * qp_strd - 1];
+                    }
+
+                    qp_q = u4_qp_const_in_ctb[2] ?
+                                    pu1_qp[-ctb_size / 8] :
+                                    pu1_qp[row * qp_strd - 1];
+                }
+                else
+                {
+                    if(0 == row)
+                    {
+                        qp_p = u4_qp_const_in_ctb[1] ?
+                                        pu1_qp[-ctb_size / 8 * qp_strd] :
+                                        pu1_qp[((col - 1) >> 1) - qp_strd];
+                    }
+                    else
+                    {
+                        qp_p = u4_qp_const_in_ctb[3] ?
+                                        pu1_qp[0] :
+                                        pu1_qp[((col - 1) >> 1) + (row - 1) * qp_strd];
+                    }
+
+                    qp_q = u4_qp_const_in_ctb[3] ?
+                                    pu1_qp[0] :
+                                    pu1_qp[((col - 1) >> 1) + row * qp_strd];
+                }
+
+                filter_p = (pu2_ctb_no_loop_filter_flag[row] >> ((col + 1) >> 1)) & 1;
+                filter_q = (pu2_ctb_no_loop_filter_flag[row + 1] >> ((col + 1) >> 1)) & 1;
+                /* filter_p and filter_q are inverted as they are calculated using no_loop_filter_flags */
+                filter_p = !filter_p;
+                filter_q = !filter_q;
+
+                if(filter_p || filter_q)
+                {
+#if DEBUG_DEBLK_LEAF_LEVEL
+                    {
+                        DUMP_DEBLK_LUMA_HORZ(pu1_src, src_strd,
+                                             u4_bs & 3, qp_p, qp_q,
+                                             ps_slice_hdr->i1_beta_offset_div2,
+                                             ps_slice_hdr->i1_tc_offset_div2,
+                                             filter_p, filter_q);
+                    }
+#endif
+                    ps_codec->s_func_selector.ihevc_deblk_luma_horz_fptr(pu1_src, src_strd,
+                                                                         u4_bs & 3, qp_p, qp_q,
+                                                                         i1_beta_offset_div2,
+                                                                         i1_tc_offset_div2, filter_p, filter_q);
+                }
+
+                pu1_src += 4;
+                u4_bs = u4_bs >> 2;
+                col++;
+            }
+
+            if((64 == ctb_size) ||
+                            ((32 == ctb_size) && (row & 1)))
+            {
+                pu4_horz_bs++;
+            }
+            pu1_src -= ctb_size;
+            pu1_src += (src_strd << 3);
+        }
+        pu4_horz_bs = pu4_ctb_horz_bs;
+    }
+
+
+    /* Chroma Veritcal Edge */
+
+    if(0 == i4_is_last_ctb_x)
+    {
+
+        /* Top CTB's slice header */
+        slice_header_t *ps_slice_hdr_top;
+#ifdef GPU_BUILD
+//TODO GPU : Later define it for ARM only version as well
+        {
+            WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+            if(i4_is_last_ctb_y)
+                cur_ctb_indx += ps_sps->i2_pic_wd_in_ctb;
+            ps_slice_hdr_top = ps_deblk->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - ps_sps->i2_pic_wd_in_ctb];
+        }
+#else
+        {
+            WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+            if(i4_is_last_ctb_y)
+                cur_ctb_indx += ps_sps->i2_pic_wd_in_ctb;
+            ps_slice_hdr_top = ps_codec->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - ps_sps->i2_pic_wd_in_ctb];
+        }
+#endif
+
+        pu1_src = ps_deblk->pu1_cur_pic_chroma + ((ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_deblk->ps_codec->i4_strd / 2) << log2_ctb_size);
+        pu1_src += i4_is_last_ctb_y ? (ps_deblk->ps_codec->i4_strd / 2) << log2_ctb_size : 0;
+
+        /** Deblocking is done on a shifted CTB -
+         *  Vertical edge processing is done by shifting the CTB up by four pixels */
+        pu1_src -= 4 * src_strd;
+
+        for(col = 0; col < ctb_size / 16; col++)
+        {
+
+            /* BS for the column - Last row is excluded and the top row is included*/
+            u4_bs = pu4_vert_bs[0] << 2;
+
+            if(ps_deblk->i4_ctb_y || i4_is_last_ctb_y)
+            {
+                /* Picking the last BS of the previous CTB corresponding to the same column */
+                UWORD32 *pu4_vert_bs_top = (UWORD32 *)((UWORD8 *)pu4_vert_bs - vert_bs_strd);
+                UWORD32 u4_top_bs = (*pu4_vert_bs_top) >> ((1 << (log2_ctb_size - 1)) - 2);
+                u4_bs |= u4_top_bs & 3;
+            }
+
+            /* Every alternate boundary strength value is used for chroma */
+            u4_bs &= 0x22222222;
+
+            for(row = 0; row < ctb_size / 8;)
+            {
+                WORD8 i1_tc_offset_div2 = ps_slice_hdr->i1_tc_offset_div2;
+
+                bs_tz = CTZ(u4_bs) >> 2;
+                if(0 != bs_tz)
+                {
+                    if((row + bs_tz) >= (ctb_size / 8))
+                        pu1_src += 4 * (ctb_size / 8 - row) * src_strd;
+                    else
+                        pu1_src += 4 * bs_tz  * src_strd;
+                    row += bs_tz;
+                    u4_bs = u4_bs >> (bs_tz << 2);
+                    continue;
+                }
+
+                if(0 == row)
+                {
+                    i1_tc_offset_div2 = ps_slice_hdr_top->i1_tc_offset_div2;
+
+                    if(0 == col)
+                    {
+                        qp_p = u4_qp_const_in_ctb[0] ?
+                                        pu1_qp[-ctb_size / 8 * qp_strd - ctb_size / 8] :
+                                        pu1_qp[-qp_strd - 1];
+                    }
+                    else
+                    {
+                        qp_p = u4_qp_const_in_ctb[1] ?
+                                        pu1_qp[-ctb_size / 8 * qp_strd] :
+                                        pu1_qp[2 * col - 1 - qp_strd];
+                    }
+
+                    qp_q = u4_qp_const_in_ctb[1] ?
+                                    pu1_qp[-ctb_size / 8 * qp_strd] :
+                                    pu1_qp[2 * col - qp_strd];
+                }
+                else
+                {
+                    if(0 == col)
+                    {
+                        qp_p = u4_qp_const_in_ctb[2] ?
+                                        pu1_qp[-ctb_size / 8] :
+                                        pu1_qp[(row - 1) * qp_strd - 1];
+                    }
+                    else
+                    {
+                        qp_p = u4_qp_const_in_ctb[3] ?
+                                        pu1_qp[0] :
+                                        pu1_qp[(row - 1) * qp_strd + 2 * col - 1];
+                    }
+
+                    qp_q = u4_qp_const_in_ctb[3] ?
+                                    pu1_qp[0] :
+                                    pu1_qp[(row - 1) * qp_strd + 2 * col];
+                }
+
+                filter_p = (pu2_ctb_no_loop_filter_flag[row] >> (col << 1)) & 1;
+                filter_q = (pu2_ctb_no_loop_filter_flag[row] >> (col << 1)) & 2;
+                /* filter_p and filter_q are inverted as they are calculated using no_loop_filter_flags */
+                filter_p = !filter_p;
+                filter_q = !filter_q;
+
+                if(filter_p || filter_q)
+                {
+                    ASSERT(1 == ((u4_bs & 3) >> 1));
+#if DEBUG_DEBLK_LEAF_LEVEL
+                    {
+                        DUMP_DEBLK_CHROMA_VERT(pu1_src, src_strd,
+                                               u4_bs & 3, qp_p, qp_q,
+                                               ps_pps->i1_pic_cb_qp_offset,
+                                               ps_pps->i1_pic_cr_qp_offset,
+                                               ps_slice_hdr->i1_tc_offset_div2,
+                                               filter_p, filter_q);
+                    }
+#endif
+                    if(chroma_yuv420sp_vu)
+                    {
+                        ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr(pu1_src,
+                                                                               src_strd,
+                                                                               qp_q,
+                                                                               qp_p,
+                                                                               ps_pps->i1_pic_cr_qp_offset,
+                                                                               ps_pps->i1_pic_cb_qp_offset,
+                                                                               i1_tc_offset_div2,
+                                                                               filter_q,
+                                                                               filter_p);
+                    }
+                    else
+                    {
+                        ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr(pu1_src,
+                                                                               src_strd,
+                                                                               qp_p,
+                                                                               qp_q,
+                                                                               ps_pps->i1_pic_cb_qp_offset,
+                                                                               ps_pps->i1_pic_cr_qp_offset,
+                                                                               i1_tc_offset_div2,
+                                                                               filter_p,
+                                                                               filter_q);
+                    }
+                }
+
+                pu1_src += 4 * src_strd;
+                u4_bs = u4_bs >> 4;
+                row++;
+            }
+
+            pu4_vert_bs += (64 == ctb_size) ? 2 : 1;
+            pu1_src -= ((src_strd / 2) << log2_ctb_size);
+            pu1_src += 16;
+        }
+    }
+
+    /* Chroma Horizontal Edge */
+
+    if(0 == i4_is_last_ctb_y)
+    {
+
+        /* Left CTB's slice header */
+        slice_header_t *ps_slice_hdr_left;
+#ifdef GPU_BUILD
+//TODO GPU : Later define it for ARM only version as well
+        {
+            WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+            if(i4_is_last_ctb_x)
+                cur_ctb_indx += 1;
+            ps_slice_hdr_left = ps_deblk->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - 1];
+        }
+#else
+        {
+            WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+            if(i4_is_last_ctb_x)
+                cur_ctb_indx += 1;
+            ps_slice_hdr_left = ps_codec->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - 1];
+        }
+#endif
+
+        pu1_src = ps_deblk->pu1_cur_pic_chroma + ((ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_deblk->ps_codec->i4_strd / 2) << log2_ctb_size);
+        pu1_src += i4_is_last_ctb_x ? ctb_size : 0;
+
+        /** Deblocking is done on a shifted CTB -
+         * Vertical edge processing is done by shifting the CTB up by four pixels (8 here beacuse UV are interleaved) */
+        pu1_src -= 8;
+        for(row = 0; row < ctb_size / 16; row++)
+        {
+            /* BS for the row - Last column is excluded and the left column is included*/
+            u4_bs = pu4_horz_bs[0] << 2;
+
+            if(ps_deblk->i4_ctb_x || i4_is_last_ctb_x)
+            {
+                /** Picking the last BS of the previous CTB corresponding to the same row
+                * UWORD32 *pu4_horz_bs_left = (UWORD32 *)((UWORD8 *)pu4_horz_bs - (ctb_size / 8) * (ctb_size / 4) / 8 * 2);
+                */
+                UWORD32 *pu4_horz_bs_left = (UWORD32 *)((UWORD8 *)pu4_horz_bs - (1 << (2 * log2_ctb_size - 7)));
+                UWORD32 u4_left_bs = (*pu4_horz_bs_left) >> ((1 << (log2_ctb_size - 1)) - 2);
+                u4_bs |= u4_left_bs & 3;
+            }
+
+            /* Every alternate boundary strength value is used for chroma */
+            u4_bs &= 0x22222222;
+
+            for(col = 0; col < ctb_size / 8;)
+            {
+                WORD8 i1_tc_offset_div2 = ps_slice_hdr->i1_tc_offset_div2;
+
+                bs_tz = CTZ(u4_bs) >> 2;
+                if(0 != bs_tz)
+                {
+                    u4_bs = u4_bs >> (bs_tz << 2);
+
+                    if((col + bs_tz) >= (ctb_size / 8))
+                        pu1_src += 8 * (ctb_size / 8 - col);
+                    else
+                        pu1_src += 8 * bs_tz;
+
+                    col += bs_tz;
+                    continue;
+                }
+
+                if(0 == col)
+                {
+                    i1_tc_offset_div2 = ps_slice_hdr_left->i1_tc_offset_div2;
+
+                    if(0 == row)
+                    {
+                        qp_p = u4_qp_const_in_ctb[0] ?
+                                        pu1_qp[-ctb_size / 8 * qp_strd - ctb_size / 8] :
+                                        pu1_qp[-qp_strd - 1];
+                    }
+                    else
+                    {
+                        qp_p = u4_qp_const_in_ctb[2] ?
+                                        pu1_qp[-ctb_size / 8] :
+                                        pu1_qp[(2 * row - 1) * qp_strd - 1];
+                    }
+
+                    qp_q = u4_qp_const_in_ctb[2] ?
+                                    pu1_qp[-ctb_size / 8] :
+                                    pu1_qp[(2 * row) * qp_strd - 1];
+                }
+                else
+                {
+                    if(0 == row)
+                    {
+                        qp_p = u4_qp_const_in_ctb[1] ?
+                                        pu1_qp[-ctb_size / 8 * qp_strd] :
+                                        pu1_qp[col - 1 - qp_strd];
+                    }
+                    else
+                    {
+                        qp_p = u4_qp_const_in_ctb[3] ?
+                                        pu1_qp[0] :
+                                        pu1_qp[(col - 1) +  (2 * row - 1) * qp_strd];
+                    }
+
+                    qp_q = u4_qp_const_in_ctb[3] ?
+                                    pu1_qp[0] :
+                                    pu1_qp[(col - 1) + 2 * row * qp_strd];
+                }
+
+                filter_p = (pu2_ctb_no_loop_filter_flag[row << 1] >> col) & 1;
+                filter_q = (pu2_ctb_no_loop_filter_flag[(row << 1) + 1] >> col) & 1;
+                /* filter_p and filter_q are inverted as they are calculated using no_loop_filter_flags */
+                filter_p = !filter_p;
+                filter_q = !filter_q;
+
+                if(filter_p || filter_q)
+                {
+                    ASSERT(1 == ((u4_bs & 3) >> 1));
+#if DEBUG_DEBLK_LEAF_LEVEL
+                    {
+                        DUMP_DEBLK_CHROMA_HORZ(pu1_src, src_strd,
+                                               u4_bs & 3, qp_p, qp_q,
+                                               ps_pps->i1_pic_cb_qp_offset,
+                                               ps_pps->i1_pic_cr_qp_offset,
+                                               ps_slice_hdr->i1_tc_offset_div2,
+                                               filter_p, filter_q);
+                    }
+#endif
+                    if(chroma_yuv420sp_vu)
+                    {
+                        ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr(pu1_src,
+                                                                               src_strd,
+                                                                               qp_q,
+                                                                               qp_p,
+                                                                               ps_pps->i1_pic_cr_qp_offset,
+                                                                               ps_pps->i1_pic_cb_qp_offset,
+                                                                               i1_tc_offset_div2,
+                                                                               filter_q,
+                                                                               filter_p);
+                    }
+                    else
+                    {
+                        ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr(pu1_src,
+                                                                               src_strd,
+                                                                               qp_p,
+                                                                               qp_q,
+                                                                               ps_pps->i1_pic_cb_qp_offset,
+                                                                               ps_pps->i1_pic_cr_qp_offset,
+                                                                               i1_tc_offset_div2,
+                                                                               filter_p,
+                                                                               filter_q);
+                    }
+                }
+
+                pu1_src += 8;
+                u4_bs = u4_bs >> 4;
+                col++;
+            }
+
+            pu4_horz_bs += (64 == ctb_size) ? 2 : 1;
+            pu1_src -= ctb_size;
+            pu1_src += 8 * src_strd;
+
+        }
+    }
+}

diff --git a/decoder/ihevcd_deblk.h b/decoder/ihevcd_deblk.h
new file mode 100644
index 0000000..1c9f7c8
--- /dev/null
+++ b/decoder/ihevcd_deblk.h

@@ -0,0 +1,42 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_deblk.h
+*
+* @brief
+*
+*
+* @author
+*  Srinivas T
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVCD_DEBLK_H_
+#define _IHEVCD_DEBLK_H_
+
+void ihevcd_deblk_ctb(deblk_ctxt_t *ps_deblk,
+                      WORD32 i4_is_last_ctb_x,
+                      WORD32 i4_is_last_ctb_y);
+
+
+#endif /*_IHEVC_DEBLK_H_*/

diff --git a/decoder/ihevcd_debug.c b/decoder/ihevcd_debug.c
new file mode 100644
index 0000000..8e6a79f
--- /dev/null
+++ b/decoder/ihevcd_debug.c

@@ -0,0 +1,1090 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_debug.c
+*
+* @brief
+*  Functions used for codec debugging
+*
+* @author
+*  Ittiam
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+
+#include "ihevc_common_tables.h"
+#include "ihevc_error.h"
+#include "ihevc_cabac_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_debug.h"
+
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#if DEBUG_CODE
+
+void ihevcd_debug_dump_mv_map(codec_t *ps_codec)
+{
+
+    process_ctxt_t *ps_proc;
+    sps_t *ps_sps = ps_codec->s_parse.ps_sps;
+    WORD32 num_minpu_in_ctb, ctb_size, x, y, cur_pu_idx, cur_ctb_idx, pu_idx_start_ctb;
+    UWORD8 *pu1_pic_pu_map_ctb;
+    pu_t *ps_pu;
+    WORD32 i;
+    FILE *fp_mv_map, *fp_pu_idx_map, *fp_pu, *fp_mv_print, *fp_mv_print_1;
+    char l0_mvx[50], l0_mvy[50], l1_mvx[50], l1_mvy[50];
+    UWORD32 *pu4_pu_done, num_pu_done = 0, is_pu_done;
+
+    pu4_pu_done = malloc(MAX_HT * MAX_WD / 4 / 4 * sizeof(UWORD32));
+
+    ctb_size = (1 << ps_sps->i1_log2_ctb_size);
+
+    num_minpu_in_ctb = (ctb_size / MIN_PU_SIZE) * (ctb_size / MIN_PU_SIZE);
+
+    ps_pu = &ps_codec->s_parse.ps_pic_pu[0];
+    fp_mv_map = fopen("d:\\dump\\mv_map.txt", "a");
+    fp_mv_print = fopen("d:\\dump\\mv_print.txt", "a");
+    fp_mv_print_1 = fopen("d:\\dump\\mv_print_1.txt", "a");
+    if((NULL == fp_mv_map) || (NULL == fp_mv_print) || (NULL == fp_mv_print_1))
+    {
+        printf("\n Couldn't open mv dump files");
+    }
+    else
+    {
+#if 0
+        fp_pu_idx_map = fopen("d:\\dump\\pu_idx_map.txt", "ab");
+        fp_pu = fopen("d:\\dump\\pu.txt", "ab");
+        {
+            WORD32 last_ctb_idx, last_pu_idx;
+            last_ctb_idx = ps_sps->i2_pic_ht_in_ctb * ps_sps->i2_pic_wd_in_ctb * num_minpu_in_ctb;
+            fwrite(ps_codec->s_parse.pu1_pic_pu_map,last_ctb_idx,sizeof(UWORD32),fp_pu_idx_map );
+            fwrite(ps_codec->s_parse.pu4_pic_pu_idx,last_ctb_idx * num_minpu_in_ctb, sizeof(UWORD8),fp_pu_idx_map );
+
+            last_pu_idx = ps_codec->s_parse.pu4_pic_pu_idx[last_ctb_idx];
+            fwrite(ps_codec->s_parse.ps_pic_pu,last_pu_idx , sizeof(pu_t),fp_pu );
+        }
+#endif
+        fprintf(fp_mv_map, "\nPOC=%d\n", ps_codec->ps_slice_hdr_base[0].i4_abs_pic_order_cnt);
+
+        {
+            WORD32 last_ctb_idx, last_ctb_pu_idx, last_pu_idx;
+            last_ctb_idx = ps_sps->i2_pic_ht_in_ctb * ps_sps->i2_pic_wd_in_ctb;
+            last_ctb_pu_idx = ps_codec->s_parse.pu4_pic_pu_idx[last_ctb_idx];
+
+            pu1_pic_pu_map_ctb = ps_codec->s_parse.pu1_pic_pu_map
+                            + last_ctb_idx * num_minpu_in_ctb;
+
+            last_pu_idx  = last_ctb_pu_idx + pu1_pic_pu_map_ctb[(((ps_sps->i2_pic_wd_in_ctb * ctb_size - 1) & (ctb_size - 1)) >> 2) + ((((ps_sps->i2_pic_ht_in_ctb * ctb_size - 1) & (ctb_size - 1))) >> 2) * (ctb_size >> 2)];
+
+            for(i = 0; i < last_pu_idx; i++)
+            {
+                ps_pu = &ps_codec->s_parse.ps_pic_pu[i];
+
+                fprintf(fp_mv_print_1, "\n-----------------------");
+
+                fprintf(fp_mv_print_1, "\n pu_x = %d, pu_y = %d",
+                        (ps_pu->b4_pos_x << 2), (ps_pu->b4_pos_y << 2));
+                fprintf(fp_mv_print_1, "\n pu_wd = %d, pu_ht = %d", ((ps_pu->b4_wd + 1) << 2), ((ps_pu->b4_ht + 1) << 2));
+                if(ps_pu->b2_pred_mode == PRED_L0)
+                    fprintf(fp_mv_print_1, "\n Pred = 0,Ref_idx = %d, MV l0 = %4d %4d", ps_pu->mv.i1_l0_ref_idx, ps_pu->mv.s_l0_mv.i2_mvx,
+                            ps_pu->mv.s_l0_mv.i2_mvy);
+                else if(ps_pu->b2_pred_mode == PRED_L1)
+                    fprintf(fp_mv_print_1, "\n Pred = 1,Ref_idx = %d,  MV l1 = %4d %4d", ps_pu->mv.i1_l1_ref_idx, ps_pu->mv.s_l1_mv.i2_mvx,
+                            ps_pu->mv.s_l1_mv.i2_mvy);
+                else
+                    fprintf(fp_mv_print_1, "\n Pred = 2,Ref_idx = %d,Ref_idx = %d, MV l0 = %4d %4d, MV l1 = %4d %4d", ps_pu->mv.i1_l0_ref_idx, ps_pu->mv.i1_l1_ref_idx,
+                            ps_pu->mv.s_l0_mv.i2_mvx, ps_pu->mv.s_l0_mv.i2_mvy,
+                            ps_pu->mv.s_l1_mv.i2_mvx, ps_pu->mv.s_l1_mv.i2_mvy);
+            }
+        }
+        for(y = 0; y < (ps_sps->i2_pic_height_in_luma_samples / MIN_PU_SIZE); y++)
+        {
+            for(x = 0; x < (ps_sps->i2_pic_width_in_luma_samples / MIN_PU_SIZE); x++)
+            {
+                cur_ctb_idx = (x * MIN_PU_SIZE / ctb_size) + (y * MIN_PU_SIZE / ctb_size) * ps_sps->i2_pic_wd_in_ctb;
+                pu_idx_start_ctb = ps_codec->s_parse.pu4_pic_pu_idx[cur_ctb_idx];
+
+                pu1_pic_pu_map_ctb = ps_codec->s_parse.pu1_pic_pu_map
+                                + cur_ctb_idx * num_minpu_in_ctb;
+
+                cur_pu_idx  = pu_idx_start_ctb + pu1_pic_pu_map_ctb[(((x * 4) & (ctb_size - 1)) >> 2) + ((((y * 4) & (ctb_size - 1))) >> 2) * (ctb_size >> 2)];
+
+                ps_pu = &ps_codec->s_parse.ps_pic_pu[cur_pu_idx];
+
+                is_pu_done = 0;
+                for(i = 0; i < num_pu_done; i++)
+                {
+                    if(pu4_pu_done[num_pu_done - i - 1] == cur_pu_idx)
+                    {
+                        is_pu_done = 1;
+                        break;
+                    }
+                }
+
+                if(is_pu_done)
+                {
+                    fprintf(fp_mv_map, ",");
+                }
+                else
+                {
+                    sprintf(l0_mvx, "%d", ps_pu->mv.s_l0_mv.i2_mvx);
+                    sprintf(l0_mvy, "%d", ps_pu->mv.s_l0_mv.i2_mvy);
+                    sprintf(l1_mvx, "%d", ps_pu->mv.s_l1_mv.i2_mvx);
+                    sprintf(l1_mvy, "%d", ps_pu->mv.s_l1_mv.i2_mvy);
+                    fprintf(fp_mv_map, "(%s:%s)(%s:%s),", l0_mvx, l0_mvy, l1_mvx, l1_mvy);
+
+                    fprintf(fp_mv_print, "\n-----------------------");
+
+/*
+                    printf("\n CTB X = %d, Y = %d",
+                           (x*MIN_PU_SIZE / ctb_size), (y*MIN_PU_SIZE / ctb_size));
+*/
+
+                    fprintf(fp_mv_print, "\n pu_x = %d, pu_y = %d",
+                            (ps_pu->b4_pos_x << 2), (ps_pu->b4_pos_y << 2));
+                    fprintf(fp_mv_print, "\n pu_wd = %d, pu_ht = %d", ((ps_pu->b4_wd + 1) << 2), ((ps_pu->b4_ht + 1) << 2));
+                    if(ps_pu->b2_pred_mode == PRED_L0)
+                        fprintf(fp_mv_print, "\n Pred = 0,Ref_idx = %d, MV l0 = %4d %4d", ps_pu->mv.i1_l0_ref_idx, ps_pu->mv.s_l0_mv.i2_mvx,
+                                ps_pu->mv.s_l0_mv.i2_mvy);
+                    else if(ps_pu->b2_pred_mode == PRED_L1)
+                        fprintf(fp_mv_print, "\n Pred = 1,Ref_idx = %d,  MV l1 = %4d %4d", ps_pu->mv.i1_l1_ref_idx, ps_pu->mv.s_l1_mv.i2_mvx,
+                                ps_pu->mv.s_l1_mv.i2_mvy);
+                    else
+                        fprintf(fp_mv_print, "\n Pred = 2,Ref_idx = %d,Ref_idx = %d, MV l0 = %4d %4d, MV l1 = %4d %4d", ps_pu->mv.i1_l0_ref_idx, ps_pu->mv.i1_l1_ref_idx,
+                                ps_pu->mv.s_l0_mv.i2_mvx, ps_pu->mv.s_l0_mv.i2_mvy,
+                                ps_pu->mv.s_l1_mv.i2_mvx, ps_pu->mv.s_l1_mv.i2_mvy);
+
+                    pu4_pu_done[num_pu_done] = cur_pu_idx;
+                    num_pu_done++;
+                }
+            }
+            fprintf(fp_mv_map, "\n");
+        }
+    }
+    fclose(fp_mv_map);
+    fclose(fp_mv_print);
+    fclose(fp_mv_print_1);
+//            fclose(fp_pu_idx_map);
+//            fclose(fp_pu);
+    free(pu4_pu_done);
+}
+
+void ihevcd_debug_assert(WORD32 x)
+{
+    if(!x)
+    {
+        printf("Assert failed.. Exiting \n");
+        exit(-1);
+    }
+}
+
+void ihevcd_debug_dump_pic_buffers(codec_t *ps_codec)
+{
+    FILE *fp_pic, *fp_pic_b;
+    sps_t *ps_sps = ps_codec->s_parse.ps_sps;
+    static WORD32 file_open = 0;
+    WORD32 vert_bs_size, horz_bs_size;
+    WORD32 qp_size;
+    WORD32 qp_const_flag_size;
+    WORD32 loop_filter_size;
+    WORD32 loop_filter_buffer;
+    WORD32 pic_intra_flag_size;
+
+    vert_bs_size = ps_codec->i4_max_wd / 8 + MAX_CTB_SIZE / 8;
+
+    /* Max Number of horizontal edges - extra MAX_CTB_SIZE / 8 to handle the last 4 rows separately(shifted CTB processing) */
+    vert_bs_size *= (ps_codec->i4_max_ht + MAX_CTB_SIZE) / MIN_TU_SIZE;
+
+    /* Number of bytes */
+    vert_bs_size /= 8;
+
+    /* Two bits per edge */
+    vert_bs_size *= 2;
+
+    /* Max Number of horizontal edges */
+    horz_bs_size = ps_codec->i4_max_ht / 8 + MAX_CTB_SIZE / 8;
+
+    /* Max Number of vertical edges - extra MAX_CTB_SIZE / 8 to handle the last 4 columns separately(shifted CTB processing) */
+    horz_bs_size *= (ps_codec->i4_max_wd + MAX_CTB_SIZE) / MIN_TU_SIZE;
+
+    /* Number of bytes */
+    horz_bs_size /= 8;
+
+    /* Two bits per edge */
+    horz_bs_size *= 2;
+
+    qp_size = (ps_codec->i4_max_ht * ps_codec->i4_max_wd) / (MIN_CU_SIZE * MIN_CU_SIZE);
+
+    /* Max CTBs in a row */
+    qp_const_flag_size = ps_codec->i4_max_wd / MIN_CTB_SIZE;
+
+    /* Max CTBs in a column */
+    qp_const_flag_size *= ps_codec->i4_max_ht / MIN_CTB_SIZE;
+
+    /* Number of bytes */
+    qp_const_flag_size /= 8;
+
+    loop_filter_size = ((ps_codec->i4_max_wd  + 64) / MIN_CU_SIZE) * ((ps_codec->i4_max_ht + 64) / MIN_CU_SIZE) / 8;
+
+    loop_filter_buffer = (ps_codec->i4_max_wd + 63) >> 6;
+    loop_filter_buffer += 1;
+
+    loop_filter_size -= loop_filter_buffer;
+
+    pic_intra_flag_size = (ps_codec->i4_max_wd / MIN_CU_SIZE) * (ps_codec->i4_max_ht / MIN_CU_SIZE) / 8;
+
+    if(0 == file_open)
+    {
+        fp_pic = fopen("D:\\dump\\pic_dump.txt", "w");
+        fp_pic_b = fopen("D:\\dump\\pic_dump_b.txt", "wb");
+        file_open = 1;
+    }
+    else
+    {
+        fp_pic = fopen("D:\\dump\\pic_dump.txt", "a");
+        fp_pic_b = fopen("D:\\dump\\pic_dump_b.txt", "ab");
+    }
+
+    {
+        WORD32 i, j;
+
+        fwrite(ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp, 1, qp_size, fp_pic_b);
+
+        fprintf(fp_pic, " Frame num :%d \n", ps_codec->u4_pic_cnt);
+
+        for(i = 0; i < ps_codec->i4_max_ht / MIN_CU_SIZE; i++)
+        {
+            for(j = 0; j < ps_codec->i4_max_wd / MIN_CU_SIZE; j++)
+            {
+                UWORD8 u1_qp;
+                WORD32 qp_strd;
+                qp_strd = ps_codec->i4_max_wd / MIN_CU_SIZE;
+                u1_qp = ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp[j + i * qp_strd];
+                fprintf(fp_pic, "%d \t", u1_qp);
+            }
+            fprintf(fp_pic, "\n");
+        }
+    }
+/*
+    fwrite(ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu4_pic_vert_bs, 1, vert_bs_size, fp_pic);
+    fwrite(ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu4_pic_horz_bs, 1, horz_bs_size, fp_pic);
+    fwrite(ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp_const_in_ctb, 1, qp_const_flag_size, fp_pic);
+    fwrite(ps_codec->s_parse.s_deblk_ctxt.pu1_pic_no_loop_filter_flag, 1, loop_filter_size, fp_pic);
+    fwrite(ps_codec->s_parse.pu1_pic_intra_flag, 1, pic_intra_flag_size, fp_pic);
+*/
+
+    //fwrite(au1_pic_avail_ctb_flags, 1, ps_sps->i2_pic_wd_in_ctb * ps_sps->i2_pic_ht_in_ctb, fp_pic);
+    //fwrite(au4_pic_ctb_slice_xy, 4, ps_sps->i2_pic_wd_in_ctb * ps_sps->i2_pic_ht_in_ctb, fp_pic);
+
+    fclose(fp_pic);
+    fclose(fp_pic_b);
+
+}
+
+
+void ihevcd_debug_dump_pic_pu(codec_t *ps_codec)
+{
+    FILE *fp_pic_pu;
+    FILE *fp_pic_pu_idx;
+    static WORD32 file_open = 0;
+    WORD32 num_pu_in_frame;
+    sps_t *ps_sps;
+
+    if(0 == file_open)
+    {
+        fp_pic_pu = fopen("D:\\dump\\pic_pu.txt", "wb");
+        fp_pic_pu_idx = fopen("D:\\dump\\pic_pu_idx.txt", "wb");
+        file_open = 1;
+    }
+    else
+    {
+        return;
+        fp_pic_pu = fopen("D:\\dump\\pic_pu.txt", "ab");
+        fp_pic_pu_idx = fopen("D:\\dump\\pic_pu_idx.txt", "ab");
+    }
+    ps_sps = ps_codec->s_parse.ps_sps;
+    num_pu_in_frame = ps_codec->s_parse.pu4_pic_pu_idx[ps_sps->i4_pic_size_in_ctb];
+
+    fwrite(ps_codec->s_parse.ps_pic_pu, sizeof(pu_t), num_pu_in_frame, fp_pic_pu);
+    fwrite(ps_codec->s_parse.pu4_pic_pu_idx, sizeof(UWORD32), ps_sps->i4_pic_size_in_ctb + 1, fp_pic_pu_idx);
+
+    fclose(fp_pic_pu);
+
+}
+
+
+void ihevcd_debug_init_tmp_buf(UWORD8 *pu1_buf_luma, UWORD8 *pu1_buf_chroma)
+{
+    memset(pu1_buf_luma, 0, 4 * MAX_CTB_SIZE * MAX_CTB_SIZE * sizeof(UWORD8));
+    memset(pu1_buf_chroma, 0, 4 * MAX_CTB_SIZE * MAX_CTB_SIZE * sizeof(UWORD8));
+}
+
+void ihevcd_debug_process_tmp_buf(UWORD8 *pu1_buf_luma, UWORD8 *pu1_buf_chroma)
+{
+    WORD32 row, col;
+    UWORD8 *pu1_tmp_buf_luma;
+    UWORD8 *pu1_tmp_buf_chroma;
+
+    FILE *fp_luma, *fp_chroma;
+
+    pu1_tmp_buf_luma = (UWORD8 *)calloc(4 * MAX_CTB_SIZE * MAX_CTB_SIZE, 1);
+    pu1_tmp_buf_chroma = (UWORD8 *)calloc(4 * MAX_CTB_SIZE * MAX_CTB_SIZE, 1);
+
+    for(row = 0; row < 2 * MAX_CTB_SIZE; row++)
+    {
+        for(col = 0; col < 2 * MAX_CTB_SIZE; col++)
+        {
+            if(0 != pu1_buf_luma[row * 2 * MAX_CTB_SIZE + col])
+                pu1_tmp_buf_luma[row * 2 * MAX_CTB_SIZE + col] = 0xFF;
+            if(0 != pu1_buf_chroma[row * 2 * MAX_CTB_SIZE + col])
+                pu1_tmp_buf_chroma[row * 2 * MAX_CTB_SIZE + col] = 0xFF;
+        }
+    }
+
+    fp_luma = fopen("D:\\dump\\win_sao_tmp_buf_luma.yuv", "wb");
+    fp_chroma = fopen("D:\\dump\\win_sao_tmp_buf_chroma.yuv", "wb");
+
+    fwrite(pu1_tmp_buf_luma, 4 * MAX_CTB_SIZE * MAX_CTB_SIZE, 1, fp_luma);
+    fwrite(pu1_tmp_buf_chroma, 4 * MAX_CTB_SIZE * MAX_CTB_SIZE, 1, fp_chroma);
+
+    fclose(fp_luma);
+    fclose(fp_chroma);
+}
+
+void ihevcd_debug_print_struct_sizes()
+{
+    printf("sizeof(tu_t) %d\n", sizeof(tu_t));
+    printf("sizeof(pu_t) %d\n", sizeof(pu_t));
+    printf("sizeof(pu_mv_t) %d\n", sizeof(pu_mv_t));
+    printf("sizeof(vps_t) %d\n", sizeof(vps_t));
+    printf("sizeof(sps_t) %d\n", sizeof(sps_t));
+    printf("sizeof(pps_t) %d\n", sizeof(pps_t));
+    printf("sizeof(slice_header_t) %d\n", sizeof(slice_header_t));
+
+    printf("sizeof(codec_t) %d\n", sizeof(codec_t));
+    printf("sizeof(parse_ctxt_t) %d\n", sizeof(parse_ctxt_t));
+    printf("sizeof(process_ctxt_t) %d\n", sizeof(process_ctxt_t));
+    printf("sizeof(cab_ctxt_t) %d\n", sizeof(cab_ctxt_t));
+    return;
+}
+
+void ihevcd_debug_dump_pic(UWORD8 *pu1_cur_pic_luma,
+                           UWORD8 *pu1_cur_pic_chroma,
+                           WORD32 pic_wd,
+                           WORD32 pic_ht,
+                           WORD32 pic_strd)
+{
+    FILE *fp;
+    static WORD32 file_open = 0;
+    WORD32 row;
+
+    if(file_open == 0)
+    {
+        fp = fopen("D:\\dump\\win_pre_ilf_dec_order.yuv", "wb");
+        file_open = 1;
+    }
+    else
+    {
+        fp = fopen("D:\\dump\\win_pre_ilf_dec_order.yuv", "ab");
+    }
+
+    for(row = 0; row < pic_ht; row++)
+    {
+        fwrite(pu1_cur_pic_luma, sizeof(UWORD8), pic_wd, fp);
+        pu1_cur_pic_luma += pic_strd;
+    }
+    for(row = 0; row < pic_ht / 2; row++)
+    {
+        fwrite(pu1_cur_pic_chroma, sizeof(UWORD8), pic_wd, fp);
+        pu1_cur_pic_chroma += pic_strd;
+    }
+
+    fclose(fp);
+}
+
+void ihevcd_debug_dump_bs(UWORD32 *pu4_pic_vert_bs,
+                          UWORD32 *pu4_pic_horz_bs,
+                          WORD32 vert_size_in_bytes,
+                          WORD32 horz_size_in_bytes)
+{
+    FILE *fp_vert, *fp_horz;
+    static WORD32 files_open = 0;
+
+    if(files_open == 0)
+    {
+        fp_vert = fopen("D:\\dump\\win_vert_bs_dec_order.txt", "wb");
+        fp_horz = fopen("D:\\dump\\win_horz_bs_dec_order.txt", "wb");
+        files_open = 1;
+    }
+    else
+    {
+        fp_vert = fopen("D:\\dump\\win_vert_bs_dec_order.txt", "ab");
+        fp_horz = fopen("D:\\dump\\win_horz_bs_dec_order.txt", "ab");
+    }
+
+    fwrite(pu4_pic_vert_bs, sizeof(UWORD8), vert_size_in_bytes, fp_vert);
+    fwrite(pu4_pic_horz_bs, sizeof(UWORD8), horz_size_in_bytes, fp_horz);
+
+    fclose(fp_vert);
+    fclose(fp_horz);
+}
+
+void ihevcd_debug_dump_qp(UWORD8 *pu1_qp, WORD32 size_in_bytes)
+{
+    FILE *fp;
+    static WORD32 file_open = 0;
+
+    if(file_open == 0)
+    {
+        fp = fopen("D:\\dump\\win_qp_dec_order.txt", "wb");
+        file_open = 1;
+    }
+    else
+    {
+        fp = fopen("D:\\dump\\win_qp_dec_order.txt", "ab");
+    }
+
+    fwrite(pu1_qp, sizeof(UWORD8), size_in_bytes, fp);
+
+    fclose(fp);
+}
+
+void ihevcs_dump_qp_const_in_ctb(UWORD8 *pu1_qp_const_in_ctb, WORD32 size_in_bytes)
+{
+    FILE *fp;
+    static WORD32 file_open = 0;
+
+    if(file_open == 0)
+    {
+        fp = fopen("D:\\dump\\win_qp_const_ctb_dec_order.txt", "wb");
+        file_open = 1;
+    }
+    else
+    {
+        fp = fopen("D:\\dump\\win_qp_const_ctb_dec_order.txt", "ab");
+    }
+
+    fwrite(pu1_qp_const_in_ctb, sizeof(UWORD8), size_in_bytes, fp);
+
+    fclose(fp);
+}
+
+
+void ihevcd_debug_dump_no_loop_filter(UWORD8 *pu1_pic_no_loop_filter, WORD32 size_in_bytes)
+{
+    FILE *fp;
+    static WORD32 file_open = 0;
+
+    if(file_open == 0)
+    {
+        fp = fopen("D:\\dump\\win_no_loop_filter_dec_order.txt", "wb");
+        file_open = 1;
+    }
+    else
+    {
+        fp = fopen("D:\\dump\\win_no_loop_filter_dec_order.txt", "ab");
+    }
+
+    fwrite(pu1_pic_no_loop_filter, sizeof(UWORD8), size_in_bytes, fp);
+
+    fclose(fp);
+}
+
+void ihevcd_debug_dump_offsets(WORD32 beta_offset_div_2, WORD32 tc_offset_div_2, WORD32 qp_offset_u, WORD32 qp_offset_v)
+{
+    FILE *fp;
+    static WORD32 file_open = 0;
+
+    if(file_open == 0)
+    {
+        fp = fopen("D:\\dump\\win_offsets.txt", "wb");
+        file_open = 1;
+    }
+    else
+    {
+        fp = fopen("D:\\dump\\win_offsets.txt", "ab");
+    }
+
+    fwrite(&beta_offset_div_2, sizeof(WORD32), 1, fp);
+    fwrite(&tc_offset_div_2, sizeof(WORD32), 1, fp);
+    fwrite(&qp_offset_u, sizeof(WORD32), 1, fp);
+    fwrite(&qp_offset_v, sizeof(WORD32), 1, fp);
+
+    fclose(fp);
+
+}
+
+/* Debugging POC values */
+void ihevcd_debug_print_ref_list_pocs(WORD32 i4_pic_order_cnt_val,
+                                      slice_header_t *ps_slice_hdr,
+                                      dpb_mgr_t *ps_dpb_mgr,
+                                      UWORD32 u4_num_st_curr_before,
+                                      UWORD32 u4_num_st_curr_after,
+                                      UWORD32 u4_num_st_foll,
+                                      UWORD32 u4_num_lt_curr,
+                                      UWORD32 u4_num_lt_foll,
+                                      WORD32 *pi4_poc_st_curr_before,
+                                      WORD32 *pi4_poc_st_curr_after,
+                                      WORD32 *pi4_poc_st_foll,
+                                      WORD32 *pi4_poc_lt_curr,
+                                      WORD32 *pi4_poc_lt_foll)
+{
+    WORD32 i, j;
+    pic_buf_t *ps_pic_buf;
+    printf("\n------------------------\nCurrent POC: %d\n", i4_pic_order_cnt_val);
+    printf("\nPOCs present in Reference List L0:\n");
+    for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
+    {
+        ps_pic_buf = (pic_buf_t *)((ps_slice_hdr->as_ref_pic_list0[i].pv_pic_buf));
+        printf("POC: %d\n", ps_pic_buf->i4_abs_poc);
+        printf("Longterm Reference = %d\n", ps_pic_buf->u1_used_as_ref);
+    }
+
+    if(ps_slice_hdr->i1_slice_type  == BSLICE)
+    {
+        printf("\nPOCs present in Reference List L1:\n");
+        for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
+        {
+            ps_pic_buf = (pic_buf_t *)((ps_slice_hdr->as_ref_pic_list1[i].pv_pic_buf));
+            printf("POC: %d\n", ps_pic_buf->i4_abs_poc);
+            printf("POC LSB: %d\n", ps_pic_buf->i4_poc_lsb);
+            printf("Longterm Reference = %d\n", ps_pic_buf->u1_used_as_ref);
+        }
+    }
+
+    printf("\nPOCs that are to be released from DPB:\n");
+    for(i = 0; i < MAX_DPB_BUFS; i++)
+    {
+        if(ps_dpb_mgr->as_dpb_info[i].ps_pic_buf)
+        {
+            WORD32 poc_found = 0;
+            ps_pic_buf = ps_dpb_mgr->as_dpb_info[i].ps_pic_buf;
+
+            for(j = 0; j < u4_num_st_curr_before && 0 == poc_found; j++)
+            {
+                if(ps_pic_buf->i4_abs_poc == pi4_poc_st_curr_before[j])
+                {
+                    poc_found++;
+                    break;
+                }
+            }
+            for(j = 0; j < u4_num_st_curr_after && 0 == poc_found; j++)
+            {
+                if(ps_pic_buf->i4_abs_poc == pi4_poc_st_curr_after[j])
+                {
+                    poc_found++;
+                    break;
+                }
+            }
+            for(j = 0; j < u4_num_st_foll && 0 == poc_found; j++)
+            {
+                if(ps_pic_buf->i4_abs_poc == pi4_poc_st_foll[j])
+                {
+                    poc_found++;
+                    break;
+                }
+            }
+            for(j = 0; j < u4_num_lt_curr && 0 == poc_found; j++)
+            {
+                if(ps_pic_buf->i4_abs_poc == pi4_poc_lt_curr[j])
+                {
+                    poc_found++;
+                    break;
+                }
+            }
+            for(j = 0; j < u4_num_lt_foll && 0 == poc_found; j++)
+            {
+                if(ps_pic_buf->i4_abs_poc == pi4_poc_lt_foll[j])
+                {
+                    poc_found++;
+                    break;
+                }
+            }
+
+            if(0 == poc_found)
+                printf("POC: %d\n", ps_pic_buf->i4_abs_poc);
+        }
+    }
+}
+
+void ihevcd_debug_validate_padded_region(process_ctxt_t *ps_proc)
+{
+    sps_t *ps_sps;
+    codec_t *ps_codec;
+    UWORD8 *pu1_src;
+    UWORD16 *pu2_src;
+    UWORD8 *pu1_validate;
+    UWORD16 *pu2_validate;
+    WORD32 i, j;
+    WORD32 pic_ht, pic_wd;
+    WORD32 src_strd;
+
+    FILE *fp;
+    static WORD32 file_open = 0;
+
+    if(file_open == 0)
+    {
+        fp = fopen("D:\\dump\\debug_padding.yuv", "wb");
+        file_open = 1;
+    }
+    else
+    {
+        fp = fopen("D:\\dump\\debug_padding.yuv", "ab");
+    }
+
+
+    if(NULL == fp)
+    {
+        printf("\nCannot Open file\n\n");
+        return;
+    }
+
+    /* pu2_src and pu2_validate are for chroma */
+
+    ps_sps = ps_proc->ps_sps;
+    ps_codec = ps_proc->ps_codec;
+    pu1_src = ps_proc->pu1_cur_pic_luma;
+    pu2_src = (UWORD16 *)ps_proc->pu1_cur_pic_chroma;
+    pic_ht = ps_sps->i2_pic_height_in_luma_samples;
+    pic_wd = ps_sps->i2_pic_width_in_luma_samples;
+    src_strd = ps_codec->i4_strd;
+
+    pu1_validate = (UWORD8 *)calloc((pic_wd + PAD_LEFT + PAD_RIGHT) * (pic_ht + PAD_TOP + PAD_BOT) * 3 / 2, 1);
+    pu2_validate = (UWORD16 *)(pu1_validate + (pic_wd + PAD_LEFT + PAD_RIGHT) * (pic_ht + PAD_TOP + PAD_BOT));
+
+    for(i = 0; i < pic_ht; i++)
+    {
+        for(j = 0; j < PAD_LEFT; j++)
+        {
+            if(pu1_src[j - PAD_LEFT] != pu1_src[0])
+            {
+                pu1_validate[j + (PAD_TOP + i) * src_strd] = 255;
+            }
+        }
+
+        for(j = 0; j < PAD_RIGHT; j++)
+        {
+            if(pu1_src[pic_wd + j] != pu1_src[pic_wd - 1])
+            {
+                pu1_validate[pic_wd + j + PAD_LEFT + (PAD_TOP + i) * src_strd] = 255;
+            }
+        }
+
+        pu1_src += src_strd;
+    }
+
+    pu1_src = ps_proc->pu1_cur_pic_luma - PAD_LEFT;
+    for(i = 0; i < pic_wd + PAD_LEFT + PAD_RIGHT; i++)
+    {
+        for(j = 0; j < PAD_TOP; j++)
+        {
+            if(pu1_src[(j - PAD_TOP) * src_strd] != pu1_src[0])
+            {
+                pu1_validate[i + j * src_strd] = 255;
+            }
+        }
+
+        for(j = 0; j < PAD_BOT; j++)
+        {
+            if(pu1_src[(pic_ht + j) * src_strd] != pu1_src[(pic_ht - 1) * src_strd])
+            {
+                pu1_validate[i + (j + pic_ht + PAD_TOP) * src_strd] = 255;
+            }
+        }
+
+        pu1_src += 1;
+    }
+
+    for(i = 0; i < pic_ht / 2; i++)
+    {
+        for(j = 0; j < PAD_LEFT / 2; j++)
+        {
+            if(pu2_src[j - PAD_LEFT / 2] != pu2_src[0])
+            {
+                pu2_validate[j + (PAD_TOP / 2 + i) * src_strd / 2] = 0xFFFF;
+            }
+        }
+
+        for(j = 0; j < PAD_RIGHT / 2; j++)
+        {
+            if(pu2_src[pic_wd / 2 + j] != pu2_src[pic_wd / 2 - 1])
+            {
+                pu2_validate[pic_wd / 2 + j + PAD_LEFT / 2 + (PAD_TOP / 2 + i) * src_strd / 2] = 0xFFFF;
+            }
+        }
+
+        pu2_src += src_strd / 2;
+    }
+    fwrite(pu1_validate, 1, (pic_wd + PAD_LEFT + PAD_RIGHT) * (pic_ht + PAD_TOP + PAD_BOT) * 3 / 2, fp);
+
+    free(pu1_validate);
+    fclose(fp);
+}
+
+void ihevcd_debug_print_nal_info(codec_t *ps_codec, WORD32 nal_type)
+{
+    FILE *fp;
+    static WORD32 file_open = 0;
+    slice_header_t *ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr_base + (ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1));
+    WORD32 frame_start_flag = 0;
+    WORD32 frame_poc = 0;
+
+    if(0 == file_open)
+    {
+        fp = fopen("nal_info.txt", "w");
+        file_open = 1;
+    }
+    else
+    {
+        fp = fopen("nal_info.txt", "a");
+    }
+
+    if(NULL == fp)
+    {
+        printf("Cannot open NAL info file.. Exiting\n");
+        exit(-1);
+    }
+
+    /* If slice NAL, update start of frame flag */
+    switch(nal_type)
+    {
+        case NAL_BLA_W_LP    :
+        case NAL_BLA_W_DLP   :
+        case NAL_BLA_N_LP    :
+        case NAL_IDR_W_LP    :
+        case NAL_IDR_N_LP    :
+        case NAL_CRA         :
+        case NAL_TRAIL_N     :
+        case NAL_TRAIL_R     :
+        case NAL_TSA_N       :
+        case NAL_TSA_R       :
+        case NAL_STSA_N      :
+        case NAL_STSA_R      :
+        case NAL_RADL_N      :
+        case NAL_RADL_R      :
+        case NAL_RASL_N      :
+        case NAL_RASL_R      :
+            frame_start_flag = ps_slice_hdr->i1_first_slice_in_pic_flag;
+            frame_poc = ps_slice_hdr->i4_abs_pic_order_cnt;
+            ps_codec->i4_first_pic_done = 1;
+            break;
+
+        default:
+            frame_start_flag = 0;
+            frame_poc = 0;
+            break;
+    }
+    fprintf(fp, "NALType=%d;NumBytes=%d;POC=%d;FrameStart=%d\n",
+            nal_type,
+            ps_codec->i4_nal_ofst + ps_codec->i4_nal_len,
+            frame_poc,
+            frame_start_flag);
+
+    fclose(fp);
+}
+
+typedef struct
+{
+    UWORD8 au1_src[8 * 4];
+    WORD32 src_strd;
+    WORD32 bs;
+    WORD32 qp_p;
+    WORD32 qp_q;
+    WORD32 beta_offset_div_2;
+    WORD32 tc_offset_div_2;
+    WORD32 filter_p;
+    WORD32 filter_q;
+}deblk_luma_t;
+
+typedef struct
+{
+    UWORD8 au1_src[8 * 4];
+    WORD32 src_strd;
+    WORD32 bs;
+    WORD32 qp_p;
+    WORD32 qp_q;
+    WORD32 qp_offset_u;
+    WORD32 qp_offset_v;
+    WORD32 tc_offset_div_2;
+    WORD32 filter_p;
+    WORD32 filter_q;
+}deblk_chroma_t;
+
+
+void ihevcd_debug_deblk_luma_vert(UWORD8 *pu1_src,
+                                  WORD32 src_strd,
+                                  WORD32 bs,
+                                  WORD32 quant_param_p,
+                                  WORD32 quant_param_q,
+                                  WORD32 beta_offset_div2,
+                                  WORD32 tc_offset_div2,
+                                  WORD32 filter_flag_p,
+                                  WORD32 filter_flag_q)
+{
+    FILE *fp;
+    static WORD32 file_open = 0;
+    WORD32 row, col;
+    deblk_luma_t s_deblk_luma;
+
+    pu1_src -= 4;
+
+    if(file_open == 0)
+    {
+        fp = fopen("D:\\dump\\win_deblk_luma_vert.txt", "wb");
+        file_open = 1;
+    }
+    else
+    {
+        fp = fopen("D:\\dump\\win_deblk_luma_vert.txt", "ab");
+    }
+
+    for(row = 0; row < 4; row++)
+    {
+        for(col = 0; col < 8; col++)
+        {
+            s_deblk_luma.au1_src[row * 8 + col] = pu1_src[row * src_strd + col];
+        }
+    }
+    s_deblk_luma.src_strd = src_strd;
+    s_deblk_luma.bs = bs;
+    s_deblk_luma.qp_p = quant_param_p;
+    s_deblk_luma.qp_q = quant_param_q;
+    s_deblk_luma.beta_offset_div_2 = beta_offset_div2;
+    s_deblk_luma.tc_offset_div_2 = tc_offset_div2;
+    s_deblk_luma.filter_p = filter_flag_p;
+    s_deblk_luma.filter_q = filter_flag_q;
+
+    fwrite(&s_deblk_luma, sizeof(deblk_luma_t), 1, fp);
+
+    fclose(fp);
+}
+
+void ihevcd_debug_deblk_luma_horz(UWORD8 *pu1_src,
+                                  WORD32 src_strd,
+                                  WORD32 bs,
+                                  WORD32 quant_param_p,
+                                  WORD32 quant_param_q,
+                                  WORD32 beta_offset_div2,
+                                  WORD32 tc_offset_div2,
+                                  WORD32 filter_flag_p,
+                                  WORD32 filter_flag_q)
+{
+    FILE *fp;
+    static WORD32 file_open = 0;
+    WORD32 row, col;
+    deblk_luma_t s_deblk_luma;
+
+    pu1_src -= 4 * src_strd;
+
+    if(file_open == 0)
+    {
+        fp = fopen("D:\\dump\\win_deblk_luma_horz.txt", "wb");
+        file_open = 1;
+    }
+    else
+    {
+        fp = fopen("D:\\dump\\win_deblk_luma_horz.txt", "ab");
+    }
+
+    for(row = 0; row < 8; row++)
+    {
+        for(col = 0; col < 4; col++)
+        {
+            s_deblk_luma.au1_src[row * 4 + col] = pu1_src[row * src_strd + col];
+        }
+    }
+    s_deblk_luma.src_strd = src_strd;
+    s_deblk_luma.bs = bs;
+    s_deblk_luma.qp_p = quant_param_p;
+    s_deblk_luma.qp_q = quant_param_q;
+    s_deblk_luma.beta_offset_div_2 = beta_offset_div2;
+    s_deblk_luma.tc_offset_div_2 = tc_offset_div2;
+    s_deblk_luma.filter_p = filter_flag_p;
+    s_deblk_luma.filter_q = filter_flag_q;
+
+    fwrite(&s_deblk_luma, sizeof(deblk_luma_t), 1, fp);
+
+    fclose(fp);
+}
+
+void ihevcd_debug_deblk_chroma_vert(UWORD8 *pu1_src,
+                                    WORD32 src_strd,
+                                    WORD32 bs,
+                                    WORD32 quant_param_p,
+                                    WORD32 quant_param_q,
+                                    WORD32 qp_offset_u,
+                                    WORD32 qp_offset_v,
+                                    WORD32 tc_offset_div2,
+                                    WORD32 filter_flag_p,
+                                    WORD32 filter_flag_q)
+{
+    FILE *fp;
+    static WORD32 file_open = 0;
+    WORD32 row, col;
+    deblk_chroma_t s_deblk_chroma;
+
+    pu1_src -= 4;
+
+    if(file_open == 0)
+    {
+        fp = fopen("D:\\dump\\win_deblk_chroma_vert.txt", "wb");
+        file_open = 1;
+    }
+    else
+    {
+        fp = fopen("D:\\dump\\win_deblk_chroma_vert.txt", "ab");
+    }
+
+    for(row = 0; row < 4; row++)
+    {
+        for(col = 0; col < 8; col++)
+        {
+            s_deblk_chroma.au1_src[row * 8 + col] = pu1_src[row * src_strd + col];
+        }
+    }
+    s_deblk_chroma.src_strd = src_strd;
+    s_deblk_chroma.bs = bs;
+    s_deblk_chroma.qp_p = quant_param_p;
+    s_deblk_chroma.qp_q = quant_param_q;
+    s_deblk_chroma.qp_offset_u = qp_offset_u;
+    s_deblk_chroma.qp_offset_v = qp_offset_v;
+    s_deblk_chroma.tc_offset_div_2 = tc_offset_div2;
+    s_deblk_chroma.filter_p = filter_flag_p;
+    s_deblk_chroma.filter_q = filter_flag_q;
+
+    fwrite(&s_deblk_chroma, sizeof(deblk_chroma_t), 1, fp);
+
+    fclose(fp);
+}
+
+
+void ihevcd_debug_deblk_chroma_horz(UWORD8 *pu1_src,
+                                    WORD32 src_strd,
+                                    WORD32 bs,
+                                    WORD32 quant_param_p,
+                                    WORD32 quant_param_q,
+                                    WORD32 qp_offset_u,
+                                    WORD32 qp_offset_v,
+                                    WORD32 tc_offset_div2,
+                                    WORD32 filter_flag_p,
+                                    WORD32 filter_flag_q)
+{
+    FILE *fp;
+    static WORD32 file_open = 0;
+    WORD32 row, col;
+    deblk_chroma_t s_deblk_chroma;
+
+    pu1_src -= 2 * src_strd;
+
+    if(file_open == 0)
+    {
+        fp = fopen("D:\\dump\\win_deblk_chroma_horz.txt", "wb");
+        file_open = 1;
+    }
+    else
+    {
+        fp = fopen("D:\\dump\\win_deblk_chroma_horz.txt", "ab");
+    }
+
+    for(row = 0; row < 4; row++)
+    {
+        for(col = 0; col < 8; col++)
+        {
+            s_deblk_chroma.au1_src[row * 8 + col] = pu1_src[row * src_strd + col];
+        }
+    }
+    s_deblk_chroma.src_strd = src_strd;
+    s_deblk_chroma.bs = bs;
+    s_deblk_chroma.qp_p = quant_param_p;
+    s_deblk_chroma.qp_q = quant_param_q;
+    s_deblk_chroma.qp_offset_u = qp_offset_u;
+    s_deblk_chroma.qp_offset_v = qp_offset_v;
+    s_deblk_chroma.tc_offset_div_2 = tc_offset_div2;
+    s_deblk_chroma.filter_p = filter_flag_p;
+    s_deblk_chroma.filter_q = filter_flag_q;
+
+    fwrite(&s_deblk_chroma, sizeof(deblk_chroma_t), 1, fp);
+
+    fclose(fp);
+}
+
+#if DEBUG_PRINT_IQ_IT_RECON
+void print_coeff(WORD16 *pi2_tu_coeff, WORD32 trans_size)
+{
+    WORD32 row, col;
+    for(row = 0; row < trans_size; row++)
+    {
+        for(col = 0; col < trans_size; col++)
+        {
+            printf("%d\t", pi2_tu_coeff[row * trans_size + col]);
+        }
+        printf("\n");
+    }
+}
+
+void print_dst(UWORD8 *pu1_dst,
+               WORD32 dst_strd,
+               WORD32 trans_size,
+               WORD32 is_luma)
+{
+    WORD32 row, col;
+    WORD32 inc;
+    inc = is_luma == 1 ? 1 : 2;
+
+    for(row = 0; row < trans_size; row++)
+    {
+        for(col = 0; col < trans_size; col++)
+        {
+            printf("%d\t", pu1_dst[row * dst_strd + inc * col]);
+        }
+        printf("\n");
+    }
+}
+#endif
+#endif

diff --git a/decoder/ihevcd_debug.h b/decoder/ihevcd_debug.h
new file mode 100644
index 0000000..af6a0d0
--- /dev/null
+++ b/decoder/ihevcd_debug.h

@@ -0,0 +1,176 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_debug.h
+*
+* @brief
+*  Debug defs
+*
+* @author
+*  Naveen S R
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_DEBUG_H_
+#define _IHEVCD_DEBUG_H_
+
+#define DEBUG_REF_LIST 0
+#define DEBUG_PADDED_REGION 0
+#define DEBUG_DUMP_PRE_ILF 0
+#define DEBUG_PRINT_IQ_IT_RECON 0
+#define DEBUG_PRINT_MV 0
+#define DEBUG_DEBLK_LEAF_LEVEL 0
+#define DEBUG_NAL_TYPE 0
+#define DEBUG_SAO_TMP_BUF 0
+#define DEBUG_BREAK_AFTER_SLICE_NAL 0
+#define DEBUG_DUMP_FRAME_BUFFERS_INFO 0
+#define DEBUG_DUMP_FRAME_PU_INFO 0
+#define DEBUG_MV_MAP 0
+
+#if (DEBUG_REF_LIST||DEBUG_PADDED_REGION||DEBUG_DUMP_PRE_ILF||DEBUG_PRINT_IQ_IT_RECON||DEBUG_PRINT_MV||DEBUG_DEBLK_LEAF_LEVEL||DEBUG_NAL_TYPE||DEBUG_SAO_TMP_BUF||DEBUG_BREAK_AFTER_SLICE_NAL || DEBUG_DUMP_FRAME_BUFFERS_INFO || DEBUG_DUMP_FRAME_PU_INFO)
+#define DEBUG_CODE 1
+#else
+#define DEBUG_CODE 0
+#endif
+
+
+#if DEBUG_DUMP_FRAME_PU_INFO
+#define DEBUG_DUMP_PIC_PU(ps_codec) ihevcd_debug_dump_pic_pu(ps_codec);
+#else
+#define DEBUG_DUMP_PIC_PU(ps_codec)
+#endif
+
+
+#if DEBUG_DUMP_FRAME_BUFFERS_INFO
+UWORD8 au1_pic_avail_ctb_flags[MAX_WD * MAX_HT / MIN_CTB_SIZE / MIN_CTB_SIZE];
+UWORD32 au4_pic_ctb_slice_xy[MAX_WD * MAX_HT / MIN_CTB_SIZE / MIN_CTB_SIZE];
+
+#define DEBUG_DUMP_PIC_BUFFERS(ps_codec) ihevcd_debug_dump_pic_buffers(ps_codec);
+#else
+#define DEBUG_DUMP_PIC_BUFFERS(ps_codec)
+#endif
+
+#if DEBUG_BREAK_AFTER_SLICE_NAL
+#define BREAK_AFTER_SLICE_NAL()                 \
+                if(ps_codec->i4_header_done)    \
+                    break;
+#else
+#define BREAK_AFTER_SLICE_NAL()     ;
+#endif
+
+
+#if DEBUG_SAO_TMP_BUF
+#define DEBUG_INIT_TMP_BUF(pu1_buf_luma, pu1_buf_chroma) ihevcd_debug_init_tmp_buf(pu1_buf_luma, pu1_buf_chroma)
+#define DEBUG_PROCESS_TMP_BUF(pu1_buf_luma, pu1_buf_chroma) ihevcd_debug_process_tmp_buf(pu1_buf_luma, pu1_buf_chroma)
+#else
+#define DEBUG_INIT_TMP_BUF(pu1_buf_luma, pu1_buf_chroma)
+#define DEBUG_PROCESS_TMP_BUF(pu1_buf_luma, pu1_buf_chroma)
+#endif
+
+#if DEBUG_NAL_TYPE
+
+#define DEBUG_PRINT_NAL_INFO(ps_codec, nal_type) ihevcd_debug_print_nal_info(ps_codec, nal_type); \
+    break;
+#define RETURN_IF_NAL_INFO return;
+
+#else
+
+#define DEBUG_PRINT_NAL_INFO(ps_codec, nal_type)
+#define RETURN_IF_NAL_INFO
+
+#endif
+
+#if DEBUG_REF_LIST
+
+#define DEBUG_PRINT_REF_LIST_POCS(i4_pic_order_cnt_val, ps_slice_hdr, ps_dpb_mgr, u4_num_st_curr_before, u4_num_st_curr_after, u4_num_st_foll, u4_num_lt_curr, u4_num_lt_foll, ai4_poc_st_curr_before, ai4_poc_st_curr_after, ai4_poc_st_foll, ai4_poc_lt_curr, ai4_poc_lt_foll)  \
+    ihevcd_debug_print_ref_list_pocs(i4_pic_order_cnt_val, ps_slice_hdr, ps_dpb_mgr, u4_num_st_curr_before, u4_num_st_curr_after, u4_num_st_foll, u4_num_lt_curr, u4_num_lt_foll, ai4_poc_st_curr_before, ai4_poc_st_curr_after, ai4_poc_st_foll, ai4_poc_lt_curr, ai4_poc_lt_foll);
+
+#else
+
+#define DEBUG_PRINT_REF_LIST_POCS(i4_pic_order_cnt_val, ps_slice_hdr, ps_dpb_mgr, u4_num_st_curr_before, u4_num_st_curr_after, u4_num_st_foll, u4_num_lt_curr, u4_num_lt_foll, ai4_poc_st_curr_before, ai4_poc_st_curr_after, ai4_poc_st_foll, ai4_poc_lt_curr, ai4_poc_lt_foll)
+
+#endif
+
+#if DEBUG_PADDED_REGION
+
+#define DEBUG_VALIDATE_PADDED_REGION(ps_proc) ihevcd_debug_validate_padded_region(ps_proc);
+
+#else
+
+#define DEBUG_VALIDATE_PADDED_REGION(ps_proc)
+
+#endif
+
+#if DEBUG_DUMP_PRE_ILF
+
+#define DUMP_PRE_ILF(pu1_cur_pic_luma, pu1_cur_pic_chroma, pic_wd, pic_ht, pic_strd) ihevcd_debug_dump_pic(pu1_cur_pic_luma, pu1_cur_pic_chroma, pic_wd, pic_ht, pic_strd)
+#define DUMP_BS(pu4_pic_vert_bs, pu4_pic_horz_bs, vert_size_in_bytes, horz_size_in_bytes) ihevcd_debug_dump_bs(pu4_pic_vert_bs, pu4_pic_horz_bs, vert_size_in_bytes, horz_size_in_bytes)
+#define DUMP_QP(pu1_qp, size_in_bytes) ihevcd_debug_dump_qp(pu1_qp, size_in_bytes)
+#define DUMP_QP_CONST_IN_CTB(pu1_qp_const_in_ctb, size_in_bytes) ihevcs_dump_qp_const_in_ctb(pu1_qp_const_in_ctb, size_in_bytes)
+#define DUMP_NO_LOOP_FILTER(pu1_pic_no_loop_filter, size_in_bytes) ihevcd_debug_dump_no_loop_filter(pu1_pic_no_loop_filter, size_in_bytes)
+#define DUMP_OFFSETS(beta_offset_div_2, tc_offset_div_2, qp_offset_u, qp_offset_v) ihevcd_debug_dump_offsets(beta_offset_div_2, tc_offset_div_2, qp_offset_u, qp_offset_v)
+
+#else
+
+#define DUMP_PRE_ILF(pu1_cur_pic_luma, pu1_cur_pic_chroma, pic_wd, pic_ht, pic_strd)
+#define DUMP_BS(pu4_pic_vert_bs, pu4_pic_horz_bs, vert_size_in_bytes, horz_size_in_bytes)
+#define DUMP_QP(pu1_qp, size_in_bytes)
+#define DUMP_QP_CONST_IN_CTB(pu1_qp_const_in_ctb, size_in_bytes)
+#define DUMP_NO_LOOP_FILTER(pu1_pic_no_loop_filter, size_in_bytes)
+#define DUMP_OFFSETS(beta_offset_div_2, tc_offset_div_2, qp_offset_u, qp_offset_v)
+
+#endif
+
+
+#if DEBUG_DEBLK_LEAF_LEVEL
+
+#define DUMP_DEBLK_LUMA_VERT(pu1_src, src_strd, u4_bs, qp_p, qp_q, beta_offset_div2, tc_offset_div2, filter_p, filter_q) ihevcd_debug_deblk_luma_vert(pu1_src, src_strd, u4_bs, qp_p, qp_q, beta_offset_div2, tc_offset_div2, filter_p, filter_q);
+#define DUMP_DEBLK_LUMA_HORZ(pu1_src, src_strd, u4_bs, qp_p, qp_q, beta_offset_div2, tc_offset_div2, filter_p, filter_q) ihevcd_debug_deblk_luma_horz(pu1_src, src_strd, u4_bs, qp_p, qp_q, beta_offset_div2, tc_offset_div2, filter_p, filter_q);
+#define DUMP_DEBLK_CHROMA_VERT(pu1_src, src_strd, u4_bs, qp_p, qp_q, qp_offset_u, qp_offset_v, tc_offset_div2, filter_p, filter_q) ihevcd_debug_deblk_chroma_vert(pu1_src, src_strd, u4_bs, qp_p, qp_q, qp_offset_u, qp_offset_v, tc_offset_div2, filter_p, filter_q)
+#define DUMP_DEBLK_CHROMA_HORZ(pu1_src, src_strd, u4_bs, qp_p, qp_q, qp_offset_u, qp_offset_v, tc_offset_div2, filter_p, filter_q) ihevcd_debug_deblk_chroma_horz(pu1_src, src_strd, u4_bs, qp_p, qp_q, qp_offset_u, qp_offset_v, tc_offset_div2, filter_p, filter_q)
+
+#else
+
+#define DUMP_DEBLK_LUMA_VERT(pu1_src, src_strd, u4_bs3, qp_p, qp_q, beta_offset_div2, tc_offset_div2, filter_p, filter_q)
+#define DUMP_DEBLK_LUMA_HORZ(pu1_src, src_strd, u4_bs3, qp_p, qp_q, beta_offset_div2, tc_offset_div2, filter_p, filter_q)
+#define DUMP_DEBLK_CHROMA_VERT(pu1_src, src_strd, u4_bs, qp_p, qp_q, qp_offset_u, qp_offset_v, tc_offset_div2, filter_p, filter_q)
+#define DUMP_DEBLK_CHROMA_HORZ(pu1_src, src_strd, u4_bs, qp_p, qp_q, qp_offset_u, qp_offset_v, tc_offset_div2, filter_p, filter_q)
+
+#endif
+
+#if DEBUG_MV_MAP
+#define DEBUG_DUMP_MV_MAP(ps_codec) ihevcd_debug_dump_mv_map(ps_codec);
+#else
+#define DEBUG_DUMP_MV_MAP(ps_codec)
+#endif
+void print_coeff(WORD16 *pi2_tu_coeff, WORD32 trans_size);
+
+void print_dst(UWORD8 *pu1_dst,
+               WORD32 dst_strd,
+               WORD32 trans_size,
+               WORD32 is_luma);
+
+#endif /* _IHEVCD_DEBUG_H_ */

diff --git a/decoder/ihevcd_decode.c b/decoder/ihevcd_decode.c
new file mode 100644
index 0000000..b2a834a
--- /dev/null
+++ b/decoder/ihevcd_decode.c

@@ -0,0 +1,859 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevcd_decode.c
+ *
+ * @brief
+ *  Contains codecs main decode function
+ *
+ * @author
+ *  Harish
+ *
+ * @par List of Functions:
+ * - fill_outargs()
+ * - ihevcd_decode
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_fmt_conv.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_process_slice.h"
+#include "ihevcd_ittiam_logo.h"
+#include "ihevcd_profile.h"
+
+#define NUM_FRAMES_LIMIT_ENABLED 0
+
+#if NUM_FRAMES_LIMIT_ENABLED
+#define NUM_FRAMES_LIMIT 3600
+#else
+#define NUM_FRAMES_LIMIT 0x7FFFFFFF
+#endif
+
+IHEVCD_ERROR_T ihevcd_fmt_conv(codec_t *ps_codec,
+                               process_ctxt_t *ps_proc,
+                               UWORD8 *pu1_y_dst,
+                               UWORD8 *pu1_u_dst,
+                               UWORD8 *pu1_v_dst,
+                               WORD32 cur_row,
+                               WORD32 num_rows);
+WORD32 ihevcd_init(codec_t *ps_codec);
+/*****************************************************************************/
+/* Function Prototypes                                                       */
+/*****************************************************************************/
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief Fills output arguments for decode process
+ *
+ * @par   Description
+ * Fills elements in the output structure based on the current state
+ *
+ * @param[in] ps_codec
+ * Codec context
+ *
+ * @param[in] ps_dec_ip
+ * Pointer to input structure
+ *
+ * @param[in] ps_dec_op
+ * Pointer to output structure
+ *
+ * @returns none
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+static UWORD32 ihevcd_map_error(IHEVCD_ERROR_T e_error)
+{
+    UWORD32 error_code = 0;
+    error_code = e_error;
+    switch(error_code)
+    {
+        case IHEVCD_SUCCESS :
+            break;
+        case IHEVCD_INIT_NOT_DONE:
+        case IHEVCD_LEVEL_UNSUPPORTED:
+        case IHEVCD_NUM_REF_UNSUPPORTED:
+        case IHEVCD_NUM_REORDER_UNSUPPORTED:
+        case IHEVCD_NUM_EXTRA_DISP_UNSUPPORTED:
+        case IHEVCD_INSUFFICIENT_MEM_MVBANK:
+        case IHEVCD_INSUFFICIENT_MEM_PICBUF:
+            error_code |= 1 << IVD_FATALERROR;
+            break;
+        case IHEVCD_INVALID_DISP_STRD:
+        case IHEVCD_CXA_VERS_BUF_INSUFFICIENT:
+        case IHEVCD_UNSUPPORTED_VPS_ID:
+        case IHEVCD_UNSUPPORTED_SPS_ID:
+        case IHEVCD_UNSUPPORTED_PPS_ID:
+        case IHEVCD_UNSUPPORTED_CHROMA_FMT_IDC:
+        case IHEVCD_UNSUPPORTED_BIT_DEPTH:
+        case IHEVCD_BUF_MGR_ERROR:
+        case IHEVCD_NO_FREE_MVBANK:
+        case IHEVCD_NO_FREE_PICBUF:
+        case IHEVCD_SLICE_IN_HEADER_MODE:
+        case IHEVCD_END_OF_SEQUENCE:
+            break;
+        default:
+            break;
+    }
+    return error_code;
+}
+/**
+ *******************************************************************************
+ *
+ * @brief Fills output arguments for decode process
+ *
+ * @par   Description
+ * Fills elements in the output structure based on the current state
+ *
+ * @param[in] ps_codec
+ * Codec context
+ *
+ * @param[in] ps_dec_ip
+ * Pointer to input structure
+ *
+ * @param[in] ps_dec_op
+ * Pointer to output structure
+ *
+ * @returns none
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+static void ihevcd_fill_outargs(codec_t *ps_codec,
+                                ivd_video_decode_ip_t *ps_dec_ip,
+                                ivd_video_decode_op_t *ps_dec_op)
+{
+
+    ps_dec_op->u4_error_code = ihevcd_map_error((IHEVCD_ERROR_T)ps_codec->i4_error_code);
+    ps_dec_op->u4_num_bytes_consumed = ps_dec_ip->u4_num_Bytes
+                    - ps_codec->i4_bytes_remaining;
+    if(ps_codec->i4_sps_done)
+    {
+        ps_dec_op->u4_pic_wd = ps_codec->i4_disp_wd;
+        ps_dec_op->u4_pic_ht = ps_codec->i4_disp_ht;
+    }
+    else
+    {
+        ps_dec_op->u4_pic_wd = 0;
+        ps_dec_op->u4_pic_ht = 0;
+    }
+
+    ps_dec_op->e_pic_type = ps_codec->e_dec_pic_type;
+    ps_dec_op->u4_frame_decoded_flag = ps_codec->i4_pic_present;
+    ps_dec_op->u4_new_seq = 0;
+
+    ps_dec_op->u4_output_present = 0;
+    ps_dec_op->u4_progressive_frame_flag = 1;
+    ps_dec_op->u4_is_ref_flag = 1;
+    ps_dec_op->e_output_format = ps_codec->e_chroma_fmt;
+    ps_dec_op->u4_is_ref_flag = 1;
+
+    ps_dec_op->e4_fld_type = IV_FLD_TYPE_DEFAULT;
+
+    ps_dec_op->u4_ts = (UWORD32)(-1);
+    ps_dec_op->u4_disp_buf_id = ps_codec->i4_disp_buf_id;
+    if(ps_codec->i4_flush_mode)
+    {
+        ps_dec_op->u4_num_bytes_consumed = 0;
+        /*In the case of flush ,since no frame is decoded set pic type as invalid*/
+        ps_dec_op->u4_is_ref_flag = 0;
+        ps_dec_op->e_pic_type = IV_NA_FRAME;
+        ps_dec_op->u4_frame_decoded_flag = 0;
+
+    }
+    /* If there is a display buffer */
+    if(ps_codec->ps_disp_buf)
+    {
+        pic_buf_t *ps_disp_buf = ps_codec->ps_disp_buf;
+
+        ps_dec_op->u4_output_present = 1;
+        PROFILE_DIS_PROCESS_CTB_SET_NOOUTPUT();
+        ps_dec_op->u4_ts = ps_disp_buf->u4_ts;
+        if((ps_codec->i4_flush_mode == 0) && (ps_codec->s_parse.i4_end_of_frame == 0))
+            ps_dec_op->u4_output_present = 0;
+        ps_dec_op->s_disp_frm_buf.u4_y_wd = ps_codec->i4_disp_wd;
+        ps_dec_op->s_disp_frm_buf.u4_y_ht = ps_codec->i4_disp_ht;
+
+        if(ps_codec->i4_share_disp_buf)
+        {
+            ps_dec_op->s_disp_frm_buf.pv_y_buf = ps_disp_buf->pu1_luma;
+            if(ps_codec->e_chroma_fmt != IV_YUV_420P)
+            {
+                ps_dec_op->s_disp_frm_buf.pv_u_buf = ps_disp_buf->pu1_chroma;
+                ps_dec_op->s_disp_frm_buf.pv_v_buf = NULL;
+            }
+            else
+            {
+                ps_dec_op->s_disp_frm_buf.pv_u_buf =
+                                ps_dec_ip->s_out_buffer.pu1_bufs[1];
+                ps_dec_op->s_disp_frm_buf.pv_v_buf =
+                                ps_dec_ip->s_out_buffer.pu1_bufs[2];
+
+            }
+            ps_dec_op->s_disp_frm_buf.u4_y_strd = ps_codec->i4_strd;
+
+        }
+        else
+        {
+            ps_dec_op->s_disp_frm_buf.pv_y_buf =
+                            ps_dec_ip->s_out_buffer.pu1_bufs[0];
+            ps_dec_op->s_disp_frm_buf.pv_u_buf =
+                            ps_dec_ip->s_out_buffer.pu1_bufs[1];
+            ps_dec_op->s_disp_frm_buf.pv_v_buf =
+                            ps_dec_ip->s_out_buffer.pu1_bufs[2];
+            ps_dec_op->s_disp_frm_buf.u4_y_strd = ps_codec->i4_disp_strd;
+        }
+
+        if((IV_YUV_420SP_VU == ps_codec->e_chroma_fmt)
+                        || (IV_YUV_420SP_UV == ps_codec->e_chroma_fmt))
+        {
+            ps_dec_op->s_disp_frm_buf.u4_u_strd =
+                            ps_dec_op->s_disp_frm_buf.u4_y_strd;
+            ps_dec_op->s_disp_frm_buf.u4_v_strd = 0;
+            ps_dec_op->s_disp_frm_buf.u4_u_wd =
+                            ps_dec_op->s_disp_frm_buf.u4_y_wd;
+            ps_dec_op->s_disp_frm_buf.u4_v_wd = 0;
+            ps_dec_op->s_disp_frm_buf.u4_u_ht =
+                            ps_dec_op->s_disp_frm_buf.u4_y_ht / 2;
+            ps_dec_op->s_disp_frm_buf.u4_v_ht = 0;
+        }
+        else if(IV_YUV_420P == ps_codec->e_chroma_fmt)
+        {
+            ps_dec_op->s_disp_frm_buf.u4_u_strd =
+                            ps_dec_op->s_disp_frm_buf.u4_y_strd / 2;
+            ps_dec_op->s_disp_frm_buf.u4_v_strd =
+                            ps_dec_op->s_disp_frm_buf.u4_y_strd / 2;
+            ps_dec_op->s_disp_frm_buf.u4_u_wd =
+                            ps_dec_op->s_disp_frm_buf.u4_y_wd / 2;
+            ps_dec_op->s_disp_frm_buf.u4_v_wd =
+                            ps_dec_op->s_disp_frm_buf.u4_y_wd / 2;
+            ps_dec_op->s_disp_frm_buf.u4_u_ht =
+                            ps_dec_op->s_disp_frm_buf.u4_y_ht / 2;
+            ps_dec_op->s_disp_frm_buf.u4_v_ht =
+                            ps_dec_op->s_disp_frm_buf.u4_y_ht / 2;
+        }
+
+    }
+    else if(ps_codec->i4_flush_mode)
+    {
+        ps_dec_op->u4_error_code = IHEVCD_END_OF_SEQUENCE;
+        /* Come out of flush mode */
+        ps_codec->i4_flush_mode = 0;
+    }
+
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Codec process call
+ *
+ * @par Description:
+ *  Codec process call  Tests for few error checks  Handle flush and decode
+ * header related code  Parse bitstream for start codes  For each NAL unit
+ * call decode NAL function  Once a complete frame is decoded (in frame
+ * decode mode)  Fill output arguments and return
+ *
+ * @param[in] ps_codec_obj
+ *  Pointer to codec object at API level
+ *
+ * @param[in] pv_api_ip
+ *  Pointer to input argument structure
+ *
+ * @param[in] pv_api_op
+ *  Pointer to output argument structure
+ *
+ * @returns  Status
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+WORD32 ihevcd_decode(iv_obj_t *ps_codec_obj, void *pv_api_ip, void *pv_api_op)
+{
+    WORD32 ret = IV_SUCCESS;
+    codec_t *ps_codec = (codec_t *)(ps_codec_obj->pv_codec_handle);
+    ivd_video_decode_ip_t *ps_dec_ip;
+    ivd_video_decode_op_t *ps_dec_op;
+
+    WORD32 proc_idx = 0;
+    WORD32 prev_proc_idx = 0;
+
+    /* Initialize error code */
+    ps_codec->i4_error_code = 0;
+
+    ps_dec_ip = (ivd_video_decode_ip_t *)pv_api_ip;
+    ps_dec_op = (ivd_video_decode_op_t *)pv_api_op;
+
+    memset(ps_dec_op, 0, sizeof(ivd_video_decode_op_t));
+    if(ps_codec->i4_init_done != 1)
+    {
+        ps_dec_op->u4_error_code |= 1 << IVD_FATALERROR;
+        ps_dec_op->u4_error_code |= IHEVCD_INIT_NOT_DONE;
+        return IV_FAIL;
+    }
+
+    if(ps_codec->u4_pic_cnt >= NUM_FRAMES_LIMIT)
+    {
+        ps_dec_op->u4_error_code |= 1 << IVD_FATALERROR;
+        ps_dec_op->u4_error_code |= IHEVCD_NUM_FRAMES_LIMIT_REACHED;
+        return IV_FAIL;
+    }
+
+    /* If reset flag is set, flush the existing buffers */
+    if(ps_codec->i4_reset_flag)
+    {
+        ps_codec->i4_flush_mode = 1;
+    }
+
+    /*Data memory barries instruction,so that bitstream write by the application is complete*/
+    //arm_dsb();
+    /* In case the decoder is not in flush mode check for input buffer validity */
+    if(0 == ps_codec->i4_flush_mode)
+    {
+        if(ps_dec_ip->pv_stream_buffer == NULL)
+        {
+            ps_dec_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+            ps_dec_op->u4_error_code |= IVD_DEC_FRM_BS_BUF_NULL;
+            return IV_FAIL;
+        }
+        if(ps_dec_ip->u4_num_Bytes <= MIN_START_CODE_LEN)
+        {
+            if((WORD32)ps_dec_ip->u4_num_Bytes > 0)
+                ps_dec_op->u4_num_bytes_consumed = ps_dec_ip->u4_num_Bytes;
+            else
+                ps_dec_op->u4_num_bytes_consumed = 0;
+
+            ps_dec_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+            ps_dec_op->u4_error_code |= IVD_DEC_NUMBYTES_INV;
+            return IV_FAIL;
+
+        }
+    }
+
+#ifdef APPLY_CONCEALMENT
+    {
+        WORD32 num_mbs;
+
+        num_mbs = (ps_codec->i4_wd * ps_codec->i4_ht + 255) >> 8;
+        /* Reset MB Count at the beginning of every process call */
+        ps_codec->mb_count = 0;
+        memset(ps_codec->mb_map, 0, ((num_mbs + 7) >> 3));
+    }
+#endif
+
+    if(0 == ps_codec->i4_share_disp_buf && ps_codec->i4_header_mode == 0)
+    {
+        UWORD32 i;
+        if(ps_dec_ip->s_out_buffer.u4_num_bufs == 0)
+        {
+            ps_dec_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+            ps_dec_op->u4_error_code |= IVD_DISP_FRM_ZERO_OP_BUFS;
+            return IV_FAIL;
+        }
+
+        for(i = 0; i < ps_dec_ip->s_out_buffer.u4_num_bufs; i++)
+        {
+            if(ps_dec_ip->s_out_buffer.pu1_bufs[i] == NULL)
+            {
+                ps_dec_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+                ps_dec_op->u4_error_code |= IVD_DISP_FRM_OP_BUF_NULL;
+                return IV_FAIL;
+            }
+
+            if(ps_dec_ip->s_out_buffer.u4_min_out_buf_size[i] == 0)
+            {
+                ps_dec_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+                ps_dec_op->u4_error_code |= IVD_DISP_FRM_ZERO_OP_BUF_SIZE;
+                return IV_FAIL;
+            }
+        }
+    }
+
+    ps_codec->ps_out_buffer = &ps_dec_ip->s_out_buffer;
+    ps_codec->u4_ts = ps_dec_ip->u4_ts;
+    if(ps_codec->i4_flush_mode)
+    {
+
+        ps_dec_op->u4_pic_wd = ps_codec->i4_disp_wd;
+        ps_dec_op->u4_pic_ht = ps_codec->i4_disp_ht;
+
+        ps_dec_op->u4_new_seq = 0;
+
+        ps_codec->ps_disp_buf = (pic_buf_t *)ihevc_disp_mgr_get(
+                        (disp_mgr_t *)ps_codec->pv_disp_buf_mgr, &ps_codec->i4_disp_buf_id);
+        /* In case of non-shared mode, then convert/copy the frame to output buffer */
+        /* Only if the codec is in non-shared mode or in shared mode but needs 420P output */
+        if((ps_codec->ps_disp_buf)
+                        && ((0 == ps_codec->i4_share_disp_buf)
+                                        || (IV_YUV_420P
+                                                        == ps_codec->e_chroma_fmt)))
+        {
+
+            process_ctxt_t *ps_proc = &ps_codec->as_process[prev_proc_idx];
+            if(0 == ps_proc->i4_init_done)
+            {
+                ihevcd_init_proc_ctxt(ps_proc, 0);
+            }
+
+            /* Set remaining number of rows to be processed */
+            ret = ihevcd_fmt_conv(ps_codec, &ps_codec->as_process[prev_proc_idx],
+                                  ps_dec_ip->s_out_buffer.pu1_bufs[0],
+                                  ps_dec_ip->s_out_buffer.pu1_bufs[1],
+                                  ps_dec_ip->s_out_buffer.pu1_bufs[2], 0,
+                                  ps_codec->i4_disp_ht);
+
+            ihevc_buf_mgr_release((buf_mgr_t *)ps_codec->pv_pic_buf_mgr,
+                                  ps_codec->i4_disp_buf_id, BUF_MGR_DISP);
+        }
+
+        ihevcd_fill_outargs(ps_codec, ps_dec_ip, ps_dec_op);
+
+        if(1 == ps_dec_op->u4_output_present)
+        {
+            WORD32 xpos = ps_codec->i4_disp_wd - 32 - LOGO_WD;
+            WORD32 ypos = ps_codec->i4_disp_ht - 32 - LOGO_HT;
+
+            if(ypos < 0)
+                ypos = 0;
+
+            if(xpos < 0)
+                xpos = 0;
+
+            INSERT_LOGO(ps_dec_ip->s_out_buffer.pu1_bufs[0],
+                        ps_dec_ip->s_out_buffer.pu1_bufs[1],
+                        ps_dec_ip->s_out_buffer.pu1_bufs[2], ps_codec->i4_disp_strd,
+                        xpos,
+                        ypos,
+                        ps_codec->e_chroma_fmt,
+                        ps_codec->i4_disp_wd,
+                        ps_codec->i4_disp_ht);
+        }
+
+
+        if(NULL == ps_codec->ps_disp_buf)
+        {
+            /* If in flush mode and there are no more buffers to flush,
+             * check for the reset flag and reset the decoder */
+            if(ps_codec->i4_reset_flag)
+            {
+                ihevcd_init(ps_codec);
+            }
+            return (IV_FAIL);
+        }
+
+        return (IV_SUCCESS);
+
+    }
+    /* In case of shared mode, check if there is a free buffer for reconstruction */
+    if((0 == ps_codec->i4_header_mode) && (1 == ps_codec->i4_share_disp_buf))
+    {
+        WORD32 buf_status;
+        buf_status = 1;
+        if(ps_codec->pv_pic_buf_mgr)
+            buf_status = ihevc_buf_mgr_check_free((buf_mgr_t *)ps_codec->pv_pic_buf_mgr);
+
+        /* If there is no free buffer, then return with an error code */
+        if(0 == buf_status)
+        {
+            ps_dec_op->u4_error_code = IVD_DEC_REF_BUF_NULL;
+            ps_dec_op->u4_error_code |= (1 << IVD_UNSUPPORTEDPARAM);
+            return IV_FAIL;
+        }
+    }
+    ps_codec->i4_bytes_remaining = ps_dec_ip->u4_num_Bytes;
+    ps_codec->pu1_inp_bitsbuf = (UWORD8 *)ps_dec_ip->pv_stream_buffer;
+    ps_codec->s_parse.i4_end_of_frame = 0;
+
+    ps_codec->i4_pic_present = 0;
+    ps_codec->i4_slice_error = 0;
+    ps_codec->ps_disp_buf = NULL;
+
+    if(ps_codec->i4_num_cores > 1)
+    {
+        ithread_set_affinity(0);
+    }
+    while(MIN_START_CODE_LEN < ps_codec->i4_bytes_remaining)
+    {
+        WORD32 nal_len;
+        WORD32 nal_ofst;
+        WORD32 bits_len;
+
+        if(ps_codec->i4_slice_error)
+        {
+            slice_header_t *ps_slice_hdr_next = ps_codec->s_parse.ps_slice_hdr_base + (ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1));
+            WORD32 next_slice_addr = ps_slice_hdr_next->i2_ctb_x +
+                            ps_slice_hdr_next->i2_ctb_y * ps_codec->s_parse.ps_sps->i2_pic_wd_in_ctb;
+            if(ps_codec->s_parse.i4_next_ctb_indx == next_slice_addr)
+                ps_codec->i4_slice_error = 0;
+        }
+
+        nal_ofst = ihevcd_nal_search_start_code(ps_codec->pu1_inp_bitsbuf,
+                                                ps_codec->i4_bytes_remaining);
+
+        ps_codec->i4_nal_ofst = nal_ofst;
+        {
+            WORD32 bytes_remaining = ps_codec->i4_bytes_remaining - nal_ofst;
+
+            bytes_remaining = MIN(bytes_remaining, ps_codec->u4_bitsbuf_size);
+            ihevcd_nal_remv_emuln_bytes(ps_codec->pu1_inp_bitsbuf + nal_ofst,
+                                        ps_codec->pu1_bitsbuf,
+                                        bytes_remaining,
+                                        &nal_len, &bits_len);
+        }
+        /* This may be used to update the offsets for tiles and entropy sync row offsets */
+        ps_codec->i4_num_emln_bytes = nal_len - bits_len;
+        ps_codec->i4_nal_len = nal_len;
+
+        ihevcd_bits_init(&ps_codec->s_parse.s_bitstrm, ps_codec->pu1_bitsbuf,
+                         bits_len);
+
+        ret = ihevcd_nal_unit(ps_codec);
+
+        /* If the frame is incomplete and
+         * the bytes remaining is zero or a header is received,
+         * complete the frame treating it to be in error */
+        if(ps_codec->i4_pic_present &&
+                        (ps_codec->s_parse.i4_next_ctb_indx != ps_codec->s_parse.ps_sps->i4_pic_size_in_ctb))
+        {
+            if((ps_codec->i4_bytes_remaining - (nal_len + nal_ofst) <= MIN_START_CODE_LEN) ||
+                            (ps_codec->i4_header_in_slice_mode))
+            {
+                slice_header_t *ps_slice_hdr_next;
+
+                ps_codec->s_parse.i4_cur_slice_idx--;
+                if(ps_codec->s_parse.i4_cur_slice_idx < 0)
+                    ps_codec->s_parse.i4_cur_slice_idx = 0;
+
+                ps_slice_hdr_next = ps_codec->s_parse.ps_slice_hdr_base + ((ps_codec->s_parse.i4_cur_slice_idx + 1) & (MAX_SLICE_HDR_CNT - 1));
+                ps_slice_hdr_next->i2_ctb_x = 0;
+                ps_slice_hdr_next->i2_ctb_y = ps_codec->s_parse.ps_sps->i2_pic_ht_in_ctb;
+                ps_codec->i4_slice_error = 1;
+                continue;
+            }
+        }
+
+        if(IHEVCD_IGNORE_SLICE == ret)
+        {
+            ps_codec->pu1_inp_bitsbuf += (nal_ofst + nal_len);
+            ps_codec->i4_bytes_remaining -= (nal_ofst + nal_len);
+
+            continue;
+        }
+
+        if((IHEVCD_FAIL == ret) &&
+                        (ps_codec->i4_error_code == IVD_RES_CHANGED))
+        {
+            break;
+        }
+
+        /* Update bytes remaining and bytes consumed and input bitstream pointer */
+        /* Do not consume the NAL in the following cases */
+        /* Slice header reached during header decode mode */
+        /* TODO: Next picture's slice reached */
+        if(ret != IHEVCD_SLICE_IN_HEADER_MODE)
+        {
+            if((0 == ps_codec->i4_slice_error) ||
+                            (ps_codec->i4_bytes_remaining - (nal_len + nal_ofst) <= MIN_START_CODE_LEN))
+            {
+                ps_codec->pu1_inp_bitsbuf += (nal_ofst + nal_len);
+                ps_codec->i4_bytes_remaining -= (nal_ofst + nal_len);
+            }
+            if(ret != IHEVCD_SUCCESS)
+                break;
+
+            if(ps_codec->s_parse.i4_end_of_frame)
+                break;
+        }
+        else
+        {
+            ret = IHEVCD_SUCCESS;
+            break;
+        }
+
+        BREAK_AFTER_SLICE_NAL();
+    }
+
+    if((ps_codec->u4_pic_cnt == 0) && (ret != IHEVCD_SUCCESS))
+    {
+        ps_codec->i4_error_code = ret;
+
+        ihevcd_fill_outargs(ps_codec, ps_dec_ip, ps_dec_op);
+        return IV_FAIL;
+    }
+
+    if(1 == ps_codec->i4_pic_present)
+    {
+        WORD32 i;
+        sps_t *ps_sps = ps_codec->s_parse.ps_sps;
+        ps_codec->i4_first_pic_done = 1;
+
+        /*TODO temporary fix: end_of_frame is checked before adding format conversion to job queue         */
+        if(ps_codec->i4_num_cores > 1 && ps_codec->s_parse.i4_end_of_frame)
+        {
+
+            /* Add job queue for format conversion / frame copy for each ctb row */
+            /* Only if the codec is in non-shared mode or in shared mode but needs 420P output */
+            process_ctxt_t *ps_proc;
+
+            /* i4_num_cores - 1 contexts are currently being used by other threads */
+            ps_proc = &ps_codec->as_process[ps_codec->i4_num_cores - 1];
+
+            if((ps_codec->ps_disp_buf) &&
+               ((0 == ps_codec->i4_share_disp_buf) || (IV_YUV_420P == ps_codec->e_chroma_fmt)))
+            {
+                /* If format conversion jobs were not issued in pic_init() add them here */
+                if((0 == ps_codec->u4_enable_fmt_conv_ahead) ||
+                                (ps_codec->i4_disp_buf_id == ps_proc->i4_cur_pic_buf_id))
+                    for(i = 0; i < ps_sps->i2_pic_ht_in_ctb; i++)
+                    {
+                        proc_job_t s_job;
+                        IHEVCD_ERROR_T ret;
+                        s_job.i4_cmd = CMD_FMTCONV;
+                        s_job.i2_ctb_cnt = 0;
+                        s_job.i2_ctb_x = 0;
+                        s_job.i2_ctb_y = i;
+                        s_job.i2_slice_idx = 0;
+                        s_job.i4_tu_coeff_data_ofst = 0;
+                        ret = ihevcd_jobq_queue((jobq_t *)ps_codec->s_parse.pv_proc_jobq,
+                                                &s_job, sizeof(proc_job_t), 1);
+                        if(ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS)
+                            return (WORD32)ret;
+                    }
+            }
+            /* Reached end of frame : Signal terminate */
+            /* The terminate flag is checked only after all the jobs are dequeued */
+            ret = ihevcd_jobq_terminate((jobq_t *)ps_codec->s_parse.pv_proc_jobq);
+
+            while(1)
+            {
+                IHEVCD_ERROR_T ret;
+                proc_job_t s_job;
+                process_ctxt_t *ps_proc;
+
+                /* i4_num_cores - 1 contexts are currently being used by other threads */
+                ps_proc = &ps_codec->as_process[ps_codec->i4_num_cores - 1];
+
+                ret = ihevcd_jobq_dequeue((jobq_t *)ps_proc->pv_proc_jobq, &s_job,
+                                          sizeof(proc_job_t), 1);
+                if((IHEVCD_ERROR_T)IHEVCD_SUCCESS != ret)
+                    break;
+
+                ps_proc->i4_ctb_cnt = s_job.i2_ctb_cnt;
+                ps_proc->i4_ctb_x = s_job.i2_ctb_x;
+                ps_proc->i4_ctb_y = s_job.i2_ctb_y;
+                ps_proc->i4_cur_slice_idx = s_job.i2_slice_idx;
+
+                if(CMD_PROCESS == s_job.i4_cmd)
+                {
+                    ihevcd_init_proc_ctxt(ps_proc, s_job.i4_tu_coeff_data_ofst);
+#ifdef GPU_BUILD
+                    if(s_job.i2_wait)
+                    {
+                        ihevcd_gpu_mc_wait(ps_proc, s_job.i2_granularity_idx);
+                    }
+
+#endif
+
+                    ihevcd_process(ps_proc);
+                }
+                else if(CMD_FMTCONV == s_job.i4_cmd)
+                {
+                    sps_t *ps_sps = ps_codec->s_parse.ps_sps;
+                    WORD32 num_rows = 1 << ps_sps->i1_log2_ctb_size;
+                    if(0 == ps_proc->i4_init_done)
+                    {
+                        ihevcd_init_proc_ctxt(ps_proc, 0);
+                    }
+
+                    num_rows = MIN(num_rows, (ps_codec->i4_disp_ht - (s_job.i2_ctb_y << ps_sps->i1_log2_ctb_size)));
+                    if(num_rows < 0)
+                        num_rows = 0;
+
+                    ihevcd_fmt_conv(ps_codec, ps_proc,
+                                    ps_dec_ip->s_out_buffer.pu1_bufs[0],
+                                    ps_dec_ip->s_out_buffer.pu1_bufs[1],
+                                    ps_dec_ip->s_out_buffer.pu1_bufs[2],
+                                    s_job.i2_ctb_y << ps_sps->i1_log2_ctb_size,
+                                    num_rows);
+                }
+            }
+        }
+        /* In case of non-shared mode and while running in single core mode, then convert/copy the frame to output buffer */
+        /* Only if the codec is in non-shared mode or in shared mode but needs 420P output */
+        else if((ps_codec->ps_disp_buf) && ((0 == ps_codec->i4_share_disp_buf) ||
+                                            (IV_YUV_420P == ps_codec->e_chroma_fmt)) &&
+                        (ps_codec->s_parse.i4_end_of_frame))
+        {
+            process_ctxt_t *ps_proc = &ps_codec->as_process[proc_idx];
+            /* Set remaining number of rows to be processed */
+            ps_codec->s_fmt_conv.i4_num_rows = ps_codec->i4_disp_ht
+                            - ps_codec->s_fmt_conv.i4_cur_row;
+            if(0 == ps_proc->i4_init_done)
+            {
+                ihevcd_init_proc_ctxt(ps_proc, 0);
+            }
+
+            if(ps_codec->s_fmt_conv.i4_num_rows < 0)
+                ps_codec->s_fmt_conv.i4_num_rows = 0;
+
+            ret = ihevcd_fmt_conv(ps_codec, ps_proc,
+                                  ps_dec_ip->s_out_buffer.pu1_bufs[0],
+                                  ps_dec_ip->s_out_buffer.pu1_bufs[1],
+                                  ps_dec_ip->s_out_buffer.pu1_bufs[2],
+                                  ps_codec->s_fmt_conv.i4_cur_row,
+                                  ps_codec->s_fmt_conv.i4_num_rows);
+            ps_codec->s_fmt_conv.i4_cur_row += ps_codec->s_fmt_conv.i4_num_rows;
+
+        }
+#ifdef GPU_BUILD
+        {
+            /*
+             * Add the buffer to the display buffer. Free mv buffer.
+             */
+            {
+
+                ihevc_disp_mgr_add(ps_codec->pv_disp_buf_mgr,
+                                   ps_codec->as_process[proc_idx].i4_cur_pic_buf_id,
+                                   ps_codec->as_process[proc_idx].ps_slice_hdr->i4_abs_pic_order_cnt,
+                                   ps_codec->as_process[proc_idx].ps_cur_pic);
+            }
+            ihevcd_free_ref_mv_buffers(ps_codec);
+            ihevcd_gpu_mc_pic_deinit(ps_codec);
+
+        }
+#endif
+
+
+        DEBUG_DUMP_MV_MAP(ps_codec);
+
+        /* Mark MV Buf as needed for reference */
+        ihevc_buf_mgr_set_status((buf_mgr_t *)ps_codec->pv_mv_buf_mgr,
+                                 ps_codec->as_process[proc_idx].i4_cur_mv_bank_buf_id,
+                                 BUF_MGR_REF);
+
+        /* Mark pic buf as needed for reference */
+        ihevc_buf_mgr_set_status((buf_mgr_t *)ps_codec->pv_pic_buf_mgr,
+                                 ps_codec->as_process[proc_idx].i4_cur_pic_buf_id,
+                                 BUF_MGR_REF);
+
+        /* Mark pic buf as needed for display */
+        ihevc_buf_mgr_set_status((buf_mgr_t *)ps_codec->pv_pic_buf_mgr,
+                                 ps_codec->as_process[proc_idx].i4_cur_pic_buf_id,
+                                 BUF_MGR_DISP);
+
+        /* Insert the current picture as short term reference */
+        ihevc_dpb_mgr_insert_ref((dpb_mgr_t *)ps_codec->pv_dpb_mgr,
+                                 ps_codec->as_process[proc_idx].ps_cur_pic,
+                                 ps_codec->as_process[proc_idx].i4_cur_pic_buf_id);
+
+        /* If a frame was displayed (in non-shared mode), then release it from display manager */
+        if((0 == ps_codec->i4_share_disp_buf) && (ps_codec->ps_disp_buf))
+            ihevc_buf_mgr_release((buf_mgr_t *)ps_codec->pv_pic_buf_mgr,
+                                  ps_codec->i4_disp_buf_id, BUF_MGR_DISP);
+
+        /* Wait for threads */
+        for(i = 0; i < (ps_codec->i4_num_cores - 1); i++)
+        {
+            if(ps_codec->ai4_process_thread_created[i])
+            {
+                ithread_join(ps_codec->apv_process_thread_handle[i], NULL);
+                ps_codec->ai4_process_thread_created[i] = 0;
+            }
+        }
+
+        DEBUG_VALIDATE_PADDED_REGION(&ps_codec->as_process[proc_idx]);
+        if(ps_codec->u4_pic_cnt > 0)
+        {
+            DEBUG_DUMP_PIC_PU(ps_codec);
+        }
+        DEBUG_DUMP_PIC_BUFFERS(ps_codec);
+
+        /* Increment the number of pictures decoded */
+        ps_codec->u4_pic_cnt++;
+    }
+    ihevcd_fill_outargs(ps_codec, ps_dec_ip, ps_dec_op);
+
+    if(1 == ps_dec_op->u4_output_present)
+    {
+        WORD32 xpos = ps_codec->i4_disp_wd - 32 - LOGO_WD;
+        WORD32 ypos = ps_codec->i4_disp_ht - 32 - LOGO_HT;
+
+        if(ypos < 0)
+            ypos = 0;
+
+        if(xpos < 0)
+            xpos = 0;
+
+        INSERT_LOGO(ps_dec_ip->s_out_buffer.pu1_bufs[0],
+                    ps_dec_ip->s_out_buffer.pu1_bufs[1],
+                    ps_dec_ip->s_out_buffer.pu1_bufs[2], ps_codec->i4_disp_strd,
+                    xpos,
+                    ypos,
+                    ps_codec->e_chroma_fmt,
+                    ps_codec->i4_disp_wd,
+                    ps_codec->i4_disp_ht);
+    }
+
+
+    return ret;
+}
+

diff --git a/decoder/ihevcd_decode.h b/decoder/ihevcd_decode.h
new file mode 100644
index 0000000..dfe6d5f
--- /dev/null
+++ b/decoder/ihevcd_decode.h

@@ -0,0 +1,42 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_nal.h
+*
+* @brief
+*  Header for main decode function
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_DECODE_H_
+#define _IHEVCD_DECODE_H_
+
+WORD32 ihevcd_decode(iv_obj_t *ps_codec_obj, void *pv_api_ip, void *pv_api_op);
+
+#endif /* _IHEVCD_DECODE_H_ */

diff --git a/decoder/ihevcd_defs.h b/decoder/ihevcd_defs.h
new file mode 100644
index 0000000..dec341c
--- /dev/null
+++ b/decoder/ihevcd_defs.h

@@ -0,0 +1,481 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_defs.h
+*
+* @brief
+*  Definitions used in the decoder
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_DEFS_H_
+#define _IHEVCD_DEFS_H_
+
+
+/*****************************************************************************/
+/* Width and height restrictions                                             */
+/*****************************************************************************/
+/**
+ * Minimum width supported by codec
+ */
+#define MIN_WD   64
+
+/**
+ * Maximum width supported by codec
+ */
+
+#define MAX_WD   8448
+
+/**
+ * Minimum height supported by codec
+ */
+#define MIN_HT   64
+
+/**
+ * Maximum height supported by codec
+ */
+
+#define MAX_HT   4320
+
+/*****************************************************************************/
+/* Padding sizes                                                             */
+/*****************************************************************************/
+/**
+ * Padding used for top of the frame
+ */
+#define PAD_TOP     80
+
+/**
+ * Padding used for bottom of the frame
+ */
+#define PAD_BOT     80
+
+/**
+ * Padding used at left of the frame
+ */
+#define PAD_LEFT    80
+
+/**
+ * Padding used at right of the frame
+ */
+#define PAD_RIGHT   80
+/**
+ * Padding for width
+ */
+#define PAD_WD      (PAD_LEFT + PAD_RIGHT)
+/**
+ * Padding for height
+ */
+#define PAD_HT      (PAD_TOP  + PAD_BOT)
+
+/*****************************************************************************/
+/* Number of frame restrictions                                              */
+/*****************************************************************************/
+/**
+ *  Maximum number of reference buffers in DPB manager
+ */
+#define MAX_REF_CNT  32
+
+/*****************************************************************************/
+/* Num cores releated defs                                                   */
+/*****************************************************************************/
+/**
+ *  Maximum number of cores
+ */
+#define MAX_NUM_CORES       8
+
+/**
+ *  Maximum number of threads for pixel processing
+ */
+#define MAX_PROCESS_THREADS MAX_NUM_CORES
+
+/*****************************************************************************/
+/* Profile and level restrictions                                            */
+/*****************************************************************************/
+/**
+ * Max level supported by the codec
+ */
+#define MAX_LEVEL  IHEVC_LEVEL_62
+/**
+ * Min level supported by the codec
+ */
+
+#define MIN_LEVEL  IHEVC_LEVEL_10
+
+
+/**
+ * Maximum number of slice headers that are held in memory simultaneously
+ * For single core implementation only 1 slice header is enough.
+ * But for multi-core parsing thread needs to ensure that slice headers are
+ * stored till the last CB in a slice is decoded.
+ * Parsing thread has to wait till last CB of a slice is consumed before reusing
+ * overwriting the slice header
+ * MAX_SLICE_HDR_CNT is assumed to be a power of 2
+ */
+
+#define LOG2_MAX_SLICE_HDR_CNT 8
+#define MAX_SLICE_HDR_CNT (1 << LOG2_MAX_SLICE_HDR_CNT)
+
+/* Number of NOP instructions to wait before yielding in process thread */
+#define PROC_NOP_CNT (8 * 128)
+
+
+/** Max QP delta that can be signalled */
+#define TU_MAX_QP_DELTA_ABS     5
+
+/** Max QP delta context increment that can be used for CABAC context */
+#define CTXT_MAX_QP_DELTA_ABS   1
+
+/*
+ * Flag whether to perform ilf at frame level or CTB level
+ */
+#define FRAME_ILF_PAD 0
+
+#define MAX_NUM_CTBS_IN_FRAME (MAX_WD * MAX_HT / MIN_CTB_SIZE / MIN_CTB_SIZE)
+
+/* Maximum slice segments allowed per frame in Level 6.2 */
+#define MAX_SLICE_SEGMENTS_IN_FRAME 600
+
+#ifdef GPU_BUILD
+/**
+ * Buffer allocated for ps_tu is re-used after RESET_TU_BUF_NCTB
+ * Set this to MAX_NUM_CTBS_IN_FRAME to disabke reuse
+ * If built for GPU, always set to maximum.
+ */
+#define RESET_TU_BUF_NCTB MAX_NUM_CTBS_IN_FRAME
+#else
+/**
+ * Buffer allocated for ps_tu is re-used after RESET_TU_BUF_NCTB
+ * Set this to MAX_NUM_CTBS_IN_FRAME to disabke reuse
+ */
+#define RESET_TU_BUF_NCTB MAX_NUM_CTBS_IN_FRAME
+#endif
+/**
+ * Flag whether to shift the CTB for SAO
+ */
+#define SAO_PROCESS_SHIFT_CTB 1
+
+/**
+ * Minimum bistream buffer size
+ */
+#define MIN_BITSBUF_SIZE (1024 * 1024)
+/**
+ *****************************************************************************
+ * Macro to compute total size required to hold on set of scaling matrices
+ *****************************************************************************
+ */
+#define SCALING_MAT_SIZE(m_scaling_mat_size)                                 \
+{                                                                            \
+    m_scaling_mat_size = 6 * TRANS_SIZE_4 * TRANS_SIZE_4;                    \
+    m_scaling_mat_size += 6 * TRANS_SIZE_8 * TRANS_SIZE_8;                   \
+    m_scaling_mat_size += 6 * TRANS_SIZE_16 * TRANS_SIZE_16;                 \
+    m_scaling_mat_size += 2 * TRANS_SIZE_32 * TRANS_SIZE_32;                 \
+}
+
+/** If num_cores is greater than MV_PRED_NUM_CORES_THRESHOLD, then mv pred and
+    boundary strength computation is done in process side instead of parse side */
+#define MV_PRED_NUM_CORES_THRESHOLD 4
+
+/**
+ ***************************************************************************
+ * Enum to hold various mem records being request
+ ****************************************************************************
+ */
+enum
+{
+    /**
+     * Codec Object at API level
+     */
+    MEM_REC_IV_OBJ,
+
+    /**
+     * Codec context
+     */
+    MEM_REC_CODEC,
+
+    /**
+     * Bitstream buffer which holds emulation prevention removed bytes
+     */
+    MEM_REC_BITSBUF,
+
+    /**
+     * Buffer to hold TU structures and coeff data
+     */
+    MEM_REC_TU_DATA,
+
+    /**
+     * Motion vector bank
+     */
+    MEM_REC_MVBANK,
+
+    /**
+     * Holds mem records passed to the codec.
+     */
+    MEM_REC_BACKUP,
+
+    /**
+     * Holds VPS
+     */
+    MEM_REC_VPS,
+
+    /**
+     * Holds SPS
+     */
+    MEM_REC_SPS,
+
+    /**
+     * Holds PPS
+     */
+    MEM_REC_PPS,
+
+    /**
+     * Holds Slice Headers
+     */
+    MEM_REC_SLICE_HDR,
+
+    /**
+     * Holds tile information such as start position, widths and heights
+     */
+    MEM_REC_TILE,
+
+    /**
+     * Holds entry point offsets for tiles and entropy sync points
+     */
+    MEM_REC_ENTRY_OFST,
+
+    /**
+     * Holds scaling matrices
+     */
+    MEM_REC_SCALING_MAT,
+
+    /**
+     * Holds one row skip_flag at 8x8 level used during parsing
+     */
+    MEM_REC_PARSE_SKIP_FLAG,
+
+    /**
+     * Holds one row ctb_tree_depth at 8x8 level used during parsing
+     */
+    MEM_REC_PARSE_CT_DEPTH,
+
+    /**
+     * Holds one row luma intra pred mode at 8x8 level used during parsing
+     */
+    MEM_REC_PARSE_INTRA_PRED_MODE,
+
+    /**
+     * Holds intra flag at 8x8 level for entire frame
+     * This is kept at frame level so that processing thread also can use this
+     * data during intra prediction and compute BS
+     */
+    MEM_REC_INTRA_FLAG,
+
+    /**
+     * Holds transquant bypass flag at 8x8 level for entire frame
+     * This is kept at frame level so that processing thread also can use this
+     */
+    MEM_REC_TRANSQUANT_BYPASS_FLAG,
+
+    /**
+     * Holds thread handles
+     */
+    MEM_REC_THREAD_HANDLE,
+
+    /**
+     * Holds memory for Process JOB Queue
+     */
+    MEM_REC_PROC_JOBQ,
+
+    /**
+     * Contains status map indicating parse status per CTB basis
+     */
+    MEM_REC_PARSE_MAP,
+
+    /**
+     * Contains status map indicating processing status per CTB basis
+     */
+    MEM_REC_PROC_MAP,
+
+    /**
+     * Holds display buffer manager context
+     */
+    MEM_REC_DISP_MGR,
+
+    /**
+     * Holds dpb manager context
+     */
+    MEM_REC_DPB_MGR,
+
+    /**
+     * Holds top and left neighbors' pu_idx array w.r.t picture level pu array
+     */
+    MEM_REC_PIC_PU_IDX_NEIGHBOR,
+
+    /**
+     * Holds intermediate buffers needed during processing stage
+     * Memory for process contexts is allocated in this memtab
+     */
+    MEM_REC_PROC_SCRATCH,
+
+    /**
+     * Holds intermediate buffers needed during SAO processing
+     */
+    MEM_REC_SAO_SCRATCH,
+
+    /**
+     * Holds buffers for vert_bs, horz_bs and QP (all frame level)
+     */
+    MEM_REC_BS_QP,
+
+    /**
+     * Contains slice map indicatating the slice index for each CTB
+     */
+    MEM_REC_TILE_IDX,
+
+    /**
+     * Holds buffers for array of SAO structures
+     */
+    MEM_REC_SAO,
+
+#ifdef GPU_BUILD
+    /**
+     * Holds buffer GPU context
+     */
+    MEM_REC_GPU,
+#endif
+    /**
+     * Holds picture buffer manager context and array of pic_buf_ts
+     * Also holds reference picture buffers in non-shared mode
+     */
+    MEM_REC_REF_PIC,
+
+
+
+    /**
+     * Place holder to compute number of memory records.
+     */
+    MEM_REC_CNT
+    /* Do not add anything below */
+};
+
+
+
+#define DISABLE_DEBLOCK_INTERVAL 8
+#define DISABLE_SAO_INTERVAL 8
+
+/**
+ ****************************************************************************
+ * Disable deblock levels
+ * Level 0 enables deblocking completely and level 4 disables completely
+ * Other levels are intermediate values to control deblocking level
+ ****************************************************************************
+ */
+enum
+{
+    /**
+     * Enable deblocking completely
+     */
+    DISABLE_DEBLK_LEVEL_0,
+    /**
+     * Disable only within CTB edges - Not supported currently
+     */
+    DISABLE_DEBLK_LEVEL_1,
+
+    /**
+     * Enable deblocking once in DEBLOCK_INTERVAL number of pictures
+     * and for I slices
+     */
+    DISABLE_DEBLK_LEVEL_2,
+
+    /**
+     * Enable deblocking only for I slices
+     */
+    DISABLE_DEBLK_LEVEL_3,
+
+    /**
+     * Disable deblocking completely
+     */
+    DISABLE_DEBLK_LEVEL_4
+};
+
+enum
+{
+    /**
+     * Enable deblocking completely
+     */
+    DISABLE_SAO_LEVEL_0,
+    /**
+     * Disable only within CTB edges - Not supported currently
+     */
+    DISABLE_SAO_LEVEL_1,
+
+    /**
+     * Enable deblocking once in DEBLOCK_INTERVAL number of pictures
+     * and for I slices
+     */
+    DISABLE_SAO_LEVEL_2,
+
+    /**
+     * Enable deblocking only for I slices
+     */
+    DISABLE_SAO_LEVEL_3,
+
+    /**
+     * Disable deblocking completely
+     */
+    DISABLE_SAO_LEVEL_4
+};
+
+/**
+ ****************************************************************************
+ * Number of buffers for I/O based on format
+ ****************************************************************************
+ */
+#define MIN_IN_BUFS             1
+#define MIN_OUT_BUFS_420        3
+#define MIN_OUT_BUFS_422ILE     1
+#define MIN_OUT_BUFS_RGB565     1
+#define MIN_OUT_BUFS_RGBA8888   1
+#define MIN_OUT_BUFS_420SP      2
+
+/**
+ ****************************************************************************
+ * Definitions related to MV pred mv merge
+ ****************************************************************************
+ */
+#define MAX_NUM_MERGE_CAND 5
+
+#define MAX_NUM_MV_NBR 5
+
+#define MAX_MVP_LIST_CAND 2
+#define MAX_MVP_LIST_CAND_MEM  (MAX_MVP_LIST_CAND + 1)
+
+
+
+#endif /*_IHEVCD_DEFS_H_*/

diff --git a/decoder/ihevcd_error.h b/decoder/ihevcd_error.h
new file mode 100644
index 0000000..7d2b255
--- /dev/null
+++ b/decoder/ihevcd_error.h

@@ -0,0 +1,127 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_error.h
+*
+* @brief
+*  Definitions related to error handling
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_ERROR_H_
+#define _IHEVCD_ERROR_H_
+
+/**
+ * Enumerations for error codes used in the codec.
+ * Not all these are expected to be returned to the application.
+ * Only select few will be exported
+ */
+typedef enum
+{
+    /**
+     * VPS id more than MAX_VPS_CNT
+     */
+    IHEVCD_UNSUPPORTED_VPS_ID  = IVD_DUMMY_ELEMENT_FOR_CODEC_EXTENSIONS + 0x300,
+    /**
+     * SPS id more than MAX_SPS_CNT
+     */
+
+    IHEVCD_UNSUPPORTED_SPS_ID,
+    /**
+     * PPS id more than MAX_PPS_CNT
+     */
+
+    IHEVCD_UNSUPPORTED_PPS_ID,
+
+    /**
+     * Invelid Parameter while decoding
+     */
+    IHEVCD_INVALID_PARAMETER,
+
+    /**
+     * Invalid header
+     */
+    IHEVCD_INVALID_HEADER,
+
+    /**
+     * In sufficient memory allocated for MV Bank
+     */
+    IHEVCD_INSUFFICIENT_MEM_MVBANK,
+
+    /**
+     * In sufficient memory allocated for MV Bank
+     */
+    IHEVCD_INSUFFICIENT_MEM_PICBUF,
+
+    /**
+     * Buffer manager error
+     */
+    IHEVCD_BUF_MGR_ERROR,
+
+    /**
+     * No free MV Bank buffer available to store current pic
+     */
+    IHEVCD_NO_FREE_MVBANK,
+
+    /**
+     * No free picture buffer available to store current pic
+     */
+    IHEVCD_NO_FREE_PICBUF,
+    /**
+     * Reached slice header in header mode
+     */
+    IHEVCD_SLICE_IN_HEADER_MODE,
+
+    /**
+     * Ignore current slice and continue
+     */
+    IHEVCD_IGNORE_SLICE,
+
+    /**
+     * Reference Picture not found
+     */
+    IHEVCD_REF_PIC_NOT_FOUND,
+
+    /**
+     * Bit depth is greater than 8
+     */
+    IHEVCD_UNSUPPORTED_BIT_DEPTH,
+
+    /**
+     * Limit on the number of frames decoded
+     */
+    IHEVCD_NUM_FRAMES_LIMIT_REACHED,
+
+    /**
+     * VUI parameters not found
+     */
+    IHEVCD_VUI_PARAMS_NOT_FOUND,
+
+}IHEVCD_ERROR_T;
+#endif /* _IHEVCD_ERROR_H_ */

diff --git a/decoder/ihevcd_fmt_conv.c b/decoder/ihevcd_fmt_conv.c
new file mode 100644
index 0000000..df62355
--- /dev/null
+++ b/decoder/ihevcd_fmt_conv.c

@@ -0,0 +1,909 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_fmt_conv.c
+*
+* @brief
+*  Contains functions for format conversion or frame copy of output buffer
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_fmt_conv.h"
+#include "ihevcd_profile.h"
+
+/**
+*******************************************************************************
+*
+* @brief Function used from copying a 420SP buffer
+*
+* @par   Description
+* Function used from copying a 420SP buffer
+*
+* @param[in] pu1_y_src
+*   Input Y pointer
+*
+* @param[in] pu1_uv_src
+*   Input UV pointer (UV is interleaved either in UV or VU format)
+*
+* @param[in] pu1_y_dst
+*   Output Y pointer
+*
+* @param[in] pu1_uv_dst
+*   Output UV pointer (UV is interleaved in the same format as that of input)
+*
+* @param[in] wd
+*   Width
+*
+* @param[in] ht
+*   Height
+*
+* @param[in] src_y_strd
+*   Input Y Stride
+*
+* @param[in] src_uv_strd
+*   Input UV stride
+*
+* @param[in] dst_y_strd
+*   Output Y stride
+*
+* @param[in] dst_uv_strd
+*   Output UV stride
+*
+* @returns None
+*
+* @remarks In case there is a need to perform partial frame copy then
+* by passion appropriate source and destination pointers and appropriate
+* values for wd and ht it can be done
+*
+*******************************************************************************
+*/
+void ihevcd_fmt_conv_420sp_to_rgb565(UWORD8 *pu1_y_src,
+                                     UWORD8 *pu1_uv_src,
+                                     UWORD16 *pu2_rgb_dst,
+                                     WORD32 wd,
+                                     WORD32 ht,
+                                     WORD32 src_y_strd,
+                                     WORD32 src_uv_strd,
+                                     WORD32 dst_strd,
+                                     WORD32 is_u_first)
+{
+
+
+    WORD16  i2_r, i2_g, i2_b;
+    UWORD32  u4_r, u4_g, u4_b;
+    WORD16  i2_i, i2_j;
+    UWORD8  *pu1_y_src_nxt;
+    UWORD16 *pu2_rgb_dst_NextRow;
+
+    UWORD8 *pu1_u_src, *pu1_v_src;
+
+    if(is_u_first)
+    {
+        pu1_u_src = (UWORD8 *)pu1_uv_src;
+        pu1_v_src = (UWORD8 *)pu1_uv_src + 1;
+    }
+    else
+    {
+        pu1_u_src = (UWORD8 *)pu1_uv_src + 1;
+        pu1_v_src = (UWORD8 *)pu1_uv_src;
+    }
+
+    pu1_y_src_nxt   = pu1_y_src + src_y_strd;
+    pu2_rgb_dst_NextRow = pu2_rgb_dst + dst_strd;
+
+    for(i2_i = 0; i2_i < (ht >> 1); i2_i++)
+    {
+        for(i2_j = (wd >> 1); i2_j > 0; i2_j--)
+        {
+            i2_b = ((*pu1_u_src - 128) * COEFF4 >> 13);
+            i2_g = ((*pu1_u_src - 128) * COEFF2 + (*pu1_v_src - 128) * COEFF3) >> 13;
+            i2_r = ((*pu1_v_src - 128) * COEFF1) >> 13;
+
+            pu1_u_src += 2;
+            pu1_v_src += 2;
+            /* pixel 0 */
+            /* B */
+            u4_b = CLIP_U8(*pu1_y_src + i2_b);
+            u4_b >>= 3;
+            /* G */
+            u4_g = CLIP_U8(*pu1_y_src + i2_g);
+            u4_g >>= 2;
+            /* R */
+            u4_r = CLIP_U8(*pu1_y_src + i2_r);
+            u4_r >>= 3;
+
+            pu1_y_src++;
+            *pu2_rgb_dst++ = ((u4_r << 11) | (u4_g << 5) | u4_b);
+
+            /* pixel 1 */
+            /* B */
+            u4_b = CLIP_U8(*pu1_y_src + i2_b);
+            u4_b >>= 3;
+            /* G */
+            u4_g = CLIP_U8(*pu1_y_src + i2_g);
+            u4_g >>= 2;
+            /* R */
+            u4_r = CLIP_U8(*pu1_y_src + i2_r);
+            u4_r >>= 3;
+
+            pu1_y_src++;
+            *pu2_rgb_dst++ = ((u4_r << 11) | (u4_g << 5) | u4_b);
+
+            /* pixel 2 */
+            /* B */
+            u4_b = CLIP_U8(*pu1_y_src_nxt + i2_b);
+            u4_b >>= 3;
+            /* G */
+            u4_g = CLIP_U8(*pu1_y_src_nxt + i2_g);
+            u4_g >>= 2;
+            /* R */
+            u4_r = CLIP_U8(*pu1_y_src_nxt + i2_r);
+            u4_r >>= 3;
+
+            pu1_y_src_nxt++;
+            *pu2_rgb_dst_NextRow++ = ((u4_r << 11) | (u4_g << 5) | u4_b);
+
+            /* pixel 3 */
+            /* B */
+            u4_b = CLIP_U8(*pu1_y_src_nxt + i2_b);
+            u4_b >>= 3;
+            /* G */
+            u4_g = CLIP_U8(*pu1_y_src_nxt + i2_g);
+            u4_g >>= 2;
+            /* R */
+            u4_r = CLIP_U8(*pu1_y_src_nxt + i2_r);
+            u4_r >>= 3;
+
+            pu1_y_src_nxt++;
+            *pu2_rgb_dst_NextRow++ = ((u4_r << 11) | (u4_g << 5) | u4_b);
+
+        }
+
+        pu1_u_src = pu1_u_src + src_uv_strd - wd;
+        pu1_v_src = pu1_v_src + src_uv_strd - wd;
+
+        pu1_y_src = pu1_y_src + (src_y_strd << 1) - wd;
+        pu1_y_src_nxt = pu1_y_src_nxt + (src_y_strd << 1) - wd;
+
+        pu2_rgb_dst = pu2_rgb_dst_NextRow - wd + dst_strd;
+        pu2_rgb_dst_NextRow = pu2_rgb_dst_NextRow + (dst_strd << 1) - wd;
+    }
+
+
+}
+
+void ihevcd_fmt_conv_420sp_to_rgba8888(UWORD8 *pu1_y_src,
+                                       UWORD8 *pu1_uv_src,
+                                       UWORD32 *pu4_rgba_dst,
+                                       WORD32 wd,
+                                       WORD32 ht,
+                                       WORD32 src_y_strd,
+                                       WORD32 src_uv_strd,
+                                       WORD32 dst_strd,
+                                       WORD32 is_u_first)
+{
+
+
+    WORD16  i2_r, i2_g, i2_b;
+    UWORD32  u4_r, u4_g, u4_b;
+    WORD16  i2_i, i2_j;
+    UWORD8  *pu1_y_src_nxt;
+    UWORD32 *pu4_rgba_dst_NextRow;
+
+    UWORD8 *pu1_u_src, *pu1_v_src;
+
+    if(is_u_first)
+    {
+        pu1_u_src = (UWORD8 *)pu1_uv_src;
+        pu1_v_src = (UWORD8 *)pu1_uv_src + 1;
+    }
+    else
+    {
+        pu1_u_src = (UWORD8 *)pu1_uv_src + 1;
+        pu1_v_src = (UWORD8 *)pu1_uv_src;
+    }
+
+    pu1_y_src_nxt   = pu1_y_src + src_y_strd;
+    pu4_rgba_dst_NextRow = pu4_rgba_dst + dst_strd;
+
+    for(i2_i = 0; i2_i < (ht >> 1); i2_i++)
+    {
+        for(i2_j = (wd >> 1); i2_j > 0; i2_j--)
+        {
+            i2_b = ((*pu1_u_src - 128) * COEFF4 >> 13);
+            i2_g = ((*pu1_u_src - 128) * COEFF2 + (*pu1_v_src - 128) * COEFF3) >> 13;
+            i2_r = ((*pu1_v_src - 128) * COEFF1) >> 13;
+
+            pu1_u_src += 2;
+            pu1_v_src += 2;
+            /* pixel 0 */
+            /* B */
+            u4_b = CLIP_U8(*pu1_y_src + i2_b);
+            /* G */
+            u4_g = CLIP_U8(*pu1_y_src + i2_g);
+            /* R */
+            u4_r = CLIP_U8(*pu1_y_src + i2_r);
+
+            pu1_y_src++;
+            *pu4_rgba_dst++ = ((u4_r << 16) | (u4_g << 8) | (u4_b << 0));
+
+            /* pixel 1 */
+            /* B */
+            u4_b = CLIP_U8(*pu1_y_src + i2_b);
+            /* G */
+            u4_g = CLIP_U8(*pu1_y_src + i2_g);
+            /* R */
+            u4_r = CLIP_U8(*pu1_y_src + i2_r);
+
+            pu1_y_src++;
+            *pu4_rgba_dst++ = ((u4_r << 16) | (u4_g << 8) | (u4_b << 0));
+
+            /* pixel 2 */
+            /* B */
+            u4_b = CLIP_U8(*pu1_y_src_nxt + i2_b);
+            /* G */
+            u4_g = CLIP_U8(*pu1_y_src_nxt + i2_g);
+            /* R */
+            u4_r = CLIP_U8(*pu1_y_src_nxt + i2_r);
+
+            pu1_y_src_nxt++;
+            *pu4_rgba_dst_NextRow++ = ((u4_r << 16) | (u4_g << 8) | (u4_b << 0));
+
+            /* pixel 3 */
+            /* B */
+            u4_b = CLIP_U8(*pu1_y_src_nxt + i2_b);
+            /* G */
+            u4_g = CLIP_U8(*pu1_y_src_nxt + i2_g);
+            /* R */
+            u4_r = CLIP_U8(*pu1_y_src_nxt + i2_r);
+
+            pu1_y_src_nxt++;
+            *pu4_rgba_dst_NextRow++ = ((u4_r << 16) | (u4_g << 8) | (u4_b << 0));
+
+        }
+
+        pu1_u_src = pu1_u_src + src_uv_strd - wd;
+        pu1_v_src = pu1_v_src + src_uv_strd - wd;
+
+        pu1_y_src = pu1_y_src + (src_y_strd << 1) - wd;
+        pu1_y_src_nxt = pu1_y_src_nxt + (src_y_strd << 1) - wd;
+
+        pu4_rgba_dst = pu4_rgba_dst_NextRow - wd + dst_strd;
+        pu4_rgba_dst_NextRow = pu4_rgba_dst_NextRow + (dst_strd << 1) - wd;
+    }
+
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief Function used from copying a 420SP buffer
+*
+* @par   Description
+* Function used from copying a 420SP buffer
+*
+* @param[in] pu1_y_src
+*   Input Y pointer
+*
+* @param[in] pu1_uv_src
+*   Input UV pointer (UV is interleaved either in UV or VU format)
+*
+* @param[in] pu1_y_dst
+*   Output Y pointer
+*
+* @param[in] pu1_uv_dst
+*   Output UV pointer (UV is interleaved in the same format as that of input)
+*
+* @param[in] wd
+*   Width
+*
+* @param[in] ht
+*   Height
+*
+* @param[in] src_y_strd
+*   Input Y Stride
+*
+* @param[in] src_uv_strd
+*   Input UV stride
+*
+* @param[in] dst_y_strd
+*   Output Y stride
+*
+* @param[in] dst_uv_strd
+*   Output UV stride
+*
+* @returns None
+*
+* @remarks In case there is a need to perform partial frame copy then
+* by passion appropriate source and destination pointers and appropriate
+* values for wd and ht it can be done
+*
+*******************************************************************************
+*/
+
+void ihevcd_fmt_conv_420sp_to_420sp(UWORD8 *pu1_y_src,
+                                    UWORD8 *pu1_uv_src,
+                                    UWORD8 *pu1_y_dst,
+                                    UWORD8 *pu1_uv_dst,
+                                    WORD32 wd,
+                                    WORD32 ht,
+                                    WORD32 src_y_strd,
+                                    WORD32 src_uv_strd,
+                                    WORD32 dst_y_strd,
+                                    WORD32 dst_uv_strd)
+{
+    UWORD8 *pu1_src, *pu1_dst;
+    WORD32 num_rows, num_cols, src_strd, dst_strd;
+    WORD32 i;
+
+    /* copy luma */
+    pu1_src = (UWORD8 *)pu1_y_src;
+    pu1_dst = (UWORD8 *)pu1_y_dst;
+
+    num_rows = ht;
+    num_cols = wd;
+
+    src_strd = src_y_strd;
+    dst_strd = dst_y_strd;
+
+    for(i = 0; i < num_rows; i++)
+    {
+        memcpy(pu1_dst, pu1_src, num_cols);
+        pu1_dst += dst_strd;
+        pu1_src += src_strd;
+    }
+
+    /* copy U and V */
+    pu1_src = (UWORD8 *)pu1_uv_src;
+    pu1_dst = (UWORD8 *)pu1_uv_dst;
+
+    num_rows = ht >> 1;
+    num_cols = wd;
+
+    src_strd = src_uv_strd;
+    dst_strd = dst_uv_strd;
+
+    for(i = 0; i < num_rows; i++)
+    {
+        memcpy(pu1_dst, pu1_src, num_cols);
+        pu1_dst += dst_strd;
+        pu1_src += src_strd;
+    }
+    return;
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief Function used from copying a 420SP buffer
+*
+* @par   Description
+* Function used from copying a 420SP buffer
+*
+* @param[in] pu1_y_src
+*   Input Y pointer
+*
+* @param[in] pu1_uv_src
+*   Input UV pointer (UV is interleaved either in UV or VU format)
+*
+* @param[in] pu1_y_dst
+*   Output Y pointer
+*
+* @param[in] pu1_uv_dst
+*   Output UV pointer (UV is interleaved in the same format as that of input)
+*
+* @param[in] wd
+*   Width
+*
+* @param[in] ht
+*   Height
+*
+* @param[in] src_y_strd
+*   Input Y Stride
+*
+* @param[in] src_uv_strd
+*   Input UV stride
+*
+* @param[in] dst_y_strd
+*   Output Y stride
+*
+* @param[in] dst_uv_strd
+*   Output UV stride
+*
+* @returns None
+*
+* @remarks In case there is a need to perform partial frame copy then
+* by passion appropriate source and destination pointers and appropriate
+* values for wd and ht it can be done
+*
+*******************************************************************************
+*/
+void ihevcd_fmt_conv_420sp_to_420sp_swap_uv(UWORD8 *pu1_y_src,
+                                            UWORD8 *pu1_uv_src,
+                                            UWORD8 *pu1_y_dst,
+                                            UWORD8 *pu1_uv_dst,
+                                            WORD32 wd,
+                                            WORD32 ht,
+                                            WORD32 src_y_strd,
+                                            WORD32 src_uv_strd,
+                                            WORD32 dst_y_strd,
+                                            WORD32 dst_uv_strd)
+{
+    UWORD8 *pu1_src, *pu1_dst;
+    WORD32 num_rows, num_cols, src_strd, dst_strd;
+    WORD32 i;
+
+    /* copy luma */
+    pu1_src = (UWORD8 *)pu1_y_src;
+    pu1_dst = (UWORD8 *)pu1_y_dst;
+
+    num_rows = ht;
+    num_cols = wd;
+
+    src_strd = src_y_strd;
+    dst_strd = dst_y_strd;
+
+    for(i = 0; i < num_rows; i++)
+    {
+        memcpy(pu1_dst, pu1_src, num_cols);
+        pu1_dst += dst_strd;
+        pu1_src += src_strd;
+    }
+
+    /* copy U and V */
+    pu1_src = (UWORD8 *)pu1_uv_src;
+    pu1_dst = (UWORD8 *)pu1_uv_dst;
+
+    num_rows = ht >> 1;
+    num_cols = wd;
+
+    src_strd = src_uv_strd;
+    dst_strd = dst_uv_strd;
+
+    for(i = 0; i < num_rows; i++)
+    {
+        WORD32 j;
+        for(j = 0; j < num_cols; j += 2)
+        {
+            pu1_dst[j + 0] = pu1_src[j + 1];
+            pu1_dst[j + 1] = pu1_src[j + 0];
+        }
+        pu1_dst += dst_strd;
+        pu1_src += src_strd;
+    }
+    return;
+}
+/**
+*******************************************************************************
+*
+* @brief Function used from copying a 420SP buffer
+*
+* @par   Description
+* Function used from copying a 420SP buffer
+*
+* @param[in] pu1_y_src
+*   Input Y pointer
+*
+* @param[in] pu1_uv_src
+*   Input UV pointer (UV is interleaved either in UV or VU format)
+*
+* @param[in] pu1_y_dst
+*   Output Y pointer
+*
+* @param[in] pu1_u_dst
+*   Output U pointer
+*
+* @param[in] pu1_v_dst
+*   Output V pointer
+*
+* @param[in] wd
+*   Width
+*
+* @param[in] ht
+*   Height
+*
+* @param[in] src_y_strd
+*   Input Y Stride
+*
+* @param[in] src_uv_strd
+*   Input UV stride
+*
+* @param[in] dst_y_strd
+*   Output Y stride
+*
+* @param[in] dst_uv_strd
+*   Output UV stride
+*
+* @param[in] is_u_first
+*   Flag to indicate if U is the first byte in input chroma part
+*
+* @returns none
+*
+* @remarks In case there is a need to perform partial frame copy then
+* by passion appropriate source and destination pointers and appropriate
+* values for wd and ht it can be done
+*
+*******************************************************************************
+*/
+
+
+void ihevcd_fmt_conv_420sp_to_420p(UWORD8 *pu1_y_src,
+                                   UWORD8 *pu1_uv_src,
+                                   UWORD8 *pu1_y_dst,
+                                   UWORD8 *pu1_u_dst,
+                                   UWORD8 *pu1_v_dst,
+                                   WORD32 wd,
+                                   WORD32 ht,
+                                   WORD32 src_y_strd,
+                                   WORD32 src_uv_strd,
+                                   WORD32 dst_y_strd,
+                                   WORD32 dst_uv_strd,
+                                   WORD32 is_u_first,
+                                   WORD32 disable_luma_copy)
+{
+    UWORD8 *pu1_src, *pu1_dst;
+    UWORD8 *pu1_u_src, *pu1_v_src;
+    WORD32 num_rows, num_cols, src_strd, dst_strd;
+    WORD32 i, j;
+
+    if(0 == disable_luma_copy)
+    {
+        /* copy luma */
+        pu1_src = (UWORD8 *)pu1_y_src;
+        pu1_dst = (UWORD8 *)pu1_y_dst;
+
+        num_rows = ht;
+        num_cols = wd;
+
+        src_strd = src_y_strd;
+        dst_strd = dst_y_strd;
+
+        for(i = 0; i < num_rows; i++)
+        {
+            memcpy(pu1_dst, pu1_src, num_cols);
+            pu1_dst += dst_strd;
+            pu1_src += src_strd;
+        }
+    }
+    /* de-interleave U and V and copy to destination */
+    if(is_u_first)
+    {
+        pu1_u_src = (UWORD8 *)pu1_uv_src;
+        pu1_v_src = (UWORD8 *)pu1_uv_src + 1;
+    }
+    else
+    {
+        pu1_u_src = (UWORD8 *)pu1_uv_src + 1;
+        pu1_v_src = (UWORD8 *)pu1_uv_src;
+    }
+
+
+    num_rows = ht >> 1;
+    num_cols = wd >> 1;
+
+    src_strd = src_uv_strd;
+    dst_strd = dst_uv_strd;
+
+    for(i = 0; i < num_rows; i++)
+    {
+        for(j = 0; j < num_cols; j++)
+        {
+            pu1_u_dst[j] = pu1_u_src[j * 2];
+            pu1_v_dst[j] = pu1_v_src[j * 2];
+        }
+
+        pu1_u_dst += dst_strd;
+        pu1_v_dst += dst_strd;
+        pu1_u_src += src_strd;
+        pu1_v_src += src_strd;
+    }
+    return;
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief Function used from format conversion or frame copy
+*
+* @par   Description
+* Function used from copying or converting a reference frame to display buffer
+* in non shared mode
+*
+* @param[in] pu1_y_dst
+*   Output Y pointer
+*
+* @param[in] pu1_u_dst
+*   Output U/UV pointer ( UV is interleaved in the same format as that of input)
+*
+* @param[in] pu1_v_dst
+*   Output V pointer ( used in 420P output case)
+*
+* @param[in] blocking
+*   To indicate whether format conversion should wait till frame is reconstructed
+*   and then return after complete copy is done. To be set to 1 when called at the
+*   end of frame processing and set to 0 when called between frame processing modules
+*   in order to utilize available MCPS
+*
+* @returns Error from IHEVCD_ERROR_T
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_fmt_conv(codec_t *ps_codec,
+                               process_ctxt_t *ps_proc,
+                               UWORD8 *pu1_y_dst,
+                               UWORD8 *pu1_u_dst,
+                               UWORD8 *pu1_v_dst,
+                               WORD32 cur_row,
+                               WORD32 num_rows)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    pic_buf_t *ps_disp_pic;
+    UWORD8 *pu1_y_src, *pu1_uv_src;
+    UWORD8 *pu1_y_dst_tmp, *pu1_uv_dst_tmp;
+    UWORD8 *pu1_u_dst_tmp, *pu1_v_dst_tmp;
+    UWORD16 *pu2_rgb_dst_tmp;
+    UWORD32 *pu4_rgb_dst_tmp;
+    WORD32 is_u_first;
+    UWORD8 *pu1_luma;
+    UWORD8 *pu1_chroma;
+    sps_t *ps_sps;
+    WORD32 disable_luma_copy;
+    WORD32 crop_unit_x, crop_unit_y;
+
+    if(0 == num_rows)
+        return ret;
+
+    /* In case processing is disabled, then no need to format convert/copy */
+    PROFILE_DISABLE_FMT_CONV();
+    ps_sps = ps_proc->ps_sps;
+
+    crop_unit_x = 1;
+    crop_unit_y = 1;
+
+    if(CHROMA_FMT_IDC_YUV420 == ps_sps->i1_chroma_format_idc)
+    {
+        crop_unit_x = 2;
+        crop_unit_y = 2;
+    }
+
+    ps_disp_pic = ps_codec->ps_disp_buf;
+    pu1_luma = ps_disp_pic->pu1_luma;
+    pu1_chroma = ps_disp_pic->pu1_chroma;
+
+
+    /* Take care of cropping */
+    pu1_luma    += ps_codec->i4_strd * ps_sps->i2_pic_crop_top_offset * crop_unit_y + ps_sps->i2_pic_crop_left_offset * crop_unit_x;
+
+    /* Left offset is multiplied by 2 because buffer is UV interleaved */
+    pu1_chroma  += ps_codec->i4_strd * ps_sps->i2_pic_crop_top_offset + ps_sps->i2_pic_crop_left_offset * 2;
+
+
+    is_u_first = (IV_YUV_420SP_UV == ps_codec->e_ref_chroma_fmt) ? 1 : 0;
+
+    /* In case of 420P output luma copy is disabled for shared mode */
+    disable_luma_copy = 0;
+    if(1 == ps_codec->i4_share_disp_buf)
+    {
+        disable_luma_copy = 1;
+    }
+
+
+
+    {
+        pu1_y_src   = pu1_luma + cur_row * ps_codec->i4_strd;
+        pu1_uv_src  = pu1_chroma + (cur_row / 2) * ps_codec->i4_strd;
+
+        pu2_rgb_dst_tmp  = (UWORD16 *)pu1_y_dst;
+        pu2_rgb_dst_tmp  += cur_row * ps_codec->i4_disp_strd;
+        pu4_rgb_dst_tmp  = (UWORD32 *)pu1_y_dst;
+        pu4_rgb_dst_tmp  += cur_row * ps_codec->i4_disp_strd;
+        pu1_y_dst_tmp  = pu1_y_dst  + cur_row * ps_codec->i4_disp_strd;
+        pu1_uv_dst_tmp = pu1_u_dst  + (cur_row / 2) * ps_codec->i4_disp_strd;
+        pu1_u_dst_tmp = pu1_u_dst  + (cur_row / 2) * ps_codec->i4_disp_strd / 2;
+        pu1_v_dst_tmp = pu1_v_dst  + (cur_row / 2) * ps_codec->i4_disp_strd / 2;
+
+        /* In case of multi threaded implementation, format conversion might be called
+         * before reconstruction is completed. If the frame being converted/copied
+         * is same as the frame being reconstructed,
+         * Check how many rows can be format converted
+         * Convert those many rows and then check for remaining rows and so on
+         */
+
+        if((0 == ps_codec->i4_flush_mode) && (ps_codec->i4_disp_buf_id == ps_proc->i4_cur_pic_buf_id) && (1 < ps_codec->i4_num_cores))
+        {
+            WORD32 idx;
+            UWORD8 *pu1_buf;
+            WORD32 status;
+            WORD32 last_row = cur_row + num_rows;
+            WORD32 last_ctb_y;
+            UWORD32 ctb_in_row;
+
+            while(1)
+            {
+                last_row = cur_row + MAX(num_rows, (1 << ps_sps->i1_log2_ctb_size)) +
+                                ps_sps->i2_pic_crop_top_offset * crop_unit_y;
+                last_ctb_y = (last_row >> ps_sps->i1_log2_ctb_size) - 1;
+                /* Since deblocking works with a shift of -4, -4 ,wait till next CTB row is processed */
+                last_ctb_y++;
+                /* In case of a  conformance window, an extra wait of one row might be needed */
+                last_ctb_y++;
+                last_ctb_y = MIN(last_ctb_y, (ps_sps->i2_pic_ht_in_ctb - 1));
+
+                idx = (last_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+
+                /*Check if the row below is completely processed before proceeding with format conversion*/
+                status = 1;
+                for(ctb_in_row = 0; (WORD32)ctb_in_row < ps_sps->i2_pic_wd_in_ctb; ctb_in_row++)
+                {
+#ifdef GPU_BUILD
+                    //TODO GPU : Later define it for ARM only version as well
+                    pu1_buf = (ps_proc->pu1_proc_map + idx);
+#else
+                    pu1_buf = (ps_codec->pu1_proc_map + idx + ctb_in_row);
+#endif
+                    status &= *pu1_buf;
+                }
+
+                if(status)
+                {
+                    break;
+                }
+                else
+                {
+                    ithread_yield();
+                }
+            }
+        }
+
+
+        if((IV_YUV_420SP_UV == ps_codec->e_chroma_fmt) || (IV_YUV_420SP_VU == ps_codec->e_chroma_fmt))
+        {
+
+            ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420sp_fptr(pu1_y_src, pu1_uv_src,
+                                                                          pu1_y_dst_tmp, pu1_uv_dst_tmp,
+                                                                          ps_codec->i4_disp_wd,
+                                                                          num_rows,
+                                                                          ps_codec->i4_strd,
+                                                                          ps_codec->i4_strd,
+                                                                          ps_codec->i4_disp_strd,
+                                                                          ps_codec->i4_disp_strd);
+        }
+#if 0
+        else if(IV_YUV_420SP_VU == ps_codec->e_chroma_fmt)
+        {
+
+            ihevcd_fmt_conv_420sp_to_420sp_swap_uv(pu1_y_src, pu1_uv_src,
+                                                   pu1_y_dst_tmp, pu1_uv_dst_tmp,
+                                                   ps_codec->i4_disp_wd,
+                                                   num_rows,
+                                                   ps_codec->i4_strd,
+                                                   ps_codec->i4_strd,
+                                                   ps_codec->i4_disp_strd,
+                                                   ps_codec->i4_disp_strd);
+        }
+#endif
+        else if(IV_YUV_420P == ps_codec->e_chroma_fmt)
+        {
+
+            if(0 == disable_luma_copy)
+            {
+                // copy luma
+                WORD32 i;
+                WORD32 num_cols = ps_codec->i4_disp_wd;
+
+                for(i = 0; i < num_rows; i++)
+                {
+                    memcpy(pu1_y_dst_tmp, pu1_y_src, num_cols);
+                    pu1_y_dst_tmp += ps_codec->i4_disp_strd;
+                    pu1_y_src += ps_codec->i4_strd;
+                }
+
+                disable_luma_copy = 1;
+            }
+
+            ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420p_fptr(pu1_y_src, pu1_uv_src,
+                                                                         pu1_y_dst_tmp, pu1_u_dst_tmp, pu1_v_dst_tmp,
+                                                                         ps_codec->i4_disp_wd,
+                                                                         num_rows,
+                                                                         ps_codec->i4_strd,
+                                                                         ps_codec->i4_strd,
+                                                                         ps_codec->i4_disp_strd,
+                                                                         (ps_codec->i4_disp_strd / 2),
+                                                                         is_u_first,
+                                                                         disable_luma_copy);
+
+        }
+        else if(IV_RGB_565 == ps_codec->e_chroma_fmt)
+        {
+
+            ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgb565_fptr(pu1_y_src, pu1_uv_src,
+                                                                           pu2_rgb_dst_tmp,
+                                                                           ps_codec->i4_disp_wd,
+                                                                           num_rows,
+                                                                           ps_codec->i4_strd,
+                                                                           ps_codec->i4_strd,
+                                                                           ps_codec->i4_disp_strd,
+                                                                           is_u_first);
+
+        }
+        else if(IV_RGBA_8888 == ps_codec->e_chroma_fmt)
+        {
+            ASSERT(is_u_first == 1);
+
+            ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgba8888_fptr(pu1_y_src,
+                                                                             pu1_uv_src,
+                                                                             pu4_rgb_dst_tmp,
+                                                                             ps_codec->i4_disp_wd,
+                                                                             num_rows,
+                                                                             ps_codec->i4_strd,
+                                                                             ps_codec->i4_strd,
+                                                                             ps_codec->i4_disp_strd,
+                                                                             is_u_first);
+
+        }
+
+
+
+    }
+    return (ret);
+}
+

diff --git a/decoder/ihevcd_fmt_conv.h b/decoder/ihevcd_fmt_conv.h
new file mode 100644
index 0000000..e099218
--- /dev/null
+++ b/decoder/ihevcd_fmt_conv.h

@@ -0,0 +1,118 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+ *******************************************************************************
+ * @file
+ *  ihevcd_structs.h
+ *
+ * @brief
+ *  Structure definitions used in the decoder
+ *
+ * @author
+ *  Harish
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+#ifndef _IHEVCD_FMT_CONV_H_
+#define _IHEVCD_FMT_CONV_H_
+
+#define COEFF1          13073
+#define COEFF2          -3207
+#define COEFF3          -6664
+#define COEFF4          16530
+
+typedef void ihevcd_fmt_conv_420sp_to_rgba8888_ft(UWORD8 *pu1_y_src,
+                                                  UWORD8 *pu1_uv_src,
+                                                  UWORD32 *pu4_rgba_dst,
+                                                  WORD32 wd,
+                                                  WORD32 ht,
+                                                  WORD32 src_y_strd,
+                                                  WORD32 src_uv_strd,
+                                                  WORD32 dst_strd,
+                                                  WORD32 is_u_first);
+
+typedef void ihevcd_fmt_conv_420sp_to_rgb565_ft(UWORD8 *pu1_y_src,
+                                                UWORD8 *pu1_uv_src,
+                                                UWORD16 *pu2_rgb_dst,
+                                                WORD32 wd,
+                                                WORD32 ht,
+                                                WORD32 src_y_strd,
+                                                WORD32 src_uv_strd,
+                                                WORD32 dst_strd,
+                                                WORD32 is_u_first);
+
+
+typedef void ihevcd_fmt_conv_420sp_to_420sp_ft(UWORD8 *pu1_y_src,
+                                               UWORD8 *pu1_uv_src,
+                                               UWORD8 *pu1_y_dst,
+                                               UWORD8 *pu1_uv_dst,
+                                               WORD32 wd,
+                                               WORD32 ht,
+                                               WORD32 src_y_strd,
+                                               WORD32 src_uv_strd,
+                                               WORD32 dst_y_strd,
+                                               WORD32 dst_uv_strd);
+typedef void ihevcd_fmt_conv_420sp_to_420p_ft(UWORD8 *pu1_y_src,
+                                              UWORD8 *pu1_uv_src,
+                                              UWORD8 *pu1_y_dst,
+                                              UWORD8 *pu1_u_dst,
+                                              UWORD8 *pu1_v_dst,
+                                              WORD32 wd,
+                                              WORD32 ht,
+                                              WORD32 src_y_strd,
+                                              WORD32 src_uv_strd,
+                                              WORD32 dst_y_strd,
+                                              WORD32 dst_uv_strd,
+                                              WORD32 is_u_first,
+                                              WORD32 disable_luma_copy);
+
+/* C function declarations */
+ihevcd_fmt_conv_420sp_to_rgba8888_ft ihevcd_fmt_conv_420sp_to_rgba8888;
+ihevcd_fmt_conv_420sp_to_rgb565_ft ihevcd_fmt_conv_420sp_to_rgb565;
+ihevcd_fmt_conv_420sp_to_420sp_ft ihevcd_fmt_conv_420sp_to_420sp;
+ihevcd_fmt_conv_420sp_to_420p_ft ihevcd_fmt_conv_420sp_to_420p;
+
+/* A9Q function declarations */
+ihevcd_fmt_conv_420sp_to_rgba8888_ft ihevcd_fmt_conv_420sp_to_rgba8888_a9q;
+ihevcd_fmt_conv_420sp_to_420sp_ft ihevcd_fmt_conv_420sp_to_420sp_a9q;
+ihevcd_fmt_conv_420sp_to_420p_ft ihevcd_fmt_conv_420sp_to_420p_a9q;
+
+/* A9A function declarations */
+ihevcd_fmt_conv_420sp_to_rgba8888_ft ihevcd_fmt_conv_420sp_to_rgba8888_a9a;
+ihevcd_fmt_conv_420sp_to_420sp_ft ihevcd_fmt_conv_420sp_to_420sp_a9a;
+ihevcd_fmt_conv_420sp_to_420p_ft ihevcd_fmt_conv_420sp_to_420p_a9a;
+
+/* SSSe31 function declarations */
+ihevcd_fmt_conv_420sp_to_420p_ft ihevcd_fmt_conv_420sp_to_420p_ssse3;
+
+/* SSE4 function declarations */
+ihevcd_fmt_conv_420sp_to_420p_ft ihevcd_fmt_conv_420sp_to_420p_sse42;
+
+/* armv8 function declarations */
+ihevcd_fmt_conv_420sp_to_rgba8888_ft ihevcd_fmt_conv_420sp_to_rgba8888_av8;
+ihevcd_fmt_conv_420sp_to_420sp_ft ihevcd_fmt_conv_420sp_to_420sp_av8;
+ihevcd_fmt_conv_420sp_to_420p_ft ihevcd_fmt_conv_420sp_to_420p_av8;
+
+#endif /* _IHEVCD_FMT_CONV_H_ */

diff --git a/decoder/ihevcd_func_types.h b/decoder/ihevcd_func_types.h
new file mode 100644
index 0000000..232b979
--- /dev/null
+++ b/decoder/ihevcd_func_types.h

@@ -0,0 +1,69 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_func_types.h
+*
+* @brief
+*  Defines different types of function implementations  Eg C, Cortex A8
+* Intrinsics, Neon assembly etc
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _ihevcd_func_types_H_
+#define _ihevcd_func_types_H_
+
+
+/* C Model : No platform specific intrinsics or inline assemblies */
+#define    C            0
+
+/* Cortex Ax intrinsics */
+#define    CXAINTR      10
+
+/* Neon intrinsics */
+#define    NEONINTR     11
+
+/* X86 intrinsics */
+#define    X86INTR      12
+
+/* X64 intrinsics */
+#define    X64INTR      13
+
+/* Atom intrinsics */
+#define    ATOMINTR       14
+
+/* Cortex Ax assembly */
+#define    CXAASM       20
+
+/* Neon assembly */
+#define    NEONASM      21
+
+/* X86 assembly */
+#define    X86ASM       22
+
+
+#endif /* _ihevcd_func_types_H_ */

diff --git a/decoder/ihevcd_function_selector.h b/decoder/ihevcd_function_selector.h
new file mode 100644
index 0000000..e7d7eee
--- /dev/null
+++ b/decoder/ihevcd_function_selector.h

@@ -0,0 +1,189 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+ *******************************************************************************
+ * @file
+ *  ihevcd_function_selector.h
+ *
+ * @brief
+ *  Structure definitions used in the decoder
+ *
+ * @author
+ *  Harish
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+#ifndef _IHEVCD_FUNCTION_SELECTOR_H_
+#define _IHEVCD_FUNCTION_SELECTOR_H_
+
+#include "ihevc_deblk.h"
+#include "ihevc_itrans.h"
+#include "ihevc_itrans_recon.h"
+#include "ihevc_chroma_itrans_recon.h"
+#include "ihevc_chroma_intra_pred.h"
+#include "ihevc_recon.h"
+#include "ihevc_chroma_recon.h"
+#include "ihevc_intra_pred.h"
+#include "ihevc_inter_pred.h"
+#include "ihevc_mem_fns.h"
+#include "ihevc_padding.h"
+#include "ihevc_weighted_pred.h"
+#include "ihevc_sao.h"
+#include "ihevcd_fmt_conv.h"
+#include "ihevcd_itrans_recon_dc.h"
+
+#define D_ARCH_NA                   1
+#define D_ARCH_ARM_NONEON           2
+#define D_ARCH_ARM_A9Q              3
+#define D_ARCH_ARM_A9A              4
+#define D_ARCH_ARM_A9               5
+#define D_ARCH_ARM_A7               6
+#define D_ARCH_ARM_A5               7
+#define D_ARCH_ARM_A15              8
+#define D_ARCH_ARM_NEONINTR         9
+#define D_ARCH_ARMV8_GENERIC        10
+#define D_ARCH_X86_GENERIC          11
+#define D_ARCH_X86_SSSE3            12
+#define D_ARCH_X86_SSE42            13
+#define D_ARCH_X86_AVX2             14
+#define D_ARCH_MIPS_GENERIC         15
+#define D_ARCH_MIPS_32              16
+
+void ihevcd_init_arch(void *pv_codec);
+
+void ihevcd_init_function_ptr(void *pv_codec);
+
+void ihevcd_init_function_ptr_generic(void *pv_codec);
+void ihevcd_init_function_ptr_ssse3(void *pv_codec);
+void ihevcd_init_function_ptr_sse42(void *pv_codec);
+
+#ifndef DISABLE_AVX2
+void ihevcd_init_function_ptr_avx2(void *pv_codec);
+#endif
+
+typedef struct
+{
+    ihevc_deblk_chroma_horz_ft *ihevc_deblk_chroma_horz_fptr;
+    ihevc_deblk_chroma_vert_ft *ihevc_deblk_chroma_vert_fptr;
+    ihevc_deblk_luma_vert_ft *ihevc_deblk_luma_vert_fptr;
+    ihevc_deblk_luma_horz_ft *ihevc_deblk_luma_horz_fptr;
+
+    ihevc_inter_pred_ft *ihevc_inter_pred_chroma_copy_fptr;
+    ihevc_inter_pred_w16out_ft *ihevc_inter_pred_chroma_copy_w16out_fptr;
+    ihevc_inter_pred_ft *ihevc_inter_pred_chroma_horz_fptr;
+    ihevc_inter_pred_w16out_ft *ihevc_inter_pred_chroma_horz_w16out_fptr;
+    ihevc_inter_pred_ft *ihevc_inter_pred_chroma_vert_fptr;
+    ihevc_inter_pred_w16inp_ft *ihevc_inter_pred_chroma_vert_w16inp_fptr;
+    ihevc_inter_pred_w16inp_w16out_ft *ihevc_inter_pred_chroma_vert_w16inp_w16out_fptr;
+    ihevc_inter_pred_w16out_ft *ihevc_inter_pred_chroma_vert_w16out_fptr;
+    ihevc_inter_pred_ft *ihevc_inter_pred_luma_horz_fptr;
+    ihevc_inter_pred_ft *ihevc_inter_pred_luma_vert_fptr;
+    ihevc_inter_pred_w16out_ft *ihevc_inter_pred_luma_vert_w16out_fptr;
+    ihevc_inter_pred_w16inp_ft *ihevc_inter_pred_luma_vert_w16inp_fptr;
+    ihevc_inter_pred_ft *ihevc_inter_pred_luma_copy_fptr;
+    ihevc_inter_pred_w16out_ft *ihevc_inter_pred_luma_copy_w16out_fptr;
+    ihevc_inter_pred_w16out_ft *ihevc_inter_pred_luma_horz_w16out_fptr;
+    ihevc_inter_pred_w16inp_w16out_ft *ihevc_inter_pred_luma_vert_w16inp_w16out_fptr;
+
+    ihevc_intra_pred_chroma_ref_substitution_ft *ihevc_intra_pred_chroma_ref_substitution_fptr;
+    ihevc_intra_pred_luma_ref_substitution_ft *ihevc_intra_pred_luma_ref_substitution_fptr;
+    ihevc_intra_pred_luma_ref_subst_all_avlble_ft *ihevc_intra_pred_luma_ref_subst_all_avlble_fptr;
+    ihevc_intra_pred_ref_filtering_ft *ihevc_intra_pred_ref_filtering_fptr;
+    ihevc_intra_pred_chroma_dc_ft *ihevc_intra_pred_chroma_dc_fptr;
+    ihevc_intra_pred_chroma_horz_ft *ihevc_intra_pred_chroma_horz_fptr;
+    ihevc_intra_pred_chroma_mode2_ft *ihevc_intra_pred_chroma_mode2_fptr;
+    ihevc_intra_pred_chroma_mode_18_34_ft *ihevc_intra_pred_chroma_mode_18_34_fptr;
+    ihevc_intra_pred_chroma_mode_27_to_33_ft *ihevc_intra_pred_chroma_mode_27_to_33_fptr;
+    ihevc_intra_pred_chroma_mode_3_to_9_ft *ihevc_intra_pred_chroma_mode_3_to_9_fptr;
+    ihevc_intra_pred_chroma_planar_ft *ihevc_intra_pred_chroma_planar_fptr;
+    ihevc_intra_pred_chroma_ver_ft *ihevc_intra_pred_chroma_ver_fptr;
+    ihevc_intra_pred_chroma_mode_11_to_17_ft *ihevc_intra_pred_chroma_mode_11_to_17_fptr;
+    ihevc_intra_pred_chroma_mode_19_to_25_ft *ihevc_intra_pred_chroma_mode_19_to_25_fptr;
+    ihevc_intra_pred_luma_mode_11_to_17_ft *ihevc_intra_pred_luma_mode_11_to_17_fptr;
+    ihevc_intra_pred_luma_mode_19_to_25_ft *ihevc_intra_pred_luma_mode_19_to_25_fptr;
+    ihevc_intra_pred_luma_dc_ft *ihevc_intra_pred_luma_dc_fptr;
+    ihevc_intra_pred_luma_horz_ft *ihevc_intra_pred_luma_horz_fptr;
+    ihevc_intra_pred_luma_mode2_ft *ihevc_intra_pred_luma_mode2_fptr;
+    ihevc_intra_pred_luma_mode_18_34_ft *ihevc_intra_pred_luma_mode_18_34_fptr;
+    ihevc_intra_pred_luma_mode_27_to_33_ft *ihevc_intra_pred_luma_mode_27_to_33_fptr;
+    ihevc_intra_pred_luma_mode_3_to_9_ft *ihevc_intra_pred_luma_mode_3_to_9_fptr;
+    ihevc_intra_pred_luma_planar_ft *ihevc_intra_pred_luma_planar_fptr;
+    ihevc_intra_pred_luma_ver_ft *ihevc_intra_pred_luma_ver_fptr;
+    ihevc_itrans_4x4_ttype1_ft *ihevc_itrans_4x4_ttype1_fptr;
+    ihevc_itrans_4x4_ft *ihevc_itrans_4x4_fptr;
+    ihevc_itrans_8x8_ft *ihevc_itrans_8x8_fptr;
+    ihevc_itrans_16x16_ft *ihevc_itrans_16x16_fptr;
+    ihevc_itrans_32x32_ft *ihevc_itrans_32x32_fptr;
+    ihevc_itrans_recon_4x4_ttype1_ft *ihevc_itrans_recon_4x4_ttype1_fptr;
+    ihevc_itrans_recon_4x4_ft *ihevc_itrans_recon_4x4_fptr;
+    ihevc_itrans_recon_8x8_ft *ihevc_itrans_recon_8x8_fptr;
+    ihevc_itrans_recon_16x16_ft *ihevc_itrans_recon_16x16_fptr;
+    ihevc_itrans_recon_32x32_ft *ihevc_itrans_recon_32x32_fptr;
+    ihevc_chroma_itrans_recon_4x4_ft *ihevc_chroma_itrans_recon_4x4_fptr;
+    ihevc_chroma_itrans_recon_8x8_ft *ihevc_chroma_itrans_recon_8x8_fptr;
+    ihevc_chroma_itrans_recon_16x16_ft *ihevc_chroma_itrans_recon_16x16_fptr;
+    ihevc_recon_4x4_ttype1_ft *ihevc_recon_4x4_ttype1_fptr;
+    ihevc_recon_4x4_ft *ihevc_recon_4x4_fptr;
+    ihevc_recon_8x8_ft *ihevc_recon_8x8_fptr;
+    ihevc_recon_16x16_ft *ihevc_recon_16x16_fptr;
+    ihevc_recon_32x32_ft *ihevc_recon_32x32_fptr;
+    ihevc_chroma_recon_4x4_ft *ihevc_chroma_recon_4x4_fptr;
+    ihevc_chroma_recon_8x8_ft *ihevc_chroma_recon_8x8_fptr;
+    ihevc_chroma_recon_16x16_ft *ihevc_chroma_recon_16x16_fptr;
+    ihevc_memcpy_mul_8_ft *ihevc_memcpy_mul_8_fptr;
+    ihevc_memcpy_ft *ihevc_memcpy_fptr;
+    ihevc_memset_mul_8_ft *ihevc_memset_mul_8_fptr;
+    ihevc_memset_ft *ihevc_memset_fptr;
+    ihevc_memset_16bit_mul_8_ft *ihevc_memset_16bit_mul_8_fptr;
+    ihevc_memset_16bit_ft *ihevc_memset_16bit_fptr;
+    ihevc_pad_left_luma_ft *ihevc_pad_left_luma_fptr;
+    ihevc_pad_left_chroma_ft *ihevc_pad_left_chroma_fptr;
+    ihevc_pad_right_luma_ft *ihevc_pad_right_luma_fptr;
+    ihevc_pad_right_chroma_ft *ihevc_pad_right_chroma_fptr;
+    ihevc_weighted_pred_bi_ft *ihevc_weighted_pred_bi_fptr;
+    ihevc_weighted_pred_bi_default_ft *ihevc_weighted_pred_bi_default_fptr;
+    ihevc_weighted_pred_uni_ft *ihevc_weighted_pred_uni_fptr;
+    ihevc_weighted_pred_chroma_bi_ft *ihevc_weighted_pred_chroma_bi_fptr;
+    ihevc_weighted_pred_chroma_bi_default_ft *ihevc_weighted_pred_chroma_bi_default_fptr;
+    ihevc_weighted_pred_chroma_uni_ft *ihevc_weighted_pred_chroma_uni_fptr;
+    ihevc_sao_band_offset_luma_ft *ihevc_sao_band_offset_luma_fptr;
+    ihevc_sao_band_offset_chroma_ft *ihevc_sao_band_offset_chroma_fptr;
+    ihevc_sao_edge_offset_class0_ft *ihevc_sao_edge_offset_class0_fptr;
+    ihevc_sao_edge_offset_class0_chroma_ft *ihevc_sao_edge_offset_class0_chroma_fptr;
+    ihevc_sao_edge_offset_class1_ft *ihevc_sao_edge_offset_class1_fptr;
+    ihevc_sao_edge_offset_class1_chroma_ft *ihevc_sao_edge_offset_class1_chroma_fptr;
+    ihevc_sao_edge_offset_class2_ft *ihevc_sao_edge_offset_class2_fptr;
+    ihevc_sao_edge_offset_class2_chroma_ft *ihevc_sao_edge_offset_class2_chroma_fptr;
+    ihevc_sao_edge_offset_class3_ft *ihevc_sao_edge_offset_class3_fptr;
+    ihevc_sao_edge_offset_class3_chroma_ft *ihevc_sao_edge_offset_class3_chroma_fptr;
+    ihevcd_fmt_conv_420sp_to_rgba8888_ft *ihevcd_fmt_conv_420sp_to_rgba8888_fptr;
+    ihevcd_fmt_conv_420sp_to_rgb565_ft *ihevcd_fmt_conv_420sp_to_rgb565_fptr;
+    ihevcd_fmt_conv_420sp_to_420sp_ft *ihevcd_fmt_conv_420sp_to_420sp_fptr;
+    ihevcd_fmt_conv_420sp_to_420p_ft *ihevcd_fmt_conv_420sp_to_420p_fptr;
+    ihevcd_itrans_recon_dc_luma_ft *ihevcd_itrans_recon_dc_luma_fptr;
+    ihevcd_itrans_recon_dc_chroma_ft *ihevcd_itrans_recon_dc_chroma_fptr;
+}func_selector_t;
+
+#endif /* _IHEVCD_FUNCTION_SELECTOR_H_ */

diff --git a/decoder/ihevcd_get_mv.c b/decoder/ihevcd_get_mv.c
new file mode 100644
index 0000000..e0d89c7
--- /dev/null
+++ b/decoder/ihevcd_get_mv.c

@@ -0,0 +1,593 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevcd_get_mv.c
+ *
+ * @brief
+ *  Contains functions to compute motion vectors
+ *
+ * @author
+ *  Ittiam
+ *
+ * @par List of Functions:
+ * - ihevcd_get_mv_ctb()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_fmt_conv.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_mv_merge.h"
+#include "ihevcd_mv_pred.h"
+#include "ihevcd_profile.h"
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function computes and stores MV's of all the PU's in CTB
+ *
+ * @par Description:
+ * MV's of a PU will be stored in PU structure. MV computation can be merge or mv pred
+ *
+ * @param[in] ps_proc
+ * processor context
+ *
+ * @param[in] pi4_ctb_top_pu_idx
+ * Pointer to ctb top PU indices
+ *
+ * @param[in] pi4_ctb_left_pu_idx
+ * Pointer to ctb left PU indices
+ *
+ * @param[in] pi4_ctb_top_left_pu_idx
+ * Pointer to ctb top left PU indices
+ *
+ * @returns
+ * number of PU's per ctb
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+WORD32 ihevcd_get_mv_ctb(mv_ctxt_t *ps_mv_ctxt,
+                         UWORD32 *pu4_ctb_top_pu_idx,
+                         UWORD32 *pu4_ctb_left_pu_idx,
+                         UWORD32 *pu4_ctb_top_left_pu_idx)
+{
+
+    WORD32 i;
+    sps_t *ps_sps;
+    pps_t *ps_pps;
+    pu_t *ps_pu;
+    tile_t *ps_tile;
+    UWORD8 *pu1_pic_pu_map_ctb;
+    WORD32 num_minpu_in_ctb;
+    WORD32 ctb_start_pu_idx;
+    UWORD32 *pu4_top_pu_idx, *pu4_left_pu_idx, *pu4_top_left_pu_idx;
+    WORD32 pu_x_in_4x4, pu_y_in_4x4;
+    WORD32 pu_x_in_4x4_single_mcl, pu_y_in_4x4_single_mcl;
+    pu_mv_t s_pred_mv;
+    WORD32 ctb_size, ctb_size_in_min_pu;
+    WORD32 num_pu_per_ctb, pu_wd, pu_ht, pu_cnt;
+    WORD32  pu_wd_single_mcl, pu_ht_single_mcl;
+    UWORD32 au4_nbr_avail[MAX_CTB_SIZE / MIN_PU_SIZE
+                    + 2 /* Top nbr + bot nbr */];
+    UWORD32 *pu4_nbr_pu_idx/* (Left + ctb_size + right ) * (top + ctb_size + bottom) */;
+    WORD32 top_avail_bits;
+    UWORD8 u1_lb_avail, u1_l_avail, u1_t_avail, u1_tr_avail, u1_tl_avail;
+    WORD32 nbr_pu_idx_strd;
+    WORD32 cb_size;
+    WORD32 single_mcl_flag;
+
+    PROFILE_DISABLE_MV_PREDICTION();
+    ps_sps = ps_mv_ctxt->ps_sps;
+    ps_pps = ps_mv_ctxt->ps_pps;
+    ps_pu = ps_mv_ctxt->ps_pu;
+    ps_tile = ps_mv_ctxt->ps_tile;
+
+    pu4_nbr_pu_idx = ps_mv_ctxt->pu4_pic_pu_idx_map;
+
+    ctb_size = (1 << ps_sps->i1_log2_ctb_size);
+
+    ctb_size_in_min_pu = (ctb_size / MIN_PU_SIZE);
+
+    num_minpu_in_ctb = ctb_size_in_min_pu * ctb_size_in_min_pu;
+    pu1_pic_pu_map_ctb = ps_mv_ctxt->pu1_pic_pu_map + (ps_mv_ctxt->i4_ctb_x + ps_mv_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb) * num_minpu_in_ctb;
+
+    num_pu_per_ctb = ps_mv_ctxt->i4_ctb_pu_cnt;
+    ctb_start_pu_idx = ps_mv_ctxt->i4_ctb_start_pu_idx;
+    nbr_pu_idx_strd = MAX_CTB_SIZE / MIN_PU_SIZE + 2;
+
+    {
+        /* Updating the initial availability map */
+        WORD32 i;
+        UWORD32 u4_left_ctb_avail, u4_top_lt_ctb_avail, u4_top_rt_ctb_avail,
+                        u4_top_ctb_avail;
+
+        u4_left_ctb_avail = ps_mv_ctxt->u1_left_ctb_avail;
+        u4_top_lt_ctb_avail = ps_mv_ctxt->u1_top_lt_ctb_avail;
+        u4_top_ctb_avail = ps_mv_ctxt->u1_top_ctb_avail;
+        u4_top_rt_ctb_avail = ps_mv_ctxt->u1_top_rt_ctb_avail;
+
+        /* Initializing the availability array */
+        memset(au4_nbr_avail, 0,
+               (MAX_CTB_SIZE / MIN_PU_SIZE + 2) * sizeof(UWORD32));
+        /* Initializing the availability array with CTB level availability flags */
+        {
+            WORD32 rows_remaining = ps_sps->i2_pic_height_in_luma_samples
+                            - (ps_mv_ctxt->i4_ctb_y << ps_sps->i1_log2_ctb_size);
+            WORD32 ctb_size_left = MIN(ctb_size, rows_remaining);
+            for(i = 0; i < ctb_size_left / MIN_PU_SIZE; i++)
+            {
+                au4_nbr_avail[i + 1] = (u4_left_ctb_avail << 31);
+            }
+        }
+        au4_nbr_avail[0] |= ((u4_top_rt_ctb_avail << 31)
+                        >> (1 + ctb_size_in_min_pu)); /* 1+ctb_size/4 position bit pos from msb */
+
+        au4_nbr_avail[0] |= (u4_top_lt_ctb_avail << 31);
+        {
+            WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples
+                            - (ps_mv_ctxt->i4_ctb_x << ps_sps->i1_log2_ctb_size);
+            WORD32 ctb_size_top = MIN(ctb_size, cols_remaining);
+            WORD32 shift = (31 - (ctb_size / MIN_TU_SIZE));
+
+            /* ctb_size_top gives number of valid pixels remaining in the current row */
+            /* Since we need pattern of 1's starting from the MSB, an additional shift */
+            /* is needed */
+            shift += ((ctb_size - ctb_size_top) / MIN_TU_SIZE);
+
+            top_avail_bits = ((1 << (ctb_size_top / MIN_PU_SIZE)) - 1) << shift;
+        }
+
+        au4_nbr_avail[0] |= ((u4_top_ctb_avail == 1) ? top_avail_bits : 0x0);
+        /* Starting from msb 2nd bit to (1+ctb_size/4) bit, set 1 if top avail,or 0 */
+
+    }
+
+    {
+        /* In case of a  tile boundary, left and top arrays must change*/
+        /*Left*/
+        /* If start of tile row*/
+        if(((ps_tile->u1_pos_x) == (ps_mv_ctxt->i4_ctb_x)) && (ps_mv_ctxt->i4_ctb_x != 0))
+        {
+            WORD32 index_pic_map;
+            WORD32 ctb_pu_idx;
+            UWORD8 *pu1_pic_pu_map;
+
+            /* Goto the left ctb which belongs to another tile */
+            index_pic_map = ((ps_mv_ctxt->i4_ctb_x - 1) + ps_mv_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+            ctb_pu_idx = ps_mv_ctxt->pu4_pic_pu_idx[index_pic_map];
+            index_pic_map *= num_minpu_in_ctb;
+
+            /*Replicate the PUs of the last column of the left ctb*/
+            pu1_pic_pu_map = ps_mv_ctxt->pu1_pic_pu_map + index_pic_map + ctb_size_in_min_pu - 1;
+            for(i = 0; i < ctb_size_in_min_pu; i++)
+            {
+                /* Left neighbors change*/
+                pu4_ctb_left_pu_idx[i] = ctb_pu_idx + (WORD32)*pu1_pic_pu_map;
+                pu1_pic_pu_map = pu1_pic_pu_map + ctb_size_in_min_pu;
+            }
+
+
+            index_pic_map = ((ps_mv_ctxt->i4_ctb_x - 1) + (ps_mv_ctxt->i4_ctb_y - 1) * ps_sps->i2_pic_wd_in_ctb);
+            ctb_pu_idx = ps_mv_ctxt->pu4_pic_pu_idx[index_pic_map];
+            index_pic_map *= num_minpu_in_ctb;
+            index_pic_map += (num_minpu_in_ctb - 1);
+            pu4_ctb_top_left_pu_idx[0] = ctb_pu_idx + pu1_pic_pu_map[index_pic_map];
+        }
+        /*Top*/
+        /* If start of tile column*/
+        if(((ps_tile->u1_pos_y) == (ps_mv_ctxt->i4_ctb_y)) && (ps_mv_ctxt->i4_ctb_y != 0))
+        {
+            WORD32 index_pic_map;
+            WORD32 ctb_pu_idx;
+            UWORD8 *pu1_pic_pu_map;
+
+            /* Goto the top ctb which belongs to another tile */
+            index_pic_map =  (ps_mv_ctxt->i4_ctb_x) + ((ps_mv_ctxt->i4_ctb_y - 1) * ps_sps->i2_pic_wd_in_ctb);
+            ctb_pu_idx = ps_mv_ctxt->pu4_pic_pu_idx[index_pic_map];
+            index_pic_map *= num_minpu_in_ctb;
+
+            /*Replicate the PUs of the last row of the top ctb*/
+            pu1_pic_pu_map = ps_mv_ctxt->pu1_pic_pu_map + index_pic_map + (ctb_size_in_min_pu * (ctb_size_in_min_pu - 1));
+            for(i = 0; i < ctb_size_in_min_pu; i++)
+            {
+                /* Top neighbors change*/
+                pu4_ctb_top_pu_idx[i] = ctb_pu_idx + (WORD32)*pu1_pic_pu_map;
+                pu1_pic_pu_map++;
+            }
+        }
+
+        /* Updating the initial neighbor pu idx map */
+        /* Initializing the availability array with CTB level availability flags */
+        /* 16x16 array for holding pu info of the ctb, wrt the frame pu count*/
+        for(i = 0; i < ctb_size_in_min_pu; i++)
+        {
+            /* Left */
+            pu4_nbr_pu_idx[(i + 1) * nbr_pu_idx_strd] = pu4_ctb_left_pu_idx[i];
+            /* Top */
+            pu4_nbr_pu_idx[i + 1] = pu4_ctb_top_pu_idx[i];
+        }
+        /* Top right */
+        pu4_nbr_pu_idx[1 + ctb_size_in_min_pu] = pu4_ctb_top_pu_idx[ctb_size_in_min_pu];
+
+        /* Top left */
+        pu4_nbr_pu_idx[0] = pu4_ctb_top_left_pu_idx[0];
+
+    }
+
+    /* CTB level MV pred */
+    for(pu_cnt = 0; pu_cnt < num_pu_per_ctb; pu_cnt++, ps_pu++)
+    {
+        pu_ht = (ps_pu->b4_ht + 1) << 2;
+        pu_wd = (ps_pu->b4_wd + 1) << 2;
+
+        pu_ht_single_mcl = pu_ht;
+        pu_wd_single_mcl = pu_wd;
+
+        pu_x_in_4x4 = ps_pu->b4_pos_x;
+        pu_y_in_4x4 = ps_pu->b4_pos_y;
+
+        pu_x_in_4x4_single_mcl = pu_x_in_4x4;
+        pu_y_in_4x4_single_mcl = pu_y_in_4x4;
+
+        /*******************************************/
+        /* Neighbor location: Graphical indication */
+        /*                                         */
+        /*          B2 _____________B1 B0          */
+        /*            |               |            */
+        /*            |               |            */
+        /*            |               |            */
+        /*            |      PU     ht|            */
+        /*            |               |            */
+        /*            |               |            */
+        /*          A1|______wd_______|            */
+        /*          A0                             */
+        /*                                         */
+        /*******************************************/
+        /* Below code is for merge mode, where if single_mcl_flag == 1,
+         * all the prediction units of the current coding unit share a
+         * single merge candidate list, which is identical to the
+         * merge candidate list of the 2Nx2N prediction unit.
+         */
+        single_mcl_flag = 0;
+        if(1 == ps_pu->b1_merge_flag)
+        {
+            cb_size = MAX(pu_wd_single_mcl, pu_ht_single_mcl);
+            cb_size = MAX(cb_size,
+                          (1 << ps_sps->i1_log2_min_coding_block_size));
+            if((ps_pps->i1_log2_parallel_merge_level > 2) && cb_size == 8 && (pu_wd_single_mcl != pu_ht_single_mcl))
+            {
+                single_mcl_flag = 1;
+                if((PART_Nx2N == ps_pu->b3_part_mode) && (1 == ps_pu->b2_part_idx))
+                {
+                    pu_x_in_4x4_single_mcl = pu_x_in_4x4_single_mcl - 1;
+                }
+                else if((PART_2NxN == ps_pu->b3_part_mode) && (1 == ps_pu->b2_part_idx))
+                {
+                    pu_y_in_4x4_single_mcl = pu_y_in_4x4_single_mcl - 1;
+                }
+                pu_ht_single_mcl = 8;
+                pu_wd_single_mcl = 8;
+            }
+        }
+        pu4_top_pu_idx = &pu4_nbr_pu_idx[(1 + pu_x_in_4x4_single_mcl)
+                        + (1 + pu_y_in_4x4_single_mcl - 1) * nbr_pu_idx_strd];
+        pu4_top_left_pu_idx = pu4_top_pu_idx - 1;
+        pu4_left_pu_idx = pu4_top_pu_idx - 1 + nbr_pu_idx_strd;
+
+        /* Get neibhbor availability */
+        {
+            u1_lb_avail = (au4_nbr_avail[1 + pu_y_in_4x4_single_mcl + pu_ht_single_mcl / MIN_PU_SIZE]
+                            >> (31 - (1 + pu_x_in_4x4_single_mcl - 1))) & 1;
+            u1_l_avail = (au4_nbr_avail[1 + pu_y_in_4x4_single_mcl]
+                            >> (31 - (1 + pu_x_in_4x4_single_mcl - 1))) & 1;
+            u1_t_avail = (au4_nbr_avail[1 + pu_y_in_4x4_single_mcl - 1]
+                            >> (31 - (1 + pu_x_in_4x4_single_mcl))) & 1;
+            u1_tr_avail = (au4_nbr_avail[1 + pu_y_in_4x4_single_mcl - 1]
+                            >> (31 - (1 + pu_x_in_4x4_single_mcl + pu_wd_single_mcl / MIN_PU_SIZE)))
+                            & 1;
+            u1_tl_avail = (au4_nbr_avail[1 + pu_y_in_4x4_single_mcl - 1]
+                            >> (31 - (1 + pu_x_in_4x4_single_mcl - 1))) & 1;
+        }
+        if(ps_pu->b1_intra_flag == 0)
+        {
+            if(ps_pu->b1_merge_flag == 0)
+            {
+                WORD32 pred_flag_l0, pred_flag_l1;
+                WORD32 tmp_x, tmp_y, mvd_x, mvd_y, mvp_x, mvp_y;
+                WORD32 two_pow_16, two_pow_15;
+#if 0
+                if( !pred_flag_l0 )
+                {
+                    s_pu_temp.mv.i1_l0_ref_idx = -1;
+                    s_pu_temp.b2_pred_mode = 3;
+                }
+                if( !pred_flag_l1 )
+                {
+                    s_pu_temp.mv.i1_l1_ref_idx = -1;
+                    s_pu_temp.b2_pred_mode = 3;
+                }
+#endif
+
+                ihevcd_mv_pred(ps_mv_ctxt, pu4_top_pu_idx, pu4_left_pu_idx,
+                               pu4_top_left_pu_idx, nbr_pu_idx_strd,
+                               ps_pu, u1_lb_avail, u1_l_avail,
+                               u1_tr_avail, u1_t_avail, u1_tl_avail,
+                               &s_pred_mv);
+
+                pred_flag_l0 = (ps_pu->b2_pred_mode != PRED_L1);
+                pred_flag_l1 = (ps_pu->b2_pred_mode != PRED_L0);
+
+                two_pow_16 = (1 << 16);
+                two_pow_15 = (1 << 15);
+
+                /* L0 MV */
+                if(pred_flag_l0)
+                {
+                    mvp_x = s_pred_mv.s_l0_mv.i2_mvx;
+                    mvp_y = s_pred_mv.s_l0_mv.i2_mvy;
+                    mvd_x = ps_pu->mv.s_l0_mv.i2_mvx;
+                    mvd_y = ps_pu->mv.s_l0_mv.i2_mvy;
+
+                    tmp_x = (mvp_x + mvd_x + two_pow_16) & (two_pow_16 - 1);
+                    tmp_x = tmp_x >= two_pow_15 ?
+                                    (tmp_x - two_pow_16) : tmp_x;
+                    ps_pu->mv.s_l0_mv.i2_mvx = tmp_x;
+                    tmp_y = (mvp_y + mvd_y + two_pow_16) & (two_pow_16 - 1);
+                    tmp_y = tmp_y >= two_pow_15 ?
+                                    (tmp_y - two_pow_16) : tmp_y;
+                    ps_pu->mv.s_l0_mv.i2_mvy = tmp_y;
+                }
+                /* L1 MV */
+                if(pred_flag_l1)
+                {
+                    mvp_x = s_pred_mv.s_l1_mv.i2_mvx;
+                    mvp_y = s_pred_mv.s_l1_mv.i2_mvy;
+                    mvd_x = ps_pu->mv.s_l1_mv.i2_mvx;
+                    mvd_y = ps_pu->mv.s_l1_mv.i2_mvy;
+
+                    tmp_x = (mvp_x + mvd_x + two_pow_16) & (two_pow_16 - 1);
+                    tmp_x = tmp_x >= two_pow_15 ?
+                                    (tmp_x - two_pow_16) : tmp_x;
+                    ps_pu->mv.s_l1_mv.i2_mvx = tmp_x;
+                    tmp_y = (mvp_y + mvd_y + two_pow_16) & (two_pow_16 - 1);
+                    tmp_y = tmp_y >= two_pow_15 ?
+                                    (tmp_y - two_pow_16) : tmp_y;
+                    ps_pu->mv.s_l1_mv.i2_mvy = tmp_y;
+                }
+            }
+            else
+            {
+                WORD32 part_mode;
+                WORD32 part_idx;
+#if 0
+                /* For all part_modes other than PART_2Nx2N, max of PU width and PU height gives
+                 * Coding block size.
+                 * To differentiate between PART_2Nx2N and PART_NxN
+                 * PART_NxN is possible only when CB size is min CB size
+                 * So if pu width and pu height are equal and they are half of min CB size, then current is PART_NxN
+                 */
+                if(pu_wd == pu_ht)
+                {
+                    part_mode = PART_2Nx2N;
+                    if(2 * pu_wd == (ps_sps->i1_log2_min_coding_block_size << 2))
+                    part_mode = PART_NxN;
+                }
+                else
+                {
+
+                    if(pu_wd == cb_size)
+                    {
+                        /* Part Mode is either PART_2NxN or PART_2NxnU or PART_2NxnD */
+                        /* Since the exact mode is not really needed, it is set to PART_2NxN */
+                        part_mode = PART_2NxN;
+                    }
+                    else
+                    {
+                        /* Part Mode is either PART_Nx2N or PART_nLx2N or PART_nRx2N */
+                        /* Since the exact mode is not really needed, it is set to PART_Nx2N */
+                        part_mode = PART_Nx2N;
+                    }
+
+                }
+#else
+                part_mode = ps_pu->b3_part_mode;
+#endif
+                //TODO: Get part_idx
+                part_idx = ps_pu->b2_part_idx;
+
+                ihevcd_mv_merge(ps_mv_ctxt, pu4_top_pu_idx, pu4_left_pu_idx,
+                                nbr_pu_idx_strd, ps_pu, part_mode,
+                                part_idx, pu_wd_single_mcl, pu_ht_single_mcl,
+                                pu_x_in_4x4_single_mcl << 2, pu_y_in_4x4_single_mcl << 2,
+                                single_mcl_flag, u1_lb_avail, u1_l_avail, u1_tr_avail,
+                                u1_t_avail, u1_tl_avail);
+
+                if(PRED_BI == ps_pu->b2_pred_mode)
+                {
+                    if(((ps_pu->b3_part_mode == PART_2NxN) && (pu_wd == 8))
+                                    || ((ps_pu->b3_part_mode == PART_Nx2N)
+                                                    && (pu_ht == 8)))
+                    {
+                        ps_pu->b2_pred_mode = PRED_L0;
+                    }
+                }
+            }
+#if DEBUG_PRINT_MV
+            printf("\n-----------------------");
+            printf("\n CTB X = %d, Y = %d",
+                   ps_mv_ctxt->i4_ctb_x, ps_mv_ctxt->i4_ctb_y);
+            printf("\n pu_x = %d, pu_y = %d",
+                   (pu_x_in_4x4 * 4), (pu_y_in_4x4 * 4));
+            printf("\n pu_wd = %d, pu_ht = %d", pu_wd, pu_ht);
+            if(ps_pu->b2_pred_mode == PRED_L0)
+                printf("\n Pred = 0,Ref_idx = %d, MV l0 = %4d %4d", ps_pu->mv.i1_l0_ref_idx, ps_pu->mv.s_l0_mv.i2_mvx,
+                       ps_pu->mv.s_l0_mv.i2_mvy);
+            else if(ps_pu->b2_pred_mode == PRED_L1)
+                printf("\n Pred = 1,Ref_idx = %d,  MV l1 = %4d %4d", ps_pu->mv.i1_l1_ref_idx, ps_pu->mv.s_l1_mv.i2_mvx,
+                       ps_pu->mv.s_l1_mv.i2_mvy);
+            else
+                printf("\n Pred = 2,Ref_idx = %d,Ref_idx = %d, MV l0 = %4d %4d, MV l1 = %4d %4d", ps_pu->mv.i1_l0_ref_idx, ps_pu->mv.i1_l1_ref_idx,
+                       ps_pu->mv.s_l0_mv.i2_mvx, ps_pu->mv.s_l0_mv.i2_mvy,
+                       ps_pu->mv.s_l1_mv.i2_mvx, ps_pu->mv.s_l1_mv.i2_mvy);
+
+#endif
+        }
+
+        {
+            slice_header_t *ps_slice_hdr;
+            pic_buf_t *ps_pic_buf_l0, *ps_pic_buf_l1;
+            ps_slice_hdr = ps_mv_ctxt->ps_slice_hdr;
+            ps_pic_buf_l0 = (pic_buf_t *)((ps_slice_hdr->as_ref_pic_list0[ps_pu->mv.i1_l0_ref_idx].pv_pic_buf));
+            ps_pic_buf_l1 = (pic_buf_t *)((ps_slice_hdr->as_ref_pic_list1[ps_pu->mv.i1_l1_ref_idx].pv_pic_buf));
+            ps_pu->mv.i1_l0_ref_pic_buf_id = ps_pic_buf_l0->u1_buf_id;
+            if(BSLICE == ps_slice_hdr->i1_slice_type)
+            {
+                ps_pu->mv.i1_l1_ref_pic_buf_id = ps_pic_buf_l1->u1_buf_id;
+            }
+        }
+
+        /* Neighbor availability inside CTB */
+        /* 1bit per 4x4. Indicates whether that 4x4 block has been reconstructed(avialable) */
+        /* Used for neighbor availability in intra pred */
+        {
+            WORD32 trans_in_min_tu;
+            UWORD32 cur_tu_in_bits;
+            UWORD32 cur_tu_avail_flag;
+
+            trans_in_min_tu = pu_wd / MIN_PU_SIZE;
+            cur_tu_in_bits = (1 << trans_in_min_tu) - 1;
+            cur_tu_in_bits = cur_tu_in_bits << (32 - trans_in_min_tu);
+
+            cur_tu_avail_flag = cur_tu_in_bits >> (pu_x_in_4x4 + 1);
+
+            for(i = 0; i < pu_ht / MIN_PU_SIZE; i++)
+                au4_nbr_avail[1 + pu_y_in_4x4 + i] |= cur_tu_avail_flag;
+        }
+
+        /* Neighbor PU idx update inside CTB */
+        /* 1byte per 4x4. Indicates the PU idx that 4x4 block belongs to */
+
+        {
+            WORD32 row, col;
+            UWORD32 cur_pu_idx;
+            WORD32 offset;
+            cur_pu_idx = ctb_start_pu_idx + pu_cnt;
+
+            offset = (1 + pu_x_in_4x4 + 0) + (1 + pu_y_in_4x4 + 0) * nbr_pu_idx_strd;
+
+            for(row = 0; row < pu_ht / MIN_PU_SIZE; row++)
+            {
+                for(col = 0; col < pu_wd / MIN_PU_SIZE; col++)
+                {
+                    pu4_nbr_pu_idx[offset + col] = cur_pu_idx;
+                }
+                offset += nbr_pu_idx_strd;
+            }
+        }
+
+    }
+
+    /* Updating Top and Left pointers */
+    {
+        WORD32 offset_top, offset_left;
+
+        offset_left = ctb_size_in_min_pu + (0 + 1) * nbr_pu_idx_strd;
+        offset_top = ctb_size_in_min_pu * nbr_pu_idx_strd + 0 + 1;
+
+        /* Top Left */
+        /* saving top left before updating top ptr, as updating top ptr will overwrite the top left for the next ctb */
+        pu4_ctb_top_left_pu_idx[0] = pu4_ctb_top_pu_idx[ctb_size_in_min_pu - 1];
+
+        for(i = 0; i < ctb_size_in_min_pu; i++)
+        {
+            /* Left */
+            /* Last column of au4_nbr_pu_idx */
+            pu4_ctb_left_pu_idx[i] = pu4_nbr_pu_idx[offset_left];
+            /* Top */
+            /* Last row of au4_nbr_pu_idx */
+            pu4_ctb_top_pu_idx[i] = pu4_nbr_pu_idx[offset_top];
+
+            offset_left += nbr_pu_idx_strd;
+            offset_top += 1;
+        }
+    }
+
+#if 1
+    /* Updating the CTB level PU idx (Used for collocated MV pred)*/
+    {
+        WORD32 ctb_row, ctb_col, index_pic_map, index_nbr_map;
+        WORD32 first_pu_of_ctb;
+        first_pu_of_ctb = pu4_nbr_pu_idx[1 + nbr_pu_idx_strd];
+
+        index_pic_map = 0 * ctb_size_in_min_pu + 0;
+        index_nbr_map = (0 + 1) * nbr_pu_idx_strd + (0 + 1);
+
+        for(ctb_row = 0; ctb_row < ctb_size_in_min_pu; ctb_row++)
+        {
+            for(ctb_col = 0; ctb_col < ctb_size_in_min_pu; ctb_col++)
+            {
+                pu1_pic_pu_map_ctb[index_pic_map + ctb_col] = pu4_nbr_pu_idx[index_nbr_map + ctb_col]
+                                - first_pu_of_ctb;
+            }
+            index_pic_map += ctb_size_in_min_pu;
+            index_nbr_map += nbr_pu_idx_strd;
+        }
+    }
+#endif
+    return num_pu_per_ctb;
+}

diff --git a/decoder/ihevcd_get_mv.h b/decoder/ihevcd_get_mv.h
new file mode 100644
index 0000000..fd5e86b
--- /dev/null
+++ b/decoder/ihevcd_get_mv.h

@@ -0,0 +1,46 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_parse_slice.h
+*
+* @brief
+*  Processing of slice level data
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef IHEVCD_GET_MV_H_
+#define IHEVCD_GET_MV_H_
+
+WORD32 ihevcd_get_mv_ctb(mv_ctxt_t *ps_mv_ctxt,
+                         UWORD32 *pu4_ctb_top_pu_idx,
+                         UWORD32 *pu4_ctb_left_pu_idx,
+                         UWORD32 *pu4_ctb_top_left_pu_idx);
+
+
+#endif /* IHEVCD_GET_MV_H_ */

diff --git a/decoder/ihevcd_ilf_padding.c b/decoder/ihevcd_ilf_padding.c
new file mode 100644
index 0000000..9db82e5
--- /dev/null
+++ b/decoder/ihevcd_ilf_padding.c

@@ -0,0 +1,214 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_ilf_padding_frame.c
+*
+* @brief
+*  Does frame level loop filtering (deblocking and SAO) and padding
+*
+* @author
+*  Srinivas T
+*
+* @par List of Functions:
+*   - ihevc_ilf_pad_frame()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+
+#include "ihevc_error.h"
+#include "ihevc_common_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_utils.h"
+
+#include "ihevc_deblk.h"
+#include "ihevc_deblk_tables.h"
+#include "ihevcd_profile.h"
+#include "ihevcd_deblk.h"
+#include "ihevcd_sao.h"
+#include "ihevc_padding.h"
+
+void ihevcd_ilf_pad_frame(deblk_ctxt_t *ps_deblk_ctxt, sao_ctxt_t *ps_sao_ctxt)
+{
+    sps_t *ps_sps;
+    slice_header_t *ps_slice_hdr;
+    codec_t *ps_codec;
+    WORD32 i4_ctb_x, i4_ctb_y;
+    WORD32 ctb_size;
+
+    ps_sps = ps_deblk_ctxt->ps_sps;
+    ps_slice_hdr = ps_deblk_ctxt->ps_slice_hdr;
+    ps_codec = ps_deblk_ctxt->ps_codec;
+    ctb_size = (1 << ps_sps->i1_log2_ctb_size);
+
+    for(i4_ctb_y = 0; i4_ctb_y < ps_sps->i2_pic_ht_in_ctb; i4_ctb_y++)
+    {
+        for(i4_ctb_x = 0; i4_ctb_x < ps_sps->i2_pic_wd_in_ctb; i4_ctb_x++)
+        {
+            WORD32 i4_is_last_ctb_x = 0;
+            WORD32 i4_is_last_ctb_y = 0;
+
+            /*TODO:
+             *  Slice header also has to be updated
+             *  */
+            ps_deblk_ctxt->i4_ctb_x = i4_ctb_x;
+            ps_deblk_ctxt->i4_ctb_y = i4_ctb_y;
+
+            ps_sao_ctxt->i4_ctb_x = i4_ctb_x;
+            ps_sao_ctxt->i4_ctb_y = i4_ctb_y;
+
+            if((0 == ps_slice_hdr->i1_slice_disable_deblocking_filter_flag) &&
+               (0 == ps_codec->i4_disable_deblk_pic))
+            {
+                ihevcd_deblk_ctb(ps_deblk_ctxt, i4_is_last_ctb_x, i4_is_last_ctb_y);
+
+                /* If the last CTB in the row was a complete CTB then deblocking has to be called from remaining pixels, since deblocking
+                 * is applied on a shifted CTB structure
+                 */
+                if(i4_ctb_x == ps_sps->i2_pic_wd_in_ctb - 1)
+                {
+                    WORD32 last_x_pos;
+                    i4_is_last_ctb_x = 1;
+                    i4_is_last_ctb_y = 0;
+
+
+                    last_x_pos = (ps_sps->i2_pic_wd_in_ctb << ps_sps->i1_log2_ctb_size);
+                    if(last_x_pos  ==  ps_sps->i2_pic_width_in_luma_samples)
+                    {
+                        ihevcd_deblk_ctb(ps_deblk_ctxt, i4_is_last_ctb_x, i4_is_last_ctb_y);
+                    }
+                }
+
+
+                /* If the last CTB in the column was a complete CTB then deblocking has to be called from remaining pixels, since deblocking
+                 * is applied on a shifted CTB structure
+                 */
+                if(i4_ctb_y == ps_sps->i2_pic_ht_in_ctb - 1)
+                {
+                    WORD32 last_y_pos;
+                    i4_is_last_ctb_y = 1;
+                    i4_is_last_ctb_x = 0;
+
+                    last_y_pos = (ps_sps->i2_pic_ht_in_ctb << ps_sps->i1_log2_ctb_size);
+                    if(last_y_pos == ps_sps->i2_pic_height_in_luma_samples)
+                    {
+                        ihevcd_deblk_ctb(ps_deblk_ctxt, i4_is_last_ctb_x, i4_is_last_ctb_y);
+                    }
+                }
+            }
+
+            if(ps_slice_hdr->i1_slice_sao_luma_flag || ps_slice_hdr->i1_slice_sao_chroma_flag)
+            {
+                ihevcd_sao_ctb(ps_sao_ctxt);
+            }
+
+            /* Call padding if required */
+            {
+                UWORD8 *pu1_cur_ctb_luma = ps_deblk_ctxt->pu1_cur_pic_luma
+                                + (i4_ctb_x * ctb_size
+                                                + i4_ctb_y * ctb_size
+                                                                * ps_codec->i4_strd);
+                UWORD8 *pu1_cur_ctb_chroma = ps_deblk_ctxt->pu1_cur_pic_chroma
+                                + i4_ctb_x * ctb_size
+                                + (i4_ctb_y * ctb_size * ps_codec->i4_strd / 2);
+
+                if(0 == i4_ctb_x)
+                {
+                    WORD32 pad_ht_luma;
+                    WORD32 pad_ht_chroma;
+
+                    pad_ht_luma = ctb_size;
+                    pad_ht_luma += (ps_sps->i2_pic_ht_in_ctb - 1) == i4_ctb_y ? 8 : 0;
+                    pad_ht_chroma = ctb_size / 2;
+                    pad_ht_chroma += (ps_sps->i2_pic_ht_in_ctb - 1) == i4_ctb_y ? 8 : 0;
+                    /* Pad left after 1st CTB is processed */
+                    ps_codec->s_func_selector.ihevc_pad_left_luma_fptr(pu1_cur_ctb_luma - 8 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_luma, PAD_LEFT);
+                    ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr(pu1_cur_ctb_chroma - 8 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_chroma, PAD_LEFT);
+                }
+                else if((ps_sps->i2_pic_wd_in_ctb - 1) == i4_ctb_x)
+                {
+                    WORD32 pad_ht_luma;
+                    WORD32 pad_ht_chroma;
+                    WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples - (i4_ctb_x << ps_sps->i1_log2_ctb_size);
+
+                    pad_ht_luma = ctb_size;
+                    pad_ht_luma += (ps_sps->i2_pic_ht_in_ctb - 1) == i4_ctb_y ? 8 : 0;
+                    pad_ht_chroma = ctb_size / 2;
+                    pad_ht_chroma += (ps_sps->i2_pic_ht_in_ctb - 1) == i4_ctb_y ? 8 : 0;
+                    /* Pad right after last CTB in the current row is processed */
+                    ps_codec->s_func_selector.ihevc_pad_right_luma_fptr(pu1_cur_ctb_luma + cols_remaining - 8 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_luma, PAD_RIGHT);
+                    ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr(pu1_cur_ctb_chroma + cols_remaining - 8 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_chroma, PAD_RIGHT);
+
+
+                    if((ps_sps->i2_pic_ht_in_ctb - 1) == i4_ctb_y)
+                    {
+                        UWORD8 *pu1_buf;
+                        /* Since SAO is shifted by 8x8, chroma padding can not be done till second row is processed */
+                        /* Hence moving top padding to to end of frame, Moving it to second row also results in problems when there is only one row */
+                        /* Pad top after padding left and right for current rows after processing 1st CTB row */
+                        ihevc_pad_top(ps_deblk_ctxt->pu1_cur_pic_luma - PAD_LEFT, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_TOP);
+                        ihevc_pad_top(ps_deblk_ctxt->pu1_cur_pic_chroma - PAD_LEFT, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_TOP / 2);
+
+                        pu1_buf = ps_deblk_ctxt->pu1_cur_pic_luma + ps_codec->i4_strd * ps_sps->i2_pic_height_in_luma_samples - PAD_LEFT;
+                        /* Pad top after padding left and right for current rows after processing 1st CTB row */
+                        ihevc_pad_bottom(pu1_buf, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_BOT);
+
+                        pu1_buf = ps_deblk_ctxt->pu1_cur_pic_chroma + ps_codec->i4_strd * (ps_sps->i2_pic_height_in_luma_samples / 2) - PAD_LEFT;
+                        ihevc_pad_bottom(pu1_buf, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_BOT / 2);
+                    }
+                }
+            }
+
+
+        }
+    }
+
+}

diff --git a/decoder/ihevcd_ilf_padding.h b/decoder/ihevcd_ilf_padding.h
new file mode 100644
index 0000000..88c9732
--- /dev/null
+++ b/decoder/ihevcd_ilf_padding.h

@@ -0,0 +1,45 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevc_ilf_padding_frame.h
+*
+* @brief
+*  Does frame level loop filtering (deblocking and SAO) and padding
+*
+* @author
+*  Srinivas T
+*
+* @par List of Functions:
+*   - ihevc_ilf_pad_frame()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef IHEVCD_ILF_PADDING_H_
+#define IHEVCD_ILF_PADDING_H_
+
+void ihevcd_ilf_pad_frame(deblk_ctxt_t *ps_deblk_ctxt, sao_ctxt_t *ps_sao_ctxt);
+
+
+#endif /* IHEVCD_ILF_PADDING_H_ */
+

diff --git a/decoder/ihevcd_inter_pred.c b/decoder/ihevcd_inter_pred.c
new file mode 100644
index 0000000..cef3bee
--- /dev/null
+++ b/decoder/ihevcd_inter_pred.c

@@ -0,0 +1,676 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_inter_pred.c
+ *
+ * @brief
+ *  Calculates the prediction samples for a given cbt
+ *
+ * @author
+ *  Srinivas T
+ *
+ * @par List of Functions:
+ *   - ihevc_inter_pred()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_weighted_pred.h"
+
+#include "ihevc_error.h"
+#include "ihevc_common_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_utils.h"
+
+#include "ihevc_inter_pred.h"
+#include "ihevcd_profile.h"
+
+WORD8 luma_filter[4][NTAPS_LUMA] =
+{
+    { 0, 0, 0, 64, 0, 0, 0, 0 },
+    { -1, 4, -10, 58, 17, -5, 1, 0 },
+    { -1, 4, -11, 40, 40, -11, 4, -1 },
+    { 0, 1, -5, 17, 58, -10, 4, -1 } };
+
+/* The filter uses only the first four elements in each array */
+WORD8 chroma_filter[8][NTAPS_LUMA] =
+{
+    { 0, 64, 0, 0, 0, 0, 0, 0 },
+    { -2, 58, 10, -2, 0, 0, 0, 0 },
+    { -4, 54, 16, -2, 0, 0, 0, 0 },
+    { -6, 46, 28, -4, 0, 0, 0, 0 },
+    { -4, 36, 36, -4, 0, 0, 0, 0 },
+    { -4, 28, 46, -6, 0, 0, 0, 0 },
+    { -2, 16, 54, -4, 0, 0, 0, 0 },
+    { -2, 10, 58, -2, 0, 0, 0, 0 } };
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Inter prediction CTB level function
+*
+* @par Description:
+*  For a given CTB, Inter prediction followed by weighted  prediction is
+* done for all the PUs present in the CTB
+*
+* @param[in] ps_ctb
+*  Pointer to the CTB context
+*
+* @returns
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+void ihevcd_inter_pred_ctb(process_ctxt_t *ps_proc)
+{
+    UWORD8 *ref_pic_luma_l0, *ref_pic_chroma_l0;
+    UWORD8 *ref_pic_luma_l1, *ref_pic_chroma_l1;
+
+    UWORD8 *ref_pic_l0 = NULL, *ref_pic_l1 = NULL;
+
+    slice_header_t *ps_slice_hdr;
+    sps_t *ps_sps;
+    pps_t *ps_pps;
+    pu_t *ps_pu;
+    codec_t *ps_codec;
+    WORD32 pu_indx;
+    WORD32 pu_x, pu_y;
+    WORD32 pu_wd, pu_ht;
+    WORD32 i4_pu_cnt;
+    WORD32 cur_ctb_idx;
+
+    WORD32 clr_indx;
+    WORD32 ntaps;
+
+
+
+    WORD32 ai2_xint[2] = { 0, 0 }, ai2_yint[2] = { 0, 0 };
+    WORD32 ai2_xfrac[2] = { 0, 0 }, ai2_yfrac[2] = { 0, 0 };
+
+    WORD32 weighted_pred, bi_pred;
+
+    WORD32 ref_strd;
+    UWORD8 *pu1_dst_luma, *pu1_dst_chroma;
+
+    UWORD8 *pu1_dst;
+
+    WORD16 *pi2_tmp1, *pi2_tmp2;
+
+    WORD32 luma_weight_l0, luma_weight_l1;
+    WORD32 chroma_weight_l0_cb, chroma_weight_l1_cb, chroma_weight_l0_cr, chroma_weight_l1_cr;
+    WORD32 luma_offset_l0, luma_offset_l1;
+    WORD32 chroma_offset_l0_cb, chroma_offset_l1_cb, chroma_offset_l0_cr, chroma_offset_l1_cr;
+    WORD32 shift, lvl_shift1, lvl_shift2;
+
+    pf_inter_pred func_ptr1, func_ptr2, func_ptr3, func_ptr4;
+    WORD32 func_indx1, func_indx2, func_indx3, func_indx4;
+    void *func_src;
+    void *func_dst;
+    WORD32 func_src_strd;
+    WORD32 func_dst_strd;
+    WORD8 *func_coeff;
+    WORD32 func_wd;
+    WORD32 func_ht;
+    WORD32 next_ctb_idx;
+    WORD8(*coeff)[8];
+    WORD32  chroma_yuv420sp_vu;
+
+    PROFILE_DISABLE_INTER_PRED();
+    ps_codec = ps_proc->ps_codec;
+    ps_slice_hdr = ps_proc->ps_slice_hdr;
+    ps_pps = ps_proc->ps_pps;
+    ps_sps = ps_proc->ps_sps;
+    cur_ctb_idx = ps_proc->i4_ctb_x
+                    + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+    /*
+     * In case of tiles, the next ctb belonging to the same tile must be used to get the PU index
+     */
+
+    next_ctb_idx = ps_proc->i4_next_pu_ctb_cnt;
+    i4_pu_cnt = ps_proc->pu4_pic_pu_idx[next_ctb_idx] - ps_proc->pu4_pic_pu_idx[cur_ctb_idx];
+
+    ps_pu = ps_proc->ps_pu;
+    ref_strd = ps_codec->i4_strd;
+    pi2_tmp1 = ps_proc->pi2_inter_pred_tmp_buf1;
+    pi2_tmp2 = ps_proc->pi2_inter_pred_tmp_buf2;
+    pu1_dst_luma = ps_proc->pu1_cur_pic_luma;
+    pu1_dst_chroma = ps_proc->pu1_cur_pic_chroma;
+
+    chroma_yuv420sp_vu = (ps_codec->e_ref_chroma_fmt == IV_YUV_420SP_VU);
+
+    ASSERT(PSLICE == ps_slice_hdr->i1_slice_type || BSLICE == ps_slice_hdr->i1_slice_type);
+
+    ref_pic_luma_l0 = NULL;
+    ref_pic_chroma_l0 = NULL;
+
+    luma_weight_l0 = 0;
+    chroma_weight_l0_cb = 0;
+    chroma_weight_l0_cr = 0;
+
+    luma_offset_l0 = 0;
+    chroma_offset_l0_cb = 0;
+    chroma_offset_l0_cr = 0;
+
+    ref_pic_luma_l1 = NULL;
+    ref_pic_chroma_l1 = NULL;
+
+    luma_weight_l1 = 0;
+    chroma_weight_l1_cb = 0;
+    chroma_weight_l1_cr = 0;
+
+    luma_offset_l1 = 0;
+    chroma_offset_l1_cb = 0;
+    chroma_offset_l1_cr = 0;
+
+    for(pu_indx = 0; pu_indx < i4_pu_cnt; pu_indx++, ps_pu++)
+    {
+        /* If the PU is intra then proceed to the next */
+        if(1 == ps_pu->b1_intra_flag)
+            continue;
+        pu_x = (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size) + (ps_pu->b4_pos_x << 2);
+        pu_y = (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size) + (ps_pu->b4_pos_y << 2);
+
+        pu_wd = (ps_pu->b4_wd + 1) << 2;
+        pu_ht = (ps_pu->b4_ht + 1) << 2;
+
+        weighted_pred = (ps_slice_hdr->i1_slice_type == PSLICE) ? ps_pps->i1_weighted_pred_flag :
+                        ps_pps->i1_weighted_bipred_flag;
+        bi_pred = (ps_pu->b2_pred_mode == PRED_BI);
+
+#ifdef GPU_BUILD
+        if(ps_proc->u4_gpu_inter_flag == 1)
+        {
+            /* Only 16x16 PUs have been implemented on opencl device */
+            if((pu_wd % 16 == 0) && (pu_ht % 16 == 0) && (weighted_pred == 0))
+            {
+                //printf("Skipping Inter\n");
+                continue;
+            }
+        }
+#endif
+        if(ps_pu->b2_pred_mode != PRED_L1)
+        {
+            pic_buf_t *ps_pic_buf_l0;
+
+            ps_pic_buf_l0 = (pic_buf_t *)((ps_slice_hdr->as_ref_pic_list0[ps_pu->mv.i1_l0_ref_idx].pv_pic_buf));
+
+            ref_pic_luma_l0 = ps_pic_buf_l0->pu1_luma;
+            ref_pic_chroma_l0 = ps_pic_buf_l0->pu1_chroma;
+
+            luma_weight_l0 = ps_slice_hdr->s_wt_ofst.i2_luma_weight_l0[ps_pu->mv.i1_l0_ref_idx];
+            chroma_weight_l0_cb = ps_slice_hdr->s_wt_ofst.i2_chroma_weight_l0_cb[ps_pu->mv.i1_l0_ref_idx];
+            chroma_weight_l0_cr = ps_slice_hdr->s_wt_ofst.i2_chroma_weight_l0_cr[ps_pu->mv.i1_l0_ref_idx];
+
+            luma_offset_l0 = ps_slice_hdr->s_wt_ofst.i2_luma_offset_l0[ps_pu->mv.i1_l0_ref_idx];
+            chroma_offset_l0_cb = ps_slice_hdr->s_wt_ofst.i2_chroma_offset_l0_cb[ps_pu->mv.i1_l0_ref_idx];
+            chroma_offset_l0_cr = ps_slice_hdr->s_wt_ofst.i2_chroma_offset_l0_cr[ps_pu->mv.i1_l0_ref_idx];
+        }
+
+        if(ps_pu->b2_pred_mode != PRED_L0)
+        {
+            pic_buf_t *ps_pic_buf_l1;
+            ps_pic_buf_l1 = (pic_buf_t *)((ps_slice_hdr->as_ref_pic_list1[ps_pu->mv.i1_l1_ref_idx].pv_pic_buf));
+            ref_pic_luma_l1 = ps_pic_buf_l1->pu1_luma;
+            ref_pic_chroma_l1 = ps_pic_buf_l1->pu1_chroma;
+
+            luma_weight_l1 = ps_slice_hdr->s_wt_ofst.i2_luma_weight_l1[ps_pu->mv.i1_l1_ref_idx];
+            chroma_weight_l1_cb = ps_slice_hdr->s_wt_ofst.i2_chroma_weight_l1_cb[ps_pu->mv.i1_l1_ref_idx];
+            chroma_weight_l1_cr = ps_slice_hdr->s_wt_ofst.i2_chroma_weight_l1_cr[ps_pu->mv.i1_l1_ref_idx];
+
+            luma_offset_l1 = ps_slice_hdr->s_wt_ofst.i2_luma_offset_l1[ps_pu->mv.i1_l1_ref_idx];
+            chroma_offset_l1_cb = ps_slice_hdr->s_wt_ofst.i2_chroma_offset_l1_cb[ps_pu->mv.i1_l1_ref_idx];
+            chroma_offset_l1_cr = ps_slice_hdr->s_wt_ofst.i2_chroma_offset_l1_cr[ps_pu->mv.i1_l1_ref_idx];
+        }
+
+        /*luma and chroma components*/
+        for(clr_indx = 0; clr_indx < 2; clr_indx++)
+        {
+            PROFILE_DISABLE_INTER_PRED_LUMA(clr_indx);
+            PROFILE_DISABLE_INTER_PRED_CHROMA(clr_indx);
+
+            if(clr_indx == 0)
+            {
+                WORD32 mv;
+                if(ps_pu->b2_pred_mode != PRED_L1)
+                {
+                    mv = CLIP3(ps_pu->mv.s_l0_mv.i2_mvx, (-((MAX_CTB_SIZE + pu_x + 7) << 2)), ((ps_sps->i2_pic_width_in_luma_samples - pu_x + 7) << 2));
+                    ai2_xint[0] = pu_x + (mv >> 2);
+                    ai2_xfrac[0] = mv & 3;
+
+                    mv = CLIP3(ps_pu->mv.s_l0_mv.i2_mvy, (-((MAX_CTB_SIZE + pu_y + 7) << 2)), ((ps_sps->i2_pic_height_in_luma_samples - pu_y + 7) << 2));
+                    ai2_yint[0] = pu_y + (mv >> 2);
+                    ai2_yfrac[0] = mv & 3;
+
+                    ai2_xfrac[0] &= ps_codec->i4_mv_frac_mask;
+                    ai2_yfrac[0] &= ps_codec->i4_mv_frac_mask;
+
+
+                    ref_pic_l0 = ref_pic_luma_l0 + ai2_yint[0] * ref_strd
+                                    + ai2_xint[0];
+                }
+
+                if(ps_pu->b2_pred_mode != PRED_L0)
+                {
+
+                    mv = CLIP3(ps_pu->mv.s_l1_mv.i2_mvx, (-((MAX_CTB_SIZE + pu_x + 7) << 2)), ((ps_sps->i2_pic_width_in_luma_samples - pu_x + 7) << 2));
+                    ai2_xint[1] = pu_x + (mv >> 2);
+                    ai2_xfrac[1] = mv & 3;
+
+                    mv = CLIP3(ps_pu->mv.s_l1_mv.i2_mvy, (-((MAX_CTB_SIZE + pu_y + 7) << 2)), ((ps_sps->i2_pic_height_in_luma_samples - pu_y + 7) << 2));
+                    ai2_yint[1] = pu_y + (mv >> 2);
+                    ai2_yfrac[1] = mv & 3;
+
+                    ref_pic_l1 = ref_pic_luma_l1 + ai2_yint[1] * ref_strd
+                                    + ai2_xint[1];
+                    ai2_xfrac[1] &= ps_codec->i4_mv_frac_mask;
+                    ai2_yfrac[1] &= ps_codec->i4_mv_frac_mask;
+
+                }
+
+                pu1_dst = pu1_dst_luma + pu_y * ref_strd + pu_x;
+
+                ntaps = NTAPS_LUMA;
+                coeff = luma_filter;
+            }
+
+            else
+            {
+                WORD32 mv;
+                /* xint is upshifted by 1 because the chroma components are  */
+                /* interleaved which is not the assumption made by standard  */
+                if(ps_pu->b2_pred_mode != PRED_L1)
+                {
+                    mv = CLIP3(ps_pu->mv.s_l0_mv.i2_mvx, (-((MAX_CTB_SIZE + pu_x + 7) << 2)), ((ps_sps->i2_pic_width_in_luma_samples - pu_x + 7) << 2));
+                    ai2_xint[0] = (pu_x / 2 + (mv >> 3)) << 1;
+                    ai2_xfrac[0] = mv & 7;
+
+                    mv = CLIP3(ps_pu->mv.s_l0_mv.i2_mvy, (-((MAX_CTB_SIZE + pu_y + 7) << 2)), ((ps_sps->i2_pic_height_in_luma_samples - pu_y + 7) << 2));
+                    ai2_yint[0] = pu_y / 2 + (mv >> 3);
+                    ai2_yfrac[0] = mv & 7;
+
+                    ref_pic_l0 = ref_pic_chroma_l0 + ai2_yint[0] * ref_strd
+                                    + ai2_xint[0];
+
+                    ai2_xfrac[0] &= ps_codec->i4_mv_frac_mask;
+                    ai2_yfrac[0] &= ps_codec->i4_mv_frac_mask;
+
+                }
+
+                if(ps_pu->b2_pred_mode != PRED_L0)
+                {
+                    mv = CLIP3(ps_pu->mv.s_l1_mv.i2_mvx, (-((MAX_CTB_SIZE + pu_x + 7) << 2)), ((ps_sps->i2_pic_width_in_luma_samples - pu_x + 7) << 2));
+                    ai2_xint[1] = (pu_x / 2 + (mv >> 3)) << 1;
+                    ai2_xfrac[1] = mv & 7;
+
+                    mv = CLIP3(ps_pu->mv.s_l1_mv.i2_mvy, (-((MAX_CTB_SIZE + pu_y + 7) << 2)), ((ps_sps->i2_pic_height_in_luma_samples - pu_y + 7) << 2));
+                    ai2_yint[1] = pu_y / 2 + (mv >> 3);
+                    ai2_yfrac[1] = mv & 7;
+
+                    ref_pic_l1 = ref_pic_chroma_l1 + ai2_yint[1] * ref_strd
+                                    + ai2_xint[1];
+                    ai2_xfrac[1] &= ps_codec->i4_mv_frac_mask;
+                    ai2_yfrac[1] &= ps_codec->i4_mv_frac_mask;
+
+                }
+
+                pu1_dst = pu1_dst_chroma + pu_y * ref_strd / 2 + pu_x;
+
+                ntaps = NTAPS_CHROMA;
+                coeff = chroma_filter;
+            }
+
+            if(ps_pu->b2_pred_mode != PRED_L1)
+            {
+                func_indx1 = 4 * (weighted_pred || bi_pred) + 1 + 11 * clr_indx;
+                func_indx1 += ai2_xfrac[0] ? 2 : 0;
+                func_indx1 += ai2_yfrac[0] ? 1 : 0;
+
+                func_indx2 = (ai2_xfrac[0] && ai2_yfrac[0])
+                                * (9 + (weighted_pred || bi_pred)) + 11 * clr_indx;
+
+                func_ptr1 = ps_codec->apf_inter_pred[func_indx1];
+                func_ptr2 = ps_codec->apf_inter_pred[func_indx2];
+            }
+            else
+            {
+                func_ptr1 = NULL;
+                func_ptr2 = NULL;
+            }
+            if(ps_pu->b2_pred_mode != PRED_L0)
+            {
+                func_indx3 = 4 * (weighted_pred || bi_pred) + 1 + 11 * clr_indx;
+                func_indx3 += ai2_xfrac[1] ? 2 : 0;
+                func_indx3 += ai2_yfrac[1] ? 1 : 0;
+
+                func_indx4 = (ai2_xfrac[1] && ai2_yfrac[1])
+                                * (9 + (weighted_pred || bi_pred)) + 11 * clr_indx;
+
+                func_ptr3 = ps_codec->apf_inter_pred[func_indx3];
+                func_ptr4 = ps_codec->apf_inter_pred[func_indx4];
+            }
+            else
+            {
+                func_ptr3 = NULL;
+                func_ptr4 = NULL;
+            }
+
+            /*Function 1*/
+            if(func_ptr1 != NULL)
+            {
+                func_src_strd = ref_strd;
+                func_src = (ai2_xfrac[0] && ai2_yfrac[0]) ?
+                                ref_pic_l0 - (ntaps / 2 - 1) * func_src_strd :
+                                ref_pic_l0;
+                func_dst = (weighted_pred || bi_pred) ?
+                                (void *)pi2_tmp1 : (void *)pu1_dst;
+                if(ai2_xfrac[0] && ai2_yfrac[0])
+                {
+                    func_dst = pi2_tmp1;
+                }
+
+                func_dst_strd = (weighted_pred || bi_pred
+                                || (ai2_xfrac[0] && ai2_yfrac[0])) ?
+                                pu_wd : ref_strd;
+                func_coeff = ai2_xfrac[0] ?
+                                coeff[ai2_xfrac[0]] : coeff[ai2_yfrac[0]];
+                func_wd = pu_wd >> clr_indx;
+                func_ht = pu_ht >> clr_indx;
+                func_ht += (ai2_xfrac[0] && ai2_yfrac[0]) ? ntaps - 1 : 0;
+                func_ptr1(func_src, func_dst, func_src_strd, func_dst_strd,
+                          func_coeff, func_ht, func_wd);
+            }
+
+            /*Function 2*/
+            if(func_ptr2 != NULL)
+            {
+                func_src_strd = pu_wd;
+                func_src = pi2_tmp1 + (ntaps / 2 - 1) * func_src_strd;
+                func_dst = (weighted_pred || bi_pred) ?
+                                (void *)pi2_tmp1 : (void *)pu1_dst;
+
+                func_dst_strd = (weighted_pred || bi_pred) ?
+                                pu_wd : ref_strd;
+                func_coeff = coeff[ai2_yfrac[0]];
+                func_wd = pu_wd >> clr_indx;
+                func_ht = pu_ht >> clr_indx;
+                func_ptr2(func_src, func_dst, func_src_strd, func_dst_strd,
+                          func_coeff, func_ht, func_wd);
+            }
+
+            if(func_ptr3 != NULL)
+            {
+                func_src_strd = ref_strd;
+                func_src = (ai2_xfrac[1] && ai2_yfrac[1]) ?
+                                ref_pic_l1 - (ntaps / 2 - 1) * func_src_strd :
+                                ref_pic_l1;
+
+                func_dst = (weighted_pred || bi_pred) ?
+                                (void *)pi2_tmp2 : (void *)pu1_dst;
+                if(ai2_xfrac[1] && ai2_yfrac[1])
+                {
+                    func_dst = pi2_tmp2;
+                }
+                func_dst_strd = (weighted_pred || bi_pred
+                                || (ai2_xfrac[1] && ai2_yfrac[1])) ?
+                                pu_wd : ref_strd;
+                func_coeff = ai2_xfrac[1] ?
+                                coeff[ai2_xfrac[1]] : coeff[ai2_yfrac[1]];
+                func_wd = pu_wd >> clr_indx;
+                func_ht = pu_ht >> clr_indx;
+                func_ht += (ai2_xfrac[1] && ai2_yfrac[1]) ? ntaps - 1 : 0;
+                func_ptr3(func_src, func_dst, func_src_strd, func_dst_strd,
+                          func_coeff, func_ht, func_wd);
+
+            }
+
+            if(func_ptr4 != NULL)
+            {
+                func_src_strd = pu_wd;
+                func_src = pi2_tmp2 + (ntaps / 2 - 1) * func_src_strd;
+
+                func_dst = (weighted_pred || bi_pred) ?
+                                (void *)pi2_tmp2 : (void *)pu1_dst;
+                func_dst_strd = (weighted_pred || bi_pred) ?
+                                pu_wd : ref_strd;
+                func_coeff = coeff[ai2_yfrac[1]];
+                func_wd = pu_wd >> clr_indx;
+                func_ht = pu_ht >> clr_indx;
+                func_ptr4(func_src, func_dst, func_src_strd, func_dst_strd,
+                          func_coeff, func_ht, func_wd);
+
+            }
+
+            PROFILE_DISABLE_INTER_PRED_LUMA_AVERAGING(clr_indx);
+            PROFILE_DISABLE_INTER_PRED_CHROMA_AVERAGING(clr_indx);
+
+
+            if((weighted_pred != 0) && (bi_pred != 0))
+            {
+                lvl_shift1 = 0;
+                lvl_shift2 = 0;
+                if((0 == clr_indx) && (ai2_xfrac[0] && ai2_yfrac[0]))
+                    lvl_shift1 = (1 << 13);
+
+                if((0 == clr_indx) && (ai2_xfrac[1] && ai2_yfrac[1]))
+                    lvl_shift2 = (1 << 13);
+
+
+                if(0 == clr_indx)
+                {
+                    shift = ps_slice_hdr->s_wt_ofst.i1_luma_log2_weight_denom
+                                    + SHIFT_14_MINUS_BIT_DEPTH + 1;
+
+                    ps_codec->s_func_selector.ihevc_weighted_pred_bi_fptr(pi2_tmp1,
+                                                                          pi2_tmp2,
+                                                                          pu1_dst,
+                                                                          pu_wd,
+                                                                          pu_wd,
+                                                                          ref_strd,
+                                                                          luma_weight_l0,
+                                                                          luma_offset_l0,
+                                                                          luma_weight_l1,
+                                                                          luma_offset_l1,
+                                                                          shift,
+                                                                          lvl_shift1,
+                                                                          lvl_shift2,
+                                                                          pu_ht,
+                                                                          pu_wd);
+                }
+                else
+                {
+                    shift = ps_slice_hdr->s_wt_ofst.i1_chroma_log2_weight_denom
+                                    + SHIFT_14_MINUS_BIT_DEPTH + 1;
+
+                    if(chroma_yuv420sp_vu)
+                    {
+                        ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_fptr(pi2_tmp1,
+                                                                                     pi2_tmp2,
+                                                                                     pu1_dst,
+                                                                                     pu_wd,
+                                                                                     pu_wd,
+                                                                                     ref_strd,
+                                                                                     chroma_weight_l0_cr,
+                                                                                     chroma_weight_l0_cb,
+                                                                                     chroma_offset_l0_cr,
+                                                                                     chroma_offset_l0_cb,
+                                                                                     chroma_weight_l1_cr,
+                                                                                     chroma_weight_l1_cb,
+                                                                                     chroma_offset_l1_cr,
+                                                                                     chroma_offset_l1_cb,
+                                                                                     shift,
+                                                                                     lvl_shift1,
+                                                                                     lvl_shift2,
+                                                                                     pu_ht >> 1,
+                                                                                     pu_wd >> 1);
+                    }
+                    else
+                    {
+                        ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_fptr(pi2_tmp1,
+                                                                                     pi2_tmp2,
+                                                                                     pu1_dst,
+                                                                                     pu_wd,
+                                                                                     pu_wd,
+                                                                                     ref_strd,
+                                                                                     chroma_weight_l0_cb,
+                                                                                     chroma_weight_l0_cr,
+                                                                                     chroma_offset_l0_cb,
+                                                                                     chroma_offset_l0_cr,
+                                                                                     chroma_weight_l1_cb,
+                                                                                     chroma_weight_l1_cr,
+                                                                                     chroma_offset_l1_cb,
+                                                                                     chroma_offset_l1_cr,
+                                                                                     shift,
+                                                                                     lvl_shift1,
+                                                                                     lvl_shift2,
+                                                                                     pu_ht >> 1,
+                                                                                     pu_wd >> 1);
+                    }
+                }
+            }
+
+            else if((weighted_pred != 0) && (bi_pred == 0))
+            {
+                lvl_shift1 = 0;
+                if(ps_pu->b2_pred_mode == PRED_L0)
+                {
+                    if((0 == clr_indx) && (ai2_xfrac[0] && ai2_yfrac[0]))
+                        lvl_shift1 = (1 << 13);
+                }
+                else
+                {
+                    if((0 == clr_indx) && (ai2_xfrac[1] && ai2_yfrac[1]))
+                        lvl_shift1 = (1 << 13);
+                }
+
+                if(0 == clr_indx)
+                {
+                    shift = ps_slice_hdr->s_wt_ofst.i1_luma_log2_weight_denom
+                                    + SHIFT_14_MINUS_BIT_DEPTH;
+
+                    ps_codec->s_func_selector.ihevc_weighted_pred_uni_fptr(ps_pu->b2_pred_mode == PRED_L0 ? pi2_tmp1 : pi2_tmp2,
+                                                                           pu1_dst,
+                                                                           pu_wd,
+                                                                           ref_strd,
+                                                                           ps_pu->b2_pred_mode == PRED_L0 ? luma_weight_l0 : luma_weight_l1,
+                                                                           ps_pu->b2_pred_mode == PRED_L0 ? luma_offset_l0 : luma_offset_l1,
+                                                                           shift,
+                                                                           lvl_shift1,
+                                                                           pu_ht,
+                                                                           pu_wd);
+                }
+                else
+                {
+                    shift = ps_slice_hdr->s_wt_ofst.i1_chroma_log2_weight_denom
+                                    + SHIFT_14_MINUS_BIT_DEPTH;
+
+                    if(chroma_yuv420sp_vu)
+                    {
+                        ps_codec->s_func_selector.ihevc_weighted_pred_chroma_uni_fptr(ps_pu->b2_pred_mode == PRED_L0 ? pi2_tmp1 : pi2_tmp2,
+                                                                                      pu1_dst,
+                                                                                      pu_wd,
+                                                                                      ref_strd,
+                                                                                      ps_pu->b2_pred_mode == PRED_L0 ? chroma_weight_l0_cr : chroma_weight_l1_cr,
+                                                                                      ps_pu->b2_pred_mode == PRED_L0 ? chroma_weight_l0_cb : chroma_weight_l1_cb,
+                                                                                      ps_pu->b2_pred_mode == PRED_L0 ? chroma_offset_l0_cr : chroma_offset_l1_cr,
+                                                                                      ps_pu->b2_pred_mode == PRED_L0 ? chroma_offset_l0_cb : chroma_offset_l1_cb,
+                                                                                      shift,
+                                                                                      lvl_shift1,
+                                                                                      pu_ht >> 1,
+                                                                                      pu_wd >> 1);
+                    }
+                    else
+                    {
+                        ps_codec->s_func_selector.ihevc_weighted_pred_chroma_uni_fptr(ps_pu->b2_pred_mode == PRED_L0 ? pi2_tmp1 : pi2_tmp2,
+                                                                                      pu1_dst,
+                                                                                      pu_wd,
+                                                                                      ref_strd,
+                                                                                      ps_pu->b2_pred_mode == PRED_L0 ? chroma_weight_l0_cb : chroma_weight_l1_cb,
+                                                                                      ps_pu->b2_pred_mode == PRED_L0 ? chroma_weight_l0_cr : chroma_weight_l1_cr,
+                                                                                      ps_pu->b2_pred_mode == PRED_L0 ? chroma_offset_l0_cb : chroma_offset_l1_cb,
+                                                                                      ps_pu->b2_pred_mode == PRED_L0 ? chroma_offset_l0_cr : chroma_offset_l1_cr,
+                                                                                      shift,
+                                                                                      lvl_shift1,
+                                                                                      pu_ht >> 1,
+                                                                                      pu_wd >> 1);
+                    }
+                }
+            }
+
+            else if((weighted_pred == 0) && (bi_pred != 0))
+            {
+                lvl_shift1 = 0;
+                lvl_shift2 = 0;
+                if((0 == clr_indx) && (ai2_xfrac[0] && ai2_yfrac[0]))
+                    lvl_shift1 = (1 << 13);
+
+                if((0 == clr_indx) && (ai2_xfrac[1] && ai2_yfrac[1]))
+                    lvl_shift2 = (1 << 13);
+
+                if(clr_indx != 0)
+                {
+                    pu_ht = (pu_ht >> 1);
+                }
+                ps_codec->s_func_selector.ihevc_weighted_pred_bi_default_fptr(pi2_tmp1,
+                                                                              pi2_tmp2,
+                                                                              pu1_dst,
+                                                                              pu_wd,
+                                                                              pu_wd,
+                                                                              ref_strd,
+                                                                              lvl_shift1,
+                                                                              lvl_shift2,
+                                                                              pu_ht,
+                                                                              pu_wd);
+
+            }
+        }
+    }
+}

diff --git a/decoder/ihevcd_inter_pred.h b/decoder/ihevcd_inter_pred.h
new file mode 100644
index 0000000..0c510d2
--- /dev/null
+++ b/decoder/ihevcd_inter_pred.h

@@ -0,0 +1,43 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_parse_slice.h
+*
+* @brief
+*  Processing of slice level data
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef IHEVCD_INTER_PRED_H_
+#define IHEVCD_INTER_PRED_H_
+
+void ihevcd_inter_pred_ctb(process_ctxt_t *ps_proc);
+
+
+#endif /* IHEVCD_INTER_PRED_H_ */

diff --git a/decoder/ihevcd_intra_pred_mode_prediction.c b/decoder/ihevcd_intra_pred_mode_prediction.c
new file mode 100644
index 0000000..b5936c4
--- /dev/null
+++ b/decoder/ihevcd_intra_pred_mode_prediction.c

@@ -0,0 +1,323 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevcd_intra_pred_mode_prediction.c.c
+ *
+ * @brief
+ *  Contains functions for intra pred mode prediction
+ *
+ * @author
+ *  Ittiam
+ *
+ * @par List of Functions:
+ * - ihevcd_intra_pred_mode_prediction()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_mem_fns.h"
+#include "ihevc_platform_macros.h"
+
+#include "ihevcd_defs.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+
+#include "ihevcd_bitstream.h"
+
+
+/*****************************************************************************/
+/* Function Prototypes                                                       */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Availability check is not done inside the function                        */
+/* Whenever the top and left are not available, it is assumed that Intra DC  */
+/*                  mode will initialized in place of non available          */
+/*                  neighbors                                                */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief Computes intra prediction mode for a CU
+*
+* @par   Description
+* Computes intra prediction mode for a CU
+*
+* @param[in,out] ps_cu
+* Codic unit context
+*
+* @param[in] ps_parse
+* parse context
+*
+* @param[in] ps_codec
+* codec context
+*
+* @param[in] log2_cb_size
+* log of cb size base 2
+*
+* @returns none
+*
+* @remarks
+* Availability check is moved to CTB level. If the neighbors are
+*  not available or if the pred mode of neighbor is not MODE_INTRA,
+*  INTRA_DC mode will be updated in top and left buffers.
+*******************************************************************************
+*/
+void ihevcd_intra_pred_mode_prediction(codec_t *ps_codec,
+                                       WORD32 log2_cb_size,
+                                       WORD32 x0,
+                                       WORD32 y0)
+{
+    WORD32 i, j, num_pred_blocks;
+    WORD32 available_l, available_t;
+    WORD32 cand_intra_pred_mode_l, cand_intra_pred_mode_t;
+    WORD32 cand_mode_list[3];
+    WORD32 cb_size, block_offset_in_min_pu;
+    UWORD8 *pu1_luma_intra_pred_mode_top;
+    UWORD8 *pu1_luma_intra_pred_mode_left;
+
+    parse_ctxt_t *ps_parse = &ps_codec->s_parse;
+    parse_cu_t *ps_cu = &ps_codec->s_parse.s_cu;
+    sps_t *ps_sps = ps_parse->ps_sps;
+
+
+    available_t = 1;
+    available_l = 1;
+    /* i4_pos_x and i4_pos_y are in minCu units (8x8), convert them to 4x4 units by multiplying by 2 */
+    pu1_luma_intra_pred_mode_top = ps_parse->pu1_luma_intra_pred_mode_top
+                    + (ps_cu->i4_pos_x * 2);
+
+    pu1_luma_intra_pred_mode_left = ps_parse->pu1_luma_intra_pred_mode_left
+                    + (ps_cu->i4_pos_y * 2);
+
+/*
+    if(0 == ps_cu->i4_pos_y)
+    {
+        memset(pu1_luma_intra_pred_mode_top, INTRA_DC, 16);
+    }
+
+    if(0 == ps_cu->i4_pos_x)
+    {
+        memset(pu1_luma_intra_pred_mode_left, INTRA_DC, 16);
+    }
+*/
+    if(ps_cu->i4_pos_y)
+    {
+        UWORD8 *pu1_pic_intra_flag = ps_codec->s_parse.pu1_pic_intra_flag;
+        WORD32 top_intra_flag;
+
+        WORD32 numbytes_row =  (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
+        pu1_pic_intra_flag += ((y0 - 8) / 8) * numbytes_row;
+        pu1_pic_intra_flag += (x0 / 64);
+        top_intra_flag = *pu1_pic_intra_flag;
+        top_intra_flag &= (1 << ((x0 / 8) % 8));
+
+        if(0 == top_intra_flag)
+        {
+            available_t = 0;
+        }
+    }
+    else
+        available_t = 0;
+
+
+    if((0 == ps_cu->i4_pos_x) && (((0 == ps_codec->s_parse.i4_ctb_slice_x) && (0 == ps_codec->s_parse.i4_ctb_slice_y)) ||
+                                  (0 == ps_codec->s_parse.i4_ctb_tile_x)))
+    {
+        available_l = 0;
+    }
+
+    if(available_l)
+    {
+        UWORD8 *pu1_pic_intra_flag = ps_codec->s_parse.pu1_pic_intra_flag;
+        WORD32 left_intra_flag;
+        WORD32 numbytes_row =  (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
+        pu1_pic_intra_flag += (y0 / 8) * numbytes_row;
+        pu1_pic_intra_flag += ((x0 - 8) / 64);
+        left_intra_flag = *pu1_pic_intra_flag;
+        left_intra_flag &= (1 << (((x0 - 8) / 8) % 8));
+
+        if(0 == left_intra_flag)
+        {
+            available_l = 0;
+        }
+    }
+
+    cb_size = (1 << log2_cb_size);
+
+    block_offset_in_min_pu = (cb_size / 2) / MIN_PU_SIZE;
+
+    num_pred_blocks = (ps_cu->i4_part_mode == PART_NxN) ? 2 : 1;
+
+    for(i = 0; i < num_pred_blocks; i++)
+    {
+        WORD32 available_l_tmp;
+        available_l_tmp = available_l;
+        for(j = 0; j < num_pred_blocks; j++)
+        {
+            /* Computing Candidate intra pred mode left */
+            {
+                WORD32 block_offset;
+
+                block_offset = i * block_offset_in_min_pu;
+                cand_intra_pred_mode_l = INTRA_DC;
+                if(available_l_tmp)
+                {
+                    cand_intra_pred_mode_l =
+                                    pu1_luma_intra_pred_mode_left[block_offset];
+                }
+
+            }
+
+            {
+                WORD32 block_offset;
+                block_offset = j * block_offset_in_min_pu;
+                cand_intra_pred_mode_t = INTRA_DC;
+                if(available_t)
+                {
+                    cand_intra_pred_mode_t =
+                                    pu1_luma_intra_pred_mode_top[block_offset];
+                }
+            }
+
+            /* Computing Candidate mode list */
+            if(cand_intra_pred_mode_l == cand_intra_pred_mode_t)
+            {
+                if(cand_intra_pred_mode_l < 2)
+                {
+                    cand_mode_list[0] = INTRA_PLANAR;
+                    cand_mode_list[1] = INTRA_DC;
+                    cand_mode_list[2] = INTRA_ANGULAR(26); /* angular 26 = Vertical */
+                }
+                else
+                {
+                    cand_mode_list[0] = cand_intra_pred_mode_l;
+                    cand_mode_list[1] = 2
+                                    + ((cand_intra_pred_mode_l + 29) % 32);
+                    cand_mode_list[2] = 2
+                                    + ((cand_intra_pred_mode_l - 2 + 1) % 32);
+                }
+            }
+            else
+            {
+                cand_mode_list[0] = cand_intra_pred_mode_l;
+                cand_mode_list[1] = cand_intra_pred_mode_t;
+
+                if((cand_intra_pred_mode_l != INTRA_PLANAR)
+                                && (cand_intra_pred_mode_t != INTRA_PLANAR))
+                {
+                    cand_mode_list[2] = INTRA_PLANAR;
+                }
+                else if((cand_intra_pred_mode_l != INTRA_DC)
+                                && (cand_intra_pred_mode_t != INTRA_DC))
+                {
+                    cand_mode_list[2] = INTRA_DC;
+                }
+                else
+                {
+                    cand_mode_list[2] = INTRA_ANGULAR(26);
+                }
+            }
+
+            /* Computing Intra pred mode */
+            if(ps_cu->ai4_prev_intra_luma_pred_flag[2 * i + j] == 1)
+            {
+                ps_cu->ai4_intra_luma_pred_mode[2 * i + j] =
+                                cand_mode_list[ps_cu->ai4_mpm_idx[2 * i + j]];
+            }
+            else
+            {
+                WORD32 intra_pred_mode;
+                /* Arranging cand_mode_list in increasing order */
+                if(cand_mode_list[0] > cand_mode_list[1])
+                {
+                    SWAP(cand_mode_list[0], cand_mode_list[1]);
+                }
+                if(cand_mode_list[0] > cand_mode_list[2])
+                {
+                    SWAP(cand_mode_list[0], cand_mode_list[2]);
+                }
+                if(cand_mode_list[1] > cand_mode_list[2])
+                {
+                    SWAP(cand_mode_list[1], cand_mode_list[2]);
+                }
+
+                intra_pred_mode = ps_cu->ai4_rem_intra_luma_pred_mode[2 * i + j];
+
+                if(intra_pred_mode >= cand_mode_list[0])
+                    intra_pred_mode++;
+
+                if(intra_pred_mode >= cand_mode_list[1])
+                    intra_pred_mode++;
+
+                if(intra_pred_mode >= cand_mode_list[2])
+                    intra_pred_mode++;
+
+                ps_cu->ai4_intra_luma_pred_mode[2 * i + j] = intra_pred_mode;
+            }
+            /* Update Top and Left intra pred mode */
+            {
+                WORD32 intra_pred_mode;
+
+                intra_pred_mode = ps_cu->ai4_intra_luma_pred_mode[2 * i + j];
+
+                ps_codec->s_func_selector.ihevc_memset_fptr(pu1_luma_intra_pred_mode_left + i * block_offset_in_min_pu, intra_pred_mode, (cb_size / num_pred_blocks) / MIN_PU_SIZE);
+                ps_codec->s_func_selector.ihevc_memset_fptr(pu1_luma_intra_pred_mode_top + j * block_offset_in_min_pu, intra_pred_mode, (cb_size / num_pred_blocks) / MIN_PU_SIZE);
+
+            }
+            /* If partition is PART_NxN, then left is available for second column always */
+            available_l_tmp = 1;
+
+        }
+        /* If partition is PART_NxN, then top is available for bottom row always */
+        available_t = 1;
+    }
+
+    /* In case it is PART_2Nx2N partition, replicate intra pred mode in other three entries */
+    if(ps_cu->i4_part_mode == PART_2Nx2N)
+    {
+        ps_cu->ai4_intra_luma_pred_mode[1] = ps_cu->ai4_intra_luma_pred_mode[0];
+        ps_cu->ai4_intra_luma_pred_mode[2] = ps_cu->ai4_intra_luma_pred_mode[0];
+        ps_cu->ai4_intra_luma_pred_mode[3] = ps_cu->ai4_intra_luma_pred_mode[0];
+    }
+}
+

diff --git a/decoder/ihevcd_intra_pred_mode_prediction.h b/decoder/ihevcd_intra_pred_mode_prediction.h
new file mode 100644
index 0000000..683a7f4
--- /dev/null
+++ b/decoder/ihevcd_intra_pred_mode_prediction.h

@@ -0,0 +1,45 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevcd_intra_pred_mode_prediction.h
+ *
+ * @brief
+ *  Contains functions for intra pred mode prediction
+ *
+ * @author
+ *  Ittiam
+ *
+ * @par List of Functions:
+ * - ihevcd_intra_pred_mode_prediction()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+#ifndef _IHEVCD_INTRA_PRED_MODE_PREDICTION_H_
+#define _IHEVCD_INTRA_PRED_MODE_PREDICTION_H_
+void ihevcd_intra_pred_mode_prediction(codec_t *ps_codec,
+                                       WORD32 log2_cb_size,
+                                       WORD32 x0,
+                                       WORD32 y0);
+
+#endif /* _IHEVCD_INTRA_PRED_MODE_PREDICTION_H_ */

diff --git a/decoder/ihevcd_iquant_itrans_recon_ctb.c b/decoder/ihevcd_iquant_itrans_recon_ctb.c
new file mode 100644
index 0000000..1596660
--- /dev/null
+++ b/decoder/ihevcd_iquant_itrans_recon_ctb.c

@@ -0,0 +1,1273 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevcd_iquant_itrans_recon_ctb.c
+ *
+ * @brief
+ *  Contains functions for inverse quantization, inverse transform and recon
+ *
+ * @author
+ *  Ittiam
+ *
+ * @par List of Functions:
+ * - ihevcd_iquant_itrans_recon_ctb()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_bitstream.h"
+#include "ihevc_common_tables.h"
+
+/* Intra pred includes */
+#include "ihevc_intra_pred.h"
+
+/* Inverse transform common module includes */
+#include "ihevc_trans_tables.h"
+#include "ihevc_trans_macros.h"
+#include "ihevc_itrans_recon.h"
+#include "ihevc_recon.h"
+#include "ihevc_chroma_itrans_recon.h"
+#include "ihevc_chroma_recon.h"
+
+/* Decoder includes */
+#include "ihevcd_common_tables.h"
+#include "ihevcd_iquant_itrans_recon_ctb.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_profile.h"
+#include "ihevcd_statistics.h"
+#include "ihevcd_itrans_recon_dc.h"
+
+const UWORD32 gau4_ihevcd_4_bit_reverse[] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
+
+
+/* Globals */
+WORD32 g_i4_ip_funcs[MAX_NUM_IP_MODES] =
+  { IP_FUNC_MODE_0, /* Mode 0 */
+    IP_FUNC_MODE_1, /* Mode 1 */
+    IP_FUNC_MODE_2, /* Mode 2 */
+    IP_FUNC_MODE_3TO9, /* Mode 3 */
+    IP_FUNC_MODE_3TO9, /* Mode 4 */
+    IP_FUNC_MODE_3TO9, /* Mode 5 */
+    IP_FUNC_MODE_3TO9, /* Mode 6 */
+    IP_FUNC_MODE_3TO9, /* Mode 7 */
+    IP_FUNC_MODE_3TO9, /* Mode 8 */
+    IP_FUNC_MODE_3TO9, /* Mode 9 */
+    IP_FUNC_MODE_10, /* Mode 10 */
+    IP_FUNC_MODE_11TO17, /* Mode 11 */
+    IP_FUNC_MODE_11TO17, /* Mode 12 */
+    IP_FUNC_MODE_11TO17, /* Mode 13 */
+    IP_FUNC_MODE_11TO17, /* Mode 14 */
+    IP_FUNC_MODE_11TO17, /* Mode 15 */
+    IP_FUNC_MODE_11TO17, /* Mode 16 */
+    IP_FUNC_MODE_11TO17, /* Mode 17 */
+    IP_FUNC_MODE_18_34, /* Mode 18 */
+    IP_FUNC_MODE_19TO25, /* Mode 19 */
+    IP_FUNC_MODE_19TO25, /* Mode 20 */
+    IP_FUNC_MODE_19TO25, /* Mode 21 */
+    IP_FUNC_MODE_19TO25, /* Mode 22 */
+    IP_FUNC_MODE_19TO25, /* Mode 23 */
+    IP_FUNC_MODE_19TO25, /* Mode 24 */
+    IP_FUNC_MODE_19TO25, /* Mode 25 */
+    IP_FUNC_MODE_26, /* Mode 26 */
+    IP_FUNC_MODE_27TO33, /* Mode 27 */
+    IP_FUNC_MODE_27TO33, /* Mode 26 */
+    IP_FUNC_MODE_27TO33, /* Mode 29 */
+    IP_FUNC_MODE_27TO33, /* Mode 30 */
+    IP_FUNC_MODE_27TO33, /* Mode 31 */
+    IP_FUNC_MODE_27TO33, /* Mode 32 */
+    IP_FUNC_MODE_27TO33, /* Mode 33 */
+    IP_FUNC_MODE_18_34, /* Mode 34 */
+};
+
+
+const WORD16 *g_ai2_ihevc_trans_tables[] =
+  { &g_ai2_ihevc_trans_dst_4[0][0],
+    &g_ai2_ihevc_trans_4[0][0],
+    &g_ai2_ihevc_trans_8[0][0],
+    &g_ai2_ihevc_trans_16[0][0],
+    &g_ai2_ihevc_trans_32[0][0]
+};
+
+
+/*****************************************************************************/
+/* Function Prototypes                                                       */
+/*****************************************************************************/
+/* Returns number of ai2_level read from ps_sblk_coeff */
+UWORD8* ihevcd_unpack_coeffs(WORD16 *pi2_tu_coeff,
+                             WORD32 log2_trans_size,
+                             UWORD8 *pu1_tu_coeff_data,
+                             WORD16 *pi2_dequant_matrix,
+                             WORD32 qp_rem,
+                             WORD32 qp_div,
+                             TRANSFORM_TYPE e_trans_type,
+                             WORD32 trans_quant_bypass,
+                             UWORD32 *pu4_zero_cols,
+                             UWORD32 *pu4_zero_rows,
+                             UWORD32 *pu4_coeff_type,
+                             WORD16 *pi2_coeff_value)
+{
+    /* Generating coeffs from coeff-map */
+    WORD32 i;
+    WORD16 *pi2_sblk_ptr;
+    WORD32 subblk_pos_x, subblk_pos_y;
+    WORD32 sblk_scan_idx, coeff_raster_idx;
+    WORD32 sblk_non_zero_coeff_idx;
+    tu_sblk_coeff_data_t *ps_tu_sblk_coeff_data;
+    UWORD8 u1_num_coded_sblks, u1_scan_type;
+    UWORD8 *pu1_new_tu_coeff_data;
+    WORD32 trans_size;
+    WORD32 xs, ys;
+    WORD32 trans_skip;
+    WORD16 iquant_out;
+    WORD32 shift_iq;
+    {
+        WORD32 bit_depth;
+
+        bit_depth = 8 + 0;
+        shift_iq = bit_depth + log2_trans_size - 5;
+    }
+    trans_size = (1 << log2_trans_size);
+
+    /* First byte points to number of coded blocks */
+    u1_num_coded_sblks = *pu1_tu_coeff_data++;
+
+    /* Next byte points to scan type */
+    u1_scan_type = *pu1_tu_coeff_data++;
+    /* 0th bit has trans_skip */
+    trans_skip = u1_scan_type & 1;
+    u1_scan_type >>= 1;
+
+    pi2_sblk_ptr = pi2_tu_coeff;
+
+    /* Initially all columns are assumed to be zero */
+    *pu4_zero_cols = 0xFFFFFFFF;
+    /* Initially all rows are assumed to be zero */
+    *pu4_zero_rows = 0xFFFFFFFF;
+
+    ps_tu_sblk_coeff_data = (tu_sblk_coeff_data_t *)(pu1_tu_coeff_data);
+
+    if(trans_skip)
+        memset(pi2_tu_coeff, 0, trans_size * trans_size * sizeof(WORD16));
+
+    STATS_INIT_SBLK_AND_COEFF_POS();
+
+    /* DC only case */
+    if((e_trans_type != DST_4x4) && (1 == u1_num_coded_sblks)
+                    && (0 == ps_tu_sblk_coeff_data->u2_subblk_pos)
+                    && (1 == ps_tu_sblk_coeff_data->u2_sig_coeff_map))
+    {
+        *pu4_coeff_type = 1;
+
+        if(!trans_quant_bypass)
+        {
+            if(4 == trans_size)
+            {
+                IQUANT_4x4(iquant_out,
+                           ps_tu_sblk_coeff_data->ai2_level[0],
+                           pi2_dequant_matrix[0]
+                                           * g_ihevc_iquant_scales[qp_rem],
+                           shift_iq, qp_div);
+            }
+            else
+            {
+                IQUANT(iquant_out, ps_tu_sblk_coeff_data->ai2_level[0],
+                       pi2_dequant_matrix[0] * g_ihevc_iquant_scales[qp_rem],
+                       shift_iq, qp_div);
+            }
+            if(trans_skip)
+                iquant_out = (iquant_out + 16) >> 5;
+        }
+        else
+        {
+            /* setting the column to zero */
+            for(i = 0; i < trans_size; i++)
+                *(pi2_tu_coeff + i * trans_size) = 0;
+
+            iquant_out = ps_tu_sblk_coeff_data->ai2_level[0];
+        }
+        *pi2_coeff_value = iquant_out;
+        *pi2_tu_coeff = iquant_out;
+        *pu4_zero_cols &= ~0x1;
+        *pu4_zero_rows &= ~0x1;
+        ps_tu_sblk_coeff_data =
+                        (void *)&ps_tu_sblk_coeff_data->ai2_level[1];
+
+        STATS_UPDATE_COEFF_COUNT();
+        STATS_LAST_SBLK_POS_UPDATE(e_trans_type, (trans_skip || trans_quant_bypass),  0, 0);
+        STATS_UPDATE_SBLK_AND_COEFF_HISTOGRAM(e_trans_type, (trans_quant_bypass || trans_skip));
+        return ((UWORD8 *)ps_tu_sblk_coeff_data);
+    }
+    else
+    {
+        *pu4_coeff_type = 0;
+        /* In case of trans skip, memset has already happened */
+        if(!trans_skip)
+            memset(pi2_tu_coeff, 0, trans_size * trans_size * sizeof(WORD16));
+    }
+
+    for(i = 0; i < u1_num_coded_sblks; i++)
+    {
+        UWORD32 u4_sig_coeff_map;
+        subblk_pos_x = ps_tu_sblk_coeff_data->u2_subblk_pos & 0x00FF;
+        subblk_pos_y = (ps_tu_sblk_coeff_data->u2_subblk_pos & 0xFF00) >> 8;
+
+        STATS_LAST_SBLK_POS_UPDATE(e_trans_type, (trans_skip || trans_quant_bypass), subblk_pos_x, subblk_pos_y);
+
+        subblk_pos_x = subblk_pos_x * MIN_TU_SIZE;
+        subblk_pos_y = subblk_pos_y * MIN_TU_SIZE;
+
+        pi2_sblk_ptr = pi2_tu_coeff + subblk_pos_y * trans_size
+                        + subblk_pos_x;
+
+        //*pu4_zero_cols &= ~(0xF << subblk_pos_x);
+
+        sblk_non_zero_coeff_idx = 0;
+        u4_sig_coeff_map = ps_tu_sblk_coeff_data->u2_sig_coeff_map;
+        //for(sblk_scan_idx = (31 - CLZ(u4_sig_coeff_map)); sblk_scan_idx >= 0; sblk_scan_idx--)
+        sblk_scan_idx = 31;
+        do
+        {
+            WORD32 clz = CLZ(u4_sig_coeff_map);
+
+            sblk_scan_idx -= clz;
+            /* when clz is 31, u4_sig_coeff_map << (clz+1) might result in unknown behaviour in some cases */
+            /* Hence either use SHL which takes care of handling these issues based on platform or shift in two stages */
+            u4_sig_coeff_map = u4_sig_coeff_map << clz;
+            /* Copying coeffs and storing in reverse order */
+            {
+                STATS_UPDATE_COEFF_COUNT();
+                coeff_raster_idx =
+                                gau1_ihevc_invscan4x4[u1_scan_type][sblk_scan_idx];
+
+                xs = coeff_raster_idx & 0x3;
+                ys = coeff_raster_idx >> 2;
+
+                if(!trans_quant_bypass)
+                {
+                    if(4 == trans_size)
+                    {
+                        IQUANT_4x4(iquant_out,
+                                   ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx],
+                                   pi2_dequant_matrix[(subblk_pos_x + xs)
+                                                   + (subblk_pos_y + ys)
+                                                   * trans_size]
+                                   * g_ihevc_iquant_scales[qp_rem],
+                                   shift_iq, qp_div);
+                        sblk_non_zero_coeff_idx++;
+                    }
+                    else
+                    {
+                        IQUANT(iquant_out,
+                               ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx],
+                               pi2_dequant_matrix[(subblk_pos_x + xs)
+                                               + (subblk_pos_y + ys)
+                                               * trans_size]
+                               * g_ihevc_iquant_scales[qp_rem],
+                               shift_iq, qp_div);
+                        sblk_non_zero_coeff_idx++;
+                    }
+
+                    if(trans_skip)
+                        iquant_out = (iquant_out + 16) >> 5;
+                }
+                else
+                {
+                    iquant_out = ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx++];
+                }
+                *pu4_zero_cols &= ~(0x1 << (subblk_pos_x + xs));
+                *pu4_zero_rows &= ~(0x1 << (subblk_pos_y + ys));
+                *(pi2_sblk_ptr + xs + ys * trans_size) = iquant_out;
+            }
+            sblk_scan_idx--;
+            u4_sig_coeff_map <<= 1;
+
+        }while(u4_sig_coeff_map);
+        /* Updating the sblk pointer */
+        ps_tu_sblk_coeff_data =
+                        (void *)&ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx];
+    }
+
+    STATS_UPDATE_SBLK_AND_COEFF_HISTOGRAM(e_trans_type, (trans_quant_bypass || trans_skip));
+
+    pu1_new_tu_coeff_data = (UWORD8 *)ps_tu_sblk_coeff_data;
+
+    return pu1_new_tu_coeff_data;
+}
+
+WORD32 ihevcd_get_intra_nbr_flag(process_ctxt_t *ps_proc,
+                                 tu_t *ps_tu,
+                                 UWORD32 *pu4_intra_nbr_avail,
+                                 WORD16 i2_pic_width_in_luma_samples,
+                                 UWORD8 i1_constrained_intra_pred_flag,
+                                 WORD32 trans_size,
+                                 WORD32 ctb_size)
+{
+    sps_t *ps_sps;
+    UWORD8 u1_bot_lt_avail, u1_left_avail, u1_top_avail, u1_top_rt_avail,
+                    u1_top_lt_avail;
+    WORD32 x_cur, y_cur, x_nbr, y_nbr;
+    UWORD8 *pu1_nbr_intra_flag;
+    UWORD8 *pu1_pic_intra_flag;
+    UWORD8 top_right, top, top_left, left, bot_left;
+    WORD32 intra_pos;
+    WORD32 num_8_blks, num_8_blks_in_bits;
+    WORD32 numbytes_row = (i2_pic_width_in_luma_samples + 63) / 64;
+    WORD32 cur_x, cur_y;
+    WORD32 i;
+    WORD32 nbr_flags;
+
+    ps_sps = ps_proc->ps_sps;
+    cur_x = ps_tu->b4_pos_x;
+    cur_y = ps_tu->b4_pos_y;
+
+    u1_bot_lt_avail = (pu4_intra_nbr_avail[1 + cur_y + trans_size / MIN_TU_SIZE]
+                    >> (31 - (1 + cur_x - 1))) & 1;
+    u1_left_avail = (pu4_intra_nbr_avail[1 + cur_y] >> (31 - (1 + cur_x - 1)))
+                    & 1;
+    u1_top_avail = (pu4_intra_nbr_avail[1 + cur_y - 1] >> (31 - (1 + cur_x)))
+                    & 1;
+    u1_top_rt_avail = (pu4_intra_nbr_avail[1 + cur_y - 1]
+                    >> (31 - (1 + cur_x + trans_size / MIN_TU_SIZE))) & 1;
+    u1_top_lt_avail = (pu4_intra_nbr_avail[1 + cur_y - 1]
+                    >> (31 - (1 + cur_x - 1))) & 1;
+
+#if DEBUG_PRINT_IQ_IT_RECON
+    printf(" Before constrained intra pred. BL:%d,L:%d,T:%d,TR:%d,TL:%d\n", u1_bot_lt_avail, u1_left_avail, u1_top_avail, u1_top_rt_avail, u1_top_lt_avail);
+#endif
+    x_cur = ps_proc->i4_ctb_x * ctb_size + cur_x * MIN_TU_SIZE;
+    y_cur = ps_proc->i4_ctb_y * ctb_size + cur_y * MIN_TU_SIZE;
+
+    pu1_pic_intra_flag = ps_proc->pu1_pic_intra_flag;
+
+    /* WORD32 nbr_flags as below  MSB --> LSB */
+    /*    Top-Left | Top-Right | Top | Left | Bottom-Left
+     *       1         4         4     4         4
+     */
+    bot_left = 0;
+    left = 0;
+    top_right = 0;
+    top = 0;
+    top_left = 0;
+
+    num_8_blks = trans_size > 4 ? trans_size / 8 : 1;
+    num_8_blks_in_bits = ((1 << num_8_blks) - 1);
+
+    if(i1_constrained_intra_pred_flag)
+    {
+        /* TODO: constrained intra pred not tested */
+        if(u1_bot_lt_avail)
+        {
+            x_nbr = x_cur - 1;
+            y_nbr = y_cur + trans_size;
+
+            pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
+                            + x_nbr / 64;
+            intra_pos = ((x_nbr / 8) % 8);
+            for(i = 0; i < num_8_blks; i++)
+            {
+                bot_left |= ((*(pu1_nbr_intra_flag + i * numbytes_row)
+                                >> intra_pos) & 1) << i;
+            }
+            bot_left &= num_8_blks_in_bits;
+        }
+        if(u1_left_avail)
+        {
+            x_nbr = x_cur - 1;
+            y_nbr = y_cur;
+
+            pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
+                            + x_nbr / 64;
+            intra_pos = ((x_nbr / 8) % 8);
+
+            for(i = 0; i < num_8_blks; i++)
+            {
+                left |= ((*(pu1_nbr_intra_flag + i * numbytes_row) >> intra_pos)
+                                & 1) << i;
+            }
+            left &= num_8_blks_in_bits;
+        }
+        if(u1_top_avail)
+        {
+            x_nbr = x_cur;
+            y_nbr = y_cur - 1;
+
+            pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
+                            + x_nbr / 64;
+            intra_pos = ((x_nbr / 8) % 8);
+
+            top = (*pu1_nbr_intra_flag >> intra_pos);
+            top &= num_8_blks_in_bits;
+            /*
+             for(i=0;i<num_8_blks;i++)
+             {
+             top |= ( (*pu1_nbr_intra_flag >> (intra_pos+i)) & 1) << i;
+             }
+             */
+        }
+        if(u1_top_rt_avail)
+        {
+            x_nbr = x_cur + trans_size;
+            y_nbr = y_cur - 1;
+
+            pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
+                            + x_nbr / 64;
+            intra_pos = ((x_nbr / 8) % 8);
+
+            top_right = (*pu1_nbr_intra_flag >> intra_pos);
+            top_right &= num_8_blks_in_bits;
+            /*
+             for(i=0;i<num_8_blks;i++)
+             {
+             top_right |= ( (*pu1_nbr_intra_flag >> (intra_pos+i)) & 1) << i;
+             }
+             */
+        }
+        if(u1_top_lt_avail)
+        {
+            x_nbr = x_cur - 1;
+            y_nbr = y_cur - 1;
+
+            pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
+                            + x_nbr / 64;
+            intra_pos = ((x_nbr / 8) % 8);
+
+            top_left = (*pu1_nbr_intra_flag >> intra_pos) & 1;
+        }
+    }
+    else
+    {
+        if(u1_top_avail)
+            top = 0xF;
+        if(u1_top_rt_avail)
+            top_right = 0xF;
+        if(u1_bot_lt_avail)
+            bot_left = 0xF;
+        if(u1_left_avail)
+            left = 0xF;
+        if(u1_top_lt_avail)
+            top_left = 0x1;
+    }
+
+    /* Handling incomplete CTBs */
+    {
+        WORD32 pu_size_limit = MIN(trans_size, 8);
+        WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples
+                        - (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size)
+                        - (ps_tu->b4_pos_x * MIN_TU_SIZE)
+                        - (1 << (ps_tu->b3_size + 2));
+        /* ctb_size_top gives number of valid pixels remaining in the current row */
+        WORD32 ctb_size_top = MIN(ctb_size, cols_remaining);
+        WORD32 ctb_size_top_bits = (1 << (ctb_size_top / pu_size_limit)) - 1;
+
+        WORD32 rows_remaining = ps_sps->i2_pic_height_in_luma_samples
+                        - (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size)
+                        - (ps_tu->b4_pos_y * MIN_TU_SIZE)
+                        - (1 << (ps_tu->b3_size + 2));
+        /* ctb_size_bot gives number of valid pixels remaining in the current column */
+        WORD32 ctb_size_bot = MIN(ctb_size, rows_remaining);
+        WORD32 ctb_size_bot_bits = (1 << (ctb_size_bot / pu_size_limit)) - 1;
+
+        top_right &= ctb_size_top_bits;
+        bot_left &= ctb_size_bot_bits;
+    }
+
+    /*    Top-Left | Top-Right | Top | Left | Bottom-Left
+     *      1         4         4     4         4
+     */
+#if DEBUG_PRINT_IQ_IT_RECON
+    printf(" After constrained intra pred. BL:%d,L:%d,T:%d,TR:%d,TL:%d\n", bot_left, left, top, top_right, top_left);
+#endif
+
+    /*
+     nbr_flags = (top_left << 16) | (gau4_ihevcd_4_bit_reverse[top_right] << 12) | (gau4_ihevcd_4_bit_reverse[top] << 8) | (gau4_ihevcd_4_bit_reverse[left] << 4)
+     | gau4_ihevcd_4_bit_reverse[bot_left];
+     */
+    nbr_flags = (top_left << 16) | (top_right << 12) | (top << 8) | (gau4_ihevcd_4_bit_reverse[left] << 4)
+                    | gau4_ihevcd_4_bit_reverse[bot_left];
+
+#if DEBUG_PRINT_IQ_IT_RECON
+    printf("\n Luma nbr flags = %d", nbr_flags);
+#endif
+
+    return nbr_flags;
+
+}
+#if 0
+void ihevcd_itrans_recon_one_coeff(WORD16 *pi2_tmp,
+                                   UWORD8 *pu1_pred,
+                                   UWORD8 *pu1_dst,
+                                   WORD32 pred_strd,
+                                   WORD32 dst_strd,
+                                   WORD32 log2_trans_size,
+                                   TRANSFORM_TYPE trans_type,
+                                   WORD32 coeff_x,
+                                   WORD32 coeff_y,
+                                   WORD16 i2_coeff_value,
+                                   WORD32 is_luma)
+{
+    WORD32 x, y;
+    WORD32 row, col;
+    WORD32 add, shift;
+    WORD32 quant_out;
+    WORD32 trans_size;
+    WORD16 *pi2_trans_table;
+    WORD32 trans_table_idx;
+    WORD32 itrans_out;
+    WORD32 col_mult = (is_luma == 1) ? 1 : 2;
+
+    x = coeff_x;
+    y = coeff_y;
+    trans_size = (1 << log2_trans_size);
+
+    if(DST_4x4 == trans_type)
+    {
+        trans_table_idx = 0;
+    }
+    else
+    {
+        trans_table_idx = log2_trans_size - 2 + 1;
+    }
+    pi2_trans_table = (WORD16 *)g_ai2_ihevc_trans_tables[trans_table_idx];
+
+    quant_out = i2_coeff_value;
+
+    shift = IT_SHIFT_STAGE_1;
+    add = 1 << (shift - 1);
+    /* Multiply trans tables values in yth row with quant_out and store in temporary buffer*/
+    for(col = 0; col < trans_size; col++)
+    {
+        pi2_tmp[col] = CLIP_S16(
+                        (quant_out * pi2_trans_table[y * trans_size + col ] + add) >> shift);
+    }
+
+    shift = IT_SHIFT_STAGE_2;
+    add = 1 << (shift - 1);
+
+    /* Multiply trans tables values in xth row with each value in temerory buffer */
+    for(row = 0; row < trans_size; row++)
+    {
+        for(col = 0; col < trans_size; col++)
+        {
+            itrans_out = CLIP_S16(
+                            (pi2_tmp[row] * pi2_trans_table[x * trans_size+ col ] + add)
+                                            >> shift);
+            pu1_dst[row * dst_strd + col * col_mult] = CLIP_U8( (pu1_pred[row * pred_strd + col * col_mult] + itrans_out));
+        }
+    }
+}
+#endif
+
+WORD32 ihevcd_iquant_itrans_recon_ctb(process_ctxt_t *ps_proc)
+{
+    WORD16 *pi2_scaling_mat;
+    UWORD8 *pu1_y_dst_ctb;
+    UWORD8 *pu1_uv_dst_ctb;
+    WORD32 ctb_size;
+    codec_t *ps_codec;
+    slice_header_t *ps_slice_hdr;
+    tu_t *ps_tu;
+    WORD16 *pi2_ctb_coeff;
+    WORD32 tu_cnt;
+    WORD16 *pi2_tu_coeff;
+    WORD16 *pi2_tmp;
+    WORD32 pic_strd;
+    WORD32 luma_nbr_flags;
+    WORD32 chroma_nbr_flags = 0;
+    UWORD8 u1_luma_pred_mode_first_tu = 0;
+    /* Pointers for generating 2d coeffs from coeff-map */
+    UWORD8 *pu1_tu_coeff_data;
+    /* nbr avail map for CTB */
+    /* 1st bit points to neighbor (left/top_left/bot_left) */
+    /* 1Tb starts at 2nd bit from msb of 2nd value in array, followed by number of min_tu's in that ctb */
+    UWORD32 au4_intra_nbr_avail[MAX_CTB_SIZE / MIN_TU_SIZE
+                    + 2 /* Top nbr + bot nbr */]; UWORD32
+                    top_avail_bits;
+    sps_t *ps_sps;
+    pps_t *ps_pps;
+    WORD32 intra_flag;
+    UWORD8 *pu1_pic_intra_flag;
+    /*************************************************************************/
+    /* Contanis scaling matrix offset in the following order in a 1D buffer  */
+    /* Intra 4 x 4 Y, 4 x 4 U, 4 x 4 V                                       */
+    /* Inter 4 x 4 Y, 4 x 4 U, 4 x 4 V                                       */
+    /* Intra 8 x 8 Y, 8 x 8 U, 8 x 8 V                                       */
+    /* Inter 8 x 8 Y, 8 x 8 U, 8 x 8 V                                       */
+    /* Intra 16x16 Y, 16x16 U, 16x16 V                                       */
+    /* Inter 16x16 Y, 16x16 U, 16x16 V                                       */
+    /* Intra 32x32 Y                                                         */
+    /* Inter 32x32 Y                                                         */
+    /*************************************************************************/
+    WORD32 scaling_mat_offset[] =
+      { 0, 16, 32, 48, 64, 80, 96, 160, 224, 288, 352, 416, 480, 736, 992,
+        1248, 1504, 1760, 2016, 3040 };
+
+    PROFILE_DISABLE_IQ_IT_RECON_INTRA_PRED();
+
+    ps_sps = ps_proc->ps_sps;
+    ps_pps = ps_proc->ps_pps;
+    ps_slice_hdr = ps_proc->ps_slice_hdr;
+    ps_codec = ps_proc->ps_codec;
+
+    pu1_y_dst_ctb = ps_proc->pu1_cur_ctb_luma;
+    pu1_uv_dst_ctb = ps_proc->pu1_cur_ctb_chroma;
+
+    pi2_ctb_coeff = ps_proc->pi2_invscan_out;
+
+    ctb_size = (1 << ps_sps->i1_log2_ctb_size);
+    pu1_tu_coeff_data = (UWORD8 *)ps_proc->pv_tu_coeff_data;
+
+    pic_strd = ps_codec->i4_strd;
+
+    pi2_tmp = ps_proc->pi2_itrans_intrmd_buf;
+
+    pi2_tu_coeff = pi2_ctb_coeff;
+
+    ps_tu = ps_proc->ps_tu;
+
+    if((1 == ps_sps->i1_scaling_list_enable_flag) && (1 == ps_pps->i1_pps_scaling_list_data_present_flag))
+    {
+        pi2_scaling_mat = ps_pps->pi2_scaling_mat;
+    }
+    else
+    {
+        pi2_scaling_mat = ps_sps->pi2_scaling_mat;
+    }
+
+    {
+        /* Updating the initial availability map */
+        WORD32 i;
+        UWORD8 u1_left_ctb_avail, u1_top_lt_ctb_avail, u1_top_rt_ctb_avail,
+                        u1_top_ctb_avail;
+
+        u1_left_ctb_avail = ps_proc->u1_left_ctb_avail;
+        u1_top_lt_ctb_avail = ps_proc->u1_top_lt_ctb_avail;
+        u1_top_ctb_avail = ps_proc->u1_top_ctb_avail;
+        u1_top_rt_ctb_avail = ps_proc->u1_top_rt_ctb_avail;
+
+        /* Initializing the availability array */
+        memset(au4_intra_nbr_avail, 0,
+               (MAX_CTB_SIZE / MIN_TU_SIZE + 2) * sizeof(UWORD32));
+        /* Initializing the availability array with CTB level availability flags */
+        {
+            WORD32 rows_remaining = ps_sps->i2_pic_height_in_luma_samples - (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size);
+            WORD32 ctb_size_left = MIN(ctb_size, rows_remaining);
+            for(i = 0; i < ctb_size_left / MIN_TU_SIZE; i++)
+            {
+                au4_intra_nbr_avail[i + 1] = ((UWORD32)u1_left_ctb_avail << 31);
+            }
+        }
+        au4_intra_nbr_avail[0] |= (((UWORD32)u1_top_rt_ctb_avail << 31)
+                        >> (1 + ctb_size / MIN_TU_SIZE)); /* 1+ctb_size/4 position bit pos from msb */
+
+        au4_intra_nbr_avail[0] |= ((UWORD32)u1_top_lt_ctb_avail << 31);
+
+        {
+            WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples - (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size);
+            WORD32 ctb_size_top = MIN(ctb_size, cols_remaining);
+            WORD32 shift = (31 - (ctb_size / MIN_TU_SIZE));
+
+            /* ctb_size_top gives number of valid pixels remaining in the current row */
+            /* Since we need pattern of 1's starting from the MSB, an additional shift */
+            /* is needed */
+            shift += ((ctb_size - ctb_size_top) / MIN_TU_SIZE);
+
+            top_avail_bits = ((1 << (ctb_size_top / MIN_TU_SIZE)) - 1)
+                            << shift;
+        }
+        au4_intra_nbr_avail[0] |= (
+                        (u1_top_ctb_avail == 1) ? top_avail_bits : 0x0);
+        /* Starting from msb 2nd bit to (1+ctb_size/4) bit, set 1 if top avail,or 0 */
+
+    }
+
+    /* Applying Inverse transform on all the TU's in CTB */
+    for(tu_cnt = 0; tu_cnt < ps_proc->i4_ctb_tu_cnt; tu_cnt++, ps_tu++)
+    {
+        WORD32 transform_skip_flag = 0;
+        WORD32 transform_skip_flag_v = 0;
+        WORD32 num_comp, c_idx, func_idx;
+        WORD32 src_strd, pred_strd, dst_strd;
+        WORD32 qp_div = 0, qp_rem = 0;
+        WORD32 qp_div_v = 0, qp_rem_v = 0;
+        UWORD32 zero_cols = 0, zero_cols_v = 0;
+        UWORD32 zero_rows = 0, zero_rows_v = 0;
+        UWORD32 coeff_type = 0, coeff_type_v = 0;
+        WORD16 i2_coeff_value, i2_coeff_value_v;
+        WORD32 trans_size = 0;
+        TRANSFORM_TYPE e_trans_type;
+        WORD32 log2_y_trans_size_minus_2, log2_uv_trans_size_minus_2;
+        WORD32 log2_trans_size;
+        WORD32 chroma_qp_idx;
+        WORD16 *pi2_src = NULL, *pi2_src_v = NULL;
+        UWORD8 *pu1_pred = NULL, *pu1_pred_v = NULL;
+        UWORD8 *pu1_dst = NULL, *pu1_dst_v = NULL;
+        WORD16 *pi2_dequant_matrix = NULL, *pi2_dequant_matrix_v = NULL;
+        WORD32 tu_x, tu_y;
+        WORD32 tu_y_offset, tu_uv_offset;
+        WORD8 i1_chroma_pic_qp_offset, i1_chroma_slice_qp_offset;
+        UWORD8 u1_cbf = 0, u1_cbf_v = 0, u1_luma_pred_mode, u1_chroma_pred_mode;
+        WORD32 luma_nbr_flags_4x4[4];
+        WORD32 offset;
+        WORD32 pcm_flag;
+        WORD32  chroma_yuv420sp_vu = (ps_codec->e_ref_chroma_fmt == IV_YUV_420SP_VU);
+        /* If 420SP_VU is chroma format, pred and dst pointer   */
+        /* will be added +1 to point to U                       */
+        WORD32 chroma_yuv420sp_vu_u_offset = 1 * chroma_yuv420sp_vu;
+        /* If 420SP_VU is chroma format, pred and dst pointer   */
+        /* will be added U offset of +1 and subtracted 2        */
+        /* to point to V                                        */
+        WORD32 chroma_yuv420sp_vu_v_offset = -2 * chroma_yuv420sp_vu;
+
+        tu_x = ps_tu->b4_pos_x * 4; /* Converting minTU unit to pixel unit */
+        tu_y = ps_tu->b4_pos_y * 4; /* Converting minTU unit to pixel unit */
+        {
+            WORD32 tu_abs_x = (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size) + (tu_x);
+            WORD32 tu_abs_y = (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size) + (tu_y);
+
+            WORD32 numbytes_row =  (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
+
+            pu1_pic_intra_flag = ps_proc->pu1_pic_intra_flag;
+            pu1_pic_intra_flag += (tu_abs_y >> 3) * numbytes_row;
+            pu1_pic_intra_flag += (tu_abs_x >> 6);
+
+            intra_flag = *pu1_pic_intra_flag;
+            intra_flag &= (1 << ((tu_abs_x >> 3) % 8));
+        }
+
+#if DEBUG_PRINT_IQ_IT_RECON
+        printf("\n tu_x = %d", tu_x);
+        printf("\n tu_y = %d", tu_y);
+#endif
+
+        u1_luma_pred_mode = ps_tu->b6_luma_intra_mode;
+        u1_chroma_pred_mode = ps_tu->b3_chroma_intra_mode_idx;
+
+        if(u1_chroma_pred_mode != 7)
+            num_comp = 2; /* Y and UV */
+        else
+            num_comp = 1; /* Y */
+
+
+        pcm_flag = 0;
+
+        if((intra_flag) && (u1_luma_pred_mode == INTRA_PRED_NONE))
+        {
+            UWORD8 *pu1_buf;
+            UWORD8 *pu1_y_dst = pu1_y_dst_ctb;
+            UWORD8 *pu1_uv_dst = pu1_uv_dst_ctb;
+            WORD32 i, j;
+            tu_sblk_coeff_data_t *ps_tu_sblk_coeff_data;
+            WORD32 cb_size = 1 << (ps_tu->b3_size + 2);
+
+            /* trans_size is used to update availability after reconstruction */
+            trans_size = cb_size;
+
+            pcm_flag = 1;
+
+            tu_y_offset = tu_x + tu_y * pic_strd;
+            pu1_y_dst += tu_x + tu_y * pic_strd;
+            pu1_uv_dst += tu_x + (tu_y >> 1) * pic_strd;
+
+            /* First byte points to number of coded blocks */
+            pu1_tu_coeff_data++;
+
+            /* Next byte points to scan type */
+            pu1_tu_coeff_data++;
+
+            ps_tu_sblk_coeff_data = (tu_sblk_coeff_data_t *)pu1_tu_coeff_data;
+
+            pu1_buf = (UWORD8 *)&ps_tu_sblk_coeff_data->ai2_level[0];
+            {
+
+                for(i = 0; i < cb_size; i++)
+                {
+                    //pu1_y_dst[i * pic_strd + j] = *pu1_buf++;
+                    memcpy(&pu1_y_dst[i * pic_strd], pu1_buf, cb_size);
+                    pu1_buf += cb_size;
+                }
+
+                pu1_uv_dst = pu1_uv_dst + chroma_yuv420sp_vu_u_offset;
+
+                /* U */
+                for(i = 0; i < cb_size / 2; i++)
+                {
+                    for(j = 0; j < cb_size / 2; j++)
+                    {
+                        pu1_uv_dst[i * pic_strd + 2 * j] = *pu1_buf++;
+                    }
+                }
+
+                pu1_uv_dst = pu1_uv_dst + 1 + chroma_yuv420sp_vu_v_offset;
+
+                /* V */
+                for(i = 0; i < cb_size / 2; i++)
+                {
+                    for(j = 0; j < cb_size / 2; j++)
+                    {
+                        pu1_uv_dst[i * pic_strd + 2 * j] = *pu1_buf++;
+                    }
+                }
+            }
+
+            pu1_tu_coeff_data = pu1_buf;
+
+        }
+
+
+
+
+
+        for(c_idx = 0; c_idx < num_comp; c_idx++)
+        {
+            if(0 == pcm_flag)
+            {
+                /* Initializing variables */
+                pred_strd = pic_strd;
+                dst_strd = pic_strd;
+
+                if(c_idx == 0) /* Y */
+                {
+                    log2_y_trans_size_minus_2 = ps_tu->b3_size;
+                    trans_size = 1 << (log2_y_trans_size_minus_2 + 2);
+                    log2_trans_size = log2_y_trans_size_minus_2 + 2;
+
+                    tu_y_offset = tu_x + tu_y * pic_strd;
+
+                    pi2_src = pi2_tu_coeff;
+                    pu1_pred = pu1_y_dst_ctb + tu_y_offset;
+                    pu1_dst = pu1_y_dst_ctb + tu_y_offset;
+
+                    /* Calculating scaling matrix offset */
+                    offset = log2_y_trans_size_minus_2 * 6
+                                    + (!intra_flag)
+                                    * ((log2_y_trans_size_minus_2
+                                                    == 3) ? 1 : 3)
+                                    + c_idx;
+                    pi2_dequant_matrix = pi2_scaling_mat
+                                    + scaling_mat_offset[offset];
+
+                    src_strd = trans_size;
+
+                    /* 4x4 transform Luma in INTRA mode is DST */
+                    if(log2_y_trans_size_minus_2 == 0 && intra_flag)
+                    {
+                        func_idx = log2_y_trans_size_minus_2;
+                        e_trans_type = DST_4x4;
+                    }
+                    else
+                    {
+                        func_idx = log2_y_trans_size_minus_2 + 1;
+                        e_trans_type = (TRANSFORM_TYPE)(log2_y_trans_size_minus_2 + 1);
+                    }
+
+                    qp_div = ps_tu->b7_qp / 6;
+                    qp_rem = ps_tu->b7_qp % 6;
+
+                    u1_cbf = ps_tu->b1_y_cbf;
+
+                    transform_skip_flag = pu1_tu_coeff_data[1] & 1;
+                    /* Unpacking coeffs */
+                    if(1 == u1_cbf)
+                    {
+                        pu1_tu_coeff_data = ihevcd_unpack_coeffs(
+                                        pi2_src, log2_y_trans_size_minus_2 + 2,
+                                        pu1_tu_coeff_data, pi2_dequant_matrix,
+                                        qp_rem, qp_div, e_trans_type,
+                                        ps_tu->b1_transquant_bypass, &zero_cols,
+                                        &zero_rows, &coeff_type,
+                                        &i2_coeff_value);
+                    }
+
+#if DEBUG_PRINT_IQ_IT_RECON
+                    printf("\nLuma Coeff \n");
+                    print_coeff(pi2_src, trans_size);
+#endif
+                }
+                else /* UV interleaved */
+                {
+                    /* Chroma :If Transform size is 4x4, keep 4x4 else do transform on (trans_size/2 x trans_size/2) */
+                    if(ps_tu->b3_size == 0)
+                    {
+                        /* Chroma 4x4 is present with 4th luma 4x4 block. For this case chroma postion has to be (luma pos x- 4,luma pos y- 4) */
+                        log2_uv_trans_size_minus_2 = ps_tu->b3_size;
+                        tu_uv_offset = (tu_x - 4) + ((tu_y - 4) / 2) * pic_strd;
+                    }
+                    else
+                    {
+                        log2_uv_trans_size_minus_2 = ps_tu->b3_size - 1;
+                        tu_uv_offset = tu_x + (tu_y >> 1) * pic_strd;
+                    }
+                    trans_size = 1 << (log2_uv_trans_size_minus_2 + 2);
+                    log2_trans_size = log2_uv_trans_size_minus_2 + 2;
+
+                    pi2_src = pi2_tu_coeff;
+                    pi2_src_v = pi2_tu_coeff + trans_size * trans_size;
+                    pu1_pred = pu1_uv_dst_ctb + tu_uv_offset + chroma_yuv420sp_vu_u_offset; /* Pointing to start byte of U*/
+                    pu1_pred_v = pu1_pred + 1 + chroma_yuv420sp_vu_v_offset; /* Pointing to start byte of V*/
+                    pu1_dst = pu1_uv_dst_ctb + tu_uv_offset + chroma_yuv420sp_vu_u_offset; /* Pointing to start byte of U*/
+                    pu1_dst_v = pu1_dst + 1 + chroma_yuv420sp_vu_v_offset; /* Pointing to start byte of V*/
+
+                    /*TODO: Add support for choosing different tables for U and V,
+                     * change this to a single array to handle flat/default/custom, intra/inter, luma/chroma and various sizes
+                     */
+                    /* Calculating scaling matrix offset */
+                    /* ((log2_uv_trans_size_minus_2 == 3) ? 1:3) condition check is not needed, since
+                     * max uv trans size is 16x16
+                     */
+                    offset = log2_uv_trans_size_minus_2 * 6
+                                    + (!intra_flag) * 3 + c_idx;
+                    pi2_dequant_matrix = pi2_scaling_mat
+                                    + scaling_mat_offset[offset];
+                    pi2_dequant_matrix_v = pi2_scaling_mat
+                                    + scaling_mat_offset[offset + 1];
+
+                    src_strd = trans_size;
+
+                    func_idx = 1 + 4 + log2_uv_trans_size_minus_2; /* DST func + Y funcs + cur func index*/
+                    e_trans_type = (TRANSFORM_TYPE)(log2_uv_trans_size_minus_2 + 1);
+                    /* QP for U */
+                    i1_chroma_pic_qp_offset = ps_pps->i1_pic_cb_qp_offset;
+                    i1_chroma_slice_qp_offset = ps_slice_hdr->i1_slice_cb_qp_offset;
+                    u1_cbf = ps_tu->b1_cb_cbf;
+
+                    chroma_qp_idx = ps_tu->b7_qp + i1_chroma_pic_qp_offset
+                                    + i1_chroma_slice_qp_offset;
+                    chroma_qp_idx = CLIP3(chroma_qp_idx, 0, 57);
+                    qp_div = gai2_ihevcd_chroma_qp[chroma_qp_idx] / 6;
+                    qp_rem = gai2_ihevcd_chroma_qp[chroma_qp_idx] % 6;
+
+                    /* QP for V */
+                    i1_chroma_pic_qp_offset = ps_pps->i1_pic_cr_qp_offset;
+                    i1_chroma_slice_qp_offset = ps_slice_hdr->i1_slice_cr_qp_offset;
+                    u1_cbf_v = ps_tu->b1_cr_cbf;
+
+                    chroma_qp_idx = ps_tu->b7_qp + i1_chroma_pic_qp_offset
+                                    + i1_chroma_slice_qp_offset;
+                    chroma_qp_idx = CLIP3(chroma_qp_idx, 0, 57);
+                    qp_div_v = gai2_ihevcd_chroma_qp[chroma_qp_idx] / 6;
+                    qp_rem_v = gai2_ihevcd_chroma_qp[chroma_qp_idx] % 6;
+
+                    /* Unpacking coeffs */
+                    transform_skip_flag = pu1_tu_coeff_data[1] & 1;
+                    if(1 == u1_cbf)
+                    {
+                        pu1_tu_coeff_data = ihevcd_unpack_coeffs(
+                                        pi2_src, log2_uv_trans_size_minus_2 + 2,
+                                        pu1_tu_coeff_data, pi2_dequant_matrix,
+                                        qp_rem, qp_div, e_trans_type,
+                                        ps_tu->b1_transquant_bypass, &zero_cols,
+                                        &zero_rows, &coeff_type,
+                                        &i2_coeff_value);
+                    }
+#if DEBUG_PRINT_IQ_IT_RECON
+                    printf("\nChroma Coeff U \n");
+                    print_coeff(pi2_src, trans_size);
+#endif
+
+                    transform_skip_flag_v = pu1_tu_coeff_data[1] & 1;
+                    if(1 == u1_cbf_v)
+                    {
+                        pu1_tu_coeff_data = ihevcd_unpack_coeffs(
+                                        pi2_src_v, log2_uv_trans_size_minus_2 + 2,
+                                        pu1_tu_coeff_data, pi2_dequant_matrix_v,
+                                        qp_rem_v, qp_div_v, e_trans_type,
+                                        ps_tu->b1_transquant_bypass, &zero_cols_v,
+                                        &zero_rows_v, &coeff_type_v, &i2_coeff_value_v);
+                    }
+                }
+                /***************************************************************/
+                /******************  Intra Prediction **************************/
+                /***************************************************************/
+                if(intra_flag) /* Intra */
+                {
+                    UWORD8 au1_ref_sub_out[(MAX_TU_SIZE * 2 * 2) + 4];
+                    UWORD8 *pu1_top_left, *pu1_top, *pu1_left;
+                    WORD32 luma_pred_func_idx, chroma_pred_func_idx;
+
+                    /* Get the neighbour availability flags */
+                    /* Done for only Y */
+                    if(c_idx == 0)
+                    {
+                        /* Get neighbor availability for Y only */
+                        luma_nbr_flags = ihevcd_get_intra_nbr_flag(ps_proc,
+                                                                   ps_tu,
+                                                                   au4_intra_nbr_avail,
+                                                                   ps_sps->i2_pic_width_in_luma_samples,
+                                                                   ps_pps->i1_constrained_intra_pred_flag,
+                                                                   trans_size,
+                                                                   ctb_size);
+
+                        if(trans_size == 4)
+                            luma_nbr_flags_4x4[(ps_tu->b4_pos_x % 2) + (ps_tu->b4_pos_y % 2) * 2] = luma_nbr_flags;
+
+                        if((ps_tu->b4_pos_x % 2 == 0) && (ps_tu->b4_pos_y % 2 == 0))
+                        {
+                            chroma_nbr_flags = luma_nbr_flags;
+                        }
+
+                        /* Initializing nbr pointers */
+                        pu1_top = pu1_pred - pic_strd;
+                        pu1_left = pu1_pred - 1;
+                        pu1_top_left = pu1_pred - pic_strd - 1;
+
+                        /* call reference array substitution */
+                        if(luma_nbr_flags == 0x1ffff)
+                            ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr(
+                                            pu1_top_left,
+                                            pu1_top, pu1_left, pred_strd, trans_size, luma_nbr_flags, au1_ref_sub_out, 1);
+                        else
+                            ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr(
+                                            pu1_top_left,
+                                            pu1_top, pu1_left, pred_strd, trans_size, luma_nbr_flags, au1_ref_sub_out, 1);
+
+                        /* call reference filtering */
+                        ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr(
+                                        au1_ref_sub_out, trans_size,
+                                        au1_ref_sub_out,
+                                        u1_luma_pred_mode, ps_sps->i1_strong_intra_smoothing_enable_flag);
+
+                        /* use the look up to get the function idx */
+                        luma_pred_func_idx = g_i4_ip_funcs[u1_luma_pred_mode];
+
+                        /* call the intra prediction function */
+                        ps_codec->apf_intra_pred_luma[luma_pred_func_idx](au1_ref_sub_out, 1, pu1_pred, pred_strd, trans_size, u1_luma_pred_mode);
+#if DEBUG_PRINT_IQ_IT_RECON
+                        printf("\n Luma Pred mode = %d, qp = %d\n", u1_luma_pred_mode, qp_div * 6 + qp_rem);
+                        print_dst(pu1_pred, pred_strd, trans_size, 1);
+#endif
+                    }
+                    else
+                    {
+                        /* In case of yuv420sp_vu, prediction happens as usual.         */
+                        /* So point the pu1_pred pointer to original prediction pointer */
+                        UWORD8 *pu1_pred_orig = pu1_pred - chroma_yuv420sp_vu_u_offset;
+
+                        /*    Top-Left | Top-Right | Top | Left | Bottom-Left
+                         *      1         4         4     4         4
+                         *
+                         * Generating chroma_nbr_flags depending upon the transform size */
+                        if(ps_tu->b3_size == 0)
+                        {
+                            /* Take TL,T,L flags of First luma 4x4 block */
+                            chroma_nbr_flags = (luma_nbr_flags_4x4[0] & 0x10FF0);
+                            /* Take TR flags of Second luma 4x4 block */
+                            chroma_nbr_flags |= (luma_nbr_flags_4x4[1] & 0x0F000);
+                            /* Take BL flags of Third luma 4x4 block */
+                            chroma_nbr_flags |= (luma_nbr_flags_4x4[2] & 0x0000F);
+                        }
+
+#if DEBUG_PRINT_IQ_IT_RECON
+                        printf("\n Chroma nbr flags = %d", chroma_nbr_flags);
+#endif
+                        /* Initializing nbr pointers */
+                        pu1_top = pu1_pred_orig - pic_strd;
+                        pu1_left = pu1_pred_orig - 2;
+                        pu1_top_left = pu1_pred_orig - pic_strd - 2;
+
+                        /* Chroma pred  mode derivation from luma pred mode */
+                        {
+                            tu_t *ps_tu_tmp = ps_tu;
+                            while(!ps_tu_tmp->b1_first_tu_in_cu)
+                            {
+                                ps_tu_tmp--;
+                            }
+                            u1_luma_pred_mode_first_tu = ps_tu_tmp->b6_luma_intra_mode;
+                        }
+                        if(4 == u1_chroma_pred_mode)
+                            u1_chroma_pred_mode = u1_luma_pred_mode_first_tu;
+                        else
+                        {
+                            u1_chroma_pred_mode = gau1_intra_pred_chroma_modes[u1_chroma_pred_mode];
+
+                            if(u1_chroma_pred_mode ==
+                                                            u1_luma_pred_mode_first_tu)
+                            {
+                                u1_chroma_pred_mode = INTRA_ANGULAR(34);
+                            }
+                        }
+
+                        /* call the chroma reference array substitution */
+                        ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr(
+                                        pu1_top_left,
+                                        pu1_top, pu1_left, pic_strd, trans_size, chroma_nbr_flags, au1_ref_sub_out, 1);
+
+                        /* use the look up to get the function idx */
+                        chroma_pred_func_idx =
+                                        g_i4_ip_funcs[u1_chroma_pred_mode];
+
+                        /* call the intra prediction function */
+                        ps_codec->apf_intra_pred_chroma[chroma_pred_func_idx](au1_ref_sub_out, 1, pu1_pred_orig, pred_strd, trans_size, u1_chroma_pred_mode);
+#if DEBUG_PRINT_IQ_IT_RECON
+                        printf("\n Chroma U Pred mode = %d,qp = %d \n", u1_chroma_pred_mode, qp_div * 6 + qp_rem);
+                        print_dst(pu1_pred_orig, pred_strd, trans_size, 0);
+#endif
+                    }
+                }
+
+                /* Updating number of transform types */
+                STATS_UPDATE_ALL_TRANS(e_trans_type, c_idx);
+
+                /* IQ, IT and Recon for Y if c_idx == 0, and U if c_idx !=0 */
+                if(1 == u1_cbf)
+                {
+                    if(ps_tu->b1_transquant_bypass || transform_skip_flag)
+                    {
+                        /* Recon */
+                        ps_codec->apf_recon[func_idx](pi2_src, pu1_pred, pu1_dst,
+                                                      src_strd, pred_strd, dst_strd,
+                                                      zero_cols);
+                    }
+                    else
+                    {
+
+                        /* Updating coded number of transform types(excluding trans skip and trans quant skip) */
+                        STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 0);
+
+                        /* iQuant , iTrans and Recon */
+                        if((0 == coeff_type))
+                        {
+                            ps_codec->apf_itrans_recon[func_idx](pi2_src, pi2_tmp,
+                                                                 pu1_pred, pu1_dst,
+                                                                 src_strd, pred_strd,
+                                                                 dst_strd, zero_cols,
+                                                                 zero_rows);
+                        }
+                        else /* DC only */
+                        {
+                            STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 1);
+                            ps_codec->apf_itrans_recon_dc[c_idx](pu1_pred, pu1_dst,
+                                                                 pred_strd, dst_strd,
+                                                                 log2_trans_size,
+                                                                 i2_coeff_value);
+                        }
+                    }
+                }
+#if DEBUG_PRINT_IQ_IT_RECON
+                printf("\n Recon data \n");
+                print_dst(pu1_dst, dst_strd, trans_size, !c_idx);
+#endif
+                /* IQ, IT and Recon for V */
+                if(c_idx != 0)
+                {
+#if DEBUG_PRINT_IQ_IT_RECON
+                    printf("\nChroma Coeff V \n");
+                    print_coeff(pi2_src_v, trans_size);
+                    printf("\n Chroma V Pred mode = %d,qp = %d \n",
+                           u1_chroma_pred_mode, qp_div_v * 6 + qp_rem_v);
+                    print_dst(pu1_pred + 1, dst_strd, trans_size, 0);
+#endif
+                    if(1 == u1_cbf_v)
+                    {
+                        if(ps_tu->b1_transquant_bypass || transform_skip_flag_v)
+                        {
+                            /* Recon */
+                            ps_codec->apf_recon[func_idx](pi2_src_v, pu1_pred_v,
+                                                          pu1_dst_v, src_strd,
+                                                          pred_strd, dst_strd,
+                                                          zero_cols_v);
+                        }
+                        else
+                        {
+                            /* Updating number of transform types */
+                            STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 0);
+
+                            /* iQuant , iTrans and Recon */
+                            if((0 == coeff_type_v))
+                            {
+                                ps_codec->apf_itrans_recon[func_idx](pi2_src_v,
+                                                                     pi2_tmp,
+                                                                     pu1_pred_v,
+                                                                     pu1_dst_v,
+                                                                     src_strd,
+                                                                     pred_strd,
+                                                                     dst_strd,
+                                                                     zero_cols_v,
+                                                                     zero_rows_v);
+                            }
+                            else  /* DC only */
+                            {
+                                STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 1);
+                                ps_codec->apf_itrans_recon_dc[c_idx](pu1_pred_v, pu1_dst_v,
+                                                                     pred_strd, dst_strd,
+                                                                     log2_trans_size,
+                                                                     i2_coeff_value_v);
+                            }
+                        }
+                    }
+#if DEBUG_PRINT_IQ_IT_RECON
+                    printf("\n Recon data \n");
+                    print_dst(pu1_dst + 1, dst_strd, trans_size, 0);
+#endif
+                }
+            }
+
+            /* Neighbor availability inside CTB */
+            /* 1bit per 4x4. Indicates whether that 4x4 block has been reconstructed(avialable) */
+            /* Used for neighbor availability in intra pred */
+            if(c_idx == 0)
+            {
+                WORD32 i;
+                WORD32 trans_in_min_tu;
+                UWORD32 cur_tu_in_bits;
+                UWORD32 cur_tu_avail_flag;
+
+                trans_in_min_tu = trans_size / MIN_TU_SIZE;
+                cur_tu_in_bits = (1 << trans_in_min_tu) - 1;
+                cur_tu_in_bits = cur_tu_in_bits << (32 - trans_in_min_tu);
+
+                cur_tu_avail_flag = cur_tu_in_bits >> (ps_tu->b4_pos_x + 1);
+
+                for(i = 0; i < trans_in_min_tu; i++)
+                    au4_intra_nbr_avail[1 + ps_tu->b4_pos_y + i] |=
+                                    cur_tu_avail_flag;
+            }
+        }
+    }
+    ps_proc->pv_tu_coeff_data = pu1_tu_coeff_data;
+
+    return ps_proc->i4_ctb_tu_cnt;
+}
+

diff --git a/decoder/ihevcd_iquant_itrans_recon_ctb.h b/decoder/ihevcd_iquant_itrans_recon_ctb.h
new file mode 100644
index 0000000..fde647f
--- /dev/null
+++ b/decoder/ihevcd_iquant_itrans_recon_ctb.h

@@ -0,0 +1,67 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevcd_iquant_itrans_recon_ctb.h
+ *
+ * @brief
+ *  Definitions related to inverse transform functions
+ *
+ * @author
+ *  Naveen S R
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+#ifndef _IHEVCD_IQUANT_ITRANS_RECON_CTB_H_
+#define _IHEVCD_IQUANT_ITRANS_RECON_CTB_H_
+
+#define MAX_NUM_IP_MODES        35
+
+typedef enum
+{
+    IP_FUNC_MODE_0 = 1,
+    IP_FUNC_MODE_1,
+    IP_FUNC_MODE_2,
+    IP_FUNC_MODE_3TO9,
+    IP_FUNC_MODE_10,
+    IP_FUNC_MODE_11TO17,
+    IP_FUNC_MODE_18_34,
+    IP_FUNC_MODE_19TO25,
+    IP_FUNC_MODE_26,
+    IP_FUNC_MODE_27TO33,
+
+    NUM_IP_FUNCS
+
+}IP_FUNCS_T;
+
+
+typedef enum
+{
+    DST_4x4, DCT_4x4, DCT_8x8, DCT_16x16, DCT_32x32, SKIP_64x64
+}TRANSFORM_TYPE;
+
+WORD32 ihevcd_iquant_itrans_recon_ctb(process_ctxt_t *ps_proc);
+
+#endif /* _IHEVCD_IQUANT_ITRANS_RECON_CTB_H_ */

diff --git a/decoder/ihevcd_itrans_recon_dc.c b/decoder/ihevcd_itrans_recon_dc.c
new file mode 100644
index 0000000..ae37e40
--- /dev/null
+++ b/decoder/ihevcd_itrans_recon_dc.c

@@ -0,0 +1,146 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevcd_itrans_recon_dc.c
+ *
+ * @brief
+ *  Contains functions for DC inverse transform and reconstruction
+ *
+ * @author
+ *  Ittiam
+ *
+ * @par List of Functions:
+ * - ihevcd_itrans_recon_dc_luma()
+ * - ihevcd_itrans_recon_dc_chroma()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_bitstream.h"
+#include "ihevc_common_tables.h"
+
+/* Intra pred includes */
+#include "ihevc_intra_pred.h"
+
+/* Inverse transform common module includes */
+#include "ihevc_trans_tables.h"
+#include "ihevc_trans_macros.h"
+#include "ihevc_itrans_recon.h"
+#include "ihevc_recon.h"
+#include "ihevc_chroma_itrans_recon.h"
+#include "ihevc_chroma_recon.h"
+
+/* Decoder includes */
+#include "ihevcd_common_tables.h"
+#include "ihevcd_iquant_itrans_recon_ctb.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_profile.h"
+#include "ihevcd_statistics.h"
+#include "ihevcd_itrans_recon_dc.h"
+
+
+
+void ihevcd_itrans_recon_dc_luma(UWORD8 *pu1_pred,
+                                 UWORD8 *pu1_dst,
+                                 WORD32 pred_strd,
+                                 WORD32 dst_strd,
+                                 WORD32 log2_trans_size,
+                                 WORD16 i2_coeff_value)
+{
+    WORD32 row, col;
+    WORD32 add, shift;
+    WORD32 dc_value, quant_out;
+    WORD32 trans_size;
+
+    trans_size = (1 << log2_trans_size);
+
+    quant_out = i2_coeff_value;
+
+    shift = IT_SHIFT_STAGE_1;
+    add = 1 << (shift - 1);
+    dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
+    shift = IT_SHIFT_STAGE_2;
+    add = 1 << (shift - 1);
+    dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
+
+    for(row = 0; row < trans_size; row++)
+        for(col = 0; col < trans_size; col++)
+            pu1_dst[row * dst_strd + col] = CLIP_U8((pu1_pred[row * pred_strd + col] + dc_value));
+
+}
+
+
+void ihevcd_itrans_recon_dc_chroma(UWORD8 *pu1_pred,
+                                   UWORD8 *pu1_dst,
+                                   WORD32 pred_strd,
+                                   WORD32 dst_strd,
+                                   WORD32 log2_trans_size,
+                                   WORD16 i2_coeff_value)
+{
+    WORD32 row, col;
+    WORD32 add, shift;
+    WORD32 dc_value, quant_out;
+    WORD32 trans_size;
+
+
+    trans_size = (1 << log2_trans_size);
+
+    quant_out = i2_coeff_value;
+
+    shift = IT_SHIFT_STAGE_1;
+    add = 1 << (shift - 1);
+    dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
+    shift = IT_SHIFT_STAGE_2;
+    add = 1 << (shift - 1);
+    dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
+
+    for(row = 0; row < trans_size; row++)
+        for(col = 0; col < trans_size; col++)
+            pu1_dst[row * dst_strd + (col << 1)] = CLIP_U8((pu1_pred[row * pred_strd + (col << 1)] + dc_value));
+
+}
+
+

diff --git a/decoder/ihevcd_itrans_recon_dc.h b/decoder/ihevcd_itrans_recon_dc.h
new file mode 100644
index 0000000..0e64a9e
--- /dev/null
+++ b/decoder/ihevcd_itrans_recon_dc.h

@@ -0,0 +1,77 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_itrans_recon.h
+*
+* @brief
+*  Header for itrans recon dc functions
+*
+* @author
+*  Naveen
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_ITRANS_RECON_DC_H_
+#define _IHEVCD_ITRANS_RECON_DC_H_
+
+typedef void ihevcd_itrans_recon_dc_luma_ft(UWORD8 *pu1_pred,
+                                            UWORD8 *pu1_dst,
+                                            WORD32 pred_strd,
+                                            WORD32 dst_strd,
+                                            WORD32 log2_trans_size,
+                                            WORD16 i2_coeff_value);
+typedef void ihevcd_itrans_recon_dc_chroma_ft(UWORD8 *pu1_pred,
+                                              UWORD8 *pu1_dst,
+                                              WORD32 pred_strd,
+                                              WORD32 dst_strd,
+                                              WORD32 log2_trans_size,
+                                              WORD16 i2_coeff_value);
+
+/* C function declarations */
+ihevcd_itrans_recon_dc_luma_ft ihevcd_itrans_recon_dc_luma;
+ihevcd_itrans_recon_dc_chroma_ft ihevcd_itrans_recon_dc_chroma;
+
+/* A9Q function declarations */
+ihevcd_itrans_recon_dc_luma_ft ihevcd_itrans_recon_dc_luma_a9q;
+ihevcd_itrans_recon_dc_chroma_ft ihevcd_itrans_recon_dc_chroma_a9q;
+
+/* A9A function declarations */
+ihevcd_itrans_recon_dc_luma_ft ihevcd_itrans_recon_dc_luma_a9a;
+ihevcd_itrans_recon_dc_chroma_ft ihevcd_itrans_recon_dc_chroma_a9a;
+
+/* SSSE3 function declarations */
+ihevcd_itrans_recon_dc_luma_ft ihevcd_itrans_recon_dc_luma_ssse3;
+ihevcd_itrans_recon_dc_chroma_ft ihevcd_itrans_recon_dc_chroma_ssse3;
+
+/* SSS4.2 function declarations */
+ihevcd_itrans_recon_dc_luma_ft ihevcd_itrans_recon_dc_luma_sse42;
+ihevcd_itrans_recon_dc_chroma_ft ihevcd_itrans_recon_dc_chroma_sse42;
+
+/* armv8 function declarations */
+ihevcd_itrans_recon_dc_luma_ft ihevcd_itrans_recon_dc_luma_av8;
+ihevcd_itrans_recon_dc_chroma_ft ihevcd_itrans_recon_dc_chroma_av8;
+
+#endif /* _IHEVCD_ITRANS_RECON_DC_H_ */

diff --git a/decoder/ihevcd_ittiam_logo.c b/decoder/ihevcd_ittiam_logo.c
new file mode 100644
index 0000000..269585b
--- /dev/null
+++ b/decoder/ihevcd_ittiam_logo.c

@@ -0,0 +1,4636 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/*****************************************************************************/
+/*                                                                           */
+/*  File Name         : ihevcd_ittiam_logo.c                                 */
+/*                                                                           */
+/*  Description       : This file contains all the necessary tables for      */
+/*                      inserting ittiam logo to a yuv buffer                */
+/*                                                                           */
+/*  List of Functions : memcpy_2d                                            */
+/*                      insert_logo                                          */
+/*                                                                           */
+/*  Issues / Problems : None                                                 */
+/*                                                                           */
+/*  Revision History  :                                                      */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
+/*         10 10 2005   Ittiam          Draft                                */
+/*                                                                           */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+/* System include files */
+
+/* User include files */
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ihevcd_ittiam_logo.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include <string.h>
+
+#ifdef LOGO_EN
+#define CODEC_LOGO 0
+
+const UWORD8 gau1_ihevcd_codec_logo_y[] =
+{
+    0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xD9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC7, 0xC1,
+    0xC1, 0xC1, 0xC1, 0xD2, 0xFF, 0xFF, 0xFF, 0xFF, 0xEA, 0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xC1,
+    0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xF5, 0xE2, 0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xD0, 0xFE, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xCF, 0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xE4, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xEB, 0xCC, 0xB3, 0xA5, 0x9E, 0xA1, 0xAD, 0xC1, 0xDD, 0xF7,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xEE, 0xEC, 0xA0, 0x9C, 0x9C, 0x9C, 0x9C, 0xA2, 0xF3, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF3, 0xA0, 0x9C, 0x9C, 0x9C, 0x9C, 0xA1, 0xF1, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xEC, 0xB9, 0xA0, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xA5,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xEE, 0xFE, 0xB6, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xD4, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xD0, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xBA, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFE, 0xD9, 0xA2, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xEE, 0xFF, 0xDC, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xB1, 0xFD, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFC, 0xAD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xE2, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFE, 0xCE, 0x9D, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xEE, 0xFF, 0xF8, 0xA8, 0x9C, 0x9C, 0x9C, 0x9C, 0x9E, 0xEA, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xE7, 0x9D, 0x9C, 0x9C, 0x9C, 0x9C, 0xAB, 0xFC, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xD5, 0x9E, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xBD, 0xC9,
+    0xC9, 0xC9, 0xC9, 0xC9, 0xC9, 0xF6, 0xFF, 0xFF, 0xC8, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC7, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC2, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xCF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xEC, 0xA3, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xA1, 0xA4, 0xA4, 0xA1, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xE2, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xEC, 0x9E, 0x9C, 0x9C, 0x9C, 0x9C, 0xA8, 0xF9,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xF7, 0xA5, 0x9C, 0x9C, 0x9C, 0x9C, 0xA2, 0xF1, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xBA, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xA5, 0xD2, 0xF0, 0xFC, 0xFC, 0xF0, 0xD3, 0xA7, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xE2, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xB6, 0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xDE,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xDA, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xBD, 0xFF, 0xFF, 0xFF, 0xFF, 0xEE,
+    0xA1, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xB1, 0xF3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF6, 0xB6,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xE2, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xDC, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xBA,
+    0xFF, 0xFF, 0xFF, 0xFE, 0xB6, 0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xE4, 0xFF, 0xFF, 0xFF, 0xFF, 0xD2,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xA5, 0xF1, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF8,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xA5, 0xB3, 0xB3, 0xB3, 0xB3, 0xB3, 0xB3, 0xB3, 0xB3, 0x9E, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xAC, 0xB3,
+    0xB3, 0xB3, 0xB3, 0xB3, 0xC1, 0xFF, 0xFF, 0xFF, 0xFF, 0xF9, 0xA7, 0x9C, 0x9C, 0x9C, 0x9C, 0xA2,
+    0xF2, 0xFF, 0xFF, 0xEF, 0xA1, 0x9C, 0x9C, 0x9C, 0x9C, 0xAD, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xBA,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xCC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0xAF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC8, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0xD3, 0xFF, 0xFF, 0xCC, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xD2, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xAB,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xEA, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0xAF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xEF, 0x9D, 0x9C, 0x9C, 0x9C, 0x9C,
+    0xAF, 0xFE, 0xFC, 0xAA, 0x9C, 0x9C, 0x9C, 0x9C, 0xA2, 0xF5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xF8, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0xAF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xB4, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9D, 0xEA, 0xE3, 0x9D, 0x9C, 0x9C, 0x9C, 0x9C, 0xBF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA4,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xF8, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0xAF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xDB, 0x9D, 0x9C, 0x9C, 0x9C,
+    0x9C, 0xC6, 0xBF, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xE7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xAB,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xEB, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xBF, 0xFA, 0xFA, 0xFA, 0xFA, 0xFA, 0xFA, 0xFA, 0xFA, 0xA5, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC6, 0xD7,
+    0xD7, 0xD7, 0xD7, 0xD7, 0xDE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFA, 0xA5, 0x9C, 0x9C, 0x9C,
+    0x9C, 0xA4, 0xA0, 0x9C, 0x9C, 0x9C, 0x9C, 0xAD, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBA,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xCD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xE2, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC7, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xD4, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xD0,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xA5, 0xF2, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF8,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xE2, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xEB, 0xA0, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xA2, 0xF7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xEE,
+    0xA0, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xB1, 0xF3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF7, 0xB6,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xE2, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xB4, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC1, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xBA, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xA5, 0xD3, 0xF0, 0xFC, 0xFD, 0xF1, 0xD4, 0xA7, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xBD, 0xCB,
+    0xCB, 0xCB, 0xCB, 0xCB, 0xCB, 0xF6, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xDB, 0x9D, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9E, 0xE7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xEB, 0xA3, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xA1, 0xA5, 0xA5, 0xA1, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xEE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF7, 0xA7, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xB0, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xD4, 0x9D, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xEE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC6, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xD6, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFE, 0xCD, 0x9D, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xEE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xEB, 0xA0,
+    0x9C, 0x9C, 0x9C, 0x9C, 0xA5, 0xF6, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFE, 0xD6, 0xA2, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+    0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+    0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xEE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xB4,
+    0x9C, 0x9C, 0x9C, 0x9C, 0xC4, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xEC, 0xB8, 0xA0, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xA4,
+    0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xD7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC6, 0xC1,
+    0xC1, 0xC1, 0xC1, 0xD2, 0xFF, 0xFF, 0xFF, 0xFF, 0xEA, 0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xC1,
+    0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xF5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xE3,
+    0xC1, 0xC1, 0xC1, 0xC1, 0xEE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xEA, 0xCB, 0xB2, 0xA3, 0x9E, 0xA1, 0xAC, 0xC0, 0xDC, 0xF7,
+
+};
+const UWORD8 gau1_ihevcd_codec_logo_420p_u[] =
+{
+    0x4D, 0x4D, 0x56, 0x80, 0x80, 0x80, 0x80, 0x4F, 0x4D, 0x53, 0x80, 0x80, 0x5E, 0x4D, 0x4D, 0x4D,
+    0x4D, 0x4D, 0x62, 0x5F, 0x4D, 0x4D, 0x68, 0x80, 0x80, 0x80, 0x80, 0x67, 0x4D, 0x4D, 0x61, 0x80,
+    0x80, 0x81, 0x70, 0x56, 0x46, 0x42, 0x4A, 0x5B, 0x40, 0x40, 0x4B, 0x80, 0x80, 0x80, 0x80, 0x42,
+    0x40, 0x48, 0x80, 0x80, 0x55, 0x40, 0x40, 0x3E, 0x3E, 0x3E, 0x59, 0x6D, 0x3F, 0x40, 0x4C, 0x7F,
+    0x80, 0x80, 0x7F, 0x4B, 0x40, 0x3F, 0x70, 0x80, 0x80, 0x62, 0x42, 0x3F, 0x40, 0x40, 0x3F, 0x3F,
+    0x41, 0x41, 0x4C, 0x80, 0x80, 0x80, 0x80, 0x44, 0x41, 0x49, 0x80, 0x80, 0x56, 0x41, 0x41, 0x4F,
+    0x51, 0x51, 0x65, 0x7F, 0x49, 0x41, 0x41, 0x73, 0x80, 0x80, 0x72, 0x41, 0x41, 0x4B, 0x7F, 0x80,
+    0x67, 0x40, 0x40, 0x3F, 0x43, 0x47, 0x43, 0x3F, 0x41, 0x41, 0x4D, 0x82, 0x82, 0x82, 0x82, 0x44,
+    0x41, 0x49, 0x80, 0x80, 0x56, 0x41, 0x41, 0x78, 0x82, 0x82, 0x80, 0x80, 0x61, 0x41, 0x41, 0x5C,
+    0x80, 0x80, 0x5A, 0x41, 0x41, 0x64, 0x80, 0x7D, 0x46, 0x41, 0x40, 0x54, 0x73, 0x7B, 0x73, 0x56,
+    0x41, 0x41, 0x48, 0x66, 0x66, 0x66, 0x66, 0x42, 0x41, 0x49, 0x80, 0x80, 0x56, 0x41, 0x41, 0x61,
+    0x67, 0x67, 0x76, 0x80, 0x79, 0x42, 0x41, 0x46, 0x7E, 0x7D, 0x45, 0x41, 0x43, 0x7B, 0x80, 0x6D,
+    0x40, 0x41, 0x4A, 0x7E, 0x81, 0x80, 0x81, 0x7F, 0x41, 0x41, 0x41, 0x3E, 0x3E, 0x3E, 0x3E, 0x41,
+    0x41, 0x49, 0x80, 0x80, 0x56, 0x41, 0x41, 0x3F, 0x3F, 0x3F, 0x66, 0x80, 0x80, 0x54, 0x41, 0x40,
+    0x6C, 0x6A, 0x41, 0x41, 0x58, 0x80, 0x80, 0x64, 0x41, 0x41, 0x5C, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x41, 0x41, 0x42, 0x45, 0x45, 0x45, 0x45, 0x41, 0x41, 0x49, 0x80, 0x80, 0x56, 0x41, 0x41, 0x42,
+    0x42, 0x42, 0x67, 0x80, 0x80, 0x6F, 0x40, 0x41, 0x54, 0x51, 0x41, 0x41, 0x72, 0x80, 0x80, 0x63,
+    0x41, 0x41, 0x5C, 0x80, 0x80, 0x80, 0x80, 0x80, 0x41, 0x41, 0x4B, 0x7A, 0x7A, 0x7A, 0x7A, 0x43,
+    0x41, 0x49, 0x80, 0x80, 0x56, 0x41, 0x41, 0x6A, 0x71, 0x71, 0x7A, 0x80, 0x80, 0x7F, 0x49, 0x41,
+    0x42, 0x41, 0x41, 0x4B, 0x80, 0x80, 0x80, 0x6D, 0x40, 0x41, 0x4A, 0x7E, 0x81, 0x80, 0x81, 0x7F,
+    0x41, 0x41, 0x4C, 0x81, 0x81, 0x81, 0x81, 0x44, 0x41, 0x49, 0x80, 0x80, 0x56, 0x41, 0x41, 0x78,
+    0x81, 0x81, 0x80, 0x80, 0x80, 0x80, 0x61, 0x41, 0x41, 0x41, 0x41, 0x66, 0x80, 0x80, 0x80, 0x7D,
+    0x46, 0x41, 0x40, 0x54, 0x73, 0x7B, 0x74, 0x56, 0x41, 0x41, 0x4C, 0x80, 0x80, 0x80, 0x80, 0x44,
+    0x41, 0x49, 0x80, 0x80, 0x56, 0x41, 0x41, 0x4F, 0x51, 0x51, 0x65, 0x80, 0x80, 0x80, 0x79, 0x42,
+    0x41, 0x41, 0x44, 0x7B, 0x80, 0x80, 0x80, 0x80, 0x67, 0x40, 0x40, 0x3F, 0x43, 0x47, 0x43, 0x3F,
+    0x40, 0x40, 0x4B, 0x80, 0x80, 0x80, 0x80, 0x42, 0x40, 0x48, 0x80, 0x80, 0x55, 0x40, 0x40, 0x3E,
+    0x3E, 0x3E, 0x59, 0x80, 0x80, 0x80, 0x80, 0x55, 0x40, 0x40, 0x59, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x61, 0x42, 0x3F, 0x40, 0x40, 0x3F, 0x3F, 0x4D, 0x4D, 0x56, 0x80, 0x80, 0x80, 0x80, 0x4F,
+    0x4D, 0x54, 0x80, 0x80, 0x5E, 0x4D, 0x4D, 0x4D, 0x4D, 0x4D, 0x62, 0x80, 0x80, 0x80, 0x80, 0x70,
+    0x4D, 0x4D, 0x74, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81, 0x70, 0x55, 0x46, 0x42, 0x4A, 0x5B,
+};
+
+const UWORD8 gau1_ihevcd_codec_logo_420p_v[] =
+{
+    0xB9, 0xB9, 0xAE, 0x80, 0x80, 0x80, 0x80, 0xB6, 0xB9, 0xB1, 0x80, 0x80, 0xA6, 0xB9, 0xB9, 0xB9,
+    0xB9, 0xB9, 0xA1, 0xA4, 0xB9, 0xB9, 0x9B, 0x7F, 0x80, 0x80, 0x7F, 0x9C, 0xB9, 0xB9, 0xA3, 0x80,
+    0x7F, 0x7E, 0x92, 0xAF, 0xC1, 0xC5, 0xBD, 0xA9, 0xC8, 0xC8, 0xBB, 0x80, 0x80, 0x80, 0x80, 0xC4,
+    0xC8, 0xBD, 0x80, 0x80, 0xAF, 0xC8, 0xC8, 0xCA, 0xCA, 0xCA, 0xAB, 0x94, 0xC8, 0xC8, 0xB9, 0x80,
+    0x80, 0x80, 0x80, 0xBA, 0xC8, 0xC8, 0x92, 0x80, 0x7F, 0xA1, 0xC4, 0xC9, 0xC8, 0xC7, 0xC8, 0xC9,
+    0xC6, 0xC6, 0xBA, 0x80, 0x80, 0x80, 0x80, 0xC3, 0xC6, 0xBC, 0x80, 0x80, 0xAE, 0xC6, 0xC6, 0xB6,
+    0xB4, 0xB4, 0x9E, 0x80, 0xBC, 0xC6, 0xC6, 0x8D, 0x80, 0x80, 0x8E, 0xC6, 0xC6, 0xBA, 0x80, 0x7F,
+    0x9B, 0xC7, 0xC7, 0xC8, 0xC3, 0xBF, 0xC3, 0xC8, 0xC6, 0xC6, 0xB9, 0x7C, 0x7C, 0x7C, 0x7C, 0xC2,
+    0xC6, 0xBC, 0x80, 0x80, 0xAE, 0xC6, 0xC6, 0x88, 0x7D, 0x7D, 0x7F, 0x7F, 0xA1, 0xC6, 0xC6, 0xA8,
+    0x7F, 0x7F, 0xAA, 0xC6, 0xC6, 0x9E, 0x7F, 0x83, 0xC0, 0xC6, 0xC7, 0xB0, 0x8D, 0x84, 0x8D, 0xAF,
+    0xC6, 0xC6, 0xBE, 0x9B, 0x9B, 0x9B, 0x9B, 0xC4, 0xC6, 0xBC, 0x80, 0x80, 0xAE, 0xC6, 0xC6, 0xA1,
+    0x9B, 0x9B, 0x8A, 0x80, 0x87, 0xC5, 0xC6, 0xBF, 0x82, 0x82, 0xC1, 0xC6, 0xC2, 0x85, 0x80, 0x94,
+    0xC7, 0xC6, 0xBC, 0x82, 0x7E, 0x7F, 0x7E, 0x80, 0xC6, 0xC6, 0xC6, 0xC9, 0xC9, 0xC9, 0xC9, 0xC6,
+    0xC6, 0xBC, 0x80, 0x80, 0xAE, 0xC6, 0xC6, 0xC8, 0xC9, 0xC9, 0x9D, 0x80, 0x7F, 0xB0, 0xC6, 0xC7,
+    0x96, 0x98, 0xC6, 0xC6, 0xAC, 0x7F, 0x80, 0x9F, 0xC6, 0xC6, 0xA7, 0x7F, 0x80, 0x80, 0x80, 0x7F,
+    0xC6, 0xC6, 0xC5, 0xC2, 0xC2, 0xC2, 0xC2, 0xC6, 0xC6, 0xBC, 0x80, 0x80, 0xAE, 0xC6, 0xC6, 0xC5,
+    0xC5, 0xC5, 0x9C, 0x80, 0x80, 0x93, 0xC7, 0xC6, 0xB1, 0xB4, 0xC6, 0xC7, 0x8F, 0x80, 0x80, 0x9F,
+    0xC6, 0xC6, 0xA7, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0xC6, 0xC6, 0xBA, 0x86, 0x86, 0x86, 0x86, 0xC3,
+    0xC6, 0xBC, 0x80, 0x80, 0xAE, 0xC6, 0xC6, 0x98, 0x90, 0x90, 0x86, 0x80, 0x80, 0x80, 0xBD, 0xC6,
+    0xC5, 0xC6, 0xC6, 0xB9, 0x7F, 0x80, 0x80, 0x95, 0xC7, 0xC6, 0xBC, 0x82, 0x7E, 0x7F, 0x7E, 0x80,
+    0xC6, 0xC6, 0xB9, 0x7E, 0x7E, 0x7E, 0x7E, 0xC3, 0xC6, 0xBC, 0x80, 0x80, 0xAE, 0xC6, 0xC6, 0x88,
+    0x7E, 0x7E, 0x7F, 0x80, 0x80, 0x7F, 0xA2, 0xC6, 0xC6, 0xC6, 0xC6, 0x9C, 0x7F, 0x80, 0x80, 0x83,
+    0xC0, 0xC6, 0xC7, 0xB0, 0x8D, 0x84, 0x8D, 0xAE, 0xC6, 0xC6, 0xBA, 0x80, 0x80, 0x80, 0x80, 0xC3,
+    0xC6, 0xBC, 0x80, 0x80, 0xAE, 0xC6, 0xC6, 0xB6, 0xB3, 0xB3, 0x9D, 0x80, 0x80, 0x80, 0x87, 0xC5,
+    0xC6, 0xC6, 0xC2, 0x84, 0x80, 0x80, 0x80, 0x7F, 0x9B, 0xC7, 0xC7, 0xC8, 0xC3, 0xBF, 0xC3, 0xC8,
+    0xC8, 0xC8, 0xBB, 0x80, 0x80, 0x80, 0x80, 0xC4, 0xC8, 0xBD, 0x80, 0x80, 0xAF, 0xC8, 0xC8, 0xCA,
+    0xCA, 0xCA, 0xAB, 0x80, 0x80, 0x80, 0x7F, 0xB1, 0xC8, 0xC8, 0xAB, 0x7F, 0x80, 0x80, 0x80, 0x80,
+    0x7F, 0xA1, 0xC4, 0xC8, 0xC8, 0xC7, 0xC8, 0xC9, 0xB9, 0xB9, 0xAF, 0x80, 0x80, 0x80, 0x80, 0xB7,
+    0xB9, 0xB2, 0x80, 0x80, 0xA6, 0xB9, 0xB9, 0xB9, 0xB9, 0xB9, 0xA2, 0x80, 0x80, 0x80, 0x80, 0x93,
+    0xB9, 0xB9, 0x8E, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x92, 0xB0, 0xC1, 0xC5, 0xBD, 0xAA,
+};
+
+const UWORD8 gau1_ihevcd_codec_logo_420sp_uv[] =
+{
+    0x4D, 0xB9, 0x4D, 0xB9, 0x56, 0xAE, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x4F, 0xB6,
+    0x4D, 0xB9, 0x53, 0xB1, 0x80, 0x80, 0x80, 0x80, 0x5E, 0xA6, 0x4D, 0xB9, 0x4D, 0xB9, 0x4D, 0xB9,
+    0x4D, 0xB9, 0x4D, 0xB9, 0x62, 0xA1, 0x5F, 0xA4, 0x4D, 0xB9, 0x4D, 0xB9, 0x68, 0x9B, 0x80, 0x7F,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x67, 0x9C, 0x4D, 0xB9, 0x4D, 0xB9, 0x61, 0xA3, 0x80, 0x80,
+    0x80, 0x7F, 0x81, 0x7E, 0x70, 0x92, 0x56, 0xAF, 0x46, 0xC1, 0x42, 0xC5, 0x4A, 0xBD, 0x5B, 0xA9,
+    0x40, 0xC8, 0x40, 0xC8, 0x4B, 0xBB, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x42, 0xC4,
+    0x40, 0xC8, 0x48, 0xBD, 0x80, 0x80, 0x80, 0x80, 0x55, 0xAF, 0x40, 0xC8, 0x40, 0xC8, 0x3E, 0xCA,
+    0x3E, 0xCA, 0x3E, 0xCA, 0x59, 0xAB, 0x6D, 0x94, 0x3F, 0xC8, 0x40, 0xC8, 0x4C, 0xB9, 0x7F, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x4B, 0xBA, 0x40, 0xC8, 0x3F, 0xC8, 0x70, 0x92, 0x80, 0x80,
+    0x80, 0x7F, 0x62, 0xA1, 0x42, 0xC4, 0x3F, 0xC9, 0x40, 0xC8, 0x40, 0xC7, 0x3F, 0xC8, 0x3F, 0xC9,
+    0x41, 0xC6, 0x41, 0xC6, 0x4C, 0xBA, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x44, 0xC3,
+    0x41, 0xC6, 0x49, 0xBC, 0x80, 0x80, 0x80, 0x80, 0x56, 0xAE, 0x41, 0xC6, 0x41, 0xC6, 0x4F, 0xB6,
+    0x51, 0xB4, 0x51, 0xB4, 0x65, 0x9E, 0x7F, 0x80, 0x49, 0xBC, 0x41, 0xC6, 0x41, 0xC6, 0x73, 0x8D,
+    0x80, 0x80, 0x80, 0x80, 0x72, 0x8E, 0x41, 0xC6, 0x41, 0xC6, 0x4B, 0xBA, 0x7F, 0x80, 0x80, 0x7F,
+    0x67, 0x9B, 0x40, 0xC7, 0x40, 0xC7, 0x3F, 0xC8, 0x43, 0xC3, 0x47, 0xBF, 0x43, 0xC3, 0x3F, 0xC8,
+    0x41, 0xC6, 0x41, 0xC6, 0x4D, 0xB9, 0x82, 0x7C, 0x82, 0x7C, 0x82, 0x7C, 0x82, 0x7C, 0x44, 0xC2,
+    0x41, 0xC6, 0x49, 0xBC, 0x80, 0x80, 0x80, 0x80, 0x56, 0xAE, 0x41, 0xC6, 0x41, 0xC6, 0x78, 0x88,
+    0x82, 0x7D, 0x82, 0x7D, 0x80, 0x7F, 0x80, 0x7F, 0x61, 0xA1, 0x41, 0xC6, 0x41, 0xC6, 0x5C, 0xA8,
+    0x80, 0x7F, 0x80, 0x7F, 0x5A, 0xAA, 0x41, 0xC6, 0x41, 0xC6, 0x64, 0x9E, 0x80, 0x7F, 0x7D, 0x83,
+    0x46, 0xC0, 0x41, 0xC6, 0x40, 0xC7, 0x54, 0xB0, 0x73, 0x8D, 0x7B, 0x84, 0x73, 0x8D, 0x56, 0xAF,
+    0x41, 0xC6, 0x41, 0xC6, 0x48, 0xBE, 0x66, 0x9B, 0x66, 0x9B, 0x66, 0x9B, 0x66, 0x9B, 0x42, 0xC4,
+    0x41, 0xC6, 0x49, 0xBC, 0x80, 0x80, 0x80, 0x80, 0x56, 0xAE, 0x41, 0xC6, 0x41, 0xC6, 0x61, 0xA1,
+    0x67, 0x9B, 0x67, 0x9B, 0x76, 0x8A, 0x80, 0x80, 0x79, 0x87, 0x42, 0xC5, 0x41, 0xC6, 0x46, 0xBF,
+    0x7E, 0x82, 0x7D, 0x82, 0x45, 0xC1, 0x41, 0xC6, 0x43, 0xC2, 0x7B, 0x85, 0x80, 0x80, 0x6D, 0x94,
+    0x40, 0xC7, 0x41, 0xC6, 0x4A, 0xBC, 0x7E, 0x82, 0x81, 0x7E, 0x80, 0x7F, 0x81, 0x7E, 0x7F, 0x80,
+    0x41, 0xC6, 0x41, 0xC6, 0x41, 0xC6, 0x3E, 0xC9, 0x3E, 0xC9, 0x3E, 0xC9, 0x3E, 0xC9, 0x41, 0xC6,
+    0x41, 0xC6, 0x49, 0xBC, 0x80, 0x80, 0x80, 0x80, 0x56, 0xAE, 0x41, 0xC6, 0x41, 0xC6, 0x3F, 0xC8,
+    0x3F, 0xC9, 0x3F, 0xC9, 0x66, 0x9D, 0x80, 0x80, 0x80, 0x7F, 0x54, 0xB0, 0x41, 0xC6, 0x40, 0xC7,
+    0x6C, 0x96, 0x6A, 0x98, 0x41, 0xC6, 0x41, 0xC6, 0x58, 0xAC, 0x80, 0x7F, 0x80, 0x80, 0x64, 0x9F,
+    0x41, 0xC6, 0x41, 0xC6, 0x5C, 0xA7, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F,
+    0x41, 0xC6, 0x41, 0xC6, 0x42, 0xC5, 0x45, 0xC2, 0x45, 0xC2, 0x45, 0xC2, 0x45, 0xC2, 0x41, 0xC6,
+    0x41, 0xC6, 0x49, 0xBC, 0x80, 0x80, 0x80, 0x80, 0x56, 0xAE, 0x41, 0xC6, 0x41, 0xC6, 0x42, 0xC5,
+    0x42, 0xC5, 0x42, 0xC5, 0x67, 0x9C, 0x80, 0x80, 0x80, 0x80, 0x6F, 0x93, 0x40, 0xC7, 0x41, 0xC6,
+    0x54, 0xB1, 0x51, 0xB4, 0x41, 0xC6, 0x41, 0xC7, 0x72, 0x8F, 0x80, 0x80, 0x80, 0x80, 0x63, 0x9F,
+    0x41, 0xC6, 0x41, 0xC6, 0x5C, 0xA7, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F,
+    0x41, 0xC6, 0x41, 0xC6, 0x4B, 0xBA, 0x7A, 0x86, 0x7A, 0x86, 0x7A, 0x86, 0x7A, 0x86, 0x43, 0xC3,
+    0x41, 0xC6, 0x49, 0xBC, 0x80, 0x80, 0x80, 0x80, 0x56, 0xAE, 0x41, 0xC6, 0x41, 0xC6, 0x6A, 0x98,
+    0x71, 0x90, 0x71, 0x90, 0x7A, 0x86, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x49, 0xBD, 0x41, 0xC6,
+    0x42, 0xC5, 0x41, 0xC6, 0x41, 0xC6, 0x4B, 0xB9, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x6D, 0x95,
+    0x40, 0xC7, 0x41, 0xC6, 0x4A, 0xBC, 0x7E, 0x82, 0x81, 0x7E, 0x80, 0x7F, 0x81, 0x7E, 0x7F, 0x80,
+    0x41, 0xC6, 0x41, 0xC6, 0x4C, 0xB9, 0x81, 0x7E, 0x81, 0x7E, 0x81, 0x7E, 0x81, 0x7E, 0x44, 0xC3,
+    0x41, 0xC6, 0x49, 0xBC, 0x80, 0x80, 0x80, 0x80, 0x56, 0xAE, 0x41, 0xC6, 0x41, 0xC6, 0x78, 0x88,
+    0x81, 0x7E, 0x81, 0x7E, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x61, 0xA2, 0x41, 0xC6,
+    0x41, 0xC6, 0x41, 0xC6, 0x41, 0xC6, 0x66, 0x9C, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x7D, 0x83,
+    0x46, 0xC0, 0x41, 0xC6, 0x40, 0xC7, 0x54, 0xB0, 0x73, 0x8D, 0x7B, 0x84, 0x74, 0x8D, 0x56, 0xAE,
+    0x41, 0xC6, 0x41, 0xC6, 0x4C, 0xBA, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x44, 0xC3,
+    0x41, 0xC6, 0x49, 0xBC, 0x80, 0x80, 0x80, 0x80, 0x56, 0xAE, 0x41, 0xC6, 0x41, 0xC6, 0x4F, 0xB6,
+    0x51, 0xB3, 0x51, 0xB3, 0x65, 0x9D, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x79, 0x87, 0x42, 0xC5,
+    0x41, 0xC6, 0x41, 0xC6, 0x44, 0xC2, 0x7B, 0x84, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F,
+    0x67, 0x9B, 0x40, 0xC7, 0x40, 0xC7, 0x3F, 0xC8, 0x43, 0xC3, 0x47, 0xBF, 0x43, 0xC3, 0x3F, 0xC8,
+    0x40, 0xC8, 0x40, 0xC8, 0x4B, 0xBB, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x42, 0xC4,
+    0x40, 0xC8, 0x48, 0xBD, 0x80, 0x80, 0x80, 0x80, 0x55, 0xAF, 0x40, 0xC8, 0x40, 0xC8, 0x3E, 0xCA,
+    0x3E, 0xCA, 0x3E, 0xCA, 0x59, 0xAB, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x55, 0xB1,
+    0x40, 0xC8, 0x40, 0xC8, 0x59, 0xAB, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x7F, 0x61, 0xA1, 0x42, 0xC4, 0x3F, 0xC8, 0x40, 0xC8, 0x40, 0xC7, 0x3F, 0xC8, 0x3F, 0xC9,
+    0x4D, 0xB9, 0x4D, 0xB9, 0x56, 0xAF, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x4F, 0xB7,
+    0x4D, 0xB9, 0x54, 0xB2, 0x80, 0x80, 0x80, 0x80, 0x5E, 0xA6, 0x4D, 0xB9, 0x4D, 0xB9, 0x4D, 0xB9,
+    0x4D, 0xB9, 0x4D, 0xB9, 0x62, 0xA2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x70, 0x93,
+    0x4D, 0xB9, 0x4D, 0xB9, 0x74, 0x8E, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x81, 0x7F, 0x70, 0x92, 0x55, 0xB0, 0x46, 0xC1, 0x42, 0xC5, 0x4A, 0xBD, 0x5B, 0xAA,
+};
+const UWORD8 gau1_ihevcd_codec_logo_420sp_vu[] =
+{
+    0XB9, 0X4D, 0XB9, 0X4D, 0XAE, 0X56, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0XB6, 0X4F,
+    0XB9, 0X4D, 0XB1, 0X53, 0X80, 0X80, 0X80, 0X80, 0XA6, 0X5E, 0XB9, 0X4D, 0XB9, 0X4D, 0XB9, 0X4D,
+    0XB9, 0X4D, 0XB9, 0X4D, 0XA1, 0X62, 0XA4, 0X5F, 0XB9, 0X4D, 0XB9, 0X4D, 0X9B, 0X68, 0X7F, 0X80,
+    0X80, 0X80, 0X80, 0X80, 0X7F, 0X80, 0X9C, 0X67, 0XB9, 0X4D, 0XB9, 0X4D, 0XA3, 0X61, 0X80, 0X80,
+    0X7F, 0X80, 0X7E, 0X81, 0X92, 0X70, 0XAF, 0X56, 0XC1, 0X46, 0XC5, 0X42, 0XBD, 0X4A, 0XA9, 0X5B,
+    0XC8, 0X40, 0XC8, 0X40, 0XBB, 0X4B, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0XC4, 0X42,
+    0XC8, 0X40, 0XBD, 0X48, 0X80, 0X80, 0X80, 0X80, 0XAF, 0X55, 0XC8, 0X40, 0XC8, 0X40, 0XCA, 0X3E,
+    0XCA, 0X3E, 0XCA, 0X3E, 0XAB, 0X59, 0X94, 0X6D, 0XC8, 0X3F, 0XC8, 0X40, 0XB9, 0X4C, 0X80, 0X7F,
+    0X80, 0X80, 0X80, 0X80, 0X80, 0X7F, 0XBA, 0X4B, 0XC8, 0X40, 0XC8, 0X3F, 0X92, 0X70, 0X80, 0X80,
+    0X7F, 0X80, 0XA1, 0X62, 0XC4, 0X42, 0XC9, 0X3F, 0XC8, 0X40, 0XC7, 0X40, 0XC8, 0X3F, 0XC9, 0X3F,
+    0XC6, 0X41, 0XC6, 0X41, 0XBA, 0X4C, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0XC3, 0X44,
+    0XC6, 0X41, 0XBC, 0X49, 0X80, 0X80, 0X80, 0X80, 0XAE, 0X56, 0XC6, 0X41, 0XC6, 0X41, 0XB6, 0X4F,
+    0XB4, 0X51, 0XB4, 0X51, 0X9E, 0X65, 0X80, 0X7F, 0XBC, 0X49, 0XC6, 0X41, 0XC6, 0X41, 0X8D, 0X73,
+    0X80, 0X80, 0X80, 0X80, 0X8E, 0X72, 0XC6, 0X41, 0XC6, 0X41, 0XBA, 0X4B, 0X80, 0X7F, 0X7F, 0X80,
+    0X9B, 0X67, 0XC7, 0X40, 0XC7, 0X40, 0XC8, 0X3F, 0XC3, 0X43, 0XBF, 0X47, 0XC3, 0X43, 0XC8, 0X3F,
+    0XC6, 0X41, 0XC6, 0X41, 0XB9, 0X4D, 0X7C, 0X82, 0X7C, 0X82, 0X7C, 0X82, 0X7C, 0X82, 0XC2, 0X44,
+    0XC6, 0X41, 0XBC, 0X49, 0X80, 0X80, 0X80, 0X80, 0XAE, 0X56, 0XC6, 0X41, 0XC6, 0X41, 0X88, 0X78,
+    0X7D, 0X82, 0X7D, 0X82, 0X7F, 0X80, 0X7F, 0X80, 0XA1, 0X61, 0XC6, 0X41, 0XC6, 0X41, 0XA8, 0X5C,
+    0X7F, 0X80, 0X7F, 0X80, 0XAA, 0X5A, 0XC6, 0X41, 0XC6, 0X41, 0X9E, 0X64, 0X7F, 0X80, 0X83, 0X7D,
+    0XC0, 0X46, 0XC6, 0X41, 0XC7, 0X40, 0XB0, 0X54, 0X8D, 0X73, 0X84, 0X7B, 0X8D, 0X73, 0XAF, 0X56,
+    0XC6, 0X41, 0XC6, 0X41, 0XBE, 0X48, 0X9B, 0X66, 0X9B, 0X66, 0X9B, 0X66, 0X9B, 0X66, 0XC4, 0X42,
+    0XC6, 0X41, 0XBC, 0X49, 0X80, 0X80, 0X80, 0X80, 0XAE, 0X56, 0XC6, 0X41, 0XC6, 0X41, 0XA1, 0X61,
+    0X9B, 0X67, 0X9B, 0X67, 0X8A, 0X76, 0X80, 0X80, 0X87, 0X79, 0XC5, 0X42, 0XC6, 0X41, 0XBF, 0X46,
+    0X82, 0X7E, 0X82, 0X7D, 0XC1, 0X45, 0XC6, 0X41, 0XC2, 0X43, 0X85, 0X7B, 0X80, 0X80, 0X94, 0X6D,
+    0XC7, 0X40, 0XC6, 0X41, 0XBC, 0X4A, 0X82, 0X7E, 0X7E, 0X81, 0X7F, 0X80, 0X7E, 0X81, 0X80, 0X7F,
+    0XC6, 0X41, 0XC6, 0X41, 0XC6, 0X41, 0XC9, 0X3E, 0XC9, 0X3E, 0XC9, 0X3E, 0XC9, 0X3E, 0XC6, 0X41,
+    0XC6, 0X41, 0XBC, 0X49, 0X80, 0X80, 0X80, 0X80, 0XAE, 0X56, 0XC6, 0X41, 0XC6, 0X41, 0XC8, 0X3F,
+    0XC9, 0X3F, 0XC9, 0X3F, 0X9D, 0X66, 0X80, 0X80, 0X7F, 0X80, 0XB0, 0X54, 0XC6, 0X41, 0XC7, 0X40,
+    0X96, 0X6C, 0X98, 0X6A, 0XC6, 0X41, 0XC6, 0X41, 0XAC, 0X58, 0X7F, 0X80, 0X80, 0X80, 0X9F, 0X64,
+    0XC6, 0X41, 0XC6, 0X41, 0XA7, 0X5C, 0X7F, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X7F, 0X80,
+    0XC6, 0X41, 0XC6, 0X41, 0XC5, 0X42, 0XC2, 0X45, 0XC2, 0X45, 0XC2, 0X45, 0XC2, 0X45, 0XC6, 0X41,
+    0XC6, 0X41, 0XBC, 0X49, 0X80, 0X80, 0X80, 0X80, 0XAE, 0X56, 0XC6, 0X41, 0XC6, 0X41, 0XC5, 0X42,
+    0XC5, 0X42, 0XC5, 0X42, 0X9C, 0X67, 0X80, 0X80, 0X80, 0X80, 0X93, 0X6F, 0XC7, 0X40, 0XC6, 0X41,
+    0XB1, 0X54, 0XB4, 0X51, 0XC6, 0X41, 0XC7, 0X41, 0X8F, 0X72, 0X80, 0X80, 0X80, 0X80, 0X9F, 0X63,
+    0XC6, 0X41, 0XC6, 0X41, 0XA7, 0X5C, 0X7F, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X7F, 0X80,
+    0XC6, 0X41, 0XC6, 0X41, 0XBA, 0X4B, 0X86, 0X7A, 0X86, 0X7A, 0X86, 0X7A, 0X86, 0X7A, 0XC3, 0X43,
+    0XC6, 0X41, 0XBC, 0X49, 0X80, 0X80, 0X80, 0X80, 0XAE, 0X56, 0XC6, 0X41, 0XC6, 0X41, 0X98, 0X6A,
+    0X90, 0X71, 0X90, 0X71, 0X86, 0X7A, 0X80, 0X80, 0X80, 0X80, 0X80, 0X7F, 0XBD, 0X49, 0XC6, 0X41,
+    0XC5, 0X42, 0XC6, 0X41, 0XC6, 0X41, 0XB9, 0X4B, 0X7F, 0X80, 0X80, 0X80, 0X80, 0X80, 0X95, 0X6D,
+    0XC7, 0X40, 0XC6, 0X41, 0XBC, 0X4A, 0X82, 0X7E, 0X7E, 0X81, 0X7F, 0X80, 0X7E, 0X81, 0X80, 0X7F,
+    0XC6, 0X41, 0XC6, 0X41, 0XB9, 0X4C, 0X7E, 0X81, 0X7E, 0X81, 0X7E, 0X81, 0X7E, 0X81, 0XC3, 0X44,
+    0XC6, 0X41, 0XBC, 0X49, 0X80, 0X80, 0X80, 0X80, 0XAE, 0X56, 0XC6, 0X41, 0XC6, 0X41, 0X88, 0X78,
+    0X7E, 0X81, 0X7E, 0X81, 0X7F, 0X80, 0X80, 0X80, 0X80, 0X80, 0X7F, 0X80, 0XA2, 0X61, 0XC6, 0X41,
+    0XC6, 0X41, 0XC6, 0X41, 0XC6, 0X41, 0X9C, 0X66, 0X7F, 0X80, 0X80, 0X80, 0X80, 0X80, 0X83, 0X7D,
+    0XC0, 0X46, 0XC6, 0X41, 0XC7, 0X40, 0XB0, 0X54, 0X8D, 0X73, 0X84, 0X7B, 0X8D, 0X74, 0XAE, 0X56,
+    0XC6, 0X41, 0XC6, 0X41, 0XBA, 0X4C, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0XC3, 0X44,
+    0XC6, 0X41, 0XBC, 0X49, 0X80, 0X80, 0X80, 0X80, 0XAE, 0X56, 0XC6, 0X41, 0XC6, 0X41, 0XB6, 0X4F,
+    0XB3, 0X51, 0XB3, 0X51, 0X9D, 0X65, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X87, 0X79, 0XC5, 0X42,
+    0XC6, 0X41, 0XC6, 0X41, 0XC2, 0X44, 0X84, 0X7B, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X7F, 0X80,
+    0X9B, 0X67, 0XC7, 0X40, 0XC7, 0X40, 0XC8, 0X3F, 0XC3, 0X43, 0XBF, 0X47, 0XC3, 0X43, 0XC8, 0X3F,
+    0XC8, 0X40, 0XC8, 0X40, 0XBB, 0X4B, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0XC4, 0X42,
+    0XC8, 0X40, 0XBD, 0X48, 0X80, 0X80, 0X80, 0X80, 0XAF, 0X55, 0XC8, 0X40, 0XC8, 0X40, 0XCA, 0X3E,
+    0XCA, 0X3E, 0XCA, 0X3E, 0XAB, 0X59, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X7F, 0X80, 0XB1, 0X55,
+    0XC8, 0X40, 0XC8, 0X40, 0XAB, 0X59, 0X7F, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80,
+    0X7F, 0X80, 0XA1, 0X61, 0XC4, 0X42, 0XC8, 0X3F, 0XC8, 0X40, 0XC7, 0X40, 0XC8, 0X3F, 0XC9, 0X3F,
+    0XB9, 0X4D, 0XB9, 0X4D, 0XAF, 0X56, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0XB7, 0X4F,
+    0XB9, 0X4D, 0XB2, 0X54, 0X80, 0X80, 0X80, 0X80, 0XA6, 0X5E, 0XB9, 0X4D, 0XB9, 0X4D, 0XB9, 0X4D,
+    0XB9, 0X4D, 0XB9, 0X4D, 0XA2, 0X62, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X93, 0X70,
+    0XB9, 0X4D, 0XB9, 0X4D, 0X8E, 0X74, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80,
+    0X80, 0X80, 0X7F, 0X81, 0X92, 0X70, 0XB0, 0X55, 0XC1, 0X46, 0XC5, 0X42, 0XBD, 0X4A, 0XAA, 0X5B,
+};
+
+const UWORD8 gau1_ihevcd_logo_y[] =
+{
+    0xfd, 0xfd, 0xfd, 0xfb, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfe, 0xfa, 0xfa, 0xfb, 0xfc, 0xfc, 0xfb, 0xfc,
+    0xfd, 0xfc, 0xfa, 0xfd, 0xfc, 0xfb, 0xfb, 0xfd,
+    0xfd, 0xfd, 0xfb, 0xfd, 0xfb, 0xfb, 0xfc, 0xfc,
+    0xfa, 0xfa, 0xfb, 0xfd, 0xfb, 0xfd, 0xfd, 0xfc,
+    0xfd, 0xfe, 0xfc, 0xfc, 0xfb, 0xfd, 0xfc, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfe, 0xfe, 0xfd, 0xfb, 0xfc, 0xfc,
+    0xfc, 0xfd, 0xfc, 0xfc, 0xfb, 0xfc, 0xfc, 0xfe,
+    0xfd, 0xfc, 0xfc, 0xfc, 0xfb, 0xfb, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfb, 0xfb, 0xfc, 0xfd, 0xfd,
+    0xfd, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfe, 0xfe,
+    0xfd, 0xfe, 0xfd, 0xfd, 0xfe, 0xfc, 0xfc, 0xfc,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfe, 0xfe, 0xfd, 0xfc,
+    0xfb, 0xfc, 0xfc, 0xfc, 0xfc, 0xfb, 0xfb, 0xfc,
+    0xfd, 0xfc, 0xfc, 0xfc, 0xfc, 0xfd, 0xfd, 0xfc,
+    0xfd, 0xfb, 0xfc, 0xfc, 0xfc, 0xfd, 0xfc, 0xfb,
+    0xfc, 0xfd, 0xfd, 0xfc, 0xfd, 0xfc, 0xfc, 0xfc,
+    0xfc, 0xfc, 0xfb, 0xfb, 0xfb, 0xfb, 0xfc, 0xfc,
+    0xfd, 0xfd, 0xfc, 0xfc, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfc, 0xfe,
+    0xfc, 0xfb, 0xfb, 0xf9, 0xfa, 0xfb, 0xfa, 0xfb,
+    0xfc, 0xfd, 0xfc, 0xfd, 0xfc, 0xfc, 0xfc, 0xfb,
+    0xfc, 0xfc, 0xfc, 0xfc, 0xfa, 0xfb, 0xfc, 0xfd,
+    0xfc, 0xfb, 0xfd, 0xfc, 0xfd, 0xfd, 0xfc, 0xfc,
+    0xfd, 0xfc, 0xf2, 0xde, 0xd9, 0xe9, 0xf5, 0xf8,
+    0xfb, 0xfd, 0xfd, 0xfd, 0xfd, 0xfc, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfc, 0xfa, 0xfe, 0xfd, 0xfa, 0xf7, 0xf6, 0xf9,
+    0xfa, 0xfc, 0xfc, 0xfa, 0xfc, 0xfc, 0xfc, 0xfa,
+    0xf9, 0xfa, 0xfc, 0xfb, 0xfd, 0xfb, 0xfc, 0xfd,
+    0xfc, 0xfc, 0xfc, 0xfb, 0xfd, 0xfc, 0xfe, 0xfa,
+    0xfd, 0xfa, 0xfa, 0xf3, 0xb3, 0x9f, 0x97, 0xaa,
+    0xce, 0xf4, 0xfa, 0xfd, 0xfc, 0xfe, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfc, 0xfb, 0xfd, 0xf9, 0xdb, 0xd5,
+    0xd6, 0xd7, 0xd7, 0xdb, 0xf7, 0xfc, 0xfd, 0xfd,
+    0xfc, 0xfc, 0xfc, 0xfa, 0xf9, 0xfb, 0xfe, 0xfd,
+    0xfc, 0xfd, 0xfe, 0xfb, 0xfb, 0xfa, 0xfc, 0xfc,
+    0xfb, 0xfd, 0xfb, 0xf9, 0xf5, 0xba, 0x98, 0xa0,
+    0xa3, 0x9b, 0x96, 0xde, 0xfc, 0xf8, 0xfc, 0xfd,
+    0xfc, 0xfc, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfe, 0xfe, 0xfd, 0xf5,
+    0xab, 0x9a, 0xa0, 0x9f, 0x99, 0xab, 0xf0, 0xfa,
+    0xfc, 0xfd, 0xfa, 0xf8, 0xfb, 0xfc, 0xfb, 0xfc,
+    0xfb, 0xfc, 0xfa, 0xf9, 0xfb, 0xfd, 0xfb, 0xfd,
+    0xfa, 0xf8, 0xfc, 0xfc, 0xfc, 0xfc, 0xf0, 0xa9,
+    0xa1, 0xa1, 0xa2, 0xa1, 0x99, 0xd6, 0xf9, 0xfa,
+    0xfb, 0xfd, 0xfd, 0xfc, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfe,
+    0xfb, 0xf4, 0xaa, 0x9f, 0xa4, 0xa3, 0x9e, 0xaf,
+    0xf3, 0xfd, 0xfa, 0xfd, 0xf8, 0xf2, 0xf3, 0xf0,
+    0xf2, 0xf2, 0xf8, 0xfb, 0xfe, 0xf8, 0xfa, 0xf5,
+    0xf3, 0xf4, 0xf6, 0xf2, 0xf5, 0xfc, 0xfa, 0xfd,
+    0xf1, 0xaa, 0x9d, 0xa0, 0xa0, 0x9f, 0x99, 0xd6,
+    0xf8, 0xfb, 0xfa, 0xfd, 0xfd, 0xfc, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfc, 0xfe, 0xfa, 0xf4, 0xac, 0x9e, 0xa0, 0xa0,
+    0x9f, 0xb0, 0xf3, 0xfa, 0xfb, 0xfc, 0xdb, 0xb8,
+    0xb9, 0xb7, 0xb9, 0xb0, 0xe4, 0xf6, 0xfc, 0xfb,
+    0xeb, 0xbf, 0xb8, 0xba, 0xb9, 0xb5, 0xc2, 0xfd,
+    0xfb, 0xfc, 0xf7, 0xbe, 0x97, 0x9e, 0xa1, 0x9b,
+    0x9e, 0xe1, 0xf9, 0xfb, 0xfa, 0xfb, 0xfd, 0xfc,
+    0xfd, 0xfd, 0xfd, 0xfc, 0xfd, 0xfc, 0xfc, 0xfc,
+    0xfc, 0xfc, 0xfc, 0xfd, 0xfc, 0xfd, 0xfc, 0xfc,
+    0xfc, 0xfc, 0xfd, 0xfd, 0xfc, 0xfd, 0xfc, 0xfd,
+    0xfd, 0xfc, 0xfd, 0xfd, 0xfd, 0xfc, 0xfd, 0xfd,
+    0xfc, 0xfd, 0xfd, 0xfc, 0xfc, 0xfd, 0xfd, 0xfc,
+    0xfd, 0xfd, 0xfc, 0xfe, 0xfd, 0xf7, 0xac, 0x9e,
+    0xa2, 0xa2, 0x9e, 0xaf, 0xf3, 0xfd, 0xfc, 0xfa,
+    0xd3, 0x9b, 0x9d, 0x9f, 0x9b, 0x97, 0xd6, 0xf7,
+    0xfc, 0xfa, 0xe3, 0xa5, 0x9a, 0xa0, 0x9d, 0x99,
+    0xaf, 0xf6, 0xfc, 0xfd, 0xfc, 0xf6, 0xb4, 0xa5,
+    0xa1, 0xac, 0xd4, 0xf7, 0xfa, 0xfc, 0xfc, 0xfd,
+    0xfd, 0xfd, 0xfa, 0xfb, 0xfc, 0xfc, 0xfc, 0xfb,
+    0xfb, 0xfb, 0xfd, 0xfa, 0xfd, 0xfc, 0xfa, 0xfa,
+    0xfb, 0xfb, 0xfb, 0xfc, 0xfe, 0xfc, 0xfe, 0xfd,
+    0xfb, 0xfc, 0xfd, 0xfd, 0xfa, 0xfa, 0xfa, 0xfc,
+    0xfd, 0xfb, 0xfb, 0xfc, 0xfc, 0xfc, 0xfc, 0xfd,
+    0xfc, 0xfd, 0xfc, 0xfd, 0xfd, 0xfc, 0xfe, 0xf8,
+    0xac, 0x9e, 0xa2, 0xa3, 0x9e, 0xb0, 0xf4, 0xfc,
+    0xfd, 0xfd, 0xd6, 0x9f, 0xa0, 0xa1, 0xa0, 0x9b,
+    0xda, 0xf7, 0xfa, 0xfa, 0xe5, 0xa8, 0xa0, 0x9f,
+    0xa0, 0x9c, 0xb0, 0xf4, 0xfb, 0xfd, 0xfd, 0xf9,
+    0xf2, 0xe0, 0xd9, 0xe7, 0xf7, 0xfb, 0xfa, 0xfd,
+    0xfa, 0xfd, 0xfb, 0xfe, 0xfe, 0xfc, 0xfb, 0xfc,
+    0xfc, 0xfc, 0xfb, 0xfb, 0xfc, 0xfa, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfb, 0xfe, 0xfb, 0xfa, 0xfc,
+    0xfc, 0xfb, 0xfe, 0xfc, 0xfc, 0xfb, 0xfd, 0xfc,
+    0xfd, 0xfb, 0xfc, 0xfc, 0xfc, 0xfc, 0xfd, 0xfc,
+    0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfd, 0xfc, 0xfc,
+    0xfe, 0xf7, 0xad, 0x9e, 0xa2, 0xa3, 0x9e, 0xaf,
+    0xf3, 0xfc, 0xfc, 0xfd, 0xd5, 0xa1, 0xa2, 0xa0,
+    0xa1, 0x9d, 0xdb, 0xf9, 0xf9, 0xfa, 0xe6, 0xa9,
+    0x9e, 0xa2, 0xa2, 0x9e, 0xb1, 0xf6, 0xfd, 0xfb,
+    0xfc, 0xfc, 0xf9, 0xf9, 0xf8, 0xf8, 0xfa, 0xfa,
+    0xfa, 0xfc, 0xfb, 0xfd, 0xfc, 0xfa, 0xf9, 0xfa,
+    0xf9, 0xf8, 0xfb, 0xfc, 0xfc, 0xfa, 0xfb, 0xfa,
+    0xfb, 0xf9, 0xfb, 0xfa, 0xfc, 0xfc, 0xf9, 0xfb,
+    0xfa, 0xfb, 0xfa, 0xf9, 0xfa, 0xfa, 0xfb, 0xfc,
+    0xfc, 0xfb, 0xfb, 0xfa, 0xfc, 0xf7, 0xf8, 0xfa,
+    0xfb, 0xfd, 0xfc, 0xfc, 0xfe, 0xfd, 0xfe, 0xfd,
+    0xfc, 0xfd, 0xfe, 0xf8, 0xad, 0x9e, 0xa3, 0xa4,
+    0x9f, 0xad, 0xf2, 0xfd, 0xfb, 0xf9, 0xd4, 0xa1,
+    0xa1, 0xa0, 0xa2, 0x9d, 0xd8, 0xf7, 0xfb, 0xf8,
+    0xe4, 0xab, 0xa1, 0xa4, 0xa3, 0x9e, 0xaf, 0xf3,
+    0xfb, 0xfb, 0xfb, 0xfb, 0xf9, 0xf9, 0xf7, 0xf7,
+    0xfa, 0xfa, 0xfb, 0xfd, 0xfc, 0xf9, 0xfd, 0xf9,
+    0xf4, 0xe1, 0xda, 0xd8, 0xd9, 0xdd, 0xec, 0xf6,
+    0xf5, 0xed, 0xef, 0xee, 0xee, 0xf0, 0xf0, 0xfb,
+    0xf9, 0xef, 0xd8, 0xd6, 0xd8, 0xda, 0xe6, 0xf7,
+    0xfc, 0xfc, 0xfc, 0xf9, 0xf3, 0xde, 0xd8, 0xd5,
+    0xd7, 0xe7, 0xf7, 0xfb, 0xfb, 0xfe, 0xfe, 0xfd,
+    0xff, 0xfe, 0xfd, 0xfc, 0xfd, 0xf8, 0xad, 0x9e,
+    0xa2, 0xa3, 0xa0, 0xaf, 0xef, 0xf0, 0xc0, 0xc2,
+    0xb4, 0xa1, 0xa5, 0xa5, 0xa3, 0x9e, 0xb7, 0xc3,
+    0xc4, 0xc4, 0xbe, 0xa6, 0xa3, 0xa2, 0xa3, 0xa0,
+    0xa6, 0xc2, 0xc4, 0xc4, 0xc3, 0xc2, 0xc4, 0xc4,
+    0xc1, 0xc2, 0xc3, 0xee, 0xfc, 0xfc, 0xfe, 0xfc,
+    0xf5, 0xdd, 0xb7, 0xa3, 0x96, 0x98, 0x96, 0x97,
+    0xaf, 0xc9, 0xdf, 0xb6, 0xb6, 0xb2, 0xb4, 0xb4,
+    0xbd, 0xee, 0xd4, 0xb0, 0x9c, 0x98, 0x97, 0x96,
+    0xa4, 0xbd, 0xf4, 0xf8, 0xf4, 0xd6, 0xb8, 0x9b,
+    0x92, 0x98, 0x97, 0xa2, 0xc0, 0xea, 0xf8, 0xfe,
+    0xfe, 0xfe, 0xfe, 0xfe, 0xfc, 0xfc, 0xfe, 0xf7,
+    0xab, 0x9f, 0xa2, 0xa2, 0x9f, 0xad, 0xeb, 0xe4,
+    0x96, 0x9f, 0xa0, 0xa2, 0xa3, 0xa2, 0xa1, 0xa2,
+    0x9f, 0x9e, 0x9d, 0x9c, 0x9e, 0xa2, 0xa0, 0xa1,
+    0xa0, 0xa1, 0xa0, 0x9e, 0x9c, 0x9c, 0x9b, 0x9b,
+    0x9b, 0x9c, 0x9b, 0x9a, 0x95, 0xdf, 0xfb, 0xfb,
+    0xfb, 0xf0, 0xcc, 0x9f, 0xa0, 0x9f, 0x9d, 0x9f,
+    0xa0, 0xa1, 0x9a, 0x9d, 0xb6, 0x9c, 0xa0, 0x9b,
+    0x9f, 0x9e, 0xac, 0xc4, 0x9d, 0x9e, 0x9e, 0xa0,
+    0xa1, 0xa0, 0x9e, 0x9c, 0xad, 0xe3, 0xce, 0xa5,
+    0x9d, 0x9e, 0xa2, 0xa1, 0x9e, 0x9b, 0x9a, 0xb7,
+    0xed, 0xfc, 0xfe, 0xfc, 0xfc, 0xfd, 0xfd, 0xfc,
+    0xfd, 0xf7, 0xab, 0x9f, 0xa1, 0xa1, 0x9f, 0xaf,
+    0xeb, 0xe5, 0x9c, 0xa2, 0xa2, 0xa2, 0xa2, 0xa2,
+    0xa0, 0xa2, 0xa1, 0xa1, 0xa2, 0xa3, 0xa3, 0xa5,
+    0xa3, 0xa4, 0xa1, 0xa1, 0xa3, 0xa3, 0xa4, 0xa5,
+    0xa3, 0xa3, 0xa2, 0xa0, 0x9f, 0xa1, 0x9d, 0xe0,
+    0xf9, 0xf8, 0xf9, 0xcd, 0x9b, 0x9f, 0xa0, 0xa2,
+    0x9e, 0xa0, 0xa3, 0xa0, 0xa4, 0xa2, 0xa1, 0xa3,
+    0xa5, 0xa3, 0xa5, 0xa1, 0xa5, 0xa0, 0xa0, 0xa0,
+    0xa1, 0xa4, 0xa2, 0xa1, 0xa1, 0xa3, 0x9c, 0xb2,
+    0xa8, 0x9c, 0xa2, 0xa1, 0xa3, 0xa3, 0xa4, 0xa0,
+    0xa2, 0x9b, 0xdb, 0xfa, 0xfe, 0xfd, 0xfd, 0xfe,
+    0xfc, 0xfd, 0xfe, 0xf8, 0xab, 0x9e, 0xa1, 0xa1,
+    0x9e, 0xb0, 0xeb, 0xe7, 0x9c, 0xa1, 0xa0, 0xa2,
+    0xa3, 0xa2, 0xa1, 0xa4, 0xa1, 0xa2, 0xa2, 0xa2,
+    0xa2, 0xa3, 0xa3, 0xa2, 0xa0, 0xa2, 0xa2, 0xa1,
+    0xa1, 0xa2, 0xa2, 0xa2, 0xa2, 0xa2, 0xa2, 0xa0,
+    0x9b, 0xde, 0xf7, 0xf8, 0xed, 0x9d, 0xa0, 0xa0,
+    0xa3, 0xa0, 0xa2, 0x9f, 0xa2, 0x9f, 0xa0, 0x9f,
+    0xa5, 0xa1, 0xa3, 0xa2, 0xa4, 0xa2, 0xa2, 0xa1,
+    0x9b, 0x9f, 0x9e, 0xa3, 0xa4, 0xa3, 0xa3, 0xa0,
+    0xa6, 0x9f, 0xa2, 0x9e, 0x9a, 0x9e, 0xa0, 0xa3,
+    0xa1, 0x9f, 0xa1, 0x9d, 0xbc, 0xfb, 0xfe, 0xff,
+    0xfd, 0xfd, 0xfc, 0xfd, 0xfd, 0xf8, 0xac, 0x9f,
+    0xa2, 0xa2, 0x9f, 0xab, 0xea, 0xe5, 0x94, 0x9c,
+    0x9c, 0xa1, 0xa3, 0xa2, 0xa0, 0xa1, 0x9d, 0x99,
+    0x9a, 0x98, 0x9c, 0xa1, 0xa0, 0xa1, 0xa1, 0xa2,
+    0xa0, 0x9b, 0x9a, 0x9a, 0x9b, 0x9c, 0xa3, 0xa2,
+    0xa2, 0x9f, 0x9c, 0xe1, 0xf6, 0xf6, 0xbb, 0x9b,
+    0x9e, 0xa3, 0xa0, 0xa4, 0x9e, 0x9f, 0xb3, 0xc0,
+    0xa9, 0x9b, 0xa0, 0xa2, 0xa0, 0xa2, 0xa2, 0xa3,
+    0x9e, 0xa8, 0xcf, 0xdd, 0xd1, 0xa8, 0xa1, 0xa5,
+    0xa4, 0xa4, 0x9e, 0xa4, 0x99, 0xc9, 0xd7, 0xcf,
+    0xa5, 0x9e, 0xa1, 0xa1, 0xa1, 0x9e, 0xa4, 0xfc,
+    0xfe, 0xfd, 0xfd, 0xfd, 0xfd, 0xfc, 0xfd, 0xf8,
+    0xac, 0x9f, 0xa3, 0xa3, 0x9e, 0xad, 0xf1, 0xf9,
+    0xf6, 0xf3, 0xcd, 0x9f, 0xa3, 0xa0, 0xa2, 0x9c,
+    0xd6, 0xf1, 0xf2, 0xf3, 0xe3, 0xa8, 0xa0, 0xa4,
+    0xa4, 0x9e, 0xae, 0xf0, 0xf3, 0xf3, 0xf0, 0xb5,
+    0x9e, 0xa2, 0xa2, 0x9f, 0x99, 0xe1, 0xf9, 0xe9,
+    0xac, 0x9e, 0x9f, 0xa1, 0xa1, 0x9e, 0xb1, 0xe9,
+    0xf6, 0xf7, 0xf1, 0xda, 0xa7, 0x9f, 0xa4, 0xa3,
+    0xa0, 0xa2, 0x9a, 0xe3, 0xf7, 0xf5, 0xf6, 0xd2,
+    0x9f, 0xa2, 0xa0, 0xa5, 0xa1, 0x9b, 0xd4, 0xf9,
+    0xfa, 0xf8, 0xdc, 0x9c, 0xa1, 0xa3, 0xa2, 0x9f,
+    0xa7, 0xfb, 0xff, 0xfe, 0xfe, 0xfe, 0xfd, 0xfc,
+    0xfd, 0xf8, 0xac, 0x9f, 0xa3, 0xa4, 0x9e, 0xaf,
+    0xf4, 0xfd, 0xfe, 0xfb, 0xd6, 0xa0, 0xa3, 0xa1,
+    0xa0, 0x9c, 0xd9, 0xf8, 0xfb, 0xfa, 0xe6, 0xaa,
+    0xa0, 0xa3, 0xa3, 0x9f, 0xb0, 0xf6, 0xf9, 0xfb,
+    0xf4, 0xb9, 0x9d, 0xa1, 0xa3, 0x9f, 0x9a, 0xe1,
+    0xf7, 0xda, 0xa3, 0xa1, 0xa1, 0xa1, 0xa0, 0xa5,
+    0xe5, 0xf6, 0xfa, 0xf9, 0xf8, 0xf2, 0xd4, 0x9d,
+    0xa4, 0xa3, 0xa3, 0xa2, 0x9b, 0xf1, 0xf8, 0xf9,
+    0xfa, 0xe4, 0xaa, 0x9f, 0xa1, 0xa6, 0x9f, 0xaa,
+    0xe5, 0xfa, 0xfe, 0xf9, 0xe5, 0xa5, 0x9e, 0xa4,
+    0xa2, 0x9f, 0xa5, 0xfa, 0xfe, 0xfe, 0xfe, 0xfe,
+    0xfd, 0xfc, 0xfd, 0xf7, 0xac, 0x9e, 0xa2, 0xa4,
+    0x9f, 0xaf, 0xf5, 0xfd, 0xfd, 0xfd, 0xd6, 0x9e,
+    0xa3, 0xa4, 0xa0, 0x9c, 0xdb, 0xfa, 0xfa, 0xfb,
+    0xe8, 0xaa, 0xa0, 0xa4, 0xa2, 0x9e, 0xaf, 0xf6,
+    0xfa, 0xfc, 0xf6, 0xb7, 0x9d, 0xa0, 0xa2, 0x9f,
+    0x9d, 0xe1, 0xf6, 0xcd, 0x9f, 0xa3, 0xa3, 0xa2,
+    0x9e, 0xbd, 0xf0, 0xf7, 0xfc, 0xfa, 0xfb, 0xfa,
+    0xe7, 0xa3, 0xa2, 0xa2, 0xa4, 0xa3, 0xa1, 0xf5,
+    0xf8, 0xfb, 0xfb, 0xed, 0xac, 0x9d, 0xa1, 0xa3,
+    0x9e, 0xb6, 0xf0, 0xf9, 0xfc, 0xfc, 0xec, 0xaf,
+    0x9e, 0xa5, 0xa1, 0x9f, 0xa5, 0xfb, 0xfe, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfc, 0xfd, 0xf7, 0xac, 0x9e,
+    0xa2, 0xa3, 0x9f, 0xaf, 0xf5, 0xfe, 0xfd, 0xfc,
+    0xd3, 0x9f, 0xa1, 0xa2, 0xa0, 0x97, 0xda, 0xf8,
+    0xf9, 0xf8, 0xe5, 0xa7, 0x9e, 0xa3, 0xa0, 0x9c,
+    0xaf, 0xf7, 0xfa, 0xfd, 0xf5, 0xb6, 0x9e, 0xa1,
+    0xa3, 0x9f, 0x99, 0xe2, 0xf5, 0xca, 0x9a, 0xa2,
+    0xa3, 0xa3, 0xa0, 0xbf, 0xf5, 0xfb, 0xfb, 0xf8,
+    0xfa, 0xf6, 0xe9, 0xa3, 0xa3, 0xa2, 0xa3, 0xa1,
+    0xa9, 0xf5, 0xfa, 0xfc, 0xfb, 0xf1, 0xad, 0x9c,
+    0xa3, 0xa4, 0x9d, 0xb9, 0xf0, 0xfb, 0xfc, 0xfb,
+    0xec, 0xb3, 0x9d, 0xa3, 0xa2, 0x9e, 0xa5, 0xfb,
+    0xfe, 0xfd, 0xfd, 0xfd, 0xfd, 0xfc, 0xfd, 0xf7,
+    0xad, 0x9e, 0xa2, 0xa3, 0x9e, 0xb0, 0xf5, 0xfd,
+    0xfd, 0xfd, 0xd5, 0xa0, 0xa2, 0xa2, 0xa2, 0x9a,
+    0xdb, 0xfb, 0xfb, 0xfa, 0xe8, 0xaa, 0xa0, 0xa4,
+    0xa2, 0x9c, 0xb1, 0xf8, 0xfa, 0xfc, 0xf7, 0xb8,
+    0x9e, 0xa2, 0xa3, 0x9f, 0x99, 0xe5, 0xf7, 0xd2,
+    0xa1, 0xa2, 0xa2, 0xa4, 0x9f, 0xbe, 0xf0, 0xfa,
+    0xfa, 0xf9, 0xfa, 0xf8, 0xe6, 0x9f, 0xa3, 0xa3,
+    0xa2, 0xa2, 0xad, 0xf7, 0xfc, 0xfa, 0xfc, 0xf5,
+    0xae, 0x9f, 0xa3, 0xa4, 0x9d, 0xb7, 0xed, 0xfd,
+    0xfc, 0xfd, 0xec, 0xb3, 0x9f, 0xa4, 0xa2, 0x9e,
+    0xa5, 0xfb, 0xfe, 0xfe, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfe, 0xf7, 0xad, 0x9f, 0xa2, 0xa3, 0x9f, 0xb0,
+    0xf4, 0xfd, 0xfd, 0xfd, 0xd5, 0xa0, 0xa3, 0xa4,
+    0xa0, 0x9b, 0xdb, 0xfa, 0xfc, 0xfb, 0xe8, 0xaa,
+    0xa0, 0xa4, 0xa1, 0x9c, 0xb0, 0xf8, 0xfb, 0xfb,
+    0xf7, 0xb8, 0x9e, 0xa2, 0xa4, 0xa0, 0x9a, 0xe5,
+    0xf6, 0xdf, 0xa9, 0xa3, 0xa2, 0xa1, 0x9e, 0x9d,
+    0xdc, 0xf5, 0xfa, 0xf7, 0xf7, 0xf2, 0xc3, 0x9d,
+    0xa3, 0xa4, 0xa0, 0xa2, 0xac, 0xf6, 0xfb, 0xfa,
+    0xfc, 0xf5, 0xad, 0x9d, 0xa3, 0xa3, 0x9d, 0xb5,
+    0xef, 0xfc, 0xfc, 0xfc, 0xed, 0xb2, 0x9e, 0xa4,
+    0xa1, 0x9f, 0xa6, 0xfb, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfc, 0xfd, 0xf8, 0xad, 0x9e, 0xa2, 0xa4,
+    0x9f, 0xae, 0xf4, 0xfe, 0xfe, 0xfd, 0xd6, 0xa1,
+    0xa3, 0xa4, 0xa1, 0x9b, 0xda, 0xf9, 0xfb, 0xfa,
+    0xe8, 0xaa, 0x9f, 0xa3, 0xa1, 0x9d, 0xb0, 0xf9,
+    0xf9, 0xfc, 0xf5, 0xb8, 0x9e, 0xa2, 0xa3, 0x9f,
+    0x99, 0xe2, 0xfb, 0xf1, 0xaf, 0x9d, 0xa1, 0xa2,
+    0x9f, 0xa5, 0xa4, 0xd5, 0xe6, 0xeb, 0xe4, 0xc3,
+    0xa1, 0xa1, 0xa3, 0xa5, 0xa0, 0xa2, 0xab, 0xf4,
+    0xfb, 0xfb, 0xfc, 0xf4, 0xac, 0x9d, 0xa1, 0xa3,
+    0x9d, 0xb5, 0xef, 0xfc, 0xfb, 0xfc, 0xee, 0xb2,
+    0x9f, 0xa3, 0xa1, 0x9f, 0xa6, 0xfb, 0xfe, 0xfd,
+    0xfe, 0xfe, 0xfd, 0xfd, 0xfd, 0xf8, 0xad, 0x9e,
+    0xa3, 0xa3, 0x9f, 0xae, 0xf5, 0xfe, 0xfd, 0xfd,
+    0xd6, 0xa1, 0xa2, 0xa5, 0xa1, 0x9c, 0xdb, 0xfa,
+    0xfc, 0xfa, 0xe8, 0xa9, 0x9f, 0xa4, 0xa2, 0x9d,
+    0xb0, 0xf9, 0xfb, 0xfc, 0xf6, 0xb6, 0x9e, 0xa2,
+    0xa2, 0x9f, 0x9b, 0xe3, 0xf9, 0xf6, 0xce, 0x99,
+    0xa1, 0xa4, 0xa1, 0xa6, 0x9e, 0x9e, 0xac, 0xb1,
+    0xa8, 0x9c, 0xa3, 0xa5, 0xa3, 0xa4, 0xa2, 0xa2,
+    0xac, 0xf4, 0xf9, 0xfc, 0xfc, 0xf4, 0xad, 0x9d,
+    0xa1, 0xa4, 0x9d, 0xb5, 0xf0, 0xfc, 0xfb, 0xfb,
+    0xec, 0xb2, 0x9d, 0xa4, 0xa1, 0x9f, 0xa5, 0xfa,
+    0xfe, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xf6,
+    0xab, 0x9e, 0xa3, 0xa2, 0x9e, 0xae, 0xf4, 0xfe,
+    0xfe, 0xfc, 0xd5, 0xa1, 0xa2, 0xa3, 0xa0, 0x9c,
+    0xda, 0xfa, 0xfd, 0xf9, 0xe7, 0xaa, 0x9f, 0xa3,
+    0xa2, 0x9c, 0xb1, 0xf8, 0xfa, 0xfb, 0xf7, 0xb6,
+    0x9f, 0xa1, 0xa2, 0xa1, 0x9b, 0xe3, 0xfc, 0xfa,
+    0xf4, 0xa4, 0xa0, 0xa2, 0xa0, 0xa1, 0xa3, 0x9d,
+    0x9f, 0x9e, 0x9e, 0xa1, 0xa3, 0xa2, 0xa3, 0xa3,
+    0xa1, 0xa1, 0xac, 0xf4, 0xf9, 0xfa, 0xfc, 0xf2,
+    0xad, 0x9d, 0xa1, 0xa4, 0x9d, 0xb5, 0xee, 0xfc,
+    0xfb, 0xfb, 0xed, 0xb2, 0x9f, 0xa5, 0xa2, 0x9e,
+    0xa6, 0xfb, 0xfe, 0xfe, 0xfe, 0xfe, 0xfc, 0xfc,
+    0xf9, 0xf5, 0xab, 0x9e, 0xa1, 0xa0, 0x9f, 0xaf,
+    0xf1, 0xfa, 0xfb, 0xfb, 0xd4, 0x9f, 0xa2, 0xa0,
+    0xa0, 0x9a, 0xd9, 0xf4, 0xfd, 0xf9, 0xe4, 0xa8,
+    0x9f, 0xa2, 0xa2, 0x9b, 0xb0, 0xf6, 0xfa, 0xfa,
+    0xf3, 0xb7, 0x9c, 0xa1, 0xa1, 0x9f, 0x9d, 0xe0,
+    0xfa, 0xfc, 0xfa, 0xdf, 0x9f, 0x9e, 0x9f, 0x9f,
+    0xa0, 0xa4, 0xa3, 0xa2, 0xa0, 0xa0, 0xa2, 0xa3,
+    0xa0, 0xa1, 0xa1, 0x9d, 0xad, 0xf4, 0xfa, 0xf9,
+    0xf7, 0xef, 0xb0, 0x9b, 0xa0, 0xa0, 0x9a, 0xb6,
+    0xed, 0xf9, 0xfd, 0xf7, 0xea, 0xb2, 0x9d, 0xa3,
+    0xa1, 0x9f, 0xa7, 0xf7, 0xfb, 0xfd, 0xfe, 0xfe,
+    0xff, 0xfe, 0xfa, 0xf4, 0xa8, 0x9f, 0xa2, 0xa3,
+    0x9d, 0xad, 0xf1, 0xfb, 0xfb, 0xfc, 0xd3, 0x9e,
+    0xa3, 0xa3, 0xa1, 0x98, 0xd8, 0xf8, 0xfc, 0xf9,
+    0xe4, 0xaa, 0x9f, 0xa2, 0xa2, 0x9b, 0xac, 0xf7,
+    0xfb, 0xf9, 0xf6, 0xb5, 0x9d, 0xa1, 0xa1, 0x9e,
+    0x9a, 0xe0, 0xf9, 0xfb, 0xfb, 0xf7, 0xde, 0xae,
+    0x98, 0xa4, 0xa2, 0xa4, 0xa2, 0xa1, 0x9c, 0xa4,
+    0xc7, 0x9d, 0x9b, 0x9e, 0x9e, 0x9d, 0xa9, 0xf7,
+    0xfb, 0xfa, 0xfa, 0xf3, 0xac, 0x9a, 0xa1, 0x9f,
+    0x96, 0xb1, 0xee, 0xfc, 0xf8, 0xfc, 0xe9, 0xad,
+    0x9b, 0xa0, 0x9d, 0x9b, 0x9e, 0xf9, 0xfc, 0xfd,
+    0xfe, 0xfe, 0xfe, 0xfe, 0xfa, 0xf6, 0xb3, 0xa4,
+    0xaa, 0xa8, 0xa4, 0xb5, 0xf3, 0xfd, 0xfc, 0xfa,
+    0xd7, 0xa5, 0xa7, 0xa8, 0xa7, 0xa5, 0xdc, 0xfb,
+    0xfc, 0xf9, 0xe6, 0xb2, 0xa6, 0xa9, 0xa6, 0xa3,
+    0xb5, 0xf6, 0xfb, 0xfa, 0xf5, 0xbc, 0xa3, 0xa6,
+    0xa7, 0xa5, 0xa3, 0xe4, 0xfb, 0xfe, 0xfa, 0xf9,
+    0xfa, 0xee, 0xcf, 0xb1, 0xa0, 0x9c, 0x9c, 0x9f,
+    0xbf, 0xe0, 0xef, 0xc5, 0xc9, 0xc7, 0xc6, 0xc8,
+    0xcd, 0xfb, 0xfc, 0xfb, 0xfc, 0xfb, 0xcf, 0xc6,
+    0xc4, 0xca, 0xc7, 0xd6, 0xf6, 0xfc, 0xfb, 0xfc,
+    0xf3, 0xd2, 0xc6, 0xc7, 0xc5, 0xc7, 0xcd, 0xfb,
+    0xfd, 0xfe, 0xfe, 0xfe, 0xfd, 0xfd, 0xfe, 0xfb,
+    0xec, 0xe7, 0xe8, 0xe9, 0xeb, 0xeb, 0xfa, 0xfb,
+    0xfc, 0xfc, 0xf7, 0xe9, 0xea, 0xe7, 0xe7, 0xe7,
+    0xf5, 0xfd, 0xfb, 0xfb, 0xf8, 0xe8, 0xe6, 0xe6,
+    0xe7, 0xe9, 0xea, 0xfc, 0xf8, 0xfa, 0xfb, 0xf1,
+    0xe9, 0xea, 0xe9, 0xe7, 0xea, 0xf8, 0xfc, 0xfd,
+    0xfc, 0xfb, 0xfb, 0xfa, 0xf6, 0xef, 0xe7, 0xe5,
+    0xe3, 0xe6, 0xf5, 0xf7, 0xfb, 0xfb, 0xfa, 0xfb,
+    0xfa, 0xfb, 0xfd, 0xfd, 0xfe, 0xfb, 0xfb, 0xfc,
+    0xfa, 0xfb, 0xf9, 0xf9, 0xf7, 0xf9, 0xfc, 0xfc,
+    0xfa, 0xfa, 0xfb, 0xfa, 0xf7, 0xf8, 0xf9, 0xfb,
+    0xfc, 0xfc, 0xfc, 0xfd, 0xfb, 0xfb, 0xfd, 0xfb,
+    0xfc, 0xfb, 0xfb, 0xfa, 0xf9, 0xfa, 0xfc, 0xfa,
+    0xfb, 0xfd, 0xfb, 0xfb, 0xfc, 0xfa, 0xfb, 0xfc,
+    0xf9, 0xfb, 0xfc, 0xfb, 0xfd, 0xfc, 0xfd, 0xfa,
+    0xfa, 0xfd, 0xf9, 0xf9, 0xfa, 0xfb, 0xfb, 0xfd,
+    0xfa, 0xfd, 0xfe, 0xfd, 0xfa, 0xfa, 0xfb, 0xfc,
+    0xfc, 0xfd, 0xfc, 0xfc, 0xfc, 0xfd, 0xfb, 0xfc,
+    0xfb, 0xfc, 0xfb, 0xfc, 0xfb, 0xfe, 0xfc, 0xfc,
+    0xfc, 0xff, 0xfd, 0xfd, 0xfc, 0xfd, 0xfd, 0xfd,
+    0xfc, 0xfd, 0xfd, 0xfc, 0xfd, 0xfd, 0xfc, 0xfe,
+    0xfc, 0xfd, 0xfc, 0xfc, 0xfc, 0xfc, 0xfd, 0xfc,
+    0xfd, 0xff, 0xfe, 0xfd, 0xfd, 0xfc, 0xfb, 0xfb,
+    0xfc, 0xfc, 0xfd, 0xfc, 0xfd, 0xfc, 0xfc, 0xfd,
+    0xfc, 0xfc, 0xfc, 0xfc, 0xfd, 0xfd, 0xfc, 0xfd,
+    0xfd, 0xfc, 0xfd, 0xfc, 0xfc, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfc, 0xfd, 0xfd, 0xfd, 0xfb,
+    0xfb, 0xfd, 0xfd, 0xfe, 0xfe, 0xfe, 0xfd, 0xfd,
+    0xfe, 0xfc, 0xfd, 0xfd, 0xfd, 0xfb, 0xfa, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfc, 0xfb, 0xfb, 0xfc, 0xfc,
+    0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xff, 0xfe, 0xfc,
+    0xfd, 0xfc, 0xfd, 0xfe, 0xfd, 0xfd, 0xfe, 0xfe,
+    0xfe, 0xfd, 0xfd, 0xfb, 0xfd, 0xfb, 0xfc, 0xfc,
+    0xfb, 0xfd, 0xfd, 0xff, 0xfe, 0xfe, 0xfd, 0xfd,
+    0xfc, 0xfb, 0xfd, 0xfd, 0xfd, 0xfc, 0xfd, 0xfd,
+    0xfc, 0xfd, 0xfd, 0xfd, 0xfd, 0xfc, 0xfc, 0xfd,
+    0xfc, 0xfc, 0xfd, 0xfc, 0xfd, 0xfd, 0xfc, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfc, 0xfd, 0xfc,
+    0xfc, 0xfc, 0xfb, 0xfd, 0xfd, 0xfe, 0xfd, 0xfe,
+    0xfd, 0xfd, 0xfe, 0xfd, 0xfc, 0xfc, 0xfd, 0xfc,
+    0xfb, 0xfc, 0xfd, 0xfd, 0xfd, 0xfb, 0xfc, 0xfb,
+    0xfc, 0xfc, 0xfc, 0xfc, 0xfd, 0xfb, 0xfb, 0xfc,
+    0xfc, 0xfc, 0xfd, 0xfc, 0xfc, 0xfc, 0xfc, 0xfd,
+    0xfd, 0xfd, 0xfd, 0xfc, 0xfc, 0xfb, 0xfb, 0xfc,
+    0xfc, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfd, 0xfd,
+    0xfb, 0xfc, 0xfc, 0xfb, 0xfd, 0xfc, 0xfc, 0xfc,
+    0xfd, 0xfd, 0xfc, 0xfc, 0xfc, 0xfd, 0xfd, 0xfc,
+    0xfc, 0xfd, 0xfc, 0xfc, 0xfc, 0xfc, 0xfd, 0xfd,
+    0xfc, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfb, 0xfc, 0xfd, 0xfc, 0xfd, 0xfd, 0xfd,
+    0xfb, 0xfc, 0xfc, 0xfc, 0xfd, 0xfd, 0xfd, 0xfc,
+    0xfc, 0xfc, 0xfc, 0xfc, 0xfd, 0xfd, 0xfd, 0xfc,
+    0xfb, 0xfb, 0xfc, 0xfc, 0xfc, 0xfc, 0xfd, 0xfc,
+    0xfc, 0xfc, 0xfc, 0xfd, 0xfc, 0xfc, 0xfc, 0xfc,
+    0xfc, 0xfd, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
+    0xfc, 0xfd, 0xfc, 0xfd, 0xfc, 0xfc, 0xfd, 0xfd,
+    0xfc, 0xfc, 0xfb, 0xfc, 0xfd, 0xfc, 0xfc, 0xfc,
+    0xfc, 0xfb, 0xfd, 0xfc, 0xfc, 0xfb, 0xfc, 0xfc,
+    0xfc, 0xfc, 0xfd, 0xfd, 0xfc, 0xfd, 0xfc, 0xfc,
+    0xfd, 0xfd, 0xfc, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+    0xfd, 0xfc, 0xfd, 0xfb, 0xfb, 0xfd, 0xfc, 0xfc,
+    0xfd, 0xfd, 0xfb, 0xfd, 0xfe, 0xfe, 0xfd, 0xfd,
+    0xfd, 0xfc, 0xfc, 0xfc, 0xfc, 0xfd, 0xfc, 0xfb,
+    0xfd, 0xfd, 0xfc, 0xfc, 0xfc, 0xfb, 0xfc, 0xfb,
+    0xfd, 0xfe, 0xfe, 0xfd, 0xfb, 0xfb, 0xfc, 0xfd,
+    0xfd, 0xfc, 0xfd, 0xfd, 0xfc, 0xfc, 0xfb, 0xfa,
+    0xfc, 0xfb, 0xfb, 0xfb, 0xfd, 0xfd, 0xfc, 0xfb,
+    0xfc, 0xfc, 0xfc, 0xfc, 0xfb, 0xfd, 0xfb, 0xfa,
+
+};
+const UWORD8 gau1_ihevcd_logo_420p_u[] =
+{
+    0x7F, 0x7D, 0x7F, 0x80, 0x7D, 0x7E, 0x7D, 0x82,
+    0x80, 0x81, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x7E,
+    0x7B, 0x7D, 0x7D, 0x81, 0x7E, 0x7D, 0x80, 0x80,
+    0x7F, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x7F, 0x80,
+    0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x7F,
+    0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x81, 0x81, 0x80,
+    0x81, 0x81, 0x81, 0x7F, 0x7D, 0x7E, 0x81, 0x7D,
+    0x7B, 0x7E, 0x7F, 0x7E, 0x80, 0x80, 0x80, 0x78,
+    0x78, 0x7C, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x7F, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x7F, 0x7F,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x7F, 0x7D, 0x7C, 0x6E, 0x69, 0x70, 0x7B,
+    0x7E, 0x7D, 0x82, 0x82, 0x7F, 0x80, 0x80, 0x82,
+    0x7E, 0x80, 0x80, 0x71, 0x4B, 0x4A, 0x64, 0x7A,
+    0x7F, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x80,
+    0x80, 0x80, 0x80, 0x7F, 0x7F, 0x80, 0x80, 0x7F,
+    0x77, 0x54, 0x43, 0x53, 0x76, 0x7F, 0x75, 0x75,
+    0x77, 0x7E, 0x80, 0x7A, 0x78, 0x74, 0x7A, 0x7E,
+    0x66, 0x39, 0x34, 0x57, 0x79, 0x7F, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x7F,
+    0x7F, 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x7F,
+    0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x79, 0x49, 0x39,
+    0x4A, 0x77, 0x7D, 0x5C, 0x4F, 0x55, 0x6F, 0x79,
+    0x64, 0x52, 0x52, 0x6B, 0x80, 0x70, 0x4F, 0x47,
+    0x6A, 0x7B, 0x7E, 0x7E, 0x80, 0x7F, 0x7D, 0x7F,
+    0x7E, 0x7F, 0x7D, 0x7E, 0x7E, 0x7D, 0x7E, 0x80,
+    0x80, 0x7F, 0x7E, 0x81, 0x80, 0x7D, 0x7D, 0x80,
+    0x7F, 0x7E, 0x7A, 0x48, 0x3A, 0x45, 0x78, 0x7E,
+    0x55, 0x3B, 0x42, 0x6D, 0x7F, 0x5C, 0x3D, 0x40,
+    0x64, 0x80, 0x7E, 0x76, 0x6C, 0x7F, 0x7D, 0x80,
+    0x7E, 0x7A, 0x7B, 0x7C, 0x80, 0x80, 0x80, 0x7E,
+    0x80, 0x7F, 0x79, 0x7B, 0x80, 0x80, 0x80, 0x80,
+    0x7A, 0x79, 0x7F, 0x81, 0x80, 0x7F, 0x7E, 0x7A,
+    0x48, 0x3A, 0x45, 0x74, 0x6D, 0x4E, 0x3B, 0x3F,
+    0x5F, 0x6B, 0x51, 0x3B, 0x3E, 0x58, 0x6B, 0x6A,
+    0x6C, 0x68, 0x76, 0x81, 0x7F, 0x74, 0x60, 0x57,
+    0x5C, 0x66, 0x67, 0x65, 0x63, 0x69, 0x65, 0x59,
+    0x5A, 0x65, 0x75, 0x6F, 0x62, 0x57, 0x5B, 0x6D,
+    0x7A, 0x81, 0x7D, 0x7F, 0x79, 0x49, 0x3A, 0x48,
+    0x6D, 0x44, 0x3E, 0x3E, 0x3D, 0x3C, 0x3C, 0x3B,
+    0x3A, 0x3B, 0x3F, 0x3F, 0x3E, 0x3A, 0x3D, 0x5B,
+    0x7C, 0x6E, 0x54, 0x3E, 0x34, 0x39, 0x3D, 0x3D,
+    0x3D, 0x3C, 0x43, 0x44, 0x3B, 0x38, 0x3D, 0x56,
+    0x49, 0x3B, 0x3D, 0x3B, 0x49, 0x6E, 0x7F, 0x80,
+    0x7E, 0x79, 0x49, 0x3A, 0x45, 0x6B, 0x47, 0x43,
+    0x3A, 0x3C, 0x40, 0x42, 0x3D, 0x39, 0x3D, 0x40,
+    0x40, 0x40, 0x3F, 0x3F, 0x54, 0x7A, 0x56, 0x3B,
+    0x3B, 0x42, 0x48, 0x42, 0x3F, 0x3D, 0x3C, 0x45,
+    0x51, 0x4D, 0x41, 0x3B, 0x3B, 0x4B, 0x52, 0x43,
+    0x35, 0x45, 0x6D, 0x7E, 0x7E, 0x7E, 0x7A, 0x49,
+    0x3A, 0x46, 0x77, 0x76, 0x4E, 0x3B, 0x42, 0x62,
+    0x77, 0x56, 0x3A, 0x3E, 0x5F, 0x75, 0x5F, 0x3C,
+    0x3B, 0x59, 0x6B, 0x46, 0x39, 0x46, 0x66, 0x75,
+    0x6D, 0x51, 0x3D, 0x3C, 0x58, 0x7A, 0x6E, 0x44,
+    0x39, 0x43, 0x6F, 0x77, 0x5B, 0x3B, 0x3C, 0x66,
+    0x7F, 0x7F, 0x7E, 0x7A, 0x48, 0x3A, 0x45, 0x7A,
+    0x7B, 0x4E, 0x3C, 0x44, 0x68, 0x7C, 0x5C, 0x3B,
+    0x40, 0x63, 0x7D, 0x65, 0x3B, 0x3E, 0x5B, 0x65,
+    0x3F, 0x3A, 0x4D, 0x72, 0x81, 0x7C, 0x5A, 0x3C,
+    0x3D, 0x5C, 0x7E, 0x71, 0x46, 0x3B, 0x46, 0x75,
+    0x7C, 0x61, 0x3C, 0x3C, 0x66, 0x7F, 0x7F, 0x7E,
+    0x7A, 0x49, 0x3B, 0x47, 0x7B, 0x7B, 0x50, 0x3D,
+    0x46, 0x69, 0x7F, 0x5D, 0x3E, 0x40, 0x64, 0x7F,
+    0x67, 0x3D, 0x3D, 0x5B, 0x69, 0x43, 0x3D, 0x4B,
+    0x6D, 0x7C, 0x78, 0x58, 0x3D, 0x3E, 0x5E, 0x7F,
+    0x73, 0x48, 0x3E, 0x4B, 0x76, 0x7E, 0x63, 0x3D,
+    0x3C, 0x67, 0x7F, 0x7F, 0x7E, 0x79, 0x48, 0x3A,
+    0x46, 0x7B, 0x7C, 0x51, 0x3E, 0x45, 0x68, 0x7D,
+    0x5D, 0x3B, 0x3F, 0x64, 0x7E, 0x68, 0x3E, 0x38,
+    0x59, 0x74, 0x51, 0x3D, 0x3F, 0x50, 0x5A, 0x53,
+    0x43, 0x3C, 0x3F, 0x5D, 0x7F, 0x72, 0x48, 0x3C,
+    0x4A, 0x77, 0x7E, 0x62, 0x3E, 0x3C, 0x67, 0x80,
+    0x80, 0x7F, 0x79, 0x4A, 0x38, 0x49, 0x79, 0x7C,
+    0x51, 0x3A, 0x40, 0x67, 0x7B, 0x5A, 0x39, 0x3C,
+    0x63, 0x7E, 0x66, 0x3D, 0x37, 0x5A, 0x7B, 0x65,
+    0x47, 0x38, 0x39, 0x3F, 0x3B, 0x3B, 0x3B, 0x3D,
+    0x5A, 0x7E, 0x71, 0x46, 0x3B, 0x47, 0x74, 0x7E,
+    0x5E, 0x3A, 0x3E, 0x68, 0x7F, 0x7E, 0x80, 0x7A,
+    0x58, 0x4A, 0x5A, 0x79, 0x7E, 0x5D, 0x49, 0x4F,
+    0x6E, 0x7C, 0x64, 0x4D, 0x4F, 0x6B, 0x80, 0x6E,
+    0x4E, 0x4A, 0x66, 0x7C, 0x7A, 0x6B, 0x54, 0x48,
+    0x4A, 0x57, 0x5D, 0x57, 0x58, 0x6C, 0x80, 0x78,
+    0x5A, 0x55, 0x5D, 0x79, 0x81, 0x6A, 0x53, 0x59,
+    0x72, 0x7E, 0x7F, 0x80, 0x7E, 0x73, 0x6F, 0x76,
+    0x80, 0x7F, 0x75, 0x72, 0x70, 0x7A, 0x7F, 0x78,
+    0x72, 0x72, 0x77, 0x80, 0x7D, 0x75, 0x6F, 0x79,
+    0x7F, 0x80, 0x80, 0x79, 0x72, 0x6D, 0x79, 0x7E,
+    0x7C, 0x7A, 0x7D, 0x81, 0x80, 0x77, 0x78, 0x7D,
+    0x7F, 0x7F, 0x7D, 0x7C, 0x7B, 0x7C, 0x7E, 0x81,
+    0x7F, 0x80, 0x7F, 0x80, 0x81, 0x81, 0x81, 0x81,
+    0x80, 0x80, 0x81, 0x81, 0x81, 0x81, 0x7E, 0x80,
+    0x80, 0x81, 0x81, 0x80, 0x7F, 0x80, 0x7C, 0x7B,
+    0x7F, 0x80, 0x81, 0x7E, 0x81, 0x7F, 0x7D, 0x7E,
+    0x7C, 0x7F, 0x80, 0x81, 0x7F, 0x7E, 0x7C, 0x7F,
+    0x81, 0x80, 0x7C, 0x7B, 0x7D, 0x7E, 0x7E, 0x7F,
+    0x80, 0x7E, 0x7E, 0x7F, 0x81, 0x7E, 0x80, 0x7F,
+    0x7F, 0x7D, 0x7C, 0x7F, 0x82, 0x7E, 0x7E, 0x7E,
+    0x80, 0x7C, 0x7C, 0x7F, 0x80, 0x7D, 0x7D, 0x7F,
+    0x7F, 0x82, 0x7F, 0x7E, 0x82, 0x7E, 0x7F, 0x80,
+    0x7F, 0x7D, 0x80, 0x82, 0x80, 0x7C, 0x7E, 0x7F,
+    0x7F, 0x81,
+};
+
+const UWORD8 gau1_ihevcd_logo_420p_v[] =
+{
+    0x7E, 0x80, 0x7D, 0x7E, 0x80, 0x81, 0x7E, 0x7C,
+    0x80, 0x81, 0x7E, 0x7D, 0x80, 0x81, 0x7C, 0x81,
+    0x81, 0x7F, 0x81, 0x7E, 0x80, 0x7F, 0x7C, 0x7F,
+    0x80, 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x7F,
+    0x7F, 0x7F, 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x7F,
+    0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x7C, 0x7B, 0x7F, 0x81, 0x7F, 0x7F, 0x81,
+    0x82, 0x81, 0x80, 0x81, 0x7F, 0x7D, 0x7E, 0x86,
+    0x87, 0x83, 0x81, 0x7F, 0x7F, 0x80, 0x80, 0x80,
+    0x7F, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x80,
+    0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x80, 0x80,
+    0x80, 0x7F, 0x81, 0x81, 0x8C, 0x8D, 0x89, 0x82,
+    0x7F, 0x81, 0x7A, 0x7C, 0x7E, 0x7F, 0x7C, 0x7C,
+    0x80, 0x7F, 0x7F, 0x8C, 0xAE, 0xAE, 0x94, 0x80,
+    0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x7F, 0x80,
+    0x80, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x7F,
+    0x7F, 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x7E,
+    0x82, 0xA8, 0xB5, 0xA5, 0x83, 0x7D, 0x84, 0x83,
+    0x83, 0x81, 0x7F, 0x81, 0x83, 0x84, 0x80, 0x7C,
+    0x94, 0xBC, 0xC0, 0x9E, 0x82, 0x80, 0x80, 0x7F,
+    0x80, 0x80, 0x80, 0x80, 0x7F, 0x7F, 0x80, 0x80,
+    0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x80,
+    0x7F, 0x7F, 0x7F, 0x80, 0x7F, 0x82, 0xB3, 0xC2,
+    0xAD, 0x83, 0x7F, 0x9C, 0xAD, 0xA5, 0x89, 0x81,
+    0x96, 0xA8, 0xA3, 0x8C, 0x7D, 0x87, 0xAD, 0xB1,
+    0x93, 0x82, 0x80, 0x7F, 0x7E, 0x7F, 0x80, 0x7F,
+    0x80, 0x7F, 0x7F, 0x81, 0x80, 0x80, 0x81, 0x81,
+    0x80, 0x81, 0x82, 0x7C, 0x7D, 0x82, 0x81, 0x80,
+    0x7F, 0x7F, 0x83, 0xB2, 0xBF, 0xB0, 0x82, 0x7F,
+    0xA1, 0xC0, 0xB7, 0x88, 0x7A, 0x9E, 0xC0, 0xB9,
+    0x93, 0x80, 0x80, 0x86, 0x8B, 0x7E, 0x7E, 0x7F,
+    0x7F, 0x81, 0x80, 0x7E, 0x7E, 0x7F, 0x81, 0x80,
+    0x7F, 0x7D, 0x81, 0x81, 0x80, 0x7E, 0x7D, 0x7D,
+    0x81, 0x7F, 0x7F, 0x7E, 0x7F, 0x80, 0x7F, 0x82,
+    0xB2, 0xBE, 0xB1, 0x85, 0x8E, 0xAA, 0xC0, 0xBB,
+    0x96, 0x8E, 0xA6, 0xBE, 0xB8, 0x9D, 0x8E, 0x8E,
+    0x8C, 0x90, 0x84, 0x7C, 0x81, 0x87, 0x98, 0xA1,
+    0x9A, 0x92, 0x91, 0x96, 0x97, 0x8D, 0x90, 0xA0,
+    0x9E, 0x95, 0x82, 0x88, 0x96, 0xA1, 0x9C, 0x8B,
+    0x81, 0x7E, 0x80, 0x7F, 0x83, 0xB2, 0xC0, 0xB0,
+    0x8B, 0xB7, 0xBE, 0xBE, 0xBF, 0xBE, 0xBE, 0xBD,
+    0xBF, 0xC2, 0xBF, 0xBA, 0xBE, 0xBE, 0xBD, 0x9B,
+    0x82, 0x8A, 0xA7, 0xBB, 0xC3, 0xBF, 0xB9, 0xB8,
+    0xB9, 0xBB, 0xB2, 0xB2, 0xBE, 0xC0, 0xBB, 0x9F,
+    0xAD, 0xBA, 0xBD, 0xBD, 0xAF, 0x89, 0x7E, 0x7D,
+    0x7F, 0x82, 0xB2, 0xC0, 0xB3, 0x8B, 0xB3, 0xBA,
+    0xC0, 0xC0, 0xBB, 0xB9, 0xBB, 0xBF, 0xC0, 0xBC,
+    0xB6, 0xBA, 0xBB, 0xBB, 0x9E, 0x84, 0xA2, 0xBE,
+    0xBE, 0xB8, 0xB0, 0xB9, 0xBD, 0xBE, 0xBF, 0xB3,
+    0xA4, 0xAC, 0xBA, 0xBF, 0xBC, 0xAA, 0xA4, 0xB8,
+    0xC2, 0xB5, 0x8D, 0x80, 0x81, 0x7E, 0x83, 0xB1,
+    0xBE, 0xB1, 0x82, 0x84, 0xAA, 0xC0, 0xBA, 0x8C,
+    0x85, 0x9F, 0xBF, 0xBC, 0x95, 0x85, 0x97, 0xBD,
+    0xBD, 0x9E, 0x8A, 0xB4, 0xC2, 0xB4, 0x8F, 0x83,
+    0x89, 0xA8, 0xC1, 0xC1, 0xA1, 0x85, 0x8B, 0xB5,
+    0xC1, 0xB5, 0x8A, 0x81, 0x9C, 0xBF, 0xBC, 0x8F,
+    0x80, 0x7E, 0x7F, 0x82, 0xB1, 0xBF, 0xB1, 0x82,
+    0x7F, 0xA9, 0xC0, 0xB8, 0x89, 0x83, 0x9D, 0xBF,
+    0xBD, 0x93, 0x81, 0x94, 0xBE, 0xBA, 0x9C, 0x8F,
+    0xB5, 0xBD, 0xAB, 0x85, 0x7C, 0x7E, 0x9F, 0xBF,
+    0xBE, 0x9C, 0x81, 0x86, 0xB2, 0xBE, 0xB1, 0x86,
+    0x80, 0x98, 0xBE, 0xBC, 0x91, 0x7F, 0x7E, 0x7F,
+    0x82, 0xB1, 0xBF, 0xB1, 0x81, 0x81, 0xA9, 0xC0,
+    0xB7, 0x89, 0x81, 0x9C, 0xBD, 0xBB, 0x91, 0x80,
+    0x93, 0xBD, 0xBA, 0x9C, 0x8E, 0xB7, 0xC1, 0xB0,
+    0x89, 0x7C, 0x82, 0xA3, 0xBE, 0xBC, 0x9A, 0x82,
+    0x82, 0xAF, 0xBF, 0xAE, 0x86, 0x7E, 0x98, 0xBC,
+    0xBD, 0x90, 0x7F, 0x7F, 0x7F, 0x82, 0xB1, 0xBE,
+    0xB2, 0x81, 0x80, 0xA9, 0xBF, 0xB7, 0x89, 0x82,
+    0x9C, 0xBC, 0xBA, 0x90, 0x80, 0x92, 0xBA, 0xC0,
+    0x9F, 0x84, 0xA8, 0xC0, 0xBD, 0xA5, 0x9C, 0xA3,
+    0xB5, 0xBF, 0xBB, 0x9C, 0x81, 0x84, 0xAE, 0xC0,
+    0xB0, 0x85, 0x7F, 0x97, 0xBD, 0xBC, 0x91, 0x7F,
+    0x7F, 0x7E, 0x82, 0xB3, 0xC1, 0xB1, 0x84, 0x7E,
+    0xA7, 0xC0, 0xB7, 0x8B, 0x80, 0x9E, 0xBF, 0xBA,
+    0x93, 0x7E, 0x93, 0xBB, 0xC2, 0x9F, 0x80, 0x92,
+    0xB5, 0xC2, 0xC0, 0xBB, 0xBD, 0xBB, 0xBE, 0xBE,
+    0x9D, 0x82, 0x85, 0xAC, 0xC1, 0xAF, 0x87, 0x80,
+    0x9A, 0xBC, 0xB7, 0x90, 0x80, 0x80, 0x7E, 0x82,
+    0xA6, 0xB1, 0xA4, 0x83, 0x7E, 0x9C, 0xAF, 0xA8,
+    0x89, 0x7F, 0x98, 0xAF, 0xAB, 0x8F, 0x7D, 0x8E,
+    0xAC, 0xB1, 0x95, 0x81, 0x80, 0x8C, 0xA6, 0xB5,
+    0xB6, 0xA2, 0x9A, 0xA3, 0xA2, 0x8D, 0x7F, 0x84,
+    0x96, 0xA8, 0x9B, 0x83, 0x7C, 0x90, 0xA5, 0x9F,
+    0x8A, 0x80, 0x7E, 0x7F, 0x80, 0x88, 0x89, 0x85,
+    0x7E, 0x7D, 0x85, 0x88, 0x88, 0x81, 0x7F, 0x86,
+    0x89, 0x88, 0x84, 0x7D, 0x82, 0x86, 0x88, 0x84,
+    0x7E, 0x7F, 0x80, 0x82, 0x88, 0x88, 0x83, 0x7E,
+    0x80, 0x80, 0x7F, 0x7D, 0x7E, 0x81, 0x81, 0x82,
+    0x7F, 0x7D, 0x81, 0x82, 0x81, 0x80, 0x7D, 0x7F,
+    0x80, 0x80, 0x7F, 0x7D, 0x7C, 0x7E, 0x7E, 0x7C,
+    0x7B, 0x7F, 0x7E, 0x7F, 0x80, 0x7D, 0x7C, 0x7F,
+    0x80, 0x7F, 0x7E, 0x7E, 0x7F, 0x7E, 0x80, 0x80,
+    0x7E, 0x7D, 0x81, 0x81, 0x7C, 0x81, 0x81, 0x7F,
+    0x81, 0x81, 0x7F, 0x7D, 0x7F, 0x81, 0x81, 0x80,
+    0x7F, 0x7F, 0x81, 0x82, 0x81, 0x7F, 0x81, 0x7F,
+    0x7E, 0x81, 0x82, 0x7F, 0x7E, 0x81, 0x7F, 0x7F,
+    0x80, 0x80, 0x81, 0x7D, 0x7C, 0x80, 0x81, 0x80,
+    0x7D, 0x80, 0x80, 0x7C, 0x7F, 0x81, 0x80, 0x7F,
+    0x7F, 0x7B, 0x7F, 0x7F, 0x7C, 0x7D, 0x81, 0x81,
+    0x7D, 0x7D, 0x80, 0x81, 0x7E, 0x7E, 0x7F, 0x81,
+    0x7F, 0x7B
+};
+const UWORD8 gau1_ihevcd_logo_420sp_uv[] =
+{
+    0x7F, 0x7E, 0x7D, 0x80, 0x7F, 0x7D, 0x80, 0x7E, 0x7D, 0x80, 0x7E, 0x81, 0x7D, 0x7E, 0x82, 0x7C,
+    0x80, 0x80, 0x81, 0x81, 0x7F, 0x7E, 0x80, 0x7D, 0x80, 0x80, 0x7F, 0x81, 0x80, 0x7C, 0x7E, 0x81,
+    0x7B, 0x81, 0x7D, 0x7F, 0x7D, 0x81, 0x81, 0x7E, 0x7E, 0x80, 0x7D, 0x7F, 0x80, 0x7C, 0x80, 0x7F,
+    0x7F, 0x80, 0x80, 0x7F, 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x7F,
+    0x80, 0x7F, 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
+    0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x81, 0x80, 0x81, 0x80, 0x80, 0x80,
+    0x81, 0x80, 0x81, 0x7C, 0x81, 0x7B, 0x7F, 0x7F, 0x7D, 0x81, 0x7E, 0x7F, 0x81, 0x7F, 0x7D, 0x81,
+    0x7B, 0x82, 0x7E, 0x81, 0x7F, 0x80, 0x7E, 0x81, 0x80, 0x7F, 0x80, 0x7D, 0x80, 0x7E, 0x78, 0x86,
+    0x78, 0x87, 0x7C, 0x83, 0x80, 0x81, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x7F, 0x7F, 0x80,
+    0x80, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x7F, 0x7F, 0x7D, 0x81, 0x7C, 0x81, 0x6E, 0x8C, 0x69, 0x8D, 0x70, 0x89, 0x7B, 0x82,
+    0x7E, 0x7F, 0x7D, 0x81, 0x82, 0x7A, 0x82, 0x7C, 0x7F, 0x7E, 0x80, 0x7F, 0x80, 0x7C, 0x82, 0x7C,
+    0x7E, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x71, 0x8C, 0x4B, 0xAE, 0x4A, 0xAE, 0x64, 0x94, 0x7A, 0x80,
+    0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x80,
+    0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x7F,
+    0x80, 0x7F, 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x7E,
+    0x77, 0x82, 0x54, 0xA8, 0x43, 0xB5, 0x53, 0xA5, 0x76, 0x83, 0x7F, 0x7D, 0x75, 0x84, 0x75, 0x83,
+    0x77, 0x83, 0x7E, 0x81, 0x80, 0x7F, 0x7A, 0x81, 0x78, 0x83, 0x74, 0x84, 0x7A, 0x80, 0x7E, 0x7C,
+    0x66, 0x94, 0x39, 0xBC, 0x34, 0xC0, 0x57, 0x9E, 0x79, 0x82, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x7F,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x7F, 0x80,
+    0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x80,
+    0x80, 0x7F, 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x79, 0x82, 0x49, 0xB3, 0x39, 0xC2,
+    0x4A, 0xAD, 0x77, 0x83, 0x7D, 0x7F, 0x5C, 0x9C, 0x4F, 0xAD, 0x55, 0xA5, 0x6F, 0x89, 0x79, 0x81,
+    0x64, 0x96, 0x52, 0xA8, 0x52, 0xA3, 0x6B, 0x8C, 0x80, 0x7D, 0x70, 0x87, 0x4F, 0xAD, 0x47, 0xB1,
+    0x6A, 0x93, 0x7B, 0x82, 0x7E, 0x80, 0x7E, 0x7F, 0x80, 0x7E, 0x7F, 0x7F, 0x7D, 0x80, 0x7F, 0x7F,
+    0x7E, 0x80, 0x7F, 0x7F, 0x7D, 0x7F, 0x7E, 0x81, 0x7E, 0x80, 0x7D, 0x80, 0x7E, 0x81, 0x80, 0x81,
+    0x80, 0x80, 0x7F, 0x81, 0x7E, 0x82, 0x81, 0x7C, 0x80, 0x7D, 0x7D, 0x82, 0x7D, 0x81, 0x80, 0x80,
+    0x7F, 0x7F, 0x7E, 0x7F, 0x7A, 0x83, 0x48, 0xB2, 0x3A, 0xBF, 0x45, 0xB0, 0x78, 0x82, 0x7E, 0x7F,
+    0x55, 0xA1, 0x3B, 0xC0, 0x42, 0xB7, 0x6D, 0x88, 0x7F, 0x7A, 0x5C, 0x9E, 0x3D, 0xC0, 0x40, 0xB9,
+    0x64, 0x93, 0x80, 0x80, 0x7E, 0x80, 0x76, 0x86, 0x6C, 0x8B, 0x7F, 0x7E, 0x7D, 0x7E, 0x80, 0x7F,
+    0x7E, 0x7F, 0x7A, 0x81, 0x7B, 0x80, 0x7C, 0x7E, 0x80, 0x7E, 0x80, 0x7F, 0x80, 0x81, 0x7E, 0x80,
+    0x80, 0x7F, 0x7F, 0x7D, 0x79, 0x81, 0x7B, 0x81, 0x80, 0x80, 0x80, 0x7E, 0x80, 0x7D, 0x80, 0x7D,
+    0x7A, 0x81, 0x79, 0x7F, 0x7F, 0x7F, 0x81, 0x7E, 0x80, 0x7F, 0x7F, 0x80, 0x7E, 0x7F, 0x7A, 0x82,
+    0x48, 0xB2, 0x3A, 0xBE, 0x45, 0xB1, 0x74, 0x85, 0x6D, 0x8E, 0x4E, 0xAA, 0x3B, 0xC0, 0x3F, 0xBB,
+    0x5F, 0x96, 0x6B, 0x8E, 0x51, 0xA6, 0x3B, 0xBE, 0x3E, 0xB8, 0x58, 0x9D, 0x6B, 0x8E, 0x6A, 0x8E,
+    0x6C, 0x8C, 0x68, 0x90, 0x76, 0x84, 0x81, 0x7C, 0x7F, 0x81, 0x74, 0x87, 0x60, 0x98, 0x57, 0xA1,
+    0x5C, 0x9A, 0x66, 0x92, 0x67, 0x91, 0x65, 0x96, 0x63, 0x97, 0x69, 0x8D, 0x65, 0x90, 0x59, 0xA0,
+    0x5A, 0x9E, 0x65, 0x95, 0x75, 0x82, 0x6F, 0x88, 0x62, 0x96, 0x57, 0xA1, 0x5B, 0x9C, 0x6D, 0x8B,
+    0x7A, 0x81, 0x81, 0x7E, 0x7D, 0x80, 0x7F, 0x7F, 0x79, 0x83, 0x49, 0xB2, 0x3A, 0xC0, 0x48, 0xB0,
+    0x6D, 0x8B, 0x44, 0xB7, 0x3E, 0xBE, 0x3E, 0xBE, 0x3D, 0xBF, 0x3C, 0xBE, 0x3C, 0xBE, 0x3B, 0xBD,
+    0x3A, 0xBF, 0x3B, 0xC2, 0x3F, 0xBF, 0x3F, 0xBA, 0x3E, 0xBE, 0x3A, 0xBE, 0x3D, 0xBD, 0x5B, 0x9B,
+    0x7C, 0x82, 0x6E, 0x8A, 0x54, 0xA7, 0x3E, 0xBB, 0x34, 0xC3, 0x39, 0xBF, 0x3D, 0xB9, 0x3D, 0xB8,
+    0x3D, 0xB9, 0x3C, 0xBB, 0x43, 0xB2, 0x44, 0xB2, 0x3B, 0xBE, 0x38, 0xC0, 0x3D, 0xBB, 0x56, 0x9F,
+    0x49, 0xAD, 0x3B, 0xBA, 0x3D, 0xBD, 0x3B, 0xBD, 0x49, 0xAF, 0x6E, 0x89, 0x7F, 0x7E, 0x80, 0x7D,
+    0x7E, 0x7F, 0x79, 0x82, 0x49, 0xB2, 0x3A, 0xC0, 0x45, 0xB3, 0x6B, 0x8B, 0x47, 0xB3, 0x43, 0xBA,
+    0x3A, 0xC0, 0x3C, 0xC0, 0x40, 0xBB, 0x42, 0xB9, 0x3D, 0xBB, 0x39, 0xBF, 0x3D, 0xC0, 0x40, 0xBC,
+    0x40, 0xB6, 0x40, 0xBA, 0x3F, 0xBB, 0x3F, 0xBB, 0x54, 0x9E, 0x7A, 0x84, 0x56, 0xA2, 0x3B, 0xBE,
+    0x3B, 0xBE, 0x42, 0xB8, 0x48, 0xB0, 0x42, 0xB9, 0x3F, 0xBD, 0x3D, 0xBE, 0x3C, 0xBF, 0x45, 0xB3,
+    0x51, 0xA4, 0x4D, 0xAC, 0x41, 0xBA, 0x3B, 0xBF, 0x3B, 0xBC, 0x4B, 0xAA, 0x52, 0xA4, 0x43, 0xB8,
+    0x35, 0xC2, 0x45, 0xB5, 0x6D, 0x8D, 0x7E, 0x80, 0x7E, 0x81, 0x7E, 0x7E, 0x7A, 0x83, 0x49, 0xB1,
+    0x3A, 0xBE, 0x46, 0xB1, 0x77, 0x82, 0x76, 0x84, 0x4E, 0xAA, 0x3B, 0xC0, 0x42, 0xBA, 0x62, 0x8C,
+    0x77, 0x85, 0x56, 0x9F, 0x3A, 0xBF, 0x3E, 0xBC, 0x5F, 0x95, 0x75, 0x85, 0x5F, 0x97, 0x3C, 0xBD,
+    0x3B, 0xBD, 0x59, 0x9E, 0x6B, 0x8A, 0x46, 0xB4, 0x39, 0xC2, 0x46, 0xB4, 0x66, 0x8F, 0x75, 0x83,
+    0x6D, 0x89, 0x51, 0xA8, 0x3D, 0xC1, 0x3C, 0xC1, 0x58, 0xA1, 0x7A, 0x85, 0x6E, 0x8B, 0x44, 0xB5,
+    0x39, 0xC1, 0x43, 0xB5, 0x6F, 0x8A, 0x77, 0x81, 0x5B, 0x9C, 0x3B, 0xBF, 0x3C, 0xBC, 0x66, 0x8F,
+    0x7F, 0x80, 0x7F, 0x7E, 0x7E, 0x7F, 0x7A, 0x82, 0x48, 0xB1, 0x3A, 0xBF, 0x45, 0xB1, 0x7A, 0x82,
+    0x7B, 0x7F, 0x4E, 0xA9, 0x3C, 0xC0, 0x44, 0xB8, 0x68, 0x89, 0x7C, 0x83, 0x5C, 0x9D, 0x3B, 0xBF,
+    0x40, 0xBD, 0x63, 0x93, 0x7D, 0x81, 0x65, 0x94, 0x3B, 0xBE, 0x3E, 0xBA, 0x5B, 0x9C, 0x65, 0x8F,
+    0x3F, 0xB5, 0x3A, 0xBD, 0x4D, 0xAB, 0x72, 0x85, 0x81, 0x7C, 0x7C, 0x7E, 0x5A, 0x9F, 0x3C, 0xBF,
+    0x3D, 0xBE, 0x5C, 0x9C, 0x7E, 0x81, 0x71, 0x86, 0x46, 0xB2, 0x3B, 0xBE, 0x46, 0xB1, 0x75, 0x86,
+    0x7C, 0x80, 0x61, 0x98, 0x3C, 0xBE, 0x3C, 0xBC, 0x66, 0x91, 0x7F, 0x7F, 0x7F, 0x7E, 0x7E, 0x7F,
+    0x7A, 0x82, 0x49, 0xB1, 0x3B, 0xBF, 0x47, 0xB1, 0x7B, 0x81, 0x7B, 0x81, 0x50, 0xA9, 0x3D, 0xC0,
+    0x46, 0xB7, 0x69, 0x89, 0x7F, 0x81, 0x5D, 0x9C, 0x3E, 0xBD, 0x40, 0xBB, 0x64, 0x91, 0x7F, 0x80,
+    0x67, 0x93, 0x3D, 0xBD, 0x3D, 0xBA, 0x5B, 0x9C, 0x69, 0x8E, 0x43, 0xB7, 0x3D, 0xC1, 0x4B, 0xB0,
+    0x6D, 0x89, 0x7C, 0x7C, 0x78, 0x82, 0x58, 0xA3, 0x3D, 0xBE, 0x3E, 0xBC, 0x5E, 0x9A, 0x7F, 0x82,
+    0x73, 0x82, 0x48, 0xAF, 0x3E, 0xBF, 0x4B, 0xAE, 0x76, 0x86, 0x7E, 0x7E, 0x63, 0x98, 0x3D, 0xBC,
+    0x3C, 0xBD, 0x67, 0x90, 0x7F, 0x7F, 0x7F, 0x7F, 0x7E, 0x7F, 0x79, 0x82, 0x48, 0xB1, 0x3A, 0xBE,
+    0x46, 0xB2, 0x7B, 0x81, 0x7C, 0x80, 0x51, 0xA9, 0x3E, 0xBF, 0x45, 0xB7, 0x68, 0x89, 0x7D, 0x82,
+    0x5D, 0x9C, 0x3B, 0xBC, 0x3F, 0xBA, 0x64, 0x90, 0x7E, 0x80, 0x68, 0x92, 0x3E, 0xBA, 0x38, 0xC0,
+    0x59, 0x9F, 0x74, 0x84, 0x51, 0xA8, 0x3D, 0xC0, 0x3F, 0xBD, 0x50, 0xA5, 0x5A, 0x9C, 0x53, 0xA3,
+    0x43, 0xB5, 0x3C, 0xBF, 0x3F, 0xBB, 0x5D, 0x9C, 0x7F, 0x81, 0x72, 0x84, 0x48, 0xAE, 0x3C, 0xC0,
+    0x4A, 0xB0, 0x77, 0x85, 0x7E, 0x7F, 0x62, 0x97, 0x3E, 0xBD, 0x3C, 0xBC, 0x67, 0x91, 0x80, 0x7F,
+    0x80, 0x7F, 0x7F, 0x7E, 0x79, 0x82, 0x4A, 0xB3, 0x38, 0xC1, 0x49, 0xB1, 0x79, 0x84, 0x7C, 0x7E,
+    0x51, 0xA7, 0x3A, 0xC0, 0x40, 0xB7, 0x67, 0x8B, 0x7B, 0x80, 0x5A, 0x9E, 0x39, 0xBF, 0x3C, 0xBA,
+    0x63, 0x93, 0x7E, 0x7E, 0x66, 0x93, 0x3D, 0xBB, 0x37, 0xC2, 0x5A, 0x9F, 0x7B, 0x80, 0x65, 0x92,
+    0x47, 0xB5, 0x38, 0xC2, 0x39, 0xC0, 0x3F, 0xBB, 0x3B, 0xBD, 0x3B, 0xBB, 0x3B, 0xBE, 0x3D, 0xBE,
+    0x5A, 0x9D, 0x7E, 0x82, 0x71, 0x85, 0x46, 0xAC, 0x3B, 0xC1, 0x47, 0xAF, 0x74, 0x87, 0x7E, 0x80,
+    0x5E, 0x9A, 0x3A, 0xBC, 0x3E, 0xB7, 0x68, 0x90, 0x7F, 0x80, 0x7E, 0x80, 0x80, 0x7E, 0x7A, 0x82,
+    0x58, 0xA6, 0x4A, 0xB1, 0x5A, 0xA4, 0x79, 0x83, 0x7E, 0x7E, 0x5D, 0x9C, 0x49, 0xAF, 0x4F, 0xA8,
+    0x6E, 0x89, 0x7C, 0x7F, 0x64, 0x98, 0x4D, 0xAF, 0x4F, 0xAB, 0x6B, 0x8F, 0x80, 0x7D, 0x6E, 0x8E,
+    0x4E, 0xAC, 0x4A, 0xB1, 0x66, 0x95, 0x7C, 0x81, 0x7A, 0x80, 0x6B, 0x8C, 0x54, 0xA6, 0x48, 0xB5,
+    0x4A, 0xB6, 0x57, 0xA2, 0x5D, 0x9A, 0x57, 0xA3, 0x58, 0xA2, 0x6C, 0x8D, 0x80, 0x7F, 0x78, 0x84,
+    0x5A, 0x96, 0x55, 0xA8, 0x5D, 0x9B, 0x79, 0x83, 0x81, 0x7C, 0x6A, 0x90, 0x53, 0xA5, 0x59, 0x9F,
+    0x72, 0x8A, 0x7E, 0x80, 0x7F, 0x7E, 0x80, 0x7F, 0x7E, 0x80, 0x73, 0x88, 0x6F, 0x89, 0x76, 0x85,
+    0x80, 0x7E, 0x7F, 0x7D, 0x75, 0x85, 0x72, 0x88, 0x70, 0x88, 0x7A, 0x81, 0x7F, 0x7F, 0x78, 0x86,
+    0x72, 0x89, 0x72, 0x88, 0x77, 0x84, 0x80, 0x7D, 0x7D, 0x82, 0x75, 0x86, 0x6F, 0x88, 0x79, 0x84,
+    0x7F, 0x7E, 0x80, 0x7F, 0x80, 0x80, 0x79, 0x82, 0x72, 0x88, 0x6D, 0x88, 0x79, 0x83, 0x7E, 0x7E,
+    0x7C, 0x80, 0x7A, 0x80, 0x7D, 0x7F, 0x81, 0x7D, 0x80, 0x7E, 0x77, 0x81, 0x78, 0x81, 0x7D, 0x82,
+    0x7F, 0x7F, 0x7F, 0x7D, 0x7D, 0x81, 0x7C, 0x82, 0x7B, 0x81, 0x7C, 0x80, 0x7E, 0x7D, 0x81, 0x7F,
+    0x7F, 0x80, 0x80, 0x80, 0x7F, 0x7F, 0x80, 0x7D, 0x81, 0x7C, 0x81, 0x7E, 0x81, 0x7E, 0x81, 0x7C,
+    0x80, 0x7B, 0x80, 0x7F, 0x81, 0x7E, 0x81, 0x7F, 0x81, 0x80, 0x81, 0x7D, 0x7E, 0x7C, 0x80, 0x7F,
+    0x80, 0x80, 0x81, 0x7F, 0x81, 0x7E, 0x80, 0x7E, 0x7F, 0x7F, 0x80, 0x7E, 0x7C, 0x80, 0x7B, 0x80,
+    0x7F, 0x7E, 0x80, 0x7D, 0x81, 0x81, 0x7E, 0x81, 0x81, 0x7C, 0x7F, 0x81, 0x7D, 0x81, 0x7E, 0x7F,
+    0x7C, 0x81, 0x7F, 0x81, 0x80, 0x7F, 0x81, 0x7D, 0x7F, 0x7F, 0x7E, 0x81, 0x7C, 0x81, 0x7F, 0x80,
+    0x81, 0x7F, 0x80, 0x7F, 0x7C, 0x81, 0x7B, 0x82, 0x7D, 0x81, 0x7E, 0x7F, 0x7E, 0x81, 0x7F, 0x7F,
+    0x80, 0x7E, 0x7E, 0x81, 0x7E, 0x82, 0x7F, 0x7F, 0x81, 0x7E, 0x7E, 0x81, 0x80, 0x7F, 0x7F, 0x7F,
+    0x7F, 0x80, 0x7D, 0x80, 0x7C, 0x81, 0x7F, 0x7D, 0x82, 0x7C, 0x7E, 0x80, 0x7E, 0x81, 0x7E, 0x80,
+    0x80, 0x7D, 0x7C, 0x80, 0x7C, 0x80, 0x7F, 0x7C, 0x80, 0x7F, 0x7D, 0x81, 0x7D, 0x80, 0x7F, 0x7F,
+    0x7F, 0x7F, 0x82, 0x7B, 0x7F, 0x7F, 0x7E, 0x7F, 0x82, 0x7C, 0x7E, 0x7D, 0x7F, 0x81, 0x80, 0x81,
+    0x7F, 0x7D, 0x7D, 0x7D, 0x80, 0x80, 0x82, 0x81, 0x80, 0x7E, 0x7C, 0x7E, 0x7E, 0x7F, 0x7F, 0x81,
+    0x7F, 0x7F, 0x81, 0x7B
+};
+const UWORD8 gau1_ihevcd_logo_420sp_vu[] =
+{
+    0x7E, 0x7F, 0x80, 0x7D, 0x7D, 0x7F, 0x7E, 0x80, 0x80, 0x7D, 0x81, 0x7E, 0x7E, 0x7D, 0x7C, 0x82,
+    0x80, 0x80, 0x81, 0x81, 0x7E, 0x7F, 0x7D, 0x80, 0x80, 0x80, 0x81, 0x7F, 0x7C, 0x80, 0x81, 0x7E,
+    0x81, 0x7B, 0x7F, 0x7D, 0x81, 0x7D, 0x7E, 0x81, 0x80, 0x7E, 0x7F, 0x7D, 0x7C, 0x80, 0x7F, 0x80,
+    0x80, 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x80,
+    0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x7F,
+    0x7F, 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x7F, 0x80, 0x81, 0x80, 0x81, 0x80, 0x80,
+    0x80, 0x81, 0x7C, 0x81, 0x7B, 0x81, 0x7F, 0x7F, 0x81, 0x7D, 0x7F, 0x7E, 0x7F, 0x81, 0x81, 0x7D,
+    0x82, 0x7B, 0x81, 0x7E, 0x80, 0x7F, 0x81, 0x7E, 0x7F, 0x80, 0x7D, 0x80, 0x7E, 0x80, 0x86, 0x78,
+    0x87, 0x78, 0x83, 0x7C, 0x81, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x7F, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x7F, 0x80, 0x7F,
+    0x80, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x7F, 0x7F, 0x81, 0x7D, 0x81, 0x7C, 0x8C, 0x6E, 0x8D, 0x69, 0x89, 0x70, 0x82, 0x7B,
+    0x7F, 0x7E, 0x81, 0x7D, 0x7A, 0x82, 0x7C, 0x82, 0x7E, 0x7F, 0x7F, 0x80, 0x7C, 0x80, 0x7C, 0x82,
+    0x80, 0x7E, 0x7F, 0x80, 0x7F, 0x80, 0x8C, 0x71, 0xAE, 0x4B, 0xAE, 0x4A, 0x94, 0x64, 0x80, 0x7A,
+    0x7F, 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x80,
+    0x80, 0x7F, 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x7F, 0x7F, 0x7F, 0x80,
+    0x7F, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x7E, 0x7F,
+    0x82, 0x77, 0xA8, 0x54, 0xB5, 0x43, 0xA5, 0x53, 0x83, 0x76, 0x7D, 0x7F, 0x84, 0x75, 0x83, 0x75,
+    0x83, 0x77, 0x81, 0x7E, 0x7F, 0x80, 0x81, 0x7A, 0x83, 0x78, 0x84, 0x74, 0x80, 0x7A, 0x7C, 0x7E,
+    0x94, 0x66, 0xBC, 0x39, 0xC0, 0x34, 0x9E, 0x57, 0x82, 0x79, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x7F,
+    0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x80, 0x7F,
+    0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x7F, 0x7F, 0x82, 0x79, 0xB3, 0x49, 0xC2, 0x39,
+    0xAD, 0x4A, 0x83, 0x77, 0x7F, 0x7D, 0x9C, 0x5C, 0xAD, 0x4F, 0xA5, 0x55, 0x89, 0x6F, 0x81, 0x79,
+    0x96, 0x64, 0xA8, 0x52, 0xA3, 0x52, 0x8C, 0x6B, 0x7D, 0x80, 0x87, 0x70, 0xAD, 0x4F, 0xB1, 0x47,
+    0x93, 0x6A, 0x82, 0x7B, 0x80, 0x7E, 0x7F, 0x7E, 0x7E, 0x80, 0x7F, 0x7F, 0x80, 0x7D, 0x7F, 0x7F,
+    0x80, 0x7E, 0x7F, 0x7F, 0x7F, 0x7D, 0x81, 0x7E, 0x80, 0x7E, 0x80, 0x7D, 0x81, 0x7E, 0x81, 0x80,
+    0x80, 0x80, 0x81, 0x7F, 0x82, 0x7E, 0x7C, 0x81, 0x7D, 0x80, 0x82, 0x7D, 0x81, 0x7D, 0x80, 0x80,
+    0x7F, 0x7F, 0x7F, 0x7E, 0x83, 0x7A, 0xB2, 0x48, 0xBF, 0x3A, 0xB0, 0x45, 0x82, 0x78, 0x7F, 0x7E,
+    0xA1, 0x55, 0xC0, 0x3B, 0xB7, 0x42, 0x88, 0x6D, 0x7A, 0x7F, 0x9E, 0x5C, 0xC0, 0x3D, 0xB9, 0x40,
+    0x93, 0x64, 0x80, 0x80, 0x80, 0x7E, 0x86, 0x76, 0x8B, 0x6C, 0x7E, 0x7F, 0x7E, 0x7D, 0x7F, 0x80,
+    0x7F, 0x7E, 0x81, 0x7A, 0x80, 0x7B, 0x7E, 0x7C, 0x7E, 0x80, 0x7F, 0x80, 0x81, 0x80, 0x80, 0x7E,
+    0x7F, 0x80, 0x7D, 0x7F, 0x81, 0x79, 0x81, 0x7B, 0x80, 0x80, 0x7E, 0x80, 0x7D, 0x80, 0x7D, 0x80,
+    0x81, 0x7A, 0x7F, 0x79, 0x7F, 0x7F, 0x7E, 0x81, 0x7F, 0x80, 0x80, 0x7F, 0x7F, 0x7E, 0x82, 0x7A,
+    0xB2, 0x48, 0xBE, 0x3A, 0xB1, 0x45, 0x85, 0x74, 0x8E, 0x6D, 0xAA, 0x4E, 0xC0, 0x3B, 0xBB, 0x3F,
+    0x96, 0x5F, 0x8E, 0x6B, 0xA6, 0x51, 0xBE, 0x3B, 0xB8, 0x3E, 0x9D, 0x58, 0x8E, 0x6B, 0x8E, 0x6A,
+    0x8C, 0x6C, 0x90, 0x68, 0x84, 0x76, 0x7C, 0x81, 0x81, 0x7F, 0x87, 0x74, 0x98, 0x60, 0xA1, 0x57,
+    0x9A, 0x5C, 0x92, 0x66, 0x91, 0x67, 0x96, 0x65, 0x97, 0x63, 0x8D, 0x69, 0x90, 0x65, 0xA0, 0x59,
+    0x9E, 0x5A, 0x95, 0x65, 0x82, 0x75, 0x88, 0x6F, 0x96, 0x62, 0xA1, 0x57, 0x9C, 0x5B, 0x8B, 0x6D,
+    0x81, 0x7A, 0x7E, 0x81, 0x80, 0x7D, 0x7F, 0x7F, 0x83, 0x79, 0xB2, 0x49, 0xC0, 0x3A, 0xB0, 0x48,
+    0x8B, 0x6D, 0xB7, 0x44, 0xBE, 0x3E, 0xBE, 0x3E, 0xBF, 0x3D, 0xBE, 0x3C, 0xBE, 0x3C, 0xBD, 0x3B,
+    0xBF, 0x3A, 0xC2, 0x3B, 0xBF, 0x3F, 0xBA, 0x3F, 0xBE, 0x3E, 0xBE, 0x3A, 0xBD, 0x3D, 0x9B, 0x5B,
+    0x82, 0x7C, 0x8A, 0x6E, 0xA7, 0x54, 0xBB, 0x3E, 0xC3, 0x34, 0xBF, 0x39, 0xB9, 0x3D, 0xB8, 0x3D,
+    0xB9, 0x3D, 0xBB, 0x3C, 0xB2, 0x43, 0xB2, 0x44, 0xBE, 0x3B, 0xC0, 0x38, 0xBB, 0x3D, 0x9F, 0x56,
+    0xAD, 0x49, 0xBA, 0x3B, 0xBD, 0x3D, 0xBD, 0x3B, 0xAF, 0x49, 0x89, 0x6E, 0x7E, 0x7F, 0x7D, 0x80,
+    0x7F, 0x7E, 0x82, 0x79, 0xB2, 0x49, 0xC0, 0x3A, 0xB3, 0x45, 0x8B, 0x6B, 0xB3, 0x47, 0xBA, 0x43,
+    0xC0, 0x3A, 0xC0, 0x3C, 0xBB, 0x40, 0xB9, 0x42, 0xBB, 0x3D, 0xBF, 0x39, 0xC0, 0x3D, 0xBC, 0x40,
+    0xB6, 0x40, 0xBA, 0x40, 0xBB, 0x3F, 0xBB, 0x3F, 0x9E, 0x54, 0x84, 0x7A, 0xA2, 0x56, 0xBE, 0x3B,
+    0xBE, 0x3B, 0xB8, 0x42, 0xB0, 0x48, 0xB9, 0x42, 0xBD, 0x3F, 0xBE, 0x3D, 0xBF, 0x3C, 0xB3, 0x45,
+    0xA4, 0x51, 0xAC, 0x4D, 0xBA, 0x41, 0xBF, 0x3B, 0xBC, 0x3B, 0xAA, 0x4B, 0xA4, 0x52, 0xB8, 0x43,
+    0xC2, 0x35, 0xB5, 0x45, 0x8D, 0x6D, 0x80, 0x7E, 0x81, 0x7E, 0x7E, 0x7E, 0x83, 0x7A, 0xB1, 0x49,
+    0xBE, 0x3A, 0xB1, 0x46, 0x82, 0x77, 0x84, 0x76, 0xAA, 0x4E, 0xC0, 0x3B, 0xBA, 0x42, 0x8C, 0x62,
+    0x85, 0x77, 0x9F, 0x56, 0xBF, 0x3A, 0xBC, 0x3E, 0x95, 0x5F, 0x85, 0x75, 0x97, 0x5F, 0xBD, 0x3C,
+    0xBD, 0x3B, 0x9E, 0x59, 0x8A, 0x6B, 0xB4, 0x46, 0xC2, 0x39, 0xB4, 0x46, 0x8F, 0x66, 0x83, 0x75,
+    0x89, 0x6D, 0xA8, 0x51, 0xC1, 0x3D, 0xC1, 0x3C, 0xA1, 0x58, 0x85, 0x7A, 0x8B, 0x6E, 0xB5, 0x44,
+    0xC1, 0x39, 0xB5, 0x43, 0x8A, 0x6F, 0x81, 0x77, 0x9C, 0x5B, 0xBF, 0x3B, 0xBC, 0x3C, 0x8F, 0x66,
+    0x80, 0x7F, 0x7E, 0x7F, 0x7F, 0x7E, 0x82, 0x7A, 0xB1, 0x48, 0xBF, 0x3A, 0xB1, 0x45, 0x82, 0x7A,
+    0x7F, 0x7B, 0xA9, 0x4E, 0xC0, 0x3C, 0xB8, 0x44, 0x89, 0x68, 0x83, 0x7C, 0x9D, 0x5C, 0xBF, 0x3B,
+    0xBD, 0x40, 0x93, 0x63, 0x81, 0x7D, 0x94, 0x65, 0xBE, 0x3B, 0xBA, 0x3E, 0x9C, 0x5B, 0x8F, 0x65,
+    0xB5, 0x3F, 0xBD, 0x3A, 0xAB, 0x4D, 0x85, 0x72, 0x7C, 0x81, 0x7E, 0x7C, 0x9F, 0x5A, 0xBF, 0x3C,
+    0xBE, 0x3D, 0x9C, 0x5C, 0x81, 0x7E, 0x86, 0x71, 0xB2, 0x46, 0xBE, 0x3B, 0xB1, 0x46, 0x86, 0x75,
+    0x80, 0x7C, 0x98, 0x61, 0xBE, 0x3C, 0xBC, 0x3C, 0x91, 0x66, 0x7F, 0x7F, 0x7E, 0x7F, 0x7F, 0x7E,
+    0x82, 0x7A, 0xB1, 0x49, 0xBF, 0x3B, 0xB1, 0x47, 0x81, 0x7B, 0x81, 0x7B, 0xA9, 0x50, 0xC0, 0x3D,
+    0xB7, 0x46, 0x89, 0x69, 0x81, 0x7F, 0x9C, 0x5D, 0xBD, 0x3E, 0xBB, 0x40, 0x91, 0x64, 0x80, 0x7F,
+    0x93, 0x67, 0xBD, 0x3D, 0xBA, 0x3D, 0x9C, 0x5B, 0x8E, 0x69, 0xB7, 0x43, 0xC1, 0x3D, 0xB0, 0x4B,
+    0x89, 0x6D, 0x7C, 0x7C, 0x82, 0x78, 0xA3, 0x58, 0xBE, 0x3D, 0xBC, 0x3E, 0x9A, 0x5E, 0x82, 0x7F,
+    0x82, 0x73, 0xAF, 0x48, 0xBF, 0x3E, 0xAE, 0x4B, 0x86, 0x76, 0x7E, 0x7E, 0x98, 0x63, 0xBC, 0x3D,
+    0xBD, 0x3C, 0x90, 0x67, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7E, 0x82, 0x79, 0xB1, 0x48, 0xBE, 0x3A,
+    0xB2, 0x46, 0x81, 0x7B, 0x80, 0x7C, 0xA9, 0x51, 0xBF, 0x3E, 0xB7, 0x45, 0x89, 0x68, 0x82, 0x7D,
+    0x9C, 0x5D, 0xBC, 0x3B, 0xBA, 0x3F, 0x90, 0x64, 0x80, 0x7E, 0x92, 0x68, 0xBA, 0x3E, 0xC0, 0x38,
+    0x9F, 0x59, 0x84, 0x74, 0xA8, 0x51, 0xC0, 0x3D, 0xBD, 0x3F, 0xA5, 0x50, 0x9C, 0x5A, 0xA3, 0x53,
+    0xB5, 0x43, 0xBF, 0x3C, 0xBB, 0x3F, 0x9C, 0x5D, 0x81, 0x7F, 0x84, 0x72, 0xAE, 0x48, 0xC0, 0x3C,
+    0xB0, 0x4A, 0x85, 0x77, 0x7F, 0x7E, 0x97, 0x62, 0xBD, 0x3E, 0xBC, 0x3C, 0x91, 0x67, 0x7F, 0x80,
+    0x7F, 0x80, 0x7E, 0x7F, 0x82, 0x79, 0xB3, 0x4A, 0xC1, 0x38, 0xB1, 0x49, 0x84, 0x79, 0x7E, 0x7C,
+    0xA7, 0x51, 0xC0, 0x3A, 0xB7, 0x40, 0x8B, 0x67, 0x80, 0x7B, 0x9E, 0x5A, 0xBF, 0x39, 0xBA, 0x3C,
+    0x93, 0x63, 0x7E, 0x7E, 0x93, 0x66, 0xBB, 0x3D, 0xC2, 0x37, 0x9F, 0x5A, 0x80, 0x7B, 0x92, 0x65,
+    0xB5, 0x47, 0xC2, 0x38, 0xC0, 0x39, 0xBB, 0x3F, 0xBD, 0x3B, 0xBB, 0x3B, 0xBE, 0x3B, 0xBE, 0x3D,
+    0x9D, 0x5A, 0x82, 0x7E, 0x85, 0x71, 0xAC, 0x46, 0xC1, 0x3B, 0xAF, 0x47, 0x87, 0x74, 0x80, 0x7E,
+    0x9A, 0x5E, 0xBC, 0x3A, 0xB7, 0x3E, 0x90, 0x68, 0x80, 0x7F, 0x80, 0x7E, 0x7E, 0x80, 0x82, 0x7A,
+    0xA6, 0x58, 0xB1, 0x4A, 0xA4, 0x5A, 0x83, 0x79, 0x7E, 0x7E, 0x9C, 0x5D, 0xAF, 0x49, 0xA8, 0x4F,
+    0x89, 0x6E, 0x7F, 0x7C, 0x98, 0x64, 0xAF, 0x4D, 0xAB, 0x4F, 0x8F, 0x6B, 0x7D, 0x80, 0x8E, 0x6E,
+    0xAC, 0x4E, 0xB1, 0x4A, 0x95, 0x66, 0x81, 0x7C, 0x80, 0x7A, 0x8C, 0x6B, 0xA6, 0x54, 0xB5, 0x48,
+    0xB6, 0x4A, 0xA2, 0x57, 0x9A, 0x5D, 0xA3, 0x57, 0xA2, 0x58, 0x8D, 0x6C, 0x7F, 0x80, 0x84, 0x78,
+    0x96, 0x5A, 0xA8, 0x55, 0x9B, 0x5D, 0x83, 0x79, 0x7C, 0x81, 0x90, 0x6A, 0xA5, 0x53, 0x9F, 0x59,
+    0x8A, 0x72, 0x80, 0x7E, 0x7E, 0x7F, 0x7F, 0x80, 0x80, 0x7E, 0x88, 0x73, 0x89, 0x6F, 0x85, 0x76,
+    0x7E, 0x80, 0x7D, 0x7F, 0x85, 0x75, 0x88, 0x72, 0x88, 0x70, 0x81, 0x7A, 0x7F, 0x7F, 0x86, 0x78,
+    0x89, 0x72, 0x88, 0x72, 0x84, 0x77, 0x7D, 0x80, 0x82, 0x7D, 0x86, 0x75, 0x88, 0x6F, 0x84, 0x79,
+    0x7E, 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x82, 0x79, 0x88, 0x72, 0x88, 0x6D, 0x83, 0x79, 0x7E, 0x7E,
+    0x80, 0x7C, 0x80, 0x7A, 0x7F, 0x7D, 0x7D, 0x81, 0x7E, 0x80, 0x81, 0x77, 0x81, 0x78, 0x82, 0x7D,
+    0x7F, 0x7F, 0x7D, 0x7F, 0x81, 0x7D, 0x82, 0x7C, 0x81, 0x7B, 0x80, 0x7C, 0x7D, 0x7E, 0x7F, 0x81,
+    0x80, 0x7F, 0x80, 0x80, 0x7F, 0x7F, 0x7D, 0x80, 0x7C, 0x81, 0x7E, 0x81, 0x7E, 0x81, 0x7C, 0x81,
+    0x7B, 0x80, 0x7F, 0x80, 0x7E, 0x81, 0x7F, 0x81, 0x80, 0x81, 0x7D, 0x81, 0x7C, 0x7E, 0x7F, 0x80,
+    0x80, 0x80, 0x7F, 0x81, 0x7E, 0x81, 0x7E, 0x80, 0x7F, 0x7F, 0x7E, 0x80, 0x80, 0x7C, 0x80, 0x7B,
+    0x7E, 0x7F, 0x7D, 0x80, 0x81, 0x81, 0x81, 0x7E, 0x7C, 0x81, 0x81, 0x7F, 0x81, 0x7D, 0x7F, 0x7E,
+    0x81, 0x7C, 0x81, 0x7F, 0x7F, 0x80, 0x7D, 0x81, 0x7F, 0x7F, 0x81, 0x7E, 0x81, 0x7C, 0x80, 0x7F,
+    0x7F, 0x81, 0x7F, 0x80, 0x81, 0x7C, 0x82, 0x7B, 0x81, 0x7D, 0x7F, 0x7E, 0x81, 0x7E, 0x7F, 0x7F,
+    0x7E, 0x80, 0x81, 0x7E, 0x82, 0x7E, 0x7F, 0x7F, 0x7E, 0x81, 0x81, 0x7E, 0x7F, 0x80, 0x7F, 0x7F,
+    0x80, 0x7F, 0x80, 0x7D, 0x81, 0x7C, 0x7D, 0x7F, 0x7C, 0x82, 0x80, 0x7E, 0x81, 0x7E, 0x80, 0x7E,
+    0x7D, 0x80, 0x80, 0x7C, 0x80, 0x7C, 0x7C, 0x7F, 0x7F, 0x80, 0x81, 0x7D, 0x80, 0x7D, 0x7F, 0x7F,
+    0x7F, 0x7F, 0x7B, 0x82, 0x7F, 0x7F, 0x7F, 0x7E, 0x7C, 0x82, 0x7D, 0x7E, 0x81, 0x7F, 0x81, 0x80,
+    0x7D, 0x7F, 0x7D, 0x7D, 0x80, 0x80, 0x81, 0x82, 0x7E, 0x80, 0x7E, 0x7C, 0x7F, 0x7E, 0x81, 0x7F,
+    0x7F, 0x7F, 0x7B, 0x81,
+
+};
+
+const UWORD8 gau1_ihevcd_logo_420_u[] =
+{
+    0x80, 0x7c, 0x7a, 0x7c, 0x80, 0x81, 0x7d, 0x78,
+    0x7c, 0x79, 0x79, 0x7d, 0x83, 0x86, 0x84, 0x80,
+    0x7f, 0x81, 0x82, 0x84, 0x84, 0x82, 0x81, 0x7f,
+    0x81, 0x83, 0x83, 0x7f, 0x78, 0x76, 0x79, 0x7d,
+    0x76, 0x7b, 0x80, 0x81, 0x7d, 0x7b, 0x7d, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x82, 0x80, 0x7f, 0x80, 0x84, 0x85, 0x83, 0x80,
+    0x84, 0x81, 0x7e, 0x7e, 0x80, 0x81, 0x80, 0x7e,
+    0x7f, 0x7f, 0x7f, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f,
+    0x7c, 0x7f, 0x80, 0x7f, 0x7c, 0x7c, 0x80, 0x83,
+    0x7e, 0x82, 0x84, 0x83, 0x80, 0x7e, 0x7f, 0x81,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x83, 0x83, 0x83, 0x83, 0x84, 0x85, 0x86, 0x86,
+    0x8a, 0x87, 0x82, 0x7e, 0x7d, 0x7d, 0x7e, 0x7f,
+    0x81, 0x7f, 0x7c, 0x7a, 0x7a, 0x7c, 0x7f, 0x81,
+    0x7a, 0x7c, 0x7e, 0x80, 0x82, 0x83, 0x84, 0x85,
+    0x7d, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x80, 0x81,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x7f, 0x81, 0x81, 0x7f, 0x7b, 0x7a, 0x7d, 0x80,
+    0x82, 0x83, 0x81, 0x7e, 0x7b, 0x7c, 0x82, 0x86,
+    0x85, 0x82, 0x7e, 0x7b, 0x7b, 0x7e, 0x82, 0x85,
+    0x80, 0x7f, 0x7f, 0x81, 0x84, 0x84, 0x80, 0x7b,
+    0x66, 0x64, 0x64, 0x69, 0x73, 0x7b, 0x80, 0x81,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x7b, 0x7d, 0x7d, 0x74, 0x69, 0x64, 0x68, 0x6d,
+    0x70, 0x77, 0x7e, 0x7f, 0x7c, 0x7d, 0x84, 0x8c,
+    0x85, 0x84, 0x81, 0x80, 0x80, 0x81, 0x84, 0x85,
+    0x86, 0x82, 0x7f, 0x81, 0x84, 0x80, 0x75, 0x6b,
+    0x45, 0x41, 0x42, 0x4e, 0x62, 0x74, 0x7d, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x7b, 0x7e, 0x7a, 0x6b, 0x57, 0x4c, 0x4f, 0x56,
+    0x60, 0x6e, 0x7d, 0x81, 0x7c, 0x78, 0x7d, 0x84,
+    0x7e, 0x7f, 0x82, 0x83, 0x83, 0x82, 0x7f, 0x7e,
+    0x84, 0x7d, 0x7a, 0x7e, 0x83, 0x7f, 0x6f, 0x60,
+    0x34, 0x2d, 0x2d, 0x3d, 0x58, 0x71, 0x7c, 0x7e,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x83, 0x7d, 0x67, 0x4b, 0x3a, 0x3c, 0x44,
+    0x5a, 0x6e, 0x82, 0x86, 0x7b, 0x6f, 0x6c, 0x70,
+    0x6f, 0x75, 0x7c, 0x82, 0x82, 0x7c, 0x75, 0x6f,
+    0x76, 0x70, 0x6e, 0x77, 0x83, 0x82, 0x71, 0x60,
+    0x3b, 0x31, 0x2c, 0x3b, 0x59, 0x72, 0x7d, 0x7d,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x86, 0x89, 0x81, 0x67, 0x46, 0x32, 0x32, 0x3a,
+    0x5a, 0x71, 0x88, 0x8a, 0x7a, 0x66, 0x5e, 0x5f,
+    0x63, 0x6b, 0x77, 0x7f, 0x7f, 0x77, 0x6b, 0x63,
+    0x69, 0x63, 0x64, 0x72, 0x83, 0x86, 0x76, 0x65,
+    0x49, 0x3c, 0x35, 0x41, 0x5d, 0x75, 0x7e, 0x7d,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+    0x52, 0x77, 0x84, 0x7b, 0x69, 0x41, 0x31, 0x4a,
+    0x38, 0x4a, 0x62, 0x75, 0x76, 0x67, 0x51, 0x41,
+    0x39, 0x39, 0x43, 0x5d, 0x7a, 0x86, 0x7d, 0x6f,
+    0x5f, 0x49, 0x49, 0x67, 0x7b, 0x77, 0x79, 0x86,
+    0x7a, 0x7c, 0x7e, 0x80, 0x80, 0x7e, 0x7c, 0x7a,
+    0x7f, 0x7f, 0x7e, 0x7e, 0x7d, 0x7d, 0x7c, 0x7c,
+    0x7e, 0x7e, 0x7d, 0x7d, 0x7d, 0x7e, 0x7f, 0x7f,
+    0x80, 0x7e, 0x7b, 0x7a, 0x7c, 0x80, 0x85, 0x88,
+    0x7c, 0x7c, 0x7c, 0x7c, 0x7e, 0x7f, 0x81, 0x82,
+    0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+    0x4b, 0x76, 0x8a, 0x85, 0x74, 0x4a, 0x32, 0x42,
+    0x35, 0x4c, 0x6c, 0x83, 0x85, 0x70, 0x52, 0x3d,
+    0x39, 0x3a, 0x46, 0x61, 0x7e, 0x8c, 0x86, 0x7b,
+    0x7c, 0x66, 0x61, 0x76, 0x83, 0x7d, 0x7a, 0x83,
+    0x80, 0x7f, 0x7f, 0x7e, 0x7e, 0x7e, 0x7f, 0x80,
+    0x84, 0x83, 0x83, 0x82, 0x82, 0x81, 0x81, 0x80,
+    0x85, 0x83, 0x7f, 0x7d, 0x7d, 0x7f, 0x82, 0x85,
+    0x7f, 0x81, 0x84, 0x85, 0x84, 0x81, 0x7c, 0x79,
+    0x81, 0x81, 0x81, 0x81, 0x80, 0x7f, 0x7f, 0x7e,
+    0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+    0x45, 0x76, 0x8c, 0x85, 0x77, 0x51, 0x33, 0x3a,
+    0x34, 0x4c, 0x6f, 0x88, 0x89, 0x72, 0x51, 0x3a,
+    0x39, 0x3c, 0x48, 0x61, 0x7b, 0x89, 0x86, 0x7e,
+    0x90, 0x7c, 0x73, 0x7e, 0x87, 0x82, 0x7d, 0x7f,
+    0x86, 0x81, 0x7a, 0x74, 0x71, 0x73, 0x77, 0x7a,
+    0x7f, 0x7f, 0x7e, 0x7e, 0x7d, 0x7c, 0x7c, 0x7c,
+    0x84, 0x7f, 0x78, 0x72, 0x72, 0x76, 0x7c, 0x81,
+    0x7d, 0x81, 0x85, 0x86, 0x81, 0x77, 0x6b, 0x64,
+    0x7d, 0x7f, 0x81, 0x82, 0x82, 0x80, 0x7d, 0x7b,
+    0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+    0x48, 0x76, 0x80, 0x6e, 0x63, 0x4c, 0x36, 0x3a,
+    0x36, 0x47, 0x5f, 0x70, 0x71, 0x61, 0x4b, 0x3a,
+    0x39, 0x3b, 0x45, 0x56, 0x69, 0x72, 0x70, 0x6a,
+    0x7b, 0x6e, 0x67, 0x6f, 0x7c, 0x82, 0x80, 0x7e,
+    0x84, 0x7b, 0x6c, 0x5f, 0x58, 0x58, 0x5c, 0x60,
+    0x68, 0x68, 0x68, 0x67, 0x66, 0x66, 0x65, 0x65,
+    0x71, 0x6a, 0x61, 0x59, 0x58, 0x5d, 0x64, 0x6a,
+    0x79, 0x77, 0x73, 0x6d, 0x65, 0x5c, 0x55, 0x51,
+    0x66, 0x6c, 0x74, 0x7b, 0x80, 0x80, 0x7e, 0x7c,
+    0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+    0x51, 0x76, 0x6e, 0x4b, 0x44, 0x42, 0x3a, 0x3f,
+    0x3b, 0x3f, 0x45, 0x4a, 0x4a, 0x47, 0x41, 0x3d,
+    0x38, 0x3a, 0x3f, 0x47, 0x4e, 0x51, 0x4e, 0x4a,
+    0x4c, 0x49, 0x47, 0x51, 0x67, 0x7c, 0x82, 0x7e,
+    0x76, 0x6b, 0x59, 0x47, 0x3d, 0x3a, 0x3d, 0x40,
+    0x4a, 0x4a, 0x49, 0x49, 0x48, 0x48, 0x47, 0x47,
+    0x56, 0x4f, 0x46, 0x3e, 0x3c, 0x3f, 0x46, 0x4a,
+    0x6d, 0x64, 0x56, 0x49, 0x41, 0x40, 0x43, 0x45,
+    0x45, 0x50, 0x60, 0x70, 0x7c, 0x81, 0x82, 0x80,
+    0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+    0x54, 0x76, 0x63, 0x34, 0x31, 0x3d, 0x3d, 0x40,
+    0x3d, 0x3a, 0x36, 0x33, 0x33, 0x36, 0x3a, 0x3d,
+    0x38, 0x3a, 0x3c, 0x3c, 0x3c, 0x3a, 0x37, 0x36,
+    0x2c, 0x31, 0x33, 0x3a, 0x55, 0x77, 0x82, 0x79,
+    0x5e, 0x55, 0x47, 0x39, 0x30, 0x2e, 0x30, 0x32,
+    0x38, 0x38, 0x37, 0x37, 0x36, 0x36, 0x35, 0x35,
+    0x45, 0x41, 0x3b, 0x35, 0x32, 0x32, 0x34, 0x36,
+    0x57, 0x50, 0x44, 0x39, 0x34, 0x35, 0x3a, 0x3e,
+    0x2e, 0x3c, 0x53, 0x6a, 0x7a, 0x82, 0x83, 0x82,
+    0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+    0x4f, 0x76, 0x64, 0x34, 0x34, 0x44, 0x3e, 0x38,
+    0x3c, 0x3a, 0x38, 0x37, 0x37, 0x38, 0x39, 0x3a,
+    0x38, 0x3b, 0x3d, 0x3c, 0x39, 0x37, 0x38, 0x39,
+    0x31, 0x39, 0x38, 0x38, 0x51, 0x75, 0x7f, 0x71,
+    0x45, 0x41, 0x3b, 0x37, 0x35, 0x38, 0x3c, 0x3f,
+    0x3a, 0x3a, 0x39, 0x39, 0x38, 0x37, 0x37, 0x37,
+    0x47, 0x46, 0x45, 0x42, 0x3e, 0x3a, 0x37, 0x35,
+    0x3f, 0x41, 0x44, 0x46, 0x45, 0x41, 0x3c, 0x39,
+    0x2b, 0x3a, 0x52, 0x6b, 0x7c, 0x82, 0x81, 0x7f,
+    0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+    0x47, 0x75, 0x6a, 0x3e, 0x3f, 0x4d, 0x3f, 0x2f,
+    0x39, 0x3d, 0x42, 0x45, 0x45, 0x41, 0x3b, 0x37,
+    0x38, 0x3d, 0x41, 0x40, 0x3d, 0x3d, 0x41, 0x45,
+    0x44, 0x4c, 0x47, 0x3f, 0x53, 0x76, 0x7c, 0x6a,
+    0x35, 0x36, 0x37, 0x3b, 0x40, 0x47, 0x4d, 0x51,
+    0x43, 0x43, 0x42, 0x42, 0x41, 0x41, 0x40, 0x40,
+    0x50, 0x52, 0x53, 0x52, 0x4e, 0x48, 0x41, 0x3c,
+    0x2e, 0x3a, 0x4c, 0x5a, 0x5b, 0x51, 0x41, 0x35,
+    0x30, 0x3f, 0x58, 0x6f, 0x7e, 0x82, 0x7f, 0x7b,
+    0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+    0x4c, 0x7f, 0x7c, 0x7e, 0x6f, 0x40, 0x3a, 0x3d,
+    0x38, 0x48, 0x61, 0x77, 0x7d, 0x6d, 0x4f, 0x37,
+    0x37, 0x38, 0x45, 0x5e, 0x77, 0x7c, 0x6b, 0x58,
+    0x38, 0x3a, 0x37, 0x3d, 0x5a, 0x75, 0x6c, 0x4f,
+    0x3a, 0x38, 0x39, 0x44, 0x56, 0x67, 0x72, 0x77,
+    0x6e, 0x63, 0x51, 0x41, 0x39, 0x39, 0x3e, 0x42,
+    0x6e, 0x7a, 0x80, 0x72, 0x55, 0x3d, 0x36, 0x39,
+    0x33, 0x54, 0x71, 0x79, 0x79, 0x71, 0x54, 0x33,
+    0x3e, 0x35, 0x42, 0x67, 0x80, 0x80, 0x7d, 0x81,
+    0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+    0x4c, 0x7f, 0x7c, 0x7e, 0x6f, 0x40, 0x3a, 0x3d,
+    0x3a, 0x49, 0x62, 0x79, 0x7f, 0x6e, 0x50, 0x39,
+    0x38, 0x39, 0x46, 0x60, 0x79, 0x7e, 0x6d, 0x5a,
+    0x38, 0x3a, 0x39, 0x3f, 0x5b, 0x74, 0x68, 0x49,
+    0x3a, 0x38, 0x3a, 0x47, 0x5b, 0x6e, 0x7a, 0x7f,
+    0x79, 0x6b, 0x56, 0x43, 0x38, 0x38, 0x3e, 0x44,
+    0x70, 0x7c, 0x82, 0x73, 0x56, 0x3e, 0x37, 0x3a,
+    0x34, 0x55, 0x73, 0x7b, 0x7b, 0x73, 0x55, 0x34,
+    0x3e, 0x35, 0x42, 0x67, 0x80, 0x80, 0x7d, 0x81,
+    0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+    0x4d, 0x80, 0x7d, 0x7e, 0x70, 0x41, 0x3b, 0x3e,
+    0x3c, 0x4b, 0x64, 0x7b, 0x81, 0x70, 0x52, 0x3b,
+    0x39, 0x3b, 0x47, 0x62, 0x7b, 0x80, 0x70, 0x5c,
+    0x37, 0x3b, 0x3b, 0x43, 0x5d, 0x73, 0x63, 0x42,
+    0x3a, 0x38, 0x3c, 0x4b, 0x61, 0x76, 0x84, 0x89,
+    0x86, 0x76, 0x5d, 0x45, 0x38, 0x38, 0x3f, 0x45,
+    0x72, 0x7e, 0x84, 0x75, 0x58, 0x40, 0x39, 0x3c,
+    0x37, 0x58, 0x75, 0x7d, 0x7d, 0x75, 0x58, 0x37,
+    0x3e, 0x35, 0x42, 0x67, 0x80, 0x80, 0x7d, 0x81,
+    0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+    0x4d, 0x80, 0x7d, 0x7f, 0x70, 0x41, 0x3b, 0x3e,
+    0x3d, 0x4d, 0x66, 0x7d, 0x83, 0x72, 0x54, 0x3d,
+    0x3a, 0x3c, 0x49, 0x63, 0x7d, 0x83, 0x72, 0x5f,
+    0x37, 0x3c, 0x3d, 0x44, 0x5e, 0x72, 0x61, 0x3f,
+    0x3a, 0x39, 0x3d, 0x4c, 0x63, 0x78, 0x86, 0x8c,
+    0x8c, 0x7a, 0x5f, 0x46, 0x38, 0x38, 0x3f, 0x46,
+    0x74, 0x7f, 0x85, 0x77, 0x5a, 0x42, 0x3b, 0x3e,
+    0x39, 0x5a, 0x77, 0x7f, 0x7f, 0x77, 0x5a, 0x39,
+    0x3e, 0x35, 0x42, 0x67, 0x80, 0x80, 0x7d, 0x81,
+    0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+    0x4e, 0x81, 0x7e, 0x7f, 0x71, 0x42, 0x3c, 0x3f,
+    0x3e, 0x4e, 0x67, 0x7d, 0x83, 0x73, 0x55, 0x3d,
+    0x3a, 0x3c, 0x49, 0x63, 0x7e, 0x84, 0x74, 0x61,
+    0x39, 0x3c, 0x3c, 0x43, 0x5e, 0x74, 0x65, 0x45,
+    0x3c, 0x3a, 0x3d, 0x49, 0x5d, 0x70, 0x7d, 0x82,
+    0x81, 0x72, 0x5a, 0x44, 0x39, 0x39, 0x40, 0x46,
+    0x74, 0x80, 0x86, 0x78, 0x5a, 0x43, 0x3c, 0x3e,
+    0x3a, 0x5b, 0x79, 0x81, 0x81, 0x79, 0x5b, 0x3a,
+    0x3e, 0x35, 0x42, 0x67, 0x80, 0x80, 0x7d, 0x81,
+    0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+    0x4e, 0x81, 0x7e, 0x80, 0x71, 0x43, 0x3d, 0x3f,
+    0x3d, 0x4d, 0x66, 0x7d, 0x83, 0x72, 0x54, 0x3d,
+    0x38, 0x3a, 0x48, 0x63, 0x7d, 0x84, 0x74, 0x61,
+    0x3b, 0x3c, 0x39, 0x3e, 0x5b, 0x77, 0x6e, 0x52,
+    0x3f, 0x3b, 0x3a, 0x42, 0x51, 0x5f, 0x68, 0x6b,
+    0x68, 0x5e, 0x4e, 0x40, 0x39, 0x3a, 0x40, 0x45,
+    0x74, 0x80, 0x86, 0x77, 0x5a, 0x42, 0x3b, 0x3e,
+    0x3a, 0x5b, 0x79, 0x81, 0x81, 0x79, 0x5b, 0x3a,
+    0x3e, 0x35, 0x42, 0x67, 0x80, 0x80, 0x7d, 0x81,
+    0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+    0x4f, 0x82, 0x7f, 0x80, 0x72, 0x43, 0x3d, 0x40,
+    0x3c, 0x4c, 0x65, 0x7b, 0x81, 0x71, 0x53, 0x3b,
+    0x36, 0x38, 0x46, 0x61, 0x7c, 0x83, 0x73, 0x61,
+    0x3e, 0x3c, 0x35, 0x39, 0x59, 0x7a, 0x79, 0x62,
+    0x42, 0x3c, 0x37, 0x3a, 0x43, 0x4c, 0x51, 0x52,
+    0x4b, 0x47, 0x41, 0x3b, 0x3a, 0x3c, 0x40, 0x43,
+    0x72, 0x7e, 0x84, 0x76, 0x59, 0x41, 0x3a, 0x3d,
+    0x39, 0x5b, 0x78, 0x80, 0x80, 0x78, 0x5b, 0x39,
+    0x3e, 0x35, 0x42, 0x67, 0x80, 0x80, 0x7d, 0x81,
+    0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+    0x4f, 0x82, 0x7f, 0x81, 0x72, 0x43, 0x3d, 0x40,
+    0x3b, 0x4b, 0x64, 0x7b, 0x81, 0x70, 0x52, 0x3a,
+    0x35, 0x37, 0x45, 0x60, 0x7b, 0x82, 0x73, 0x60,
+    0x40, 0x3c, 0x32, 0x35, 0x57, 0x7c, 0x80, 0x6c,
+    0x44, 0x3d, 0x35, 0x34, 0x3a, 0x40, 0x42, 0x41,
+    0x38, 0x38, 0x38, 0x38, 0x3a, 0x3d, 0x40, 0x42,
+    0x72, 0x7d, 0x83, 0x75, 0x58, 0x40, 0x39, 0x3c,
+    0x39, 0x5a, 0x77, 0x7f, 0x7f, 0x77, 0x5a, 0x39,
+    0x3e, 0x35, 0x42, 0x67, 0x80, 0x80, 0x7d, 0x81,
+    0x7f, 0x82, 0x7c, 0x63, 0x44, 0x31, 0x33, 0x3c,
+    0x58, 0x6f, 0x85, 0x83, 0x69, 0x49, 0x35, 0x2e,
+    0x31, 0x46, 0x65, 0x7b, 0x7d, 0x6a, 0x4d, 0x39,
+    0x30, 0x35, 0x45, 0x63, 0x7d, 0x82, 0x71, 0x5d,
+    0x39, 0x30, 0x2e, 0x40, 0x5e, 0x76, 0x7c, 0x79,
+    0x63, 0x57, 0x44, 0x34, 0x2d, 0x2f, 0x37, 0x3d,
+    0x35, 0x39, 0x3c, 0x3b, 0x36, 0x34, 0x37, 0x3c,
+    0x74, 0x7e, 0x81, 0x70, 0x54, 0x3d, 0x38, 0x3c,
+    0x3a, 0x4f, 0x6d, 0x83, 0x83, 0x6d, 0x4f, 0x3a,
+    0x29, 0x37, 0x4e, 0x65, 0x76, 0x7d, 0x7d, 0x7b,
+    0x7f, 0x82, 0x7d, 0x69, 0x4e, 0x3f, 0x41, 0x4a,
+    0x5f, 0x73, 0x85, 0x83, 0x6d, 0x53, 0x44, 0x3f,
+    0x3f, 0x51, 0x6a, 0x7c, 0x7e, 0x6e, 0x57, 0x47,
+    0x41, 0x43, 0x4f, 0x67, 0x7e, 0x83, 0x75, 0x64,
+    0x48, 0x40, 0x3d, 0x4b, 0x65, 0x79, 0x7e, 0x7a,
+    0x74, 0x69, 0x59, 0x4a, 0x41, 0x3f, 0x42, 0x45,
+    0x4c, 0x51, 0x54, 0x52, 0x4d, 0x4c, 0x4f, 0x53,
+    0x79, 0x82, 0x86, 0x79, 0x60, 0x4e, 0x4b, 0x4f,
+    0x4e, 0x5e, 0x75, 0x85, 0x85, 0x75, 0x5e, 0x4e,
+    0x42, 0x4c, 0x5d, 0x6e, 0x7a, 0x80, 0x81, 0x81,
+    0x7e, 0x82, 0x80, 0x71, 0x5f, 0x55, 0x59, 0x60,
+    0x6b, 0x79, 0x86, 0x83, 0x73, 0x62, 0x5b, 0x5a,
+    0x55, 0x61, 0x72, 0x7e, 0x80, 0x75, 0x66, 0x5c,
+    0x5b, 0x5a, 0x5f, 0x6e, 0x7f, 0x84, 0x7b, 0x6f,
+    0x61, 0x59, 0x55, 0x5e, 0x70, 0x7e, 0x7f, 0x7c,
+    0x84, 0x7d, 0x73, 0x67, 0x5e, 0x58, 0x55, 0x55,
+    0x6a, 0x6e, 0x72, 0x70, 0x6b, 0x69, 0x6d, 0x71,
+    0x7d, 0x85, 0x8a, 0x82, 0x71, 0x65, 0x64, 0x68,
+    0x6a, 0x72, 0x7d, 0x84, 0x84, 0x7d, 0x72, 0x6a,
+    0x65, 0x69, 0x70, 0x77, 0x7e, 0x82, 0x85, 0x86,
+    0x7e, 0x81, 0x81, 0x7a, 0x70, 0x6b, 0x6f, 0x75,
+    0x77, 0x7f, 0x85, 0x83, 0x79, 0x72, 0x71, 0x74,
+    0x6c, 0x72, 0x7a, 0x80, 0x81, 0x7c, 0x75, 0x70,
+    0x74, 0x70, 0x6f, 0x75, 0x7f, 0x84, 0x80, 0x7a,
+    0x77, 0x71, 0x6d, 0x70, 0x7a, 0x81, 0x81, 0x7d,
+    0x85, 0x83, 0x80, 0x7c, 0x76, 0x71, 0x6c, 0x6a,
+    0x7c, 0x80, 0x83, 0x81, 0x7d, 0x7b, 0x7e, 0x83,
+    0x7c, 0x83, 0x88, 0x85, 0x7c, 0x76, 0x77, 0x7a,
+    0x7e, 0x7f, 0x7f, 0x80, 0x80, 0x7f, 0x7f, 0x7e,
+    0x7f, 0x7e, 0x7d, 0x7c, 0x7d, 0x80, 0x83, 0x85,
+    0x7f, 0x81, 0x81, 0x7f, 0x7c, 0x7b, 0x7e, 0x81,
+    0x7f, 0x82, 0x83, 0x82, 0x7f, 0x7e, 0x80, 0x83,
+    0x7b, 0x7d, 0x7f, 0x81, 0x81, 0x80, 0x7f, 0x7d,
+    0x82, 0x7e, 0x7b, 0x7c, 0x7f, 0x82, 0x82, 0x81,
+    0x84, 0x80, 0x7d, 0x7d, 0x80, 0x82, 0x80, 0x7e,
+    0x7b, 0x7d, 0x7f, 0x82, 0x82, 0x80, 0x7e, 0x7c,
+    0x7e, 0x82, 0x86, 0x84, 0x7f, 0x7d, 0x81, 0x85,
+    0x7a, 0x7d, 0x81, 0x81, 0x7f, 0x7e, 0x7f, 0x81,
+    0x85, 0x82, 0x7e, 0x7b, 0x7b, 0x7e, 0x82, 0x85,
+    0x89, 0x86, 0x80, 0x7b, 0x7a, 0x7b, 0x7e, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x81, 0x82, 0x83, 0x83,
+    0x82, 0x81, 0x80, 0x80, 0x82, 0x83, 0x85, 0x85,
+    0x82, 0x81, 0x81, 0x80, 0x80, 0x81, 0x81, 0x82,
+    0x83, 0x82, 0x81, 0x80, 0x7f, 0x80, 0x81, 0x83,
+    0x85, 0x84, 0x84, 0x82, 0x80, 0x7f, 0x7f, 0x7f,
+    0x76, 0x78, 0x7c, 0x7f, 0x82, 0x84, 0x84, 0x84,
+    0x7b, 0x7f, 0x83, 0x81, 0x7c, 0x7a, 0x7e, 0x82,
+    0x7a, 0x7b, 0x7c, 0x7d, 0x7f, 0x80, 0x80, 0x80,
+    0x82, 0x80, 0x7d, 0x7b, 0x7b, 0x7d, 0x80, 0x82,
+    0x86, 0x83, 0x7f, 0x7b, 0x79, 0x7a, 0x7c, 0x7d,
+    0x81, 0x7f, 0x7d, 0x7f, 0x82, 0x83, 0x80, 0x7d,
+    0x81, 0x7f, 0x7d, 0x7f, 0x83, 0x84, 0x82, 0x80,
+    0x81, 0x81, 0x80, 0x7f, 0x7f, 0x7f, 0x7f, 0x80,
+    0x7d, 0x80, 0x83, 0x82, 0x7f, 0x7d, 0x7f, 0x81,
+    0x7e, 0x81, 0x83, 0x81, 0x7e, 0x7c, 0x7d, 0x7f,
+    0x7e, 0x7e, 0x7d, 0x7d, 0x7e, 0x7f, 0x81, 0x82,
+    0x7b, 0x80, 0x83, 0x81, 0x7c, 0x7b, 0x7e, 0x82,
+    0x80, 0x7d, 0x7b, 0x7c, 0x7f, 0x81, 0x80, 0x7e,
+    0x7d, 0x7f, 0x80, 0x82, 0x82, 0x80, 0x7f, 0x7d,
+    0x7f, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f,
+    0x82, 0x7e, 0x7c, 0x7d, 0x81, 0x81, 0x7d, 0x78,
+    0x7f, 0x7d, 0x7b, 0x7e, 0x83, 0x84, 0x7f, 0x7a,
+    0x7f, 0x7f, 0x7f, 0x7e, 0x7e, 0x7d, 0x7d, 0x7d,
+    0x76, 0x7c, 0x82, 0x83, 0x7f, 0x7c, 0x7c, 0x7f,
+    0x78, 0x7d, 0x81, 0x80, 0x7b, 0x79, 0x7b, 0x7f,
+    0x89, 0x86, 0x81, 0x7d, 0x7a, 0x7a, 0x7c, 0x7e,
+    0x7e, 0x83, 0x86, 0x84, 0x7f, 0x7e, 0x81, 0x85,
+    0x86, 0x81, 0x7d, 0x7d, 0x81, 0x83, 0x80, 0x7d,
+    0x7a, 0x7e, 0x84, 0x88, 0x88, 0x84, 0x7e, 0x7a,
+    0x7a, 0x7b, 0x7e, 0x81, 0x83, 0x83, 0x82, 0x82,
+};
+
+const UWORD8 gau1_ihevcd_logo_420_v[] =
+{
+    0x7b, 0x7f, 0x81, 0x80, 0x7c, 0x7c, 0x80, 0x85,
+    0x87, 0x87, 0x86, 0x80, 0x79, 0x78, 0x7d, 0x82,
+    0x84, 0x81, 0x7d, 0x7a, 0x7a, 0x7d, 0x81, 0x84,
+    0x7c, 0x7a, 0x7a, 0x7f, 0x85, 0x88, 0x85, 0x80,
+    0x83, 0x80, 0x7d, 0x7e, 0x81, 0x81, 0x7c, 0x77,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x7d, 0x7f, 0x81, 0x7f, 0x7c, 0x7c, 0x7e, 0x81,
+    0x7d, 0x7f, 0x80, 0x7f, 0x7d, 0x7d, 0x80, 0x84,
+    0x83, 0x82, 0x81, 0x80, 0x80, 0x81, 0x82, 0x83,
+    0x82, 0x80, 0x7e, 0x80, 0x82, 0x82, 0x7f, 0x7b,
+    0x7f, 0x7d, 0x7c, 0x7e, 0x81, 0x82, 0x7f, 0x7b,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x7f, 0x7e, 0x7d, 0x7d,
+    0x76, 0x79, 0x7c, 0x7f, 0x80, 0x81, 0x81, 0x81,
+    0x80, 0x82, 0x83, 0x85, 0x85, 0x83, 0x82, 0x80,
+    0x85, 0x84, 0x82, 0x80, 0x7e, 0x7d, 0x7b, 0x7b,
+    0x83, 0x83, 0x84, 0x85, 0x85, 0x83, 0x81, 0x7f,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x82, 0x80, 0x80, 0x82, 0x85, 0x86, 0x83, 0x80,
+    0x7d, 0x7c, 0x7c, 0x7f, 0x82, 0x82, 0x7d, 0x78,
+    0x7b, 0x7e, 0x81, 0x84, 0x84, 0x81, 0x7e, 0x7b,
+    0x81, 0x82, 0x81, 0x7f, 0x7c, 0x7c, 0x81, 0x85,
+    0x97, 0x99, 0x9a, 0x96, 0x8d, 0x84, 0x80, 0x7e,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x83, 0x80, 0x80, 0x87, 0x91, 0x96, 0x92, 0x8c,
+    0x8d, 0x86, 0x80, 0x80, 0x83, 0x81, 0x79, 0x71,
+    0x79, 0x7b, 0x7e, 0x80, 0x80, 0x7e, 0x7b, 0x79,
+    0x79, 0x7d, 0x80, 0x7e, 0x7b, 0x7f, 0x8a, 0x95,
+    0xb2, 0xb6, 0xb5, 0xa9, 0x96, 0x85, 0x7d, 0x7c,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x81, 0x7e, 0x80, 0x8e, 0xa1, 0xab, 0xa7, 0x9f,
+    0x9c, 0x8e, 0x80, 0x7d, 0x83, 0x85, 0x7f, 0x78,
+    0x7f, 0x7e, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e, 0x7f,
+    0x7a, 0x80, 0x84, 0x80, 0x7b, 0x7f, 0x8f, 0x9e,
+    0xc0, 0xc6, 0xc6, 0xb7, 0x9c, 0x86, 0x7d, 0x7c,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x7f, 0x7c, 0x81, 0x95, 0xb0, 0xbf, 0xbd, 0xb4,
+    0xa0, 0x8e, 0x7b, 0x79, 0x84, 0x8f, 0x8f, 0x8a,
+    0x8c, 0x88, 0x82, 0x7e, 0x7e, 0x82, 0x88, 0x8c,
+    0x85, 0x8c, 0x8e, 0x85, 0x79, 0x7a, 0x8b, 0x9c,
+    0xbc, 0xc5, 0xc9, 0xb9, 0x9e, 0x87, 0x80, 0x81,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x7e, 0x7b, 0x81, 0x9a, 0xb9, 0xcc, 0xca, 0xc1,
+    0x9f, 0x8a, 0x76, 0x75, 0x86, 0x97, 0x9d, 0x9b,
+    0x97, 0x91, 0x88, 0x81, 0x81, 0x88, 0x91, 0x97,
+    0x91, 0x97, 0x96, 0x88, 0x77, 0x75, 0x84, 0x95,
+    0xb2, 0xbd, 0xc4, 0xb7, 0x9d, 0x88, 0x83, 0x87,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+    0x9b, 0x84, 0x82, 0x88, 0x8d, 0xaf, 0xc8, 0xba,
+    0xbd, 0xac, 0x94, 0x82, 0x82, 0x92, 0xa9, 0xba,
+    0xc0, 0xb8, 0xa8, 0x93, 0x81, 0x7a, 0x7e, 0x84,
+    0xa7, 0xab, 0xaa, 0x9d, 0x89, 0x7d, 0x7e, 0x83,
+    0x81, 0x7f, 0x7d, 0x7c, 0x7c, 0x7e, 0x81, 0x83,
+    0x7e, 0x7e, 0x7f, 0x7f, 0x80, 0x81, 0x81, 0x81,
+    0x83, 0x82, 0x80, 0x80, 0x80, 0x81, 0x83, 0x85,
+    0x7f, 0x82, 0x87, 0x89, 0x86, 0x7f, 0x77, 0x72,
+    0x84, 0x84, 0x84, 0x84, 0x83, 0x81, 0x7f, 0x7e,
+    0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+    0xa3, 0x84, 0x7b, 0x7d, 0x83, 0xa7, 0xc7, 0xc2,
+    0xc0, 0xaa, 0x8b, 0x74, 0x74, 0x89, 0xa8, 0xbd,
+    0xcc, 0xc4, 0xb4, 0x9e, 0x8a, 0x80, 0x82, 0x87,
+    0x88, 0x90, 0x93, 0x8c, 0x7f, 0x79, 0x7d, 0x85,
+    0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7d, 0x7c, 0x7c,
+    0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f,
+    0x7a, 0x7b, 0x7d, 0x7e, 0x7f, 0x7f, 0x7e, 0x7d,
+    0x7e, 0x7d, 0x7c, 0x7c, 0x7c, 0x7d, 0x7f, 0x80,
+    0x7d, 0x7d, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82,
+    0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+    0xab, 0x84, 0x78, 0x7c, 0x81, 0xa3, 0xc6, 0xc8,
+    0xc3, 0xab, 0x89, 0x71, 0x70, 0x87, 0xa8, 0xc0,
+    0xc0, 0xb9, 0xaa, 0x95, 0x81, 0x76, 0x76, 0x7a,
+    0x72, 0x7b, 0x83, 0x81, 0x78, 0x76, 0x7d, 0x86,
+    0x78, 0x7d, 0x83, 0x88, 0x89, 0x87, 0x82, 0x7f,
+    0x82, 0x82, 0x83, 0x83, 0x84, 0x84, 0x85, 0x85,
+    0x78, 0x7c, 0x82, 0x87, 0x88, 0x86, 0x81, 0x7e,
+    0x7e, 0x7b, 0x78, 0x78, 0x7c, 0x85, 0x8e, 0x94,
+    0x7e, 0x7d, 0x7b, 0x7a, 0x7c, 0x7e, 0x81, 0x83,
+    0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+    0xaa, 0x84, 0x80, 0x91, 0x96, 0xaa, 0xc4, 0xc5,
+    0xc2, 0xb2, 0x9a, 0x89, 0x88, 0x98, 0xaf, 0xbf,
+    0xba, 0xb6, 0xab, 0x9b, 0x8c, 0x84, 0x85, 0x8a,
+    0x83, 0x8c, 0x92, 0x8c, 0x80, 0x79, 0x7c, 0x83,
+    0x7b, 0x84, 0x91, 0x9d, 0xa2, 0xa0, 0x9b, 0x96,
+    0x94, 0x95, 0x95, 0x96, 0x96, 0x97, 0x97, 0x97,
+    0x87, 0x8e, 0x97, 0x9f, 0xa1, 0x9d, 0x96, 0x91,
+    0x81, 0x83, 0x87, 0x8d, 0x95, 0x9d, 0xa3, 0xa7,
+    0x92, 0x8d, 0x86, 0x7f, 0x7c, 0x7c, 0x7f, 0x81,
+    0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+    0xa4, 0x83, 0x8f, 0xb2, 0xb7, 0xb8, 0xc1, 0xbd,
+    0xbf, 0xbb, 0xb5, 0xb0, 0xaf, 0xb3, 0xb8, 0xbc,
+    0xc7, 0xc7, 0xc4, 0xbc, 0xb3, 0xb0, 0xb5, 0xba,
+    0xae, 0xb4, 0xb4, 0xa7, 0x91, 0x81, 0x7d, 0x7f,
+    0x89, 0x93, 0xa4, 0xb4, 0xbd, 0xbe, 0xba, 0xb6,
+    0xad, 0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0,
+    0xa0, 0xa7, 0xb1, 0xba, 0xbd, 0xba, 0xb3, 0xaf,
+    0x8b, 0x94, 0xa1, 0xae, 0xb6, 0xb8, 0xb6, 0xb4,
+    0xb1, 0xa8, 0x99, 0x89, 0x7f, 0x7b, 0x7c, 0x7e,
+    0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+    0xa3, 0x82, 0x98, 0xc8, 0xcc, 0xbf, 0xbe, 0xba,
+    0xbf, 0xc1, 0xc5, 0xc8, 0xc7, 0xc4, 0xbf, 0xbb,
+    0xc0, 0xc4, 0xc5, 0xc2, 0xbe, 0xbe, 0xc4, 0xca,
+    0xca, 0xce, 0xcb, 0xba, 0x9f, 0x89, 0x80, 0x80,
+    0x9e, 0xa7, 0xb5, 0xc2, 0xc9, 0xca, 0xc7, 0xc4,
+    0xbd, 0xbd, 0xbe, 0xbe, 0xbf, 0xbf, 0xc0, 0xc0,
+    0xb0, 0xb5, 0xbd, 0xc4, 0xc8, 0xc7, 0xc5, 0xc2,
+    0xa1, 0xa8, 0xb3, 0xbd, 0xc3, 0xc3, 0xc0, 0xbd,
+    0xc8, 0xbb, 0xa6, 0x91, 0x82, 0x7c, 0x7c, 0x7e,
+    0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+    0xab, 0x82, 0x94, 0xc6, 0xca, 0xbb, 0xbd, 0xc0,
+    0xc1, 0xc2, 0xc4, 0xc4, 0xc4, 0xc2, 0xc0, 0xbe,
+    0xb5, 0xb9, 0xbc, 0xba, 0xb5, 0xb4, 0xb8, 0xbd,
+    0xc2, 0xc8, 0xc8, 0xb9, 0xa1, 0x8d, 0x85, 0x86,
+    0xb4, 0xb8, 0xbe, 0xc2, 0xc4, 0xc1, 0xbd, 0xba,
+    0xbf, 0xbf, 0xc0, 0xc0, 0xc1, 0xc2, 0xc2, 0xc2,
+    0xae, 0xb0, 0xb4, 0xb9, 0xbd, 0xc0, 0xc2, 0xc3,
+    0xba, 0xb7, 0xb2, 0xb0, 0xb2, 0xb8, 0xc0, 0xc5,
+    0xcc, 0xbe, 0xa7, 0x90, 0x82, 0x7d, 0x80, 0x83,
+    0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+    0xb3, 0x82, 0x8d, 0xbc, 0xc0, 0xb3, 0xbd, 0xc8,
+    0xc5, 0xc0, 0xbb, 0xb6, 0xb5, 0xb9, 0xbe, 0xc1,
+    0xc1, 0xc5, 0xc7, 0xc4, 0xbd, 0xba, 0xbc, 0xc0,
+    0xae, 0xb6, 0xba, 0xb0, 0x9d, 0x8d, 0x89, 0x8b,
+    0xc2, 0xc2, 0xc1, 0xbe, 0xb9, 0xb3, 0xad, 0xaa,
+    0xbb, 0xbb, 0xbc, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe,
+    0xa5, 0xa5, 0xa7, 0xaa, 0xae, 0xb4, 0xb9, 0xbc,
+    0xcc, 0xbe, 0xaa, 0x9c, 0x9c, 0xa9, 0xbc, 0xc9,
+    0xc7, 0xb9, 0xa2, 0x8d, 0x80, 0x7e, 0x83, 0x88,
+    0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+    0xab, 0x7c, 0x82, 0x7f, 0x89, 0xb6, 0xbf, 0xc1,
+    0xc4, 0xaf, 0x83, 0x87, 0x86, 0x81, 0xad, 0xc1,
+    0xc0, 0xc0, 0xba, 0x8f, 0x84, 0x85, 0x7a, 0xa7,
+    0xc0, 0xbf, 0xc2, 0xbb, 0x9e, 0x83, 0x8d, 0xa9,
+    0xc4, 0xca, 0xca, 0xbb, 0xa1, 0x8b, 0x82, 0x81,
+    0x8a, 0x96, 0xa8, 0xba, 0xc4, 0xc6, 0xc2, 0xbe,
+    0x83, 0x88, 0x86, 0x86, 0x9e, 0xbf, 0xc8, 0xbd,
+    0xc4, 0xa4, 0x89, 0x82, 0x82, 0x89, 0xa4, 0xc4,
+    0xc0, 0xc4, 0xb3, 0x90, 0x7c, 0x80, 0x83, 0x7c,
+    0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+    0xab, 0x7c, 0x82, 0x7f, 0x89, 0xb6, 0xbf, 0xc1,
+    0xc4, 0xaf, 0x83, 0x87, 0x86, 0x81, 0xac, 0xc1,
+    0xc0, 0xc0, 0xba, 0x8e, 0x84, 0x84, 0x7a, 0xa6,
+    0xc0, 0xbe, 0xc0, 0xb9, 0x9d, 0x84, 0x91, 0xaf,
+    0xbc, 0xc2, 0xc3, 0xb6, 0x9e, 0x89, 0x81, 0x80,
+    0x80, 0x8e, 0xa4, 0xb8, 0xc4, 0xc5, 0xc0, 0xbb,
+    0x82, 0x87, 0x84, 0x84, 0x9c, 0xbe, 0xc7, 0xbc,
+    0xc3, 0xa3, 0x87, 0x80, 0x80, 0x87, 0xa3, 0xc3,
+    0xc0, 0xc4, 0xb3, 0x90, 0x7c, 0x80, 0x83, 0x7c,
+    0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+    0xab, 0x7c, 0x82, 0x7f, 0x89, 0xb6, 0xbf, 0xc1,
+    0xc3, 0xaf, 0x82, 0x86, 0x86, 0x81, 0xac, 0xc0,
+    0xbf, 0xc0, 0xb9, 0x8e, 0x83, 0x84, 0x7a, 0xa6,
+    0xc1, 0xbd, 0xbd, 0xb6, 0x9b, 0x86, 0x96, 0xb6,
+    0xb6, 0xbc, 0xbe, 0xb1, 0x99, 0x85, 0x7d, 0x7c,
+    0x75, 0x85, 0x9e, 0xb6, 0xc3, 0xc4, 0xbd, 0xb7,
+    0x81, 0x85, 0x82, 0x81, 0x99, 0xbb, 0xc5, 0xba,
+    0xc1, 0xa1, 0x85, 0x7e, 0x7e, 0x85, 0xa1, 0xc1,
+    0xc0, 0xc4, 0xb3, 0x90, 0x7c, 0x80, 0x83, 0x7c,
+    0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+    0xab, 0x7c, 0x82, 0x7f, 0x89, 0xb6, 0xbf, 0xc1,
+    0xc3, 0xae, 0x82, 0x86, 0x85, 0x80, 0xab, 0xc0,
+    0xbf, 0xbf, 0xb9, 0x8d, 0x83, 0x83, 0x79, 0xa5,
+    0xc1, 0xbd, 0xbc, 0xb4, 0x9a, 0x86, 0x97, 0xb9,
+    0xb8, 0xbe, 0xbf, 0xb0, 0x96, 0x81, 0x77, 0x76,
+    0x71, 0x82, 0x9d, 0xb5, 0xc2, 0xc2, 0xba, 0xb3,
+    0x80, 0x84, 0x80, 0x7f, 0x96, 0xb9, 0xc4, 0xba,
+    0xbf, 0x9f, 0x84, 0x7d, 0x7d, 0x84, 0x9f, 0xbf,
+    0xc0, 0xc4, 0xb3, 0x90, 0x7c, 0x80, 0x83, 0x7c,
+    0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+    0xab, 0x7c, 0x82, 0x7f, 0x89, 0xb6, 0xbf, 0xc1,
+    0xc2, 0xad, 0x81, 0x85, 0x84, 0x7f, 0xab, 0xbf,
+    0xbe, 0xbf, 0xb8, 0x8d, 0x82, 0x83, 0x79, 0xa5,
+    0xbf, 0xbc, 0xbd, 0xb5, 0x9b, 0x85, 0x93, 0xb3,
+    0xbf, 0xc5, 0xc4, 0xb5, 0x9a, 0x83, 0x79, 0x78,
+    0x7b, 0x8a, 0xa1, 0xb6, 0xc1, 0xc1, 0xb9, 0xb3,
+    0x81, 0x84, 0x7f, 0x7d, 0x95, 0xb8, 0xc4, 0xba,
+    0xbf, 0x9f, 0x83, 0x7c, 0x7c, 0x83, 0x9f, 0xbf,
+    0xc0, 0xc4, 0xb3, 0x90, 0x7c, 0x80, 0x83, 0x7c,
+    0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+    0xab, 0x7c, 0x82, 0x7f, 0x89, 0xb6, 0xbf, 0xc1,
+    0xc2, 0xad, 0x81, 0x84, 0x84, 0x7f, 0xaa, 0xbe,
+    0xbd, 0xbe, 0xb7, 0x8c, 0x82, 0x82, 0x78, 0xa4,
+    0xbd, 0xbc, 0xc0, 0xba, 0x9d, 0x82, 0x8a, 0xa6,
+    0xc0, 0xc7, 0xc9, 0xbd, 0xa6, 0x92, 0x8b, 0x8b,
+    0x91, 0x9c, 0xab, 0xb9, 0xc1, 0xc0, 0xba, 0xb5,
+    0x82, 0x85, 0x7f, 0x7c, 0x94, 0xb8, 0xc5, 0xbc,
+    0xbf, 0x9f, 0x84, 0x7d, 0x7d, 0x84, 0x9f, 0xbf,
+    0xc0, 0xc4, 0xb3, 0x90, 0x7c, 0x80, 0x83, 0x7c,
+    0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+    0xab, 0x7c, 0x82, 0x7f, 0x89, 0xb6, 0xbf, 0xc1,
+    0xc1, 0xac, 0x80, 0x84, 0x83, 0x7e, 0xaa, 0xbe,
+    0xbd, 0xbe, 0xb7, 0x8c, 0x81, 0x82, 0x78, 0xa4,
+    0xba, 0xbc, 0xc4, 0xbf, 0xa0, 0x7e, 0x80, 0x97,
+    0xb6, 0xc1, 0xc9, 0xc4, 0xb5, 0xa9, 0xa7, 0xab,
+    0xab, 0xb0, 0xb7, 0xbd, 0xc0, 0xbf, 0xbc, 0xb9,
+    0x84, 0x86, 0x80, 0x7d, 0x94, 0xb9, 0xc7, 0xbe,
+    0xc0, 0xa0, 0x85, 0x7e, 0x7e, 0x85, 0xa0, 0xc0,
+    0xc0, 0xc4, 0xb3, 0x90, 0x7c, 0x80, 0x83, 0x7c,
+    0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+    0xab, 0x7c, 0x82, 0x7f, 0x89, 0xb6, 0xbf, 0xc1,
+    0xc1, 0xac, 0x80, 0x84, 0x83, 0x7e, 0xaa, 0xbe,
+    0xbd, 0xbd, 0xb7, 0x8b, 0x81, 0x81, 0x77, 0xa4,
+    0xb8, 0xbc, 0xc6, 0xc3, 0xa2, 0x7c, 0x79, 0x8c,
+    0xac, 0xb9, 0xc6, 0xc7, 0xc0, 0xbb, 0xbe, 0xc4,
+    0xbc, 0xbd, 0xbf, 0xc0, 0xc0, 0xbf, 0xbd, 0xbc,
+    0x86, 0x87, 0x80, 0x7d, 0x95, 0xba, 0xc8, 0xbf,
+    0xc1, 0xa1, 0x86, 0x7f, 0x7f, 0x86, 0xa1, 0xc1,
+    0xc0, 0xc4, 0xb3, 0x90, 0x7c, 0x80, 0x83, 0x7c,
+    0x7f, 0x7b, 0x82, 0x99, 0xb8, 0xca, 0xc8, 0xbf,
+    0xa9, 0x91, 0x7a, 0x7a, 0x93, 0xb1, 0xc4, 0xca,
+    0xc3, 0xae, 0x90, 0x7c, 0x7c, 0x90, 0xae, 0xc3,
+    0xc8, 0xc3, 0xb3, 0x96, 0x7c, 0x78, 0x8a, 0x9e,
+    0xbe, 0xc7, 0xca, 0xb9, 0x9b, 0x85, 0x7f, 0x83,
+    0xa1, 0xac, 0xbc, 0xc9, 0xcd, 0xc7, 0xbd, 0xb5,
+    0xc3, 0xbf, 0xbb, 0xbd, 0xc2, 0xc4, 0xc0, 0xbc,
+    0x84, 0x84, 0x88, 0x8f, 0x9d, 0xae, 0xbe, 0xc8,
+    0xb7, 0xa6, 0x8f, 0x7e, 0x7e, 0x8f, 0xa6, 0xb7,
+    0xc4, 0xbb, 0xab, 0x9a, 0x8c, 0x84, 0x81, 0x80,
+    0x7f, 0x7c, 0x80, 0x94, 0xae, 0xbd, 0xba, 0xb1,
+    0xa1, 0x8d, 0x7a, 0x7a, 0x8f, 0xa8, 0xb6, 0xb9,
+    0xb6, 0xa5, 0x8d, 0x7c, 0x7c, 0x8d, 0xa5, 0xb6,
+    0xb8, 0xb6, 0xab, 0x94, 0x7d, 0x78, 0x87, 0x98,
+    0xb0, 0xb9, 0xbd, 0xaf, 0x96, 0x83, 0x7f, 0x83,
+    0x7f, 0x8b, 0x9d, 0xb0, 0xbc, 0xc1, 0xc1, 0xbf,
+    0xad, 0xa9, 0xa5, 0xa7, 0xac, 0xae, 0xaa, 0xa6,
+    0x80, 0x80, 0x81, 0x87, 0x92, 0x9f, 0xad, 0xb5,
+    0xa8, 0x9b, 0x88, 0x7a, 0x7a, 0x88, 0x9b, 0xa8,
+    0xb3, 0xaa, 0x9d, 0x8f, 0x85, 0x7f, 0x7e, 0x7e,
+    0x80, 0x7c, 0x7e, 0x8c, 0x9e, 0xa7, 0xa3, 0x9b,
+    0x95, 0x86, 0x79, 0x7b, 0x8a, 0x99, 0xa0, 0xa0,
+    0xa3, 0x98, 0x88, 0x7d, 0x7d, 0x88, 0x98, 0xa3,
+    0xa0, 0xa2, 0x9d, 0x8f, 0x7e, 0x7a, 0x83, 0x8f,
+    0x9a, 0xa3, 0xa7, 0x9e, 0x8d, 0x80, 0x7f, 0x83,
+    0x6f, 0x77, 0x84, 0x93, 0xa0, 0xa9, 0xae, 0xb0,
+    0x92, 0x8e, 0x8a, 0x8c, 0x91, 0x92, 0x8f, 0x8b,
+    0x7d, 0x7c, 0x7b, 0x7e, 0x84, 0x8c, 0x95, 0x9a,
+    0x94, 0x8c, 0x80, 0x78, 0x78, 0x80, 0x8c, 0x94,
+    0x9a, 0x94, 0x8c, 0x83, 0x7e, 0x7b, 0x7c, 0x7c,
+    0x81, 0x7d, 0x7d, 0x84, 0x8e, 0x92, 0x8d, 0x87,
+    0x88, 0x81, 0x7a, 0x7b, 0x84, 0x8a, 0x8a, 0x88,
+    0x90, 0x8b, 0x83, 0x7e, 0x7e, 0x83, 0x8b, 0x90,
+    0x89, 0x8d, 0x8f, 0x89, 0x7f, 0x7c, 0x80, 0x86,
+    0x87, 0x8d, 0x92, 0x8e, 0x84, 0x7e, 0x7e, 0x82,
+    0x80, 0x81, 0x81, 0x83, 0x85, 0x87, 0x89, 0x8a,
+    0x83, 0x7e, 0x7b, 0x7d, 0x81, 0x83, 0x80, 0x7b,
+    0x7e, 0x7d, 0x7b, 0x7b, 0x7c, 0x7f, 0x83, 0x85,
+    0x85, 0x82, 0x7e, 0x7b, 0x7b, 0x7e, 0x82, 0x85,
+    0x87, 0x85, 0x81, 0x7e, 0x7c, 0x7d, 0x7e, 0x80,
+    0x81, 0x7f, 0x7e, 0x80, 0x82, 0x82, 0x7f, 0x7b,
+    0x80, 0x7d, 0x7c, 0x7d, 0x7f, 0x80, 0x7d, 0x7a,
+    0x83, 0x82, 0x80, 0x7e, 0x7e, 0x80, 0x82, 0x83,
+    0x7c, 0x80, 0x83, 0x83, 0x80, 0x7e, 0x7e, 0x80,
+    0x7c, 0x7f, 0x82, 0x82, 0x7f, 0x7d, 0x7e, 0x80,
+    0x8b, 0x88, 0x83, 0x7d, 0x7a, 0x78, 0x78, 0x78,
+    0x81, 0x7d, 0x7a, 0x7c, 0x80, 0x82, 0x7f, 0x7a,
+    0x82, 0x81, 0x7f, 0x7e, 0x7d, 0x7c, 0x7c, 0x7c,
+    0x7f, 0x80, 0x81, 0x82, 0x82, 0x81, 0x80, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x80, 0x82, 0x83, 0x84,
+    0x80, 0x80, 0x80, 0x7f, 0x7d, 0x7c, 0x7b, 0x7b,
+    0x7d, 0x7e, 0x7e, 0x7e, 0x7d, 0x7b, 0x7a, 0x79,
+    0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7e, 0x7e,
+    0x7a, 0x7b, 0x7c, 0x7e, 0x7f, 0x7f, 0x7e, 0x7d,
+    0x7b, 0x7b, 0x7c, 0x7d, 0x7e, 0x7e, 0x7e, 0x7e,
+    0x7f, 0x7e, 0x7d, 0x7d, 0x7d, 0x7f, 0x81, 0x82,
+    0x84, 0x80, 0x7d, 0x7f, 0x83, 0x85, 0x82, 0x7d,
+    0x82, 0x82, 0x83, 0x82, 0x81, 0x7f, 0x7d, 0x7b,
+    0x7f, 0x81, 0x83, 0x85, 0x85, 0x83, 0x81, 0x7f,
+    0x7d, 0x7e, 0x81, 0x83, 0x84, 0x84, 0x84, 0x84,
+    0x80, 0x82, 0x83, 0x81, 0x7d, 0x7c, 0x7e, 0x80,
+    0x7e, 0x80, 0x82, 0x80, 0x7d, 0x7b, 0x7d, 0x80,
+    0x7e, 0x7f, 0x7f, 0x80, 0x80, 0x7f, 0x7f, 0x7e,
+    0x7f, 0x7c, 0x7a, 0x7b, 0x7f, 0x81, 0x80, 0x7e,
+    0x81, 0x7e, 0x7c, 0x7c, 0x7f, 0x80, 0x7e, 0x7c,
+    0x77, 0x79, 0x7c, 0x7f, 0x82, 0x84, 0x84, 0x84,
+    0x83, 0x7f, 0x7c, 0x7d, 0x82, 0x84, 0x81, 0x7c,
+    0x7d, 0x7f, 0x81, 0x83, 0x83, 0x81, 0x7e, 0x7c,
+    0x7e, 0x7f, 0x81, 0x82, 0x82, 0x81, 0x7f, 0x7e,
+    0x7d, 0x7f, 0x81, 0x83, 0x83, 0x82, 0x7f, 0x7e,
+    0x7f, 0x82, 0x85, 0x83, 0x7e, 0x7d, 0x81, 0x86,
+    0x7f, 0x82, 0x83, 0x81, 0x7d, 0x7c, 0x81, 0x86,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x84, 0x7f, 0x79, 0x79, 0x7e, 0x82, 0x81, 0x7f,
+    0x87, 0x82, 0x7d, 0x7d, 0x80, 0x82, 0x7f, 0x7b,
+    0x7e, 0x80, 0x82, 0x83, 0x82, 0x7f, 0x7b, 0x78,
+    0x80, 0x7b, 0x78, 0x7a, 0x7f, 0x80, 0x7d, 0x79,
+    0x78, 0x7a, 0x7e, 0x82, 0x82, 0x81, 0x7e, 0x7c,
+    0x7d, 0x7d, 0x7e, 0x7f, 0x7f, 0x7e, 0x7d, 0x7d,
+    0x7c, 0x7e, 0x81, 0x82, 0x81, 0x7e, 0x7a, 0x77,
+};
+
+
+UWORD8 gau1_ihevcd_logo_uv[10240] =
+{
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+};
+
+const UWORD16 gau2_ihevcd_logo_rgb565[10240] =
+{
+
+    0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xfffd, 0xfffd, 0xffde, 0xffde, 0xffdd, 0xffdd,
+    0xffdd, 0xffdd, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+    0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffd, 0xfffd, 0xffdd, 0xffdd,
+    0xffdd, 0xffdd, 0xfffe, 0xfffe, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xf7fe, 0xf7fe, 0xefff, 0xefff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xffdd, 0xffdd, 0xffde, 0xffde, 0xffdd, 0xffdd,
+    0xffdd, 0xffdd, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+    0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffd, 0xfffd, 0xffdd, 0xffdd,
+    0xffdd, 0xffdd, 0xfffe, 0xfffe, 0xffdc, 0xffdc, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff,
+    0xf7df, 0xf7df, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xf7fe, 0xf7fe, 0xefff, 0xefff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffff, 0xffdf, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xfffe, 0xfffe,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+    0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffff, 0xffdf, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xfffe, 0xfffe,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+    0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xefff, 0xefff, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xfffe, 0xfffe, 0xffde, 0xffde,
+    0xffde, 0xffde, 0xfffe, 0xfffe, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffde, 0xffde,
+    0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffde, 0xffde, 0xffde, 0xffdf, 0xffdf, 0xffde,
+    0xffde, 0xffbe, 0xffde, 0xffde, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf,
+    0xf7df, 0xf7df, 0xf7df, 0xf7df, 0xf7df, 0xf7df, 0xefff, 0xefff, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xfffe, 0xfffe, 0xffde, 0xffde,
+    0xffde, 0xffde, 0xfffe, 0xfffe, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffde, 0xffde,
+    0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffde, 0xffde, 0xffbe, 0xffbe, 0xffbe, 0xffbe,
+    0xffbe, 0xffdf, 0xffbe, 0xffbe, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffdf, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffbd, 0xffbd,
+    0xffbd, 0xffbd, 0xffde, 0xffde, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffde, 0xffde,
+    0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+    0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+    0xffff, 0xffff, 0xffde, 0xffde, 0xff78, 0xff16, 0xfe73, 0xf612, 0xf5f2, 0xfe73,
+    0xff17, 0xff79, 0xffbb, 0xffbc, 0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffdf, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffde, 0xffde,
+    0xffde, 0xffde, 0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffde, 0xffde,
+    0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+    0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+    0xffff, 0xffff, 0xffde, 0xffde, 0xfef6, 0xee12, 0xcccd, 0xb3ea, 0xb3ea, 0xcccd,
+    0xee13, 0xfef6, 0xffbb, 0xffbb, 0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xfffe, 0xfffe, 0xf7bd, 0xffff, 0xffff, 0xfffe, 0xfffc, 0xfffc, 0xffb9, 0xffb9,
+    0xff98, 0xffb8, 0xffb9, 0xffb9, 0xffda, 0xffda, 0xffdb, 0xffbb, 0xfffd, 0xffbc,
+    0xffde, 0xffff, 0xffdf, 0xffdf, 0xffde, 0xfffe, 0xfffe, 0xffde, 0xefdf, 0xefdf,
+    0xe7ff, 0xe7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xf7ff, 0xffdf, 0xffff,
+    0xffdf, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xefff, 0xf7ff,
+    0xffff, 0xf7df, 0xffdf, 0xffff, 0xffff, 0xffff, 0xefdf, 0xf7ff, 0xffff, 0xffdf,
+    0xff7b, 0xffdc, 0xff9a, 0xfef7, 0xdbc2, 0xe403, 0xf423, 0xf464, 0xf464, 0xec43,
+    0xd425, 0xcbe5, 0xf652, 0xff56, 0xfffc, 0xffdb, 0xfffe, 0xffff, 0xffff, 0xf7ff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xfffe, 0xfffe, 0xf7de, 0xffff, 0xffff, 0xfffe, 0xfffc, 0xf75a, 0xff17, 0xff17,
+    0xff16, 0xff36, 0xff37, 0xff37, 0xff38, 0xff38, 0xff18, 0xff7a, 0xfffd, 0xfffd,
+    0xfffe, 0xffff, 0xffde, 0xffff, 0xffde, 0xffde, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff,
+    0xe7ff, 0xe7ff, 0xefff, 0xefff, 0xf7df, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7df, 0xefff, 0xefff, 0xefdf, 0xefff,
+    0xffff, 0xffff, 0xffff, 0xffdf, 0xf7df, 0xf7df, 0xf7ff, 0xf7ff, 0xffdf, 0xffdf,
+    0xffdc, 0xffdd, 0xfeb6, 0xcd10, 0xe403, 0xe424, 0xf443, 0xfc64, 0xf484, 0xf444,
+    0xdc46, 0xd425, 0xc4ac, 0xfe93, 0xfffc, 0xfffc, 0xf7fe, 0xf7de, 0xf7ff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xfffe, 0xfffe, 0xf7fe, 0xffff, 0xfffe, 0xfffe, 0xff99, 0xe674, 0xed4c, 0xed4c,
+    0xfd49, 0xfd6a, 0xfd6b, 0xfd6a, 0xed6c, 0xe56c, 0xdd2e, 0xfe32, 0xffdb, 0xffdb,
+    0xffde, 0xffff, 0xf7ff, 0xffff, 0xfffe, 0xfffe, 0xffdd, 0xfffd, 0xffff, 0xffff,
+    0xf7ff, 0xefff, 0xffff, 0xffde, 0xffff, 0xffff, 0xf7ff, 0xffff, 0xffff, 0xf7df,
+    0xf7ff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7fe, 0xfffe, 0xffff, 0xf7ff, 0xf7ff,
+    0xfffe, 0xffff, 0xfffe, 0xffdd, 0xffde, 0xffff, 0xf7ff, 0xf7ff, 0xf7de, 0xffff,
+    0xffdb, 0xffdb, 0xfdf1, 0xb3c8, 0xfc20, 0xfc20, 0xfc40, 0xfc40, 0xfc40, 0xfc40,
+    0xf442, 0xf442, 0xabc6, 0xf5ee, 0xfffb, 0xfffc, 0xf7fe, 0xf7dd, 0xf7fe, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xfffe, 0xfffe, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xff99, 0xd5d2, 0xc407, 0xc427,
+    0xdc25, 0xdc45, 0xd446, 0xd446, 0xc447, 0xc427, 0xbc2a, 0xed8f, 0xff9a, 0xffdb,
+    0xffde, 0xfffe, 0xf7ff, 0xffff, 0xfffe, 0xffde, 0xffdd, 0xffbc, 0xf7de, 0xffde,
+    0xefff, 0xefff, 0xffff, 0xf7de, 0xffff, 0xffff, 0xf7df, 0xf7ff, 0xffff, 0xf7df,
+    0xffff, 0xf7df, 0xf7df, 0xf7df, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+    0xffde, 0xffde, 0xffdd, 0xfffe, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+    0xffdb, 0xff9a, 0xfdf1, 0xbc09, 0xfc40, 0xfc41, 0xfc40, 0xfc40, 0xfc40, 0xfc40,
+    0xfc63, 0xfc63, 0xbc48, 0xf60f, 0xff9a, 0xffdb, 0xf7fe, 0xf7fe, 0xf7fe, 0xf7fe,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xf7ff, 0xffde, 0xffff, 0xff99, 0xdd90, 0xe405, 0xe425,
+    0xfc21, 0xfc22, 0xfc42, 0xfc22, 0xec23, 0xec03, 0xcc69, 0xfdcf, 0xffdb, 0xffdb,
+    0xf7ff, 0xf7ff, 0xefff, 0xf7ff, 0xffbd, 0xffde, 0xffbb, 0xffbb, 0xffba, 0xffba,
+    0xffdb, 0xfffb, 0xffba, 0xffba, 0xffdc, 0xfffd, 0xffde, 0xffde, 0xffff, 0xffff,
+    0xffff, 0xf7df, 0xffbd, 0xffde, 0xfffd, 0xffdc, 0xffbb, 0xffbb, 0xffdc, 0xfffd,
+    0xffdb, 0xff9a, 0xff9a, 0xffdb, 0xfffd, 0xffbc, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+    0xffdc, 0xff9a, 0xfe11, 0xc44a, 0xfc42, 0xfc42, 0xfc40, 0xfc40, 0xfc40, 0xfc20,
+    0xfc62, 0xfc62, 0xcc89, 0xfe0f, 0xff7a, 0xffbb, 0xfffe, 0xffff, 0xffde, 0xffde,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffde, 0xffff, 0xffb9, 0xdd91, 0xec46, 0xec66,
+    0xfc62, 0xfc83, 0xfc83, 0xfc63, 0xf465, 0xf464, 0xd48a, 0xfdef, 0xffdb, 0xffdb,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffde, 0xffde, 0xffbb, 0xffbb, 0xffba, 0xffba,
+    0xffbb, 0xffba, 0xffba, 0xffdb, 0xff9b, 0xff9b, 0xfffe, 0xfffe, 0xf7df, 0xffff,
+    0xffff, 0xf7df, 0xffde, 0xfffe, 0xfffd, 0xffbc, 0xffba, 0xffdb, 0xffbc, 0xfffd,
+    0xffdb, 0xffdb, 0xffba, 0xffba, 0xffdd, 0xffdc, 0xf7ff, 0xefff, 0xf7ff, 0xf7ff,
+    0xffdc, 0xffbb, 0xf611, 0xb409, 0xfc22, 0xfc42, 0xfc20, 0xfc20, 0xfc20, 0xfc20,
+    0xfc42, 0xf442, 0xbc27, 0xf5ef, 0xff9a, 0xffdb, 0xffde, 0xffde, 0xffde, 0xfffe,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffdf, 0xf7ff, 0xf7ff, 0xffbf, 0xffff, 0xff99, 0xe571, 0xf404, 0xf424,
+    0xfc00, 0xfc00, 0xfc20, 0xfc00, 0xfc22, 0xfc01, 0xcc69, 0xf5cf, 0xffdb, 0xfffc,
+    0xefff, 0xefff, 0xefff, 0xefff, 0xfffe, 0xff7c, 0xfeb5, 0xfe53, 0xfe52, 0xfe72,
+    0xfe52, 0xfe31, 0xfe73, 0xfed4, 0xe634, 0xee54, 0xffbc, 0xfffd, 0xf79d, 0xffff,
+    0xffbe, 0xffde, 0xffdd, 0xffbc, 0xff17, 0xf675, 0xfe73, 0xfeb4, 0xf695, 0xf694,
+    0xfe93, 0xfeb4, 0xfe73, 0xf632, 0xeef8, 0xfffc, 0xf7ff, 0xefff, 0xefff, 0xefff,
+    0xfffd, 0xfffd, 0xf673, 0xbc8c, 0xe405, 0xec25, 0xfc22, 0xfc42, 0xfc21, 0xfc20,
+    0xf423, 0xec02, 0xc469, 0xfe31, 0xffbc, 0xffdc, 0xffde, 0xffbe, 0xffbe, 0xffdf,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7df, 0xf7ff, 0xf7ff, 0xffbf, 0xffff, 0xff99, 0xe570, 0xf424, 0xfc25,
+    0xfc00, 0xfc00, 0xfc20, 0xfc20, 0xfc22, 0xfc22, 0xcc8a, 0xfe10, 0xfffc, 0xfffc,
+    0xefff, 0xefff, 0xefff, 0xefff, 0xfffe, 0xe6da, 0xd54f, 0xbc8c, 0xcc8a, 0xd4cb,
+    0xd4cc, 0xccab, 0xbc8c, 0xcced, 0xac4c, 0xbcce, 0xff5b, 0xfffd, 0xf79d, 0xffff,
+    0xffde, 0xffff, 0xffdd, 0xff5b, 0xddf3, 0xbcce, 0xbc6b, 0xcced, 0xc52f, 0xb4ad,
+    0xc4ac, 0xcd0d, 0xbcac, 0xb44a, 0xc5d3, 0xffdb, 0xf7ff, 0xf7ff, 0xefff, 0xefff,
+    0xfffd, 0xfffd, 0xfed5, 0xd56f, 0xe404, 0xe425, 0xfc22, 0xfc43, 0xfc41, 0xfc20,
+    0xec03, 0xebe2, 0xe54d, 0xfeb3, 0xffdc, 0xffdc, 0xffdf, 0xffff, 0xffbe, 0xffbe,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xccc8, 0xee0c, 0xfffd, 0xfffd,
+    0xffdf, 0xffdf, 0xffde, 0xffde, 0xffd9, 0xf716, 0xfd26, 0xdc22, 0xfbe0, 0xfc20,
+    0xfc06, 0xfc06, 0xfc21, 0xfc21, 0xdc24, 0xe486, 0xff35, 0xffb7, 0xfffc, 0xfffd,
+    0xfffd, 0xfffd, 0xffd9, 0xff77, 0xfd8c, 0xdc67, 0xf402, 0xfc23, 0xfc22, 0xfc22,
+    0xf441, 0xf441, 0xdca4, 0xc3e1, 0xddef, 0xffb5, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff,
+    0xffff, 0xfffe, 0xfffa, 0xffda, 0xdc2a, 0xdc4b, 0xe465, 0xe465, 0xe485, 0xdc65,
+    0xcc6d, 0xcc6c, 0xff9d, 0xff9d, 0xf7fc, 0xf7fd, 0xfffd, 0xfffd, 0xffdf, 0xffdf,
+    0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xffff, 0xf7fe, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+    0xffff, 0xffff, 0xfffe, 0xfffe, 0xffdd, 0xffdd, 0xf7fe, 0xffff, 0xf7fe, 0xf7de,
+    0xffff, 0xffff, 0xf7de, 0xffde, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffde,
+    0xffde, 0xffde, 0xffff, 0xffff, 0xffde, 0xffde, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+    0xffff, 0xffff, 0xffff, 0xffde, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+    0xffff, 0xffde, 0xffde, 0xffde, 0xffbd, 0xffbd, 0xff9d, 0xffde, 0xffff, 0xffff,
+    0xefff, 0xefff, 0xe7ff, 0xefff, 0xfffe, 0xffde, 0xffde, 0xfffe, 0xfffe, 0xfffe,
+    0xffde, 0xffbe, 0xffde, 0xffde, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xc4c7, 0xedec, 0xfffd, 0xffdd,
+    0xffdf, 0xffdf, 0xffde, 0xffde, 0xfffa, 0xff16, 0xfd26, 0xe442, 0xfbe0, 0xfc20,
+    0xfc06, 0xfc06, 0xfc21, 0xfc21, 0xdc24, 0xe466, 0xff15, 0xff97, 0xffdb, 0xffdc,
+    0xfffc, 0xffdc, 0xffb9, 0xff57, 0xfd8c, 0xdc47, 0xf3e2, 0xfc23, 0xfc42, 0xfc42,
+    0xfc61, 0xf441, 0xdca4, 0xcc01, 0xddef, 0xffb5, 0xfffd, 0xfffd, 0xf7ff, 0xf7ff,
+    0xfffe, 0xf7fe, 0xffda, 0xffda, 0xfe32, 0xfdb0, 0xf528, 0xe4a6, 0xe4a6, 0xf528,
+    0xf5d2, 0xfe54, 0xff9d, 0xffbd, 0xf7fd, 0xf7fd, 0xfffd, 0xfffd, 0xffdf, 0xffbf,
+    0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xf7fe, 0xf7fe, 0xf7df, 0xf7de, 0xf7ff, 0xf7ff,
+    0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffdd, 0xffdd, 0xffff, 0xffff, 0xf7ff, 0xf7de,
+    0xffff, 0xfffe, 0xf7de, 0xffff, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde,
+    0xffde, 0xffde, 0xffbe, 0xffde, 0xffff, 0xffde, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+    0xfffe, 0xfffe, 0xffff, 0xffff, 0xffdf, 0xffbe, 0xffbe, 0xffbe, 0xffff, 0xffff,
+    0xffde, 0xffde, 0xffde, 0xffde, 0xffbd, 0xffbd, 0xffbe, 0xffbe, 0xffdf, 0xffdf,
+    0xefff, 0xefff, 0xe7ff, 0xe7ff, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde,
+    0xfffe, 0xfffe, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xd4c6, 0xfdeb, 0xfffd, 0xfffd,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffc, 0xef59, 0xf549, 0xd465, 0xfc00, 0xfc40,
+    0xfc04, 0xfc04, 0xfc41, 0xfc41, 0xdc45, 0xe487, 0xff58, 0xffb9, 0xe7ff, 0xe7ff,
+    0xe7ff, 0xe7ff, 0xffdb, 0xff79, 0xfd8c, 0xdc68, 0xfc02, 0xfc43, 0xfbe2, 0xfbe2,
+    0xfc22, 0xfc01, 0xf465, 0xdba2, 0xf5b0, 0xff56, 0xffbf, 0xffbf, 0xffdf, 0xffdf,
+    0xffdf, 0xffdf, 0xffde, 0xffbd, 0xffde, 0xff7d, 0xf6d5, 0xee73, 0xee72, 0xfed3,
+    0xff7b, 0xffdd, 0xffdf, 0xffdf, 0xf7fe, 0xf7fe, 0xf7fd, 0xf7fd, 0xffbf, 0xffbf,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7fe, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xf7ff,
+    0xffff, 0xf7ff, 0xf7df, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7de, 0xf7fe, 0xfffe,
+    0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7df, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff,
+    0xfffe, 0xfffe, 0xfffd, 0xfffd, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7df,
+    0xf7df, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xdcc6, 0xfdeb, 0xfffd, 0xfffd,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffc, 0xef39, 0xf569, 0xdc65, 0xfc00, 0xfc61,
+    0xfc24, 0xfc24, 0xfc61, 0xfc61, 0xdc66, 0xeca7, 0xff58, 0xffda, 0xe7ff, 0xefff,
+    0xefff, 0xe7ff, 0xfffb, 0xff9a, 0xfdac, 0xdc88, 0xfc22, 0xfc63, 0xfc02, 0xfc02,
+    0xfc22, 0xfc02, 0xf465, 0xdba2, 0xf5b0, 0xff56, 0xffbf, 0xffbf, 0xffdf, 0xffdf,
+    0xffdf, 0xffdf, 0xffde, 0xffde, 0xffbe, 0xffbe, 0xffb8, 0xffb8, 0xffb7, 0xffb7,
+    0xffbc, 0xffbc, 0xffdf, 0xffdf, 0xf7fe, 0xf7fe, 0xf7fd, 0xf7fd, 0xffbf, 0xffbf,
+    0xf7ff, 0xf7ff, 0xffff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7fe, 0xf7fe,
+    0xf7fe, 0xf7fe, 0xf7fe, 0xf7fe, 0xf7ff, 0xf7ff, 0xf7df, 0xffff, 0xffff, 0xf7ff,
+    0xffff, 0xf7ff, 0xf7bf, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7de, 0xf7de, 0xf7fe,
+    0xffff, 0xfffe, 0xf7de, 0xf7de, 0xf7df, 0xffff, 0xffff, 0xffff, 0xf7fe, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff,
+    0xfffe, 0xfffe, 0xfffd, 0xfffd, 0xf7df, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xf7ff,
+    0xf7df, 0xf7df, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe484, 0xfda9, 0xfffd, 0xffdc,
+    0xf7ff, 0xefff, 0xf7ff, 0xf7ff, 0xfffd, 0xe75a, 0xed6b, 0xd487, 0xfc20, 0xfc61,
+    0xfc22, 0xfc02, 0xfc41, 0xfc20, 0xdc45, 0xe487, 0xff59, 0xffda, 0xe7ff, 0xe7ff,
+    0xe7ff, 0xdfff, 0xfffc, 0xff7a, 0xfd8c, 0xdc67, 0xfc01, 0xfc42, 0xfc42, 0xfc42,
+    0xfc63, 0xfc62, 0xe4a6, 0xcbe3, 0xe5f0, 0xff96, 0xfffe, 0xfffe, 0xefff, 0xefff,
+    0xefff, 0xefff, 0xf7fe, 0xf7fe, 0xe7ff, 0xe7ff, 0xf7fe, 0xf7fe, 0xfffb, 0xfffb,
+    0xffde, 0xffde, 0xefff, 0xefff, 0xefff, 0xefff, 0xf7fe, 0xf7fe, 0xffbf, 0xffbe,
+    0xf7ff, 0xf7ff, 0xffff, 0xf7ff, 0xffbd, 0xffbd, 0xffdc, 0xffdc, 0xffdb, 0xffdb,
+    0xffdc, 0xffdb, 0xffdc, 0xfffd, 0xfffd, 0xfffd, 0xffdf, 0xffff, 0xffbe, 0xffbe,
+    0xffdf, 0xffde, 0xffbe, 0xffdf, 0xffbe, 0xffbe, 0xffbe, 0xffde, 0xffbe, 0xffde,
+    0xffde, 0xffde, 0xf7ff, 0xefff, 0xefde, 0xf7fe, 0xfffd, 0xfffd, 0xffdb, 0xfffc,
+    0xffdc, 0xffdb, 0xffdc, 0xffdc, 0xffde, 0xffde, 0xf7df, 0xf7df, 0xf7fe, 0xf7fe,
+    0xf7ff, 0xf7ff, 0xefff, 0xefff, 0xefff, 0xefff, 0xf7ff, 0xf7ff, 0xfffd, 0xfffd,
+    0xffda, 0xffb9, 0xff98, 0xffd8, 0xf7de, 0xf7de, 0xf7de, 0xf7ff, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe484, 0xfda9, 0xfffd, 0xfffd,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffd, 0xef5a, 0xf56b, 0xd487, 0xfc20, 0xfc41,
+    0xfc02, 0xfbe2, 0xfc41, 0xfc20, 0xdc46, 0xe4a7, 0xff59, 0xffdb, 0xe7ff, 0xe7ff,
+    0xe7ff, 0xe7ff, 0xfffc, 0xff9a, 0xfdac, 0xdc67, 0xfc01, 0xfc42, 0xfc42, 0xfc42,
+    0xfc63, 0xfc62, 0xe4a6, 0xcbe3, 0xe5d0, 0xff76, 0xfffe, 0xfffe, 0xefff, 0xefff,
+    0xefff, 0xefff, 0xf7ff, 0xf7ff, 0xe7ff, 0xe7ff, 0xf7fe, 0xf7fe, 0xfffb, 0xfffc,
+    0xffff, 0xffff, 0xefff, 0xefff, 0xefff, 0xefff, 0xf7fe, 0xf7fe, 0xffbf, 0xffbf,
+    0xefff, 0xf7ff, 0xffff, 0xf7ff, 0xffdd, 0xfffe, 0xfffc, 0xffbc, 0xffdb, 0xffdb,
+    0xffdb, 0xffbb, 0xffdc, 0xffdc, 0xfffd, 0xfffd, 0xffdf, 0xffff, 0xffdf, 0xffbe,
+    0xffdf, 0xffde, 0xffde, 0xffff, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xfffe,
+    0xfffe, 0xfffe, 0xf7ff, 0xefff, 0xf7de, 0xf7ff, 0xfffd, 0xffdd, 0xffbb, 0xffbb,
+    0xff9a, 0xffbb, 0xffdc, 0xffdc, 0xffde, 0xffde, 0xffff, 0xffff, 0xf7fe, 0xf7fe,
+    0xf7ff, 0xf7ff, 0xefff, 0xefff, 0xefff, 0xefff, 0xffff, 0xf7ff, 0xffbc, 0xffdd,
+    0xffb9, 0xff99, 0xff77, 0xffb8, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe485, 0xfdaa, 0xfffd, 0xfffc,
+    0xffff, 0xffff, 0xffbb, 0xffbb, 0xffb8, 0xfed4, 0xfd29, 0xdc46, 0xfc21, 0xfc42,
+    0xfc22, 0xfc02, 0xfc41, 0xfc41, 0xec24, 0xf485, 0xff15, 0xff96, 0xffba, 0xffbb,
+    0xffdb, 0xffbb, 0xffb7, 0xff56, 0xfd8b, 0xec66, 0xfc21, 0xfc63, 0xfc62, 0xfc82,
+    0xfc82, 0xf462, 0xe4a5, 0xcbe2, 0xedcd, 0xff73, 0xffd9, 0xffd9, 0xfffb, 0xfffb,
+    0xfffb, 0xfffb, 0xfff9, 0xfff9, 0xffbd, 0xffdd, 0xffda, 0xffda, 0xffb9, 0xffb9,
+    0xffba, 0xffba, 0xffde, 0xffde, 0xefff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffdf, 0xffde,
+    0xefdf, 0xf7ff, 0xfffe, 0xffde, 0xffda, 0xffda, 0xff56, 0xfed4, 0xfe30, 0xfe2f,
+    0xfe2f, 0xfe0f, 0xfe30, 0xfe30, 0xf651, 0xf672, 0xfed5, 0xff58, 0xffb9, 0xffb9,
+    0xff99, 0xff37, 0xfef6, 0xff37, 0xff36, 0xff37, 0xff37, 0xff37, 0xff37, 0xff37,
+    0xff57, 0xff57, 0xfffb, 0xffdb, 0xffd9, 0xffd9, 0xff35, 0xfe72, 0xfdef, 0xfe0f,
+    0xfdef, 0xfe0f, 0xfe30, 0xfe31, 0xf673, 0xfed4, 0xff78, 0xffda, 0xfffd, 0xfffd,
+    0xfffd, 0xfffd, 0xffdc, 0xffdb, 0xff9a, 0xff99, 0xff16, 0xfe94, 0xfe10, 0xfe10,
+    0xfe0f, 0xfe0e, 0xfded, 0xfe0d, 0xf6b4, 0xff36, 0xffb9, 0xffda, 0xffdc, 0xffbb,
+    0xfffe, 0xfffe, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe485, 0xfdaa, 0xfffc, 0xffbc,
+    0xffdf, 0xf79e, 0xff59, 0xff59, 0xff56, 0xfeb4, 0xf508, 0xdc45, 0xfc21, 0xfc62,
+    0xfc43, 0xfc43, 0xfc41, 0xfc21, 0xe404, 0xec45, 0xfed4, 0xff55, 0xff79, 0xff7a,
+    0xff9a, 0xff9a, 0xff77, 0xff15, 0xfd6a, 0xec46, 0xfc01, 0xfc42, 0xfc62, 0xfc62,
+    0xfc82, 0xf462, 0xe4a5, 0xcbe2, 0xedcd, 0xff73, 0xff98, 0xff98, 0xf79a, 0xf79a,
+    0xf799, 0xff9a, 0xff98, 0xff98, 0xf79d, 0xf79c, 0xff79, 0xff59, 0xff57, 0xff57,
+    0xff79, 0xff7a, 0xffdd, 0xffde, 0xefff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffdf, 0xffdf,
+    0xf7ff, 0xf7ff, 0xffde, 0xffbd, 0xff99, 0xff79, 0xfe72, 0xed8f, 0xcc28, 0xc428,
+    0xc427, 0xc407, 0xbc28, 0xbc28, 0xb44a, 0xb46a, 0xd570, 0xf674, 0xff58, 0xff99,
+    0xff57, 0xfe95, 0xee12, 0xf633, 0xee12, 0xee12, 0xf612, 0xf633, 0xf632, 0xf633,
+    0xf633, 0xf633, 0xffbb, 0xffdb, 0xffb9, 0xff58, 0xf631, 0xc4ec, 0xc428, 0xc428,
+    0xcc48, 0xc428, 0xbc28, 0xbc08, 0xbc6b, 0xd54f, 0xf695, 0xff99, 0xfffd, 0xfffd,
+    0xfffd, 0xfffd, 0xffdc, 0xffdb, 0xff99, 0xff99, 0xe5d1, 0xc4cd, 0xb3e8, 0xbc08,
+    0xcc27, 0xcc27, 0xcc06, 0xcc06, 0xb48c, 0xd5b0, 0xf717, 0xffba, 0xffdc, 0xffbb,
+    0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc03, 0xfc23,
+    0xfc20, 0xfc20, 0xfc43, 0xfc43, 0xfc22, 0xfc22, 0xd487, 0xfdcc, 0xf79b, 0xffdc,
+    0xff9a, 0xbd10, 0xd383, 0xec46, 0xf403, 0xf424, 0xf444, 0xfc64, 0xfc42, 0xfc42,
+    0xfc44, 0xfc43, 0xfc43, 0xfc63, 0xfc64, 0xfc43, 0xf444, 0xf424, 0xe445, 0xec45,
+    0xe445, 0xe445, 0xec45, 0xec45, 0xfc64, 0xfc64, 0xfc63, 0xfc63, 0xfc22, 0xfc01,
+    0xfc02, 0xfc22, 0xfc23, 0xfc03, 0xfc05, 0xfbe4, 0xec26, 0xec26, 0xe427, 0xe427,
+    0xec06, 0xec06, 0xf3e5, 0xf3e5, 0xe426, 0xe446, 0xec25, 0xec25, 0xec24, 0xec24,
+    0xd446, 0xd426, 0xa42b, 0xf6b4, 0xfffe, 0xfffe, 0xf7df, 0xf7ff, 0xffff, 0xfffe,
+    0xffdd, 0xff9c, 0xff79, 0xff58, 0xfdcf, 0xcc28, 0xdba2, 0xf445, 0xfc43, 0xfc22,
+    0xfc21, 0xfc42, 0xfc42, 0xf422, 0xf423, 0xf463, 0xec86, 0xdc04, 0xdc25, 0xfe0c,
+    0xff10, 0xfd49, 0xd3c3, 0xec86, 0xec65, 0xe424, 0xe424, 0xe445, 0xe424, 0xe424,
+    0xec85, 0xfd07, 0xff94, 0xfef2, 0xfd8b, 0xd446, 0xdbe3, 0xec65, 0xfc63, 0xfc43,
+    0xfc22, 0xfc22, 0xfc43, 0xfc43, 0xec44, 0xec24, 0xdc04, 0xdc04, 0xf717, 0xfffb,
+    0xffb8, 0xff97, 0xffb5, 0xfeb1, 0xf4c7, 0xe465, 0xec23, 0xf443, 0xf423, 0xf402,
+    0xec23, 0xf444, 0xf465, 0xec44, 0xf4c6, 0xdbe2, 0xcc05, 0xfd4b, 0xfef4, 0xff97,
+    0xffdb, 0xfffb, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc03, 0xfc23,
+    0xfc20, 0xfc20, 0xfc43, 0xfc43, 0xfc22, 0xfc22, 0xd467, 0xfdac, 0xef5a, 0xffbb,
+    0xff59, 0xbcef, 0xd383, 0xec46, 0xf424, 0xf444, 0xfc44, 0xfc64, 0xfc42, 0xfc42,
+    0xfc43, 0xfc43, 0xfc22, 0xfc22, 0xfc43, 0xfc23, 0xec24, 0xec04, 0xe425, 0xe425,
+    0xe425, 0xe425, 0xec04, 0xec24, 0xf423, 0xf443, 0xfc22, 0xfc22, 0xfc22, 0xfc22,
+    0xfc02, 0xfc02, 0xfc23, 0xfc23, 0xfc25, 0xfc05, 0xec06, 0xec06, 0xe407, 0xe407,
+    0xebe6, 0xebe6, 0xf3c5, 0xf3c5, 0xdc05, 0xdc25, 0xec05, 0xec05, 0xec04, 0xe404,
+    0xd426, 0xcc06, 0xac4b, 0xf694, 0xffde, 0xfffe, 0xf7df, 0xf7ff, 0xffff, 0xf7de,
+    0xff7b, 0xffdd, 0xff38, 0xd591, 0xcc28, 0xd469, 0xf465, 0xec45, 0xfc22, 0xfc22,
+    0xfc21, 0xfc42, 0xfc63, 0xfc42, 0xf463, 0xfc84, 0xdc25, 0xdc04, 0xd404, 0xf4e8,
+    0xfdcb, 0xf4c7, 0xd3e3, 0xe465, 0xec65, 0xe445, 0xe445, 0xec65, 0xec65, 0xe444,
+    0xf4a6, 0xfd28, 0xfef2, 0xf5ad, 0xdc67, 0xd446, 0xec65, 0xec65, 0xfc43, 0xfc43,
+    0xfc22, 0xfc22, 0xfc43, 0xfc43, 0xec45, 0xec44, 0xe445, 0xe425, 0xbd30, 0xde74,
+    0xff77, 0xffb8, 0xfeb1, 0xed6c, 0xec86, 0xec66, 0xf423, 0xf443, 0xfc43, 0xfc43,
+    0xf444, 0xf464, 0xf444, 0xec24, 0xec64, 0xe403, 0xd426, 0xe4a8, 0xe5cf, 0xfef4,
+    0xffdb, 0xffdb, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc03, 0xfc23,
+    0xfc20, 0xfc20, 0xfc43, 0xfc43, 0xfc22, 0xfc22, 0xdca9, 0xfded, 0xf7bb, 0xfffc,
+    0xff97, 0xd52e, 0xfba0, 0xfc62, 0xfc00, 0xfc00, 0xfc43, 0xfc43, 0xfc43, 0xfc43,
+    0xfc64, 0xfc64, 0xfc43, 0xfc63, 0xfc63, 0xfc63, 0xfc41, 0xfc41, 0xfc41, 0xfc41,
+    0xfc41, 0xfc41, 0xfc41, 0xfc41, 0xfc63, 0xfc63, 0xfc83, 0xfc63, 0xfc42, 0xfc42,
+    0xfc22, 0xfc22, 0xfc23, 0xfc23, 0xfc23, 0xfc22, 0xfc63, 0xfc63, 0xfc62, 0xfc62,
+    0xfc42, 0xfc42, 0xfc21, 0xfc21, 0xfc40, 0xfc40, 0xfc20, 0xfc21, 0xfc41, 0xfc21,
+    0xfc62, 0xfc62, 0xcc68, 0xfe90, 0xff9c, 0xffdd, 0xffdf, 0xf7bf, 0xfffd, 0xf7dd,
+    0xff97, 0xfe93, 0xed0b, 0xd427, 0xebe4, 0xf465, 0xfc42, 0xfc22, 0xfc20, 0xfc00,
+    0xfc00, 0xfc00, 0xfc20, 0xfc00, 0xfc20, 0xfc40, 0xfc41, 0xfc82, 0xfc41, 0xfc41,
+    0xfcc3, 0xfca3, 0xfc41, 0xfc82, 0xfc41, 0xfc20, 0xfc21, 0xfc61, 0xfc41, 0xfc40,
+    0xfc61, 0xfcc3, 0xfd48, 0xec64, 0xe3e1, 0xf443, 0xfc83, 0xfc42, 0xfc00, 0xfc41,
+    0xfc20, 0xfc20, 0xfc20, 0xfc20, 0xfc20, 0xfc20, 0xfc21, 0xfc21, 0xbbe6, 0xd4a9,
+    0xfe6f, 0xfed0, 0xfd07, 0xe403, 0xfc21, 0xfc41, 0xfc20, 0xfc20, 0xfc20, 0xfc41,
+    0xfc62, 0xfc42, 0xfc43, 0xfc43, 0xfc00, 0xfc00, 0xfc42, 0xf401, 0xd468, 0xfe0e,
+    0xffda, 0xffba, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc03, 0xfc23,
+    0xfc20, 0xfc20, 0xfc43, 0xfc43, 0xfc22, 0xfc22, 0xd488, 0xfdad, 0xef7a, 0xffdc,
+    0xff77, 0xd50e, 0xfb80, 0xfc41, 0xfc20, 0xfc20, 0xfc43, 0xfc43, 0xfc43, 0xfc43,
+    0xfc44, 0xfc44, 0xfc22, 0xfc23, 0xfc42, 0xfc42, 0xfc21, 0xfc21, 0xfc20, 0xfc20,
+    0xfc41, 0xfc41, 0xfc41, 0xfc41, 0xfc62, 0xfc62, 0xfc63, 0xfc43, 0xfc62, 0xfc42,
+    0xfc22, 0xfc22, 0xfc23, 0xfc23, 0xfc43, 0xfc23, 0xfc63, 0xfc63, 0xfc62, 0xfc62,
+    0xfc42, 0xfc42, 0xfc21, 0xfc21, 0xfc40, 0xfc40, 0xfc20, 0xfc20, 0xfc21, 0xfc20,
+    0xfc62, 0xfc62, 0xcca8, 0xfe90, 0xffbc, 0xffdd, 0xffdf, 0xf79f, 0xfffd, 0xffdd,
+    0xff35, 0xe52d, 0xc3c6, 0xd448, 0xfca6, 0xf465, 0xfc01, 0xfc62, 0xfc20, 0xfc00,
+    0xfc20, 0xfc20, 0xfc40, 0xfc40, 0xfc40, 0xfc40, 0xfc41, 0xfca3, 0xfc62, 0xfc00,
+    0xfc41, 0xfc61, 0xfc41, 0xfc62, 0xfc62, 0xfc61, 0xfc61, 0xfc82, 0xfc61, 0xfc41,
+    0xfc41, 0xfc82, 0xe444, 0xec64, 0xf484, 0xf484, 0xfc42, 0xfc42, 0xfc41, 0xfc41,
+    0xfc20, 0xfc20, 0xfc20, 0xfc20, 0xfc21, 0xfc21, 0xfc41, 0xfc41, 0xcc89, 0xcc68,
+    0xf50a, 0xfd6b, 0xf485, 0xe403, 0xfc42, 0xfc42, 0xfc61, 0xfc41, 0xfc21, 0xfc41,
+    0xfc62, 0xfc42, 0xfc43, 0xfc64, 0xfc40, 0xfc40, 0xfca4, 0xf401, 0xcbe6, 0xfd6c,
+    0xff99, 0xffda, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc03, 0xfc23,
+    0xfc20, 0xfc20, 0xfc43, 0xfc43, 0xfc22, 0xfc22, 0xe467, 0xfd8b, 0xef7a, 0xffdc,
+    0xff77, 0xcd2e, 0xfb80, 0xfc61, 0xfc21, 0xfc21, 0xfc45, 0xfc45, 0xfc43, 0xfc43,
+    0xfc42, 0xfc42, 0xfc22, 0xfc22, 0xfc22, 0xfc22, 0xfc21, 0xfc21, 0xfc21, 0xfc41,
+    0xfc42, 0xfc42, 0xfc42, 0xfc42, 0xfc42, 0xfc42, 0xfc42, 0xfc42, 0xfca2, 0xf482,
+    0xfc62, 0xfc62, 0xfc63, 0xfc63, 0xfc63, 0xfc63, 0xf482, 0xf482, 0xf4a1, 0xf4a1,
+    0xfc82, 0xfc82, 0xfc62, 0xfc62, 0xfc40, 0xfc40, 0xfc02, 0xfc02, 0xfc01, 0xfbe1,
+    0xf441, 0xf441, 0xcc87, 0xfe8f, 0xffbc, 0xffdd, 0xffbf, 0xff9e, 0xfffc, 0xffdb,
+    0xfca6, 0xfc85, 0xfc64, 0xf443, 0xfc22, 0xfc42, 0xfc42, 0xfc41, 0xfc20, 0xfc21,
+    0xfc41, 0xfc42, 0xfc63, 0xfc83, 0xfc84, 0xfc63, 0xfc22, 0xfc62, 0xfc62, 0xfc42,
+    0xfc42, 0xfc42, 0xfc22, 0xfc42, 0xfc62, 0xfc42, 0xfc42, 0xfc62, 0xfc42, 0xfc21,
+    0xfc21, 0xfc41, 0xe465, 0xec85, 0xec85, 0xec44, 0xec03, 0xec24, 0xfc44, 0xfc64,
+    0xfc63, 0xfc63, 0xfc42, 0xfc42, 0xfc21, 0xfc21, 0xfc41, 0xfc41, 0xfca4, 0xfc43,
+    0xf423, 0xfc64, 0xfcc6, 0xf485, 0xec64, 0xec64, 0xf485, 0xf465, 0xfc64, 0xfc84,
+    0xfc63, 0xfc43, 0xfc22, 0xfc22, 0xfc40, 0xfc20, 0xfc62, 0xfc22, 0xcc06, 0xe4c9,
+    0xf6d6, 0xffba, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc03, 0xfc23,
+    0xfc20, 0xfc20, 0xfc43, 0xfc43, 0xfc22, 0xfc22, 0xe487, 0xfdcc, 0xf7bb, 0xfffc,
+    0xffb8, 0xcd4e, 0xfba0, 0xfc62, 0xfc00, 0xfc00, 0xfc45, 0xfc45, 0xfc43, 0xfc43,
+    0xfc42, 0xfc42, 0xfc43, 0xfc43, 0xfc63, 0xfc43, 0xfc42, 0xfc42, 0xfc42, 0xfc42,
+    0xfc42, 0xfc42, 0xfc42, 0xfc42, 0xfc62, 0xfc62, 0xfc63, 0xfc62, 0xf4a2, 0xf482,
+    0xfc62, 0xfc62, 0xfc63, 0xfc63, 0xfc63, 0xfc42, 0xf482, 0xf482, 0xf4a1, 0xf4a1,
+    0xfc82, 0xfc82, 0xfc62, 0xfc62, 0xfc60, 0xfc60, 0xfc22, 0xfc42, 0xfc22, 0xfc22,
+    0xfc62, 0xfc61, 0xcc46, 0xfe8f, 0xff9c, 0xffbc, 0xffdf, 0xffdf, 0xffdb, 0xef38,
+    0xe3e3, 0xec44, 0xfc64, 0xf443, 0xfc22, 0xfc63, 0xfc42, 0xfc21, 0xfc41, 0xfc41,
+    0xfc21, 0xfc21, 0xfc22, 0xfc42, 0xfc23, 0xf402, 0xfc42, 0xfc21, 0xfc22, 0xfc83,
+    0xfc83, 0xfc42, 0xfc42, 0xfc63, 0xfc21, 0xfc21, 0xfc41, 0xfc42, 0xfc42, 0xfc41,
+    0xfc41, 0xfc21, 0xf4c6, 0xe444, 0xdc03, 0xe424, 0xec44, 0xec24, 0xf403, 0xf423,
+    0xfc43, 0xfc43, 0xfc42, 0xfc42, 0xfc41, 0xfc21, 0xfc21, 0xfc21, 0xfc84, 0xfc84,
+    0xf443, 0xf443, 0xf485, 0xec64, 0xdc03, 0xdbe3, 0xe3e3, 0xe423, 0xf423, 0xf443,
+    0xfc22, 0xfc43, 0xfc22, 0xfc22, 0xfc40, 0xfc00, 0xfc42, 0xfc62, 0xdc68, 0xdc67,
+    0xddf2, 0xffb9, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc03, 0xfc23,
+    0xfc20, 0xfc20, 0xfc43, 0xfc43, 0xfc22, 0xfc22, 0xec24, 0xfd69, 0xef7a, 0xf7bb,
+    0xff78, 0xb50e, 0xe360, 0xfc22, 0xfc03, 0xfc03, 0xf447, 0xf467, 0xfc43, 0xfc44,
+    0xfc40, 0xfc40, 0xfc01, 0xfc01, 0xfc22, 0xfc02, 0xfc03, 0xf403, 0xec04, 0xec24,
+    0xec04, 0xec03, 0xf402, 0xf403, 0xfc01, 0xfc22, 0xfc21, 0xfc01, 0xfc42, 0xfc42,
+    0xfc03, 0xfc23, 0xfc04, 0xfc04, 0xfc03, 0xfbe3, 0xf3e1, 0xfbe1, 0xf401, 0xf401,
+    0xf3e3, 0xf3e3, 0xfbe4, 0xfbe4, 0xec84, 0xec84, 0xf447, 0xf447, 0xfc25, 0xfc25,
+    0xec63, 0xe462, 0xc487, 0xfeb0, 0xff9c, 0xff9c, 0xffde, 0xffde, 0xff37, 0xcdd1,
+    0xfc21, 0xfbc0, 0xfbc0, 0xfc41, 0xfc82, 0xfc41, 0xfc22, 0xfc42, 0xfc84, 0xfc64,
+    0xec45, 0xec24, 0xe466, 0xeca8, 0xecc9, 0xeca9, 0xfc65, 0xf3c3, 0xf3c2, 0xfc24,
+    0xfc24, 0xfc03, 0xfc03, 0xfc23, 0xfc23, 0xfc24, 0xfc44, 0xfc44, 0xfc44, 0xfc44,
+    0xfc44, 0xfc23, 0xdcc8, 0xe4c8, 0xed0a, 0xfd8c, 0xfdcd, 0xfdad, 0xf50a, 0xeca9,
+    0xec67, 0xec67, 0xf466, 0xf466, 0xfc64, 0xfc64, 0xfc43, 0xfc42, 0xfc20, 0xfc00,
+    0xfc83, 0xfc42, 0xd425, 0xe4a7, 0xe56d, 0xdd4c, 0xe54d, 0xe56d, 0xece9, 0xdc67,
+    0xf3e3, 0xfc44, 0xfc42, 0xfc21, 0xfc40, 0xfc40, 0xfc43, 0xfc84, 0xdcca, 0xc428,
+    0xc571, 0xffdb, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffde, 0xffde,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc03, 0xfc23,
+    0xfc20, 0xfc20, 0xfc43, 0xfc43, 0xfc22, 0xfc22, 0xec45, 0xfd8a, 0xf79b, 0xfffc,
+    0xffb9, 0xbd2f, 0xe380, 0xfc22, 0xfbe2, 0xfc02, 0xec46, 0xf467, 0xfc44, 0xfc44,
+    0xfc40, 0xfc40, 0xfc22, 0xfc42, 0xfc43, 0xfc43, 0xfc24, 0xfc23, 0xf424, 0xf424,
+    0xf424, 0xec24, 0xf423, 0xfc43, 0xfc42, 0xfc63, 0xfc62, 0xfc42, 0xfc42, 0xfc41,
+    0xfc23, 0xfc23, 0xfc04, 0xfbe4, 0xfbe3, 0xfbc3, 0xfc22, 0xfc22, 0xfc42, 0xfc42,
+    0xfc23, 0xfc23, 0xfc04, 0xfc04, 0xeca5, 0xeca5, 0xfc67, 0xfc67, 0xfc46, 0xfc45,
+    0xec83, 0xec83, 0xcca8, 0xfed0, 0xff9c, 0xff9c, 0xffde, 0xffbe, 0xe694, 0xa48c,
+    0xfc00, 0xfc20, 0xfc41, 0xfc62, 0xfc41, 0xfc01, 0xfc22, 0xfc63, 0xfc43, 0xf423,
+    0xec24, 0xec45, 0xf4c8, 0xfd8b, 0xfe0e, 0xfe0e, 0xfd28, 0xfc65, 0xfc24, 0xfc44,
+    0xfc44, 0xfc44, 0xfc65, 0xfc44, 0xfc44, 0xfc44, 0xfc44, 0xfc44, 0xfc44, 0xfc44,
+    0xfc23, 0xfc03, 0xd446, 0xfdcc, 0xff12, 0xff73, 0xff74, 0xff74, 0xfe90, 0xfd6c,
+    0xe446, 0xe467, 0xf466, 0xf466, 0xfc84, 0xfc64, 0xfc63, 0xfc63, 0xfc40, 0xfba0,
+    0xfc83, 0xfc62, 0xcbc4, 0xfd6a, 0xffb6, 0xff75, 0xffb6, 0xff75, 0xfe2e, 0xe4a8,
+    0xf3c2, 0xfc23, 0xfc41, 0xfc01, 0xfc00, 0xfc60, 0xfc43, 0xfc43, 0xd4aa, 0xbbe7,
+    0xbd10, 0xffdb, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffde, 0xffde,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xdc46, 0xfd6a, 0xf7fe, 0xefbe,
+    0xffde, 0xf79d, 0xf7bd, 0xf79d, 0xff99, 0xeed7, 0xfce5, 0xec22, 0xfc22, 0xfc63,
+    0xfc23, 0xfc22, 0xfc22, 0xfc21, 0xe424, 0xec65, 0xe754, 0xf7b5, 0xff5b, 0xff7b,
+    0xff9d, 0xff7d, 0xf7f9, 0xef98, 0xfd6b, 0xe447, 0xfc00, 0xfc41, 0xfc41, 0xfc62,
+    0xfc62, 0xfc42, 0xfc45, 0xe382, 0xd5ef, 0xff95, 0xffbc, 0xf79b, 0xff9d, 0xff9d,
+    0xe7f8, 0xeff9, 0xfe30, 0xd449, 0xfc21, 0xfc21, 0xfc42, 0xfc42, 0xfc41, 0xfc21,
+    0xfc42, 0xfc22, 0xc449, 0xfe91, 0xfffd, 0xffdc, 0xffda, 0xff79, 0xfdac, 0xdc67,
+    0xfc02, 0xfc02, 0xfbe1, 0xfc02, 0xfc02, 0xfc02, 0xfc44, 0xfc24, 0xdcea, 0xc427,
+    0xc58f, 0xff77, 0xfffc, 0xffdb, 0xffdc, 0xfffd, 0xff99, 0xff79, 0xfed4, 0xd52e,
+    0xd426, 0xe4a8, 0xfc64, 0xfc85, 0xfc22, 0xfc42, 0xfc02, 0xfc01, 0xfc43, 0xfc23,
+    0xfbe3, 0xfbe3, 0xd6b5, 0xf799, 0xffbd, 0xffde, 0xffbf, 0xff9e, 0xfffc, 0xffba,
+    0xdd2a, 0xcca8, 0xfc02, 0xfc63, 0xfc21, 0xfc01, 0xfc62, 0xfc62, 0xfc41, 0xfc20,
+    0xd447, 0xd447, 0xeef7, 0xffbb, 0xfffe, 0xffdd, 0xfffe, 0xffbc, 0xfffb, 0xf739,
+    0xe4ea, 0xcc27, 0xfc20, 0xfc40, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+    0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe487, 0xfdab, 0xffff, 0xf7ff,
+    0xfffe, 0xffde, 0xfffe, 0xfffe, 0xffda, 0xeef7, 0xfd06, 0xec22, 0xfc22, 0xfc63,
+    0xfc23, 0xfc22, 0xfc42, 0xfc22, 0xe445, 0xec86, 0xef95, 0xfff6, 0xff9c, 0xffdd,
+    0xffbe, 0xff9e, 0xfffa, 0xf7b9, 0xfd8c, 0xe467, 0xfc21, 0xfc62, 0xfc41, 0xfc62,
+    0xfc62, 0xfc42, 0xfc45, 0xe382, 0xd60f, 0xff95, 0xfffd, 0xffdc, 0xffde, 0xffde,
+    0xeff9, 0xf7fa, 0xfe71, 0xdc8a, 0xfc21, 0xfc21, 0xfc42, 0xfc42, 0xfc41, 0xfc21,
+    0xfc42, 0xfc22, 0xc449, 0xfe71, 0xfffc, 0xffdc, 0xffba, 0xff17, 0xfd4a, 0xdc67,
+    0xfc02, 0xfc02, 0xfbe1, 0xfc02, 0xfc02, 0xfc02, 0xfc24, 0xfc24, 0xcc48, 0xed6c,
+    0xf736, 0xffb8, 0xffdb, 0xfffb, 0xfffd, 0xf7bc, 0xfffb, 0xffba, 0xff56, 0xf632,
+    0xed0a, 0xdc88, 0xebe2, 0xfc43, 0xfc22, 0xfc42, 0xfc22, 0xfc01, 0xfc44, 0xfc23,
+    0xfbe3, 0xfc03, 0xe717, 0xffda, 0xffbd, 0xffde, 0xffbf, 0xff9e, 0xfffc, 0xffbb,
+    0xedac, 0xd4e9, 0xfc02, 0xfc43, 0xfc21, 0xfc01, 0xfc62, 0xfc62, 0xfc40, 0xfc00,
+    0xd468, 0xdcc9, 0xf718, 0xffbb, 0xfffd, 0xfffd, 0xfffd, 0xffbc, 0xfffc, 0xff59,
+    0xed0a, 0xcc27, 0xfc20, 0xfc40, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+    0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe486, 0xfdab, 0xffff, 0xf7ff,
+    0xfffe, 0xfffe, 0xffff, 0xffff, 0xfffb, 0xf738, 0xfd26, 0xf423, 0xfc22, 0xfc62,
+    0xfc23, 0xfc23, 0xfc22, 0xfc02, 0xe425, 0xec86, 0xef95, 0xfff7, 0xffbd, 0xffdd,
+    0xffbf, 0xff9e, 0xfffb, 0xf7b9, 0xfd8c, 0xe467, 0xfc01, 0xfc42, 0xfc42, 0xfc62,
+    0xfc42, 0xfc42, 0xfc45, 0xe382, 0xd60f, 0xff96, 0xfffd, 0xffbd, 0xffdf, 0xffde,
+    0xeffa, 0xf7fa, 0xfe51, 0xdc6a, 0xfc21, 0xfc21, 0xfc42, 0xfc42, 0xfc42, 0xfc42,
+    0xfc43, 0xf423, 0xc469, 0xfe71, 0xffdc, 0xffdc, 0xff98, 0xf694, 0xf4c7, 0xec65,
+    0xfc42, 0xfc42, 0xfc21, 0xfc42, 0xfc22, 0xfc22, 0xf445, 0xf424, 0xd4cb, 0xfeb2,
+    0xfffb, 0xffda, 0xf7bd, 0xfffd, 0xffff, 0xf7be, 0xfffd, 0xf7bc, 0xffda, 0xff78,
+    0xfe50, 0xe4ea, 0xe3c2, 0xfc65, 0xfc22, 0xfc42, 0xfc22, 0xfc21, 0xfc64, 0xfc23,
+    0xf403, 0xfc04, 0xef99, 0xfffb, 0xffbe, 0xffbe, 0xffbf, 0xff9f, 0xfffc, 0xffdb,
+    0xfe4f, 0xdd4b, 0xfc02, 0xfc23, 0xfc01, 0xfc01, 0xfc62, 0xfc83, 0xfc41, 0xfc00,
+    0xdca9, 0xf56c, 0xff7a, 0xffbb, 0xfffe, 0xfffe, 0xfffe, 0xf7bd, 0xfffc, 0xff9a,
+    0xf56c, 0xd468, 0xfc20, 0xfc20, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+    0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe466, 0xfd8b, 0xffff, 0xf7ff,
+    0xfffe, 0xfffe, 0xffff, 0xffff, 0xfffb, 0xf738, 0xfd26, 0xec22, 0xfc21, 0xfc62,
+    0xfc23, 0xfc23, 0xfc02, 0xfc01, 0xe424, 0xec66, 0xef95, 0xfff7, 0xffbd, 0xffdd,
+    0xffbf, 0xff9e, 0xfffb, 0xf7b9, 0xfd8c, 0xe447, 0xfbe1, 0xfc22, 0xfc42, 0xfc62,
+    0xfc42, 0xfc42, 0xfc45, 0xe382, 0xd60f, 0xffb6, 0xffdd, 0xffbc, 0xffde, 0xffbe,
+    0xeff9, 0xeffa, 0xfe51, 0xdc49, 0xfc21, 0xfc21, 0xfc42, 0xfc42, 0xfc42, 0xfc42,
+    0xfc43, 0xf423, 0xcc8a, 0xfe92, 0xffdc, 0xfffc, 0xff98, 0xe633, 0xec65, 0xec86,
+    0xfc62, 0xfc62, 0xfc42, 0xfc42, 0xfc22, 0xfc22, 0xf425, 0xec04, 0xfe51, 0xff55,
+    0xffda, 0xffba, 0xfffe, 0xfffe, 0xf7be, 0xffff, 0xffdd, 0xf7bc, 0xffda, 0xffda,
+    0xff13, 0xf56c, 0xebe2, 0xfc85, 0xfc22, 0xfc42, 0xfc22, 0xfc22, 0xfc64, 0xfc23,
+    0xfc04, 0xfc24, 0xf7ba, 0xfffb, 0xff9d, 0xffbe, 0xffbf, 0xffbf, 0xfffc, 0xffdb,
+    0xfed1, 0xe56b, 0xfc02, 0xfc02, 0xfc01, 0xfc01, 0xfc62, 0xfc62, 0xfc41, 0xfbe0,
+    0xdcca, 0xfe2f, 0xffbb, 0xffdb, 0xffdd, 0xfffe, 0xffdd, 0xffdd, 0xfffc, 0xffbb,
+    0xfdcd, 0xd489, 0xfc00, 0xfc20, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+    0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfdab, 0xffff, 0xf7ff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xfffb, 0xf738, 0xfd06, 0xec02, 0xfc02, 0xfc63,
+    0xfc23, 0xfc23, 0xfc43, 0xfc22, 0xe425, 0xec86, 0xefb6, 0xfff8, 0xffde, 0xffde,
+    0xffdf, 0xffbf, 0xfffb, 0xf7ba, 0xfd8c, 0xe468, 0xfc02, 0xfc42, 0xfc42, 0xfc62,
+    0xfc42, 0xfc22, 0xfc45, 0xe382, 0xd610, 0xffb7, 0xfffe, 0xffdd, 0xffdf, 0xffdf,
+    0xeffb, 0xf7fb, 0xfe52, 0xdc6a, 0xfc01, 0xfc21, 0xfc42, 0xfc62, 0xfc62, 0xfc42,
+    0xf444, 0xf424, 0xccaa, 0xfe92, 0xffdb, 0xfffc, 0xff97, 0xe5d0, 0xec02, 0xfc85,
+    0xfc82, 0xfc82, 0xfc62, 0xfc62, 0xfc43, 0xfc42, 0xec46, 0xe425, 0xff56, 0xff77,
+    0xff9b, 0xffbc, 0xffff, 0xffff, 0xefbf, 0xffff, 0xefff, 0xefff, 0xfffd, 0xffdc,
+    0xff55, 0xf5af, 0xe3c2, 0xf444, 0xfc42, 0xfc42, 0xfc22, 0xfc42, 0xfc64, 0xfc43,
+    0xf424, 0xfc65, 0xf7ba, 0xfffb, 0xff9e, 0xffdf, 0xffdf, 0xffbf, 0xfffd, 0xf7db,
+    0xff12, 0xdd6c, 0xf402, 0xfc23, 0xfc22, 0xfc22, 0xfc63, 0xfc63, 0xfc41, 0xfbe0,
+    0xdceb, 0xfe91, 0xffdc, 0xfffc, 0xf7de, 0xffff, 0xf7fe, 0xf7de, 0xfffd, 0xffbc,
+    0xfe0f, 0xd4aa, 0xfc00, 0xfc21, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+    0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfdab, 0xffff, 0xf7ff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xfffb, 0xf718, 0xfd06, 0xec02, 0xfc02, 0xfc63,
+    0xfc23, 0xfc23, 0xfc23, 0xfc02, 0xe425, 0xec66, 0xef96, 0xfff8, 0xffbd, 0xffbd,
+    0xffbf, 0xff9f, 0xfffb, 0xf7b9, 0xfd8c, 0xe448, 0xfc01, 0xfc42, 0xfc42, 0xfc62,
+    0xfc42, 0xfc22, 0xfc45, 0xe382, 0xd610, 0xffb7, 0xfffe, 0xffdd, 0xffdf, 0xffdf,
+    0xf7fb, 0xf7fb, 0xfe52, 0xdc6a, 0xfc01, 0xfc21, 0xfc42, 0xfc62, 0xfc62, 0xfc42,
+    0xf444, 0xf424, 0xc46a, 0xfe92, 0xffbb, 0xffdb, 0xff97, 0xe5b0, 0xe3c1, 0xfc84,
+    0xfc82, 0xfc82, 0xfc62, 0xfc62, 0xfc43, 0xfc42, 0xec46, 0xe425, 0xff15, 0xff97,
+    0xfffd, 0xffbc, 0xf7ff, 0xffff, 0xf7df, 0xf7bf, 0xefff, 0xefff, 0xffdc, 0xff9b,
+    0xff55, 0xf5cf, 0xe3c2, 0xf444, 0xfc42, 0xfc42, 0xfc21, 0xfc22, 0xfc64, 0xfc23,
+    0xf444, 0xfca6, 0xf79a, 0xfffb, 0xff9e, 0xffdf, 0xffdf, 0xffbf, 0xfffd, 0xffdc,
+    0xff12, 0xdd6c, 0xf3e2, 0xfc23, 0xfc22, 0xfc22, 0xfc63, 0xfc63, 0xfc41, 0xfbe0,
+    0xdceb, 0xfed2, 0xffbb, 0xfffc, 0xf7de, 0xfffe, 0xf7fe, 0xf7fe, 0xfffc, 0xffbb,
+    0xfe30, 0xdcca, 0xfbe0, 0xfc41, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+    0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe466, 0xfd8b, 0xffff, 0xf7ff,
+    0xffff, 0xffde, 0xffff, 0xffff, 0xfffb, 0xf718, 0xfd06, 0xec23, 0xfc22, 0xfc63,
+    0xfc23, 0xfc23, 0xfc03, 0xfbe2, 0xdc05, 0xe446, 0xef96, 0xfff8, 0xff9e, 0xffbe,
+    0xffbf, 0xff9f, 0xfffb, 0xef9a, 0xfd6c, 0xe448, 0xfbe2, 0xfc23, 0xfc42, 0xfc62,
+    0xfc43, 0xfc22, 0xfc46, 0xe383, 0xd610, 0xffb7, 0xffde, 0xffbe, 0xffdf, 0xffdf,
+    0xeffb, 0xeffb, 0xfe32, 0xd44b, 0xfc01, 0xfc21, 0xfc42, 0xfc43, 0xfc63, 0xfc43,
+    0xf444, 0xec44, 0xbc6a, 0xfeb3, 0xffbb, 0xffdb, 0xff97, 0xe5d0, 0xe3a1, 0xfc84,
+    0xfc62, 0xfc62, 0xfc42, 0xfc62, 0xfc43, 0xfc23, 0xec46, 0xe426, 0xff36, 0xff97,
+    0xfffd, 0xfffd, 0xe7df, 0xf7ff, 0xefff, 0xe7df, 0xe7ff, 0xe7ff, 0xffbd, 0xffbd,
+    0xff76, 0xf5d0, 0xe3c2, 0xfc85, 0xfc42, 0xfc42, 0xfc21, 0xfc42, 0xfc84, 0xfc43,
+    0xf465, 0xfce7, 0xf7bb, 0xfffc, 0xffbe, 0xffff, 0xffff, 0xf7bf, 0xfffd, 0xfffd,
+    0xff53, 0xdd8d, 0xf3e3, 0xfc24, 0xfc23, 0xfc22, 0xfc63, 0xfc63, 0xfc42, 0xfbe0,
+    0xdceb, 0xfed3, 0xf79b, 0xfffd, 0xf7ff, 0xf7fe, 0xf7ff, 0xf7ff, 0xffdd, 0xff9c,
+    0xfe50, 0xdceb, 0xfbe0, 0xfc62, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+    0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfdab, 0xffff, 0xf7ff,
+    0xffff, 0xfffe, 0xffff, 0xffff, 0xfffb, 0xf738, 0xfd06, 0xf423, 0xfc22, 0xfc63,
+    0xfc23, 0xfc23, 0xfc23, 0xfc02, 0xdc05, 0xec67, 0xf7b7, 0xfff9, 0xffde, 0xffde,
+    0xffbf, 0xff9f, 0xfffc, 0xf7ba, 0xfd8d, 0xe468, 0xfc02, 0xfc43, 0xfc42, 0xfc62,
+    0xfc43, 0xfc22, 0xfc46, 0xe383, 0xd610, 0xffb7, 0xffde, 0xffde, 0xffdf, 0xffdf,
+    0xeffb, 0xf7fb, 0xfe52, 0xd44b, 0xfc01, 0xfc21, 0xfc42, 0xfc43, 0xfc63, 0xfc43,
+    0xf444, 0xec44, 0xbc6a, 0xfed3, 0xffdb, 0xffdb, 0xffb7, 0xedf0, 0xebc1, 0xfc84,
+    0xfc62, 0xfc62, 0xfc42, 0xfc62, 0xfc43, 0xfc43, 0xec46, 0xe446, 0xffb8, 0xff56,
+    0xfffd, 0xfffd, 0xe7df, 0xefff, 0xefff, 0xefff, 0xe7ff, 0xe7ff, 0xffdd, 0xfffe,
+    0xff97, 0xed8f, 0xd361, 0xfc86, 0xfc42, 0xfc42, 0xfc21, 0xfc21, 0xfc84, 0xfc43,
+    0xf485, 0xfd08, 0xf7db, 0xfffc, 0xffbe, 0xffff, 0xffdf, 0xf79f, 0xfffd, 0xfffd,
+    0xff74, 0xddad, 0xf403, 0xfc24, 0xfc22, 0xfc22, 0xfc63, 0xfc84, 0xfc42, 0xfbe0,
+    0xdceb, 0xfed2, 0xf75a, 0xfffd, 0xf7ff, 0xf7de, 0xffff, 0xffff, 0xffdc, 0xf79b,
+    0xfe51, 0xdceb, 0xfbe0, 0xfc62, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+    0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfd8b, 0xffff, 0xf7ff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xfffb, 0xf718, 0xfd07, 0xec23, 0xfc22, 0xfc43,
+    0xfc23, 0xfc44, 0xfc23, 0xfc03, 0xdc26, 0xec87, 0xef97, 0xfff8, 0xffbe, 0xffde,
+    0xffbf, 0xffbf, 0xfffc, 0xefba, 0xfd8d, 0xe469, 0xfc02, 0xfc43, 0xfc42, 0xfc62,
+    0xfc43, 0xfc22, 0xfc46, 0xe383, 0xd610, 0xffb7, 0xffff, 0xffde, 0xffdf, 0xffdf,
+    0xeffc, 0xf7fc, 0xfe53, 0xd44b, 0xfc21, 0xfc21, 0xfc43, 0xfc63, 0xfc63, 0xfc43,
+    0xf444, 0xf444, 0xbc49, 0xfef4, 0xffbb, 0xffdc, 0xffd9, 0xee52, 0xec44, 0xf485,
+    0xfc43, 0xfc22, 0xfc02, 0xfc42, 0xfc43, 0xfc03, 0xec25, 0xec05, 0xfe71, 0xff34,
+    0xffda, 0xfffb, 0xf7ff, 0xeffe, 0xe7df, 0xf7ff, 0xf7ff, 0xefbe, 0xffbb, 0xfffc,
+    0xff14, 0xed4d, 0xec03, 0xf424, 0xfc42, 0xfc62, 0xfc42, 0xfc22, 0xfc64, 0xfc43,
+    0xf485, 0xfce7, 0xf7bb, 0xfffc, 0xffdf, 0xffbf, 0xffdf, 0xffdf, 0xfffd, 0xf7fd,
+    0xff74, 0xd58d, 0xebe3, 0xf444, 0xfc23, 0xfc22, 0xfc63, 0xfc63, 0xfc42, 0xf3c0,
+    0xdceb, 0xfe92, 0xffbd, 0xfffd, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xfffd, 0xffbc,
+    0xfe30, 0xd4cb, 0xfc01, 0xfc42, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+    0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfd8b, 0xffff, 0xf7ff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xfffb, 0xf718, 0xfd07, 0xec23, 0xfc22, 0xfc43,
+    0xfc23, 0xfc44, 0xfc23, 0xfc03, 0xdc26, 0xec87, 0xef97, 0xfff8, 0xffbe, 0xffde,
+    0xffbf, 0xffbf, 0xfffc, 0xefba, 0xfd8d, 0xe469, 0xfc02, 0xfc43, 0xfc42, 0xfc62,
+    0xfc43, 0xfc22, 0xfc46, 0xe383, 0xd610, 0xffb7, 0xffff, 0xffde, 0xffdf, 0xffdf,
+    0xeffc, 0xf7fc, 0xfe53, 0xd44b, 0xfc21, 0xfc21, 0xfc43, 0xfc63, 0xfc63, 0xfc43,
+    0xf444, 0xf444, 0xc46a, 0xfef4, 0xffbb, 0xffbb, 0xffd8, 0xf673, 0xfca6, 0xf465,
+    0xfc43, 0xfc42, 0xfc22, 0xfc22, 0xfc23, 0xfc02, 0xf425, 0xf425, 0xc4aa, 0xfed3,
+    0xfffb, 0xffda, 0xe7bd, 0xf7ff, 0xf7ff, 0xe79e, 0xefbe, 0xf7ff, 0xfffc, 0xff9a,
+    0xfe0f, 0xdcca, 0xf424, 0xfc85, 0xfc42, 0xfc62, 0xfc42, 0xfc22, 0xfc64, 0xfc43,
+    0xf485, 0xfce7, 0xf7bb, 0xfffc, 0xffdf, 0xffbf, 0xffdf, 0xffdf, 0xfffd, 0xf7fd,
+    0xff74, 0xd58d, 0xebe3, 0xf444, 0xfc23, 0xfc22, 0xfc63, 0xfc63, 0xfc42, 0xf3c0,
+    0xdceb, 0xfe92, 0xffbd, 0xfffd, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xfffd, 0xffbc,
+    0xfe30, 0xd4cb, 0xfc01, 0xfc42, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+    0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfd8b, 0xffff, 0xf7ff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xfffb, 0xf718, 0xfd07, 0xec23, 0xfc02, 0xfc43,
+    0xfc23, 0xfc44, 0xfc23, 0xfc02, 0xdc26, 0xec87, 0xefb6, 0xfff8, 0xffbe, 0xffde,
+    0xffbf, 0xffbf, 0xfffc, 0xefba, 0xfd8d, 0xe468, 0xfc02, 0xfc43, 0xfc62, 0xfc62,
+    0xfc42, 0xfc22, 0xfc46, 0xe3a3, 0xd610, 0xffb7, 0xffff, 0xffde, 0xffdf, 0xffdf,
+    0xeffc, 0xeffc, 0xfe53, 0xd44b, 0xfc22, 0xfc42, 0xfc43, 0xfc63, 0xfc42, 0xfc42,
+    0xfc43, 0xfc22, 0xc449, 0xfeb2, 0xffdc, 0xffdc, 0xffdb, 0xff58, 0xfdcd, 0xdc88,
+    0xfc23, 0xfc23, 0xfc02, 0xfc23, 0xfc02, 0xfbe2, 0xfc24, 0xfc24, 0xcc06, 0xf56b,
+    0xff34, 0xffd7, 0xffd9, 0xfff9, 0xfffa, 0xffb9, 0xffb9, 0xffd9, 0xff96, 0xfe51,
+    0xecc8, 0xdc26, 0xf423, 0xfc84, 0xfc42, 0xfc62, 0xfc42, 0xfc22, 0xfc64, 0xfc43,
+    0xf465, 0xfce7, 0xf7bb, 0xfffc, 0xffdf, 0xffbf, 0xffdf, 0xffdf, 0xf7fd, 0xf7fd,
+    0xff74, 0xd5ad, 0xebe2, 0xf444, 0xfc23, 0xfc02, 0xfc43, 0xfc63, 0xfc42, 0xf3c0,
+    0xdceb, 0xfe92, 0xffbd, 0xffdd, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xfffd, 0xffbc,
+    0xfe30, 0xd4cb, 0xfc01, 0xfc42, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+    0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfd8b, 0xffff, 0xf7ff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xfffb, 0xf718, 0xfd07, 0xec23, 0xfc02, 0xfc43,
+    0xfc23, 0xfc44, 0xfc23, 0xfc02, 0xdc26, 0xec87, 0xefb6, 0xfff8, 0xffbe, 0xffde,
+    0xffbf, 0xffbf, 0xfffc, 0xefba, 0xfd8d, 0xe468, 0xfc02, 0xfc43, 0xfc62, 0xfc62,
+    0xfc42, 0xfc22, 0xfc46, 0xe3a3, 0xd610, 0xffb7, 0xffff, 0xffde, 0xffdf, 0xffdf,
+    0xeffc, 0xeffc, 0xfe53, 0xd44b, 0xfc22, 0xfc42, 0xfc43, 0xfc63, 0xfc42, 0xfc42,
+    0xfc43, 0xfc22, 0xbc28, 0xfe91, 0xfffd, 0xfffd, 0xfffb, 0xffba, 0xfe2e, 0xcc26,
+    0xfc03, 0xfc03, 0xfc02, 0xfc23, 0xfc22, 0xfbe2, 0xfc24, 0xfc44, 0xe4e9, 0xcc26,
+    0xcd6d, 0xff14, 0xff98, 0xff57, 0xff99, 0xffda, 0xff98, 0xff57, 0xfe31, 0xd50c,
+    0xdc46, 0xe467, 0xfc64, 0xfc64, 0xfc42, 0xfc62, 0xfc42, 0xfc22, 0xfc64, 0xfc43,
+    0xf465, 0xfce7, 0xf7bb, 0xfffc, 0xffdf, 0xffbf, 0xffdf, 0xffdf, 0xf7fd, 0xf7fd,
+    0xff74, 0xd5ad, 0xebe2, 0xf444, 0xfc23, 0xfc02, 0xfc43, 0xfc63, 0xfc42, 0xf3c0,
+    0xdceb, 0xfe92, 0xffbd, 0xffdd, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xfffd, 0xffbc,
+    0xfe30, 0xd4cb, 0xfc01, 0xfc42, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+    0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfd8c, 0xffff, 0xf7ff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xfffc, 0xf719, 0xfd07, 0xec23, 0xfc02, 0xfc43,
+    0xfc24, 0xfc44, 0xfc22, 0xfc22, 0xdc25, 0xec87, 0xefb6, 0xfff8, 0xffbd, 0xffde,
+    0xffdf, 0xffbf, 0xfffb, 0xefda, 0xfd8d, 0xe468, 0xfc01, 0xfc42, 0xfc61, 0xfc61,
+    0xfc42, 0xfc41, 0xfc45, 0xe3a2, 0xd610, 0xffd6, 0xfffe, 0xffde, 0xffdf, 0xffdf,
+    0xeffb, 0xeffc, 0xfe53, 0xd44b, 0xfc22, 0xfc43, 0xfc43, 0xfc63, 0xfc41, 0xfc41,
+    0xfc21, 0xfc21, 0xcc48, 0xfe91, 0xfffe, 0xfffd, 0xf7dd, 0xf7dc, 0xfe93, 0xb46a,
+    0xec03, 0xf423, 0xfc22, 0xfc63, 0xfc42, 0xfc21, 0xfc22, 0xfc43, 0xfca5, 0xec03,
+    0xdc45, 0xece8, 0xf54b, 0xfe0d, 0xfe4f, 0xfdcd, 0xfdcb, 0xfd29, 0xec45, 0xe424,
+    0xf443, 0xfc84, 0xfc83, 0xfc63, 0xfc42, 0xfc63, 0xfc43, 0xfc22, 0xfc64, 0xfc43,
+    0xfc64, 0xfcc6, 0xffba, 0xfffb, 0xffdf, 0xffbe, 0xffdf, 0xffdf, 0xfffd, 0xf7fc,
+    0xff74, 0xd5ac, 0xebe2, 0xfc43, 0xfc22, 0xfc02, 0xfc43, 0xfc63, 0xfc42, 0xfbc0,
+    0xdceb, 0xfe72, 0xffbc, 0xffdd, 0xffff, 0xf7ff, 0xf7ff, 0xf7df, 0xfffd, 0xffbc,
+    0xfe30, 0xdccb, 0xfbe1, 0xfc42, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+    0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfd8c, 0xffff, 0xf7ff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xfffc, 0xf719, 0xfd07, 0xec23, 0xfc02, 0xfc43,
+    0xfc24, 0xfc44, 0xfc22, 0xfc22, 0xdc25, 0xec87, 0xefb6, 0xfff8, 0xffbd, 0xffde,
+    0xffdf, 0xffbf, 0xfffb, 0xefda, 0xfd8d, 0xe468, 0xfc01, 0xfc42, 0xfc61, 0xfc61,
+    0xfc42, 0xfc41, 0xfc45, 0xe3a2, 0xd610, 0xffd6, 0xfffe, 0xffde, 0xffdf, 0xffdf,
+    0xeffb, 0xeffc, 0xfe53, 0xd44b, 0xfc22, 0xfc43, 0xfc43, 0xfc63, 0xfc41, 0xfc41,
+    0xfc21, 0xfc21, 0xcc89, 0xfe91, 0xf7fd, 0xf7fd, 0xf7dc, 0xfffd, 0xff56, 0xe5f0,
+    0xec23, 0xf423, 0xfc22, 0xfc63, 0xfc42, 0xfc22, 0xfc42, 0xfc43, 0xf464, 0xf464,
+    0xdc86, 0xd445, 0xd426, 0xe4a8, 0xf4c9, 0xe488, 0xe486, 0xdc45, 0xe424, 0xec65,
+    0xfc85, 0xfc84, 0xfc42, 0xfc63, 0xfc42, 0xfc63, 0xfc43, 0xfc22, 0xfc64, 0xfc43,
+    0xfc64, 0xfcc6, 0xffba, 0xfffb, 0xffdf, 0xffbe, 0xffdf, 0xffdf, 0xfffd, 0xf7fc,
+    0xff74, 0xd5ac, 0xebe2, 0xfc43, 0xfc22, 0xfc02, 0xfc43, 0xfc63, 0xfc42, 0xfbc0,
+    0xdceb, 0xfe72, 0xffbc, 0xffdd, 0xffff, 0xf7ff, 0xf7ff, 0xf7df, 0xfffd, 0xffbc,
+    0xfe30, 0xdccb, 0xfbe1, 0xfc42, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+    0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfd8c, 0xffff, 0xf7ff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xfffc, 0xf719, 0xfd07, 0xec23, 0xfc02, 0xfc43,
+    0xfc24, 0xfc44, 0xfc22, 0xfc22, 0xdc25, 0xec86, 0xefb6, 0xfff8, 0xffbd, 0xffde,
+    0xffdf, 0xffbf, 0xfffb, 0xefda, 0xfd8c, 0xe468, 0xfc01, 0xfc42, 0xfc61, 0xfc61,
+    0xfc61, 0xfc41, 0xfc65, 0xe3a2, 0xd630, 0xffd6, 0xfffe, 0xffdd, 0xffdf, 0xffdf,
+    0xeffb, 0xeffc, 0xfe53, 0xd44b, 0xf443, 0xfc43, 0xfc43, 0xfc63, 0xfc40, 0xfc20,
+    0xfc20, 0xfc00, 0xcc68, 0xfe71, 0xf7fd, 0xf7fe, 0xf7ff, 0xf7ff, 0xfffa, 0xff99,
+    0xeca5, 0xe484, 0xfc42, 0xfc63, 0xfc41, 0xfc41, 0xfc21, 0xfc41, 0xfc42, 0xfc62,
+    0xfc43, 0xfc23, 0xfc03, 0xf3c2, 0xfba2, 0xfc04, 0xfc41, 0xfc41, 0xfc62, 0xfc82,
+    0xfc82, 0xfc41, 0xfc21, 0xfc41, 0xfc42, 0xfc63, 0xfc43, 0xfc23, 0xfc44, 0xfc43,
+    0xfc44, 0xfcc6, 0xffba, 0xffdb, 0xffde, 0xffbe, 0xffdf, 0xffdf, 0xfffc, 0xf7fc,
+    0xff73, 0xd5ac, 0xf3e2, 0xfc23, 0xfc22, 0xfc02, 0xfc43, 0xfc63, 0xfc42, 0xfbc0,
+    0xdceb, 0xfe71, 0xffbc, 0xffdd, 0xffff, 0xffff, 0xffff, 0xffde, 0xfffd, 0xffbc,
+    0xfe30, 0xdccb, 0xfbe1, 0xfc42, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+    0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+    0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfd8c, 0xffff, 0xf7ff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xfffc, 0xf719, 0xfd07, 0xec23, 0xfc02, 0xfc43,
+    0xfc24, 0xfc44, 0xfc22, 0xfc22, 0xdc25, 0xec86, 0xefb6, 0xfff8, 0xffbd, 0xffde,
+    0xffdf, 0xffbf, 0xfffb, 0xefda, 0xfd8c, 0xe468, 0xfc01, 0xfc42, 0xfc61, 0xfc61,
+    0xfc61, 0xfc41, 0xfc65, 0xe3a2, 0xd630, 0xffd6, 0xfffe, 0xffdd, 0xffdf, 0xffdf,
+    0xeffb, 0xeffc, 0xfe53, 0xd44b, 0xf443, 0xfc43, 0xfc43, 0xfc63, 0xfc40, 0xfc20,
+    0xfc20, 0xfc00, 0xcc48, 0xfe91, 0xf7fe, 0xfffe, 0xf7ff, 0xefde, 0xffb9, 0xffba,
+    0xf4e6, 0xeca5, 0xf442, 0xfc42, 0xfc21, 0xfc21, 0xfc21, 0xfc21, 0xfc01, 0xfc83,
+    0xfc43, 0xf402, 0xfc44, 0xfc45, 0xfbe3, 0xfc24, 0xfc41, 0xfc61, 0xfc41, 0xfc21,
+    0xfc21, 0xfc42, 0xfc62, 0xfc42, 0xfc42, 0xfc63, 0xfc43, 0xfc23, 0xfc44, 0xfc43,
+    0xfc44, 0xfcc6, 0xffba, 0xffdb, 0xffde, 0xffbe, 0xffdf, 0xffdf, 0xfffc, 0xf7fc,
+    0xff73, 0xd5ac, 0xf3e2, 0xfc23, 0xfc22, 0xfc02, 0xfc43, 0xfc63, 0xfc42, 0xfbc0,
+    0xdceb, 0xfe71, 0xffbc, 0xffdd, 0xffff, 0xffff, 0xffff, 0xffde, 0xfffd, 0xffbc,
+    0xfe30, 0xdccb, 0xfbe1, 0xfc42, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+    0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xf7de, 0xf7ff, 0xf7ff, 0xffbd, 0xfffe, 0xffb8, 0xe58f, 0xf424, 0xf424,
+    0xfc00, 0xfc00, 0xfc20, 0xfc20, 0xfc22, 0xfc22, 0xe469, 0xfdcf, 0xffbb, 0xffbb,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffda, 0xfef6, 0xfd08, 0xe404, 0xfc00, 0xfc41,
+    0xfc20, 0xfc20, 0xfc20, 0xfc60, 0xdc23, 0xeca5, 0xff36, 0xffd8, 0xefbc, 0xf7fe,
+    0xf7fe, 0xf7fe, 0xffda, 0xff58, 0xfd8b, 0xe446, 0xfbe1, 0xfc42, 0xfc20, 0xfc00,
+    0xfc41, 0xfbe0, 0xec44, 0xe3e3, 0xe5d0, 0xff97, 0xf7fe, 0xf7de, 0xefff, 0xefff,
+    0xfffc, 0xffdb, 0xfe51, 0xd4ab, 0xfc01, 0xfc21, 0xfc20, 0xfc20, 0xfc20, 0xfc00,
+    0xf423, 0xf423, 0xccab, 0xfe72, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xffdd, 0xffdd,
+    0xff37, 0xe50e, 0xcba6, 0xe429, 0xfc86, 0xfc04, 0xfbc0, 0xfc62, 0xfc00, 0xfc00,
+    0xfc20, 0xfc40, 0xfc62, 0xfc41, 0xf483, 0xfc83, 0xfc41, 0xfc00, 0xfc42, 0xfc83,
+    0xfc42, 0xf3e1, 0xfc22, 0xfc63, 0xfc41, 0xfc41, 0xfc20, 0xfc20, 0xfc41, 0xfc00,
+    0xfc42, 0xfd05, 0xffbb, 0xffdb, 0xffbe, 0xffde, 0xffbf, 0xff9f, 0xffbb, 0xffbb,
+    0xff52, 0xedac, 0xe462, 0xe462, 0xfc21, 0xfc41, 0xfc23, 0xfc03, 0xf462, 0xe3e0,
+    0xe4e8, 0xfe8f, 0xff99, 0xff99, 0xf7df, 0xffff, 0xffff, 0xf7bf, 0xff99, 0xff79,
+    0xfe4e, 0xe4e8, 0xec21, 0xf462, 0xfc20, 0xfcc0, 0xfc61, 0xfc20, 0xe487, 0xdc26,
+    0xdd2f, 0xff78, 0xffbd, 0xffbd, 0xffde, 0xffde, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+    0xffff, 0xffde, 0xf7ff, 0xf7ff, 0xffde, 0xfffe, 0xffb8, 0xe590, 0xf424, 0xf424,
+    0xfc00, 0xfc20, 0xfc21, 0xfc20, 0xfc22, 0xfc22, 0xdc49, 0xfd8e, 0xff9a, 0xffbb,
+    0xefdf, 0xf7ff, 0xf7ff, 0xf7ff, 0xffda, 0xff17, 0xfd29, 0xec45, 0xfc21, 0xfc61,
+    0xfc20, 0xfc00, 0xfc20, 0xfc40, 0xdc03, 0xec85, 0xff15, 0xffd8, 0xefbc, 0xf7fe,
+    0xf7fe, 0xf7fe, 0xffb9, 0xff58, 0xfd6b, 0xe446, 0xfbe0, 0xfc22, 0xfc40, 0xfc20,
+    0xfc82, 0xfc41, 0xf465, 0xe3e3, 0xddb0, 0xff77, 0xffff, 0xf7fe, 0xefff, 0xefdf,
+    0xffbb, 0xffbb, 0xfe51, 0xcc8a, 0xfc21, 0xfc41, 0xfc20, 0xfc40, 0xfc20, 0xfc20,
+    0xfc43, 0xf423, 0xc48a, 0xfe51, 0xffdc, 0xffbc, 0xf7bd, 0xfffe, 0xfffd, 0xfffe,
+    0xff78, 0xfeb5, 0xfd0c, 0xd3c7, 0xf3a2, 0xfc65, 0xfc41, 0xfba0, 0xfc20, 0xfc00,
+    0xfc40, 0xfc60, 0xfc82, 0xfc61, 0xf483, 0xfc83, 0xfc21, 0xfc41, 0xfc21, 0xfbe0,
+    0xfc83, 0xfd05, 0xfca4, 0xf3c0, 0xfc21, 0xfc41, 0xfc00, 0xfc20, 0xfc41, 0xfc00,
+    0xfc42, 0xfce5, 0xffbb, 0xffdb, 0xffde, 0xffde, 0xffbf, 0xffbf, 0xffbb, 0xffbb,
+    0xff32, 0xe56b, 0xdc41, 0xe462, 0xfc41, 0xfc42, 0xfc02, 0xfbe2, 0xfca3, 0xec00,
+    0xe4e9, 0xfe8f, 0xffba, 0xffba, 0xf7df, 0xf7bf, 0xffff, 0xf7df, 0xffda, 0xff9a,
+    0xfe2e, 0xe4c8, 0xec21, 0xfc82, 0xfc40, 0xfc80, 0xfc41, 0xfc41, 0xe487, 0xcbc4,
+    0xd4ee, 0xffb9, 0xffdd, 0xffdd, 0xffde, 0xffde, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffde, 0xffff, 0xffb9, 0xd591, 0xe426, 0xe446,
+    0xfc23, 0xfc43, 0xfc44, 0xfc43, 0xec45, 0xec25, 0xcc6a, 0xfdd0, 0xffbc, 0xffdc,
+    0xefdf, 0xf7ff, 0xf7ff, 0xf7ff, 0xffba, 0xfef7, 0xf50a, 0xd427, 0xf424, 0xfc85,
+    0xfc63, 0xfc43, 0xfc84, 0xfc84, 0xcc26, 0xe4c8, 0xff37, 0xfffa, 0xefdd, 0xf7fe,
+    0xf7fe, 0xf7fe, 0xffdb, 0xff79, 0xfdae, 0xdc89, 0xf425, 0xfc86, 0xfc64, 0xf443,
+    0xfc85, 0xf444, 0xe467, 0xcba4, 0xd570, 0xff78, 0xffff, 0xf7fe, 0xefff, 0xefff,
+    0xffdc, 0xfffd, 0xfe94, 0xc4ac, 0xe445, 0xec45, 0xfc43, 0xfc43, 0xfc43, 0xfc22,
+    0xe446, 0xe445, 0xbc8c, 0xfe94, 0xfffe, 0xffdd, 0xf7be, 0xfffe, 0xfffe, 0xffdd,
+    0xfffc, 0xf7fc, 0xf736, 0xcdf1, 0xcc89, 0xbc28, 0xe425, 0xec66, 0xfc64, 0xfc44,
+    0xfc44, 0xfc44, 0xfc45, 0xfc24, 0xfc05, 0xfc25, 0xec87, 0xdc05, 0xcbe5, 0xf50a,
+    0xfe0e, 0xf54b, 0xcc06, 0xd426, 0xe466, 0xe487, 0xe446, 0xe446, 0xe487, 0xdc67,
+    0xdc88, 0xed0a, 0xf7dc, 0xffdd, 0xffdf, 0xffdf, 0xffdf, 0xffbf, 0xffdd, 0xffdd,
+    0xff75, 0xd58e, 0xc445, 0xcc86, 0xec87, 0xec86, 0xf427, 0xf427, 0xdc66, 0xcbe4,
+    0xcccb, 0xfe72, 0xffbc, 0xfffd, 0xf7ff, 0xefdf, 0xefdf, 0xf7ff, 0xfffd, 0xff9b,
+    0xf610, 0xccab, 0xcc05, 0xdc87, 0xf484, 0xf464, 0xdc46, 0xe4a7, 0xccab, 0xb3c7,
+    0xbcef, 0xffdb, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xf7ff, 0xffde, 0xffff, 0xff79, 0xd570, 0xe447, 0xec67,
+    0xfc44, 0xfc64, 0xfc85, 0xfc64, 0xf466, 0xec66, 0xcc4a, 0xf5b0, 0xffdc, 0xffdc,
+    0xf7ff, 0xf7ff, 0xefdf, 0xf7ff, 0xffdb, 0xff38, 0xf54b, 0xd447, 0xf424, 0xfc85,
+    0xfc84, 0xfca4, 0xf463, 0xf443, 0xcc05, 0xe4c8, 0xf716, 0xfffa, 0xf7fe, 0xf7fe,
+    0xf7fe, 0xf7fe, 0xffdb, 0xff59, 0xfd8d, 0xdc69, 0xec04, 0xf425, 0xfca5, 0xfc64,
+    0xfc85, 0xf444, 0xe467, 0xd3e5, 0xe5d1, 0xffd9, 0xf7de, 0xf7fe, 0xf7ff, 0xefff,
+    0xfffd, 0xfffd, 0xfe73, 0xbc6b, 0xec65, 0xec66, 0xfc64, 0xfc64, 0xfc63, 0xfc43,
+    0xec66, 0xec66, 0xb44b, 0xfe74, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffdd, 0xffbd,
+    0xf7fb, 0xf7db, 0xfff9, 0xffd9, 0xfe50, 0xccaa, 0xe425, 0xfcc8, 0xfc65, 0xfc44,
+    0xfc23, 0xfc44, 0xfc24, 0xfc04, 0xfc04, 0xfc25, 0xec87, 0xdc25, 0xecc9, 0xfed1,
+    0xff94, 0xed0a, 0xc3a4, 0xece9, 0xe446, 0xe467, 0xe446, 0xe446, 0xe487, 0xdc67,
+    0xdc88, 0xecea, 0xf7dd, 0xfffd, 0xffdf, 0xffff, 0xffdf, 0xffbf, 0xffdd, 0xffdd,
+    0xff75, 0xd58e, 0xc465, 0xcc86, 0xe466, 0xe466, 0xf427, 0xf448, 0xdc87, 0xd425,
+    0xd4ec, 0xfe72, 0xff9b, 0xfffd, 0xf7ff, 0xf7ff, 0xefdf, 0xf7ff, 0xffdc, 0xff7b,
+    0xf610, 0xd4ec, 0xdc66, 0xe4a7, 0xf464, 0xec43, 0xdc45, 0xe487, 0xccab, 0xb3e8,
+    0xbd10, 0xff7a, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7df, 0xffff, 0xffbb, 0xd5f4, 0xc44a, 0xc44a,
+    0xdc48, 0xdc68, 0xd469, 0xd469, 0xc46a, 0xc46a, 0xc4ae, 0xe5d3, 0xffbd, 0xfffe,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xff9b, 0xeed8, 0xdd4e, 0xbc4a, 0xc449, 0xcc69,
+    0xcc69, 0xcc69, 0xd489, 0xd488, 0xbc6a, 0xd52d, 0xeef8, 0xffdc, 0xf7fe, 0xffff,
+    0xf7ff, 0xf7df, 0xffdc, 0xff5a, 0xedd1, 0xcccd, 0xd46a, 0xd46a, 0xd4aa, 0xcc69,
+    0xd469, 0xcc28, 0xcc6a, 0xbc09, 0xcd92, 0xff39, 0xf7fe, 0xf7fe, 0xf7ff, 0xf7ff,
+    0xffdd, 0xffdd, 0xee76, 0xb4cf, 0xbc4a, 0xbc6a, 0xcc49, 0xd449, 0xd448, 0xd448,
+    0xc44a, 0xc44a, 0xb4af, 0xee96, 0xffde, 0xffde, 0xffde, 0xffff, 0xfffe, 0xffde,
+    0xdfff, 0xe7ff, 0xf7ff, 0xeffe, 0xffdb, 0xf77a, 0xee53, 0xd570, 0xcc4a, 0xc409,
+    0xcbc7, 0xcbc7, 0xd3a6, 0xd3a6, 0xdba6, 0xdbc7, 0xcd50, 0xee54, 0xff18, 0xffba,
+    0xfffc, 0xeeb7, 0xcdb3, 0xe655, 0xe613, 0xe634, 0xe613, 0xe613, 0xe634, 0xde14,
+    0xde35, 0xe696, 0xf7fe, 0xf7fe, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7df, 0xffff, 0xffff,
+    0xfffb, 0xded6, 0xd631, 0xde52, 0xee11, 0xe5f1, 0xf5f3, 0xfe13, 0xee34, 0xe5f3,
+    0xe676, 0xff5a, 0xffde, 0xfffe, 0xf7ff, 0xefff, 0xf7ff, 0xf7ff, 0xffff, 0xffde,
+    0xff19, 0xe656, 0xe5f3, 0xee13, 0xf5f2, 0xf612, 0xe5f2, 0xee13, 0xe655, 0xd5f4,
+    0xd6b8, 0xfffd, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+    0xfffe, 0xffff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffdc, 0xf6f8, 0xfe31, 0xfe52,
+    0xfe2f, 0xfe50, 0xfe51, 0xfe51, 0xfe72, 0xfe52, 0xf634, 0xfeb6, 0xffdd, 0xffdd,
+    0xefff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffdc, 0xffbc, 0xfed4, 0xfe52, 0xfe51, 0xfe51,
+    0xfe30, 0xfe30, 0xfe2f, 0xfe0e, 0xee11, 0xfe72, 0xff7a, 0xfffc, 0xffff, 0xf7fe,
+    0xf7ff, 0xf7df, 0xfffd, 0xffbc, 0xfeb5, 0xf633, 0xfdf0, 0xfe10, 0xfe51, 0xfe30,
+    0xfe71, 0xfe30, 0xfe93, 0xfe11, 0xfef7, 0xffdb, 0xffff, 0xf7de, 0xefff, 0xefdf,
+    0xffbd, 0xffde, 0xff59, 0xee55, 0xfe52, 0xfe52, 0xfe30, 0xfe50, 0xfe2f, 0xfe2f,
+    0xfe51, 0xfe31, 0xe655, 0xff59, 0xffff, 0xffde, 0xf7de, 0xffff, 0xffde, 0xfffe,
+    0xe7ff, 0xe7ff, 0xeffe, 0xe7dd, 0xffdb, 0xfffc, 0xff98, 0xfed5, 0xfe52, 0xfe11,
+    0xfdcf, 0xfdcf, 0xfdae, 0xfd8e, 0xfdae, 0xfdcf, 0xfed6, 0xffb9, 0xffdb, 0xff9a,
+    0xffdc, 0xffdc, 0xff9a, 0xffba, 0xffba, 0xffda, 0xffd9, 0xffb9, 0xffdb, 0xffda,
+    0xffbb, 0xffdc, 0xf7fe, 0xfffe, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7df, 0xffff, 0xffff,
+    0xfffc, 0xffdb, 0xfff8, 0xfff9, 0xffb8, 0xff98, 0xff99, 0xff99, 0xff99, 0xff58,
+    0xffbb, 0xffdc, 0xfffe, 0xffde, 0xf7ff, 0xefff, 0xefff, 0xefff, 0xfffe, 0xfffe,
+    0xffbb, 0xffbb, 0xff79, 0xff79, 0xff98, 0xffb9, 0xffb9, 0xff99, 0xffdb, 0xffdb,
+    0xfffd, 0xfffd, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+    0xffff, 0xffff, 0xf7df, 0xffff, 0xffff, 0xffff, 0xfffe, 0xffbd, 0xff9b, 0xff9b,
+    0xff99, 0xff99, 0xffbb, 0xffbb, 0xffdc, 0xffdc, 0xffdd, 0xffbc, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffdd, 0xfffe, 0xffbb, 0xff9a, 0xffdb, 0xffdb,
+    0xffbb, 0xffbc, 0xffba, 0xffba, 0xffbb, 0xffbb, 0xfffe, 0xfffe, 0xffff, 0xf7df,
+    0xf7ff, 0xf7df, 0xfffe, 0xfffe, 0xffbc, 0xff9c, 0xffbb, 0xffbb, 0xff9b, 0xffbb,
+    0xffbb, 0xff7a, 0xffdb, 0xff7a, 0xffbc, 0xffdc, 0xffff, 0xf7be, 0xf7df, 0xf7ff,
+    0xffff, 0xffff, 0xfffe, 0xfffe, 0xffbc, 0xffdd, 0xffbb, 0xffbb, 0xffba, 0xffba,
+    0xffbb, 0xffbb, 0xffdd, 0xfffe, 0xffff, 0xf7ff, 0xffff, 0xffff, 0xffde, 0xffff,
+    0xffff, 0xffbf, 0xffbf, 0xffdf, 0xffdf, 0xf79e, 0xffbd, 0xfffe, 0xfffd, 0xffdc,
+    0xffbb, 0xffdb, 0xffda, 0xffb9, 0xffd9, 0xfff9, 0xfffe, 0xffde, 0xf7be, 0xf7df,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+    0xf7fe, 0xf7fe, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffe, 0xffde, 0xffdf, 0xffde,
+    0xffff, 0xffff, 0xf7fe, 0xf7de, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7fe, 0xf7ff,
+    0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+    0xf7fe, 0xf7fe, 0xf7fe, 0xf7fe, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffdf, 0xffdf,
+    0xffff, 0xffff, 0xf7df, 0xffff, 0xffff, 0xf7df, 0xfffe, 0xfffe, 0xffbb, 0xffbb,
+    0xffba, 0xffba, 0xffdb, 0xffdb, 0xffdc, 0xfffc, 0xffdd, 0xff9c, 0xffff, 0xffde,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffdd, 0xfffe, 0xfffc, 0xffbb, 0xffdb, 0xffdb,
+    0xffbc, 0xfffc, 0xff9a, 0xffba, 0xffdc, 0xffdc, 0xfffe, 0xffdd, 0xf7df, 0xf7df,
+    0xffff, 0xf7df, 0xffde, 0xfffe, 0xffbc, 0xffbc, 0xffbb, 0xff9b, 0xfffc, 0xfffc,
+    0xffdb, 0xff7a, 0xffdb, 0xffbb, 0xffdc, 0xff9c, 0xffff, 0xf7de, 0xf7ff, 0xffff,
+    0xffdf, 0xf7be, 0xffde, 0xfffe, 0xfffd, 0xfffd, 0xffdc, 0xffdc, 0xffbb, 0xffbb,
+    0xffdb, 0xffdb, 0xfffe, 0xffdd, 0xf7bf, 0xf7df, 0xffff, 0xffff, 0xffde, 0xffff,
+    0xffff, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffdf, 0xffbd, 0xffde, 0xffdc, 0xffdc,
+    0xffdb, 0xfffc, 0xfffa, 0xfffa, 0xffd9, 0xfffa, 0xffbd, 0xffbd, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xffff, 0xf7ff, 0xfffe, 0xffff, 0xfffe, 0xffde, 0xffff, 0xffff,
+    0xf7df, 0xf7ff, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+    0xf7fe, 0xf7fe, 0xfffd, 0xfffc, 0xffbc, 0xfffd, 0xfffe, 0xffdd, 0xffdf, 0xffbe,
+    0xffff, 0xffff, 0xf7de, 0xf7fe, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xf7fe,
+    0xffde, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffbf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+    0xfffe, 0xfffe, 0xf7fe, 0xf7fe, 0xf7ff, 0xf7ff, 0xf7df, 0xf7df, 0xffdf, 0xffdf,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+    0xfffe, 0xfffe, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffde, 0xffde, 0xf7ff, 0xf7ff,
+    0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffde, 0xffde, 0xf7df, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+    0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffbe, 0xffbe, 0xffbe, 0xffbe, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7fe, 0xf7fe, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffdf, 0xffdf, 0xffdf, 0xffdf,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+    0xfffe, 0xfffe, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffde, 0xffde, 0xf7ff, 0xf7ff,
+    0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffde, 0xffdf, 0xf7df, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffbe, 0xffbe, 0xffbe, 0xffbe, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7fe, 0xf7fe, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffdf, 0xffdf, 0xffdf, 0xffdf,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffdf, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffde, 0xffde, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xfffe, 0xfffe, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xffde, 0xffde, 0xffde, 0xffde,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffde, 0xffde, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xffde, 0xffde, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xfffe, 0xfffe, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xffde, 0xffde, 0xffde, 0xffde,
+    0xffff, 0xffff, 0xffdf, 0xffdf, 0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff,
+    0xefff, 0xefff, 0xf7ff, 0xf7ff, 0xf7fe, 0xf7fe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+    0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xfffe, 0xfffe, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+    0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7fe, 0xf7fe, 0xfffe, 0xfffe,
+    0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffdf, 0xffdf, 0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffde, 0xffde, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff,
+    0xefff, 0xefff, 0xf7ff, 0xf7ff, 0xf7fe, 0xf7fe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+    0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xfffe, 0xfffe, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde,
+    0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+    0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7fe, 0xf7fe, 0xf7fe, 0xf7fe,
+    0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffde, 0xffde, 0xffde, 0xffde, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xfffe, 0xfffe, 0xffdd, 0xffdd, 0xffff, 0xffff, 0xfffe, 0xfffe,
+    0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+    0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffd, 0xfffd,
+    0xfffe, 0xfffe, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+    0xfffe, 0xfffe, 0xffff, 0xffff, 0xffdd, 0xffdd, 0xffde, 0xffde, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff,
+    0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffde, 0xffde, 0xfffe, 0xfffe,
+    0xfffe, 0xfffe, 0xf7fe, 0xf7fe, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+    0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7fe, 0xf7fe, 0xf7fd, 0xf7fd,
+    0xf7fe, 0xf7fe, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xfffe, 0xfffe, 0xf7fe, 0xf7fe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+    0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xefff, 0xefff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffde, 0xffde, 0xffde, 0xffde, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xfffe, 0xfffe, 0xffdd, 0xffdd, 0xffff, 0xffff, 0xfffe, 0xfffe,
+    0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+    0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffd, 0xfffd,
+    0xfffe, 0xfffe, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+    0xfffe, 0xfffe, 0xffff, 0xffff, 0xffdd, 0xffdd, 0xfffe, 0xfffe, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff,
+    0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffde, 0xffde, 0xfffe, 0xfffe,
+    0xfffe, 0xfffe, 0xf7fe, 0xf7fe, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+    0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7fe, 0xf7fe, 0xf7fd, 0xf7fd,
+    0xf7fe, 0xf7fe, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xfffe, 0xfffe, 0xf7fd, 0xf7fd, 0xfffe, 0xfffe, 0xffde, 0xffde,
+    0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xefff, 0xefff,
+};
+
+const UWORD32 gau4_ihevcd_logo_rgb8888[10240] = {
+
+    0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce8, 0xf8fce8,
+    0xf8f8f0, 0xf8f8f0, 0xf8f8e8, 0xf8f8e8, 0xf8f8e8, 0xf8f8e8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+    0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+    0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce8, 0xf8fce8, 0xf8f8e8, 0xf8f8e8, 0xf8f8e8, 0xf8f8e8, 0xf8fcf0, 0xf8fcf0,
+    0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf0, 0xf0fcf0, 0xe8fcf8, 0xe8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8f8e8, 0xf8f8e8,
+    0xf8f8f0, 0xf8f8f0, 0xf8f8e8, 0xf8f8e8, 0xf8f8e8, 0xf8f8e8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+    0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+    0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce8, 0xf8fce8, 0xf8f8e8, 0xf8f8e8, 0xf8f8e8, 0xf8f8e8, 0xf8fcf0, 0xf8fcf0,
+    0xf8f8e0, 0xf8f8e0, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0f8f8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf0fcf0, 0xf0fcf0, 0xe8fcf8, 0xe8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8f8f8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+    0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+    0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8f8f8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+    0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+    0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xe8fcf8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8,
+    0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0, 0xf8f4f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf0f8f8, 0xf0f8f8, 0xf0f8f8, 0xf0f8f8, 0xf0f8f8, 0xf0f8f8,
+    0xe8fcf8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8,
+    0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8f8f0, 0xf8f8f0, 0xf8f4f0, 0xf8f4f0, 0xf8f4f0, 0xf8f4f0, 0xf8f4f0, 0xf8f8f8, 0xf8f4f0, 0xf8f4f0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f4e8, 0xf8f4e8, 0xf8f4e8, 0xf8f4e8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8,
+    0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0,
+    0xf8ecc0, 0xf8e0b0, 0xf8cc98, 0xf0c090, 0xf0bc90, 0xf8cc98, 0xf8e0b8, 0xf8ecc8, 0xf8f4d8, 0xf8f4e0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0,
+    0xf8dcb0, 0xe8c090, 0xc89868, 0xb07c50, 0xb07c50, 0xc89868, 0xe8c098, 0xf8dcb0, 0xf8f4d8, 0xf8f4d8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf0, 0xf8fcf0, 0xf0f4e8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fce0, 0xf8fce0, 0xf8f4c8, 0xf8f4c8, 0xf8f0c0, 0xf8f4c0, 0xf8f4c8, 0xf8f4c8, 0xf8f8d0, 0xf8f8d0,
+    0xf8f8d8, 0xf8f4d8, 0xf8fce8, 0xf8f4e0, 0xf8f8f0, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xe8f8f8, 0xe8f8f8, 0xe0fcf8, 0xe0fcf8,
+    0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8f8f8, 0xf8fcf8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xe8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xe8f8f8, 0xf0fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8ecd8, 0xf8f8e0, 0xf8f0d0, 0xf8dcb8,
+    0xd87810, 0xe08018, 0xf08418, 0xf08c20, 0xf08c20, 0xe88818, 0xd08428, 0xc87c28, 0xf0c890, 0xf8e8b0, 0xf8fce0, 0xf8f8d8, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf0, 0xf8fcf0, 0xf0f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fce0, 0xf0e8d0, 0xf8e0b8, 0xf8e0b8, 0xf8e0b0, 0xf8e4b0, 0xf8e4b8, 0xf8e4b8, 0xf8e4c0, 0xf8e4c0,
+    0xf8e0c0, 0xf8ecd0, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf8, 0xf8f8f0, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xe0fcf8, 0xe0fcf8,
+    0xe8fcf8, 0xe8fcf8, 0xf0f8f8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0f8f8, 0xe8fcf8, 0xe8fcf8,
+    0xe8f8f8, 0xe8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf0f8f8, 0xf0f8f8, 0xf0fcf8, 0xf0fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8e0, 0xf8f8e8, 0xf8d4b0, 0xc8a080,
+    0xe08018, 0xe08420, 0xf08818, 0xf88c20, 0xf09020, 0xf08820, 0xd88830, 0xd08428, 0xc09460, 0xf8d098, 0xf8fce0, 0xf8fce0, 0xf0fcf0, 0xf0f8f0, 0xf0fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf0, 0xf8fcf0, 0xf0fcf0, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8f0c8, 0xe0cca0, 0xe8a860, 0xe8a860, 0xf8a848, 0xf8ac50, 0xf8ac58, 0xf8ac50, 0xe8ac60, 0xe0ac60,
+    0xd8a470, 0xf8c490, 0xf8f8d8, 0xf8f8d8, 0xf8f8f0, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8f8e8, 0xf8fce8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xe8fcf8,
+    0xf8fcf8, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0f8f8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf0, 0xf8fcf0, 0xf8fcf8,
+    0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8f8f0, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0f8f0, 0xf8fcf8, 0xf8f8d8, 0xf8f8d8, 0xf8bc88, 0xb07840,
+    0xf88400, 0xf88400, 0xf88800, 0xf88800, 0xf88800, 0xf88800, 0xf08810, 0xf08810, 0xa87830, 0xf0bc70, 0xf8fcd8, 0xf8fce0, 0xf0fcf0, 0xf0f8e8, 0xf0fcf0, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8f0c8, 0xd0b890, 0xc08038, 0xc08438, 0xd88428, 0xd88828, 0xd08830, 0xd08830, 0xc08838, 0xc08438,
+    0xb88450, 0xe8b078, 0xf8f0d0, 0xf8f8d8, 0xf8f8f0, 0xf8fcf0, 0xf0fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8f8f0, 0xf8f8e8, 0xf8f4e0, 0xf0f8f0, 0xf8f8f0, 0xe8fcf8, 0xe8fcf8,
+    0xf8fcf8, 0xf0f8f0, 0xf8fcf8, 0xf8fcf8, 0xf0f8f8, 0xf0fcf8, 0xf8fcf8, 0xf0f8f8, 0xf8fcf8, 0xf0f8f8, 0xf0f8f8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf0fcf8, 0xf0fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8e8, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xf8f0d0, 0xf8bc88, 0xb88048,
+    0xf88800, 0xf88808, 0xf88800, 0xf88800, 0xf88800, 0xf88800, 0xf88c18, 0xf88c18, 0xb88840, 0xf0c078, 0xf8f0d0, 0xf8f8d8, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8f8f0, 0xf8fcf8, 0xf8f0c8, 0xd8b080, 0xe08028, 0xe08428, 0xf88408, 0xf88410, 0xf88810, 0xf88410, 0xe88418, 0xe88018,
+    0xc88c48, 0xf8b878, 0xf8f8d8, 0xf8f8d8, 0xf0fcf8, 0xf0fcf8, 0xe8fcf8, 0xf0fcf8, 0xf8f4e8, 0xf8f8f0, 0xf8f4d8, 0xf8f4d8, 0xf8f4d0, 0xf8f4d0, 0xf8f8d8, 0xf8fcd8,
+    0xf8f4d0, 0xf8f4d0, 0xf8f8e0, 0xf8fce8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0f8f8, 0xf8f4e8, 0xf8f8f0, 0xf8fce8, 0xf8f8e0, 0xf8f4d8, 0xf8f4d8,
+    0xf8f8e0, 0xf8fce8, 0xf8f8d8, 0xf8f0d0, 0xf8f0d0, 0xf8f8d8, 0xf8fce8, 0xf8f4e0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8e0, 0xf8f0d0, 0xf8c088, 0xc08850,
+    0xf88810, 0xf88810, 0xf88800, 0xf88800, 0xf88800, 0xf88400, 0xf88c10, 0xf88c10, 0xc89048, 0xf8c078, 0xf8ecd0, 0xf8f4d8, 0xf8fcf0, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8fcf8, 0xf8f4c8, 0xd8b088, 0xe88830, 0xe88c30, 0xf88c10, 0xf89018, 0xf89018, 0xf88c18, 0xf08c28, 0xf08c20,
+    0xd09050, 0xf8bc78, 0xf8f8d8, 0xf8f8d8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f4d8, 0xf8f4d8, 0xf8f4d0, 0xf8f4d0, 0xf8f4d8, 0xf8f4d0,
+    0xf8f4d0, 0xf8f8d8, 0xf8f0d8, 0xf8f0d8, 0xf8fcf0, 0xf8fcf0, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0f8f8, 0xf8f8f0, 0xf8fcf0, 0xf8fce8, 0xf8f4e0, 0xf8f4d0, 0xf8f8d8,
+    0xf8f4e0, 0xf8fce8, 0xf8f8d8, 0xf8f8d8, 0xf8f4d0, 0xf8f4d0, 0xf8f8e8, 0xf8f8e0, 0xf0fcf8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8e0, 0xf8f4d8, 0xf0c088, 0xb08048,
+    0xf88410, 0xf88810, 0xf88400, 0xf88400, 0xf88400, 0xf88400, 0xf88810, 0xf08810, 0xb88438, 0xf0bc78, 0xf8f0d0, 0xf8f8d8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8f8f8, 0xf0fcf8, 0xf0fcf8, 0xf8f4f8, 0xf8fcf8, 0xf8f0c8, 0xe0ac88, 0xf08020, 0xf08420, 0xf88000, 0xf88000, 0xf88400, 0xf88000, 0xf88410, 0xf88008,
+    0xc88c48, 0xf0b878, 0xf8f8d8, 0xf8fce0, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf8fcf0, 0xf8ece0, 0xf8d4a8, 0xf8c898, 0xf8c890, 0xf8cc90, 0xf8c890, 0xf8c488,
+    0xf8cc98, 0xf8d8a0, 0xe0c4a0, 0xe8c8a0, 0xf8f4e0, 0xf8fce8, 0xf0f0e8, 0xf8fcf8, 0xf8f4f0, 0xf8f8f0, 0xf8f8e8, 0xf8f4e0, 0xf8e0b8, 0xf0cca8, 0xf8cc98, 0xf8d4a0,
+    0xf0d0a8, 0xf0d0a0, 0xf8d098, 0xf8d4a0, 0xf8cc98, 0xf0c490, 0xe8dcc0, 0xf8fce0, 0xf0fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf8fce8, 0xf8fce8, 0xf0cc98, 0xb89060,
+    0xe08028, 0xe88428, 0xf88410, 0xf88810, 0xf88408, 0xf88400, 0xf08418, 0xe88010, 0xc08c48, 0xf8c488, 0xf8f4e0, 0xf8f8e0, 0xf8f8f0, 0xf8f4f0, 0xf8f4f0, 0xf8f8f8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0f8f8, 0xf0fcf8, 0xf0fcf8, 0xf8f4f8, 0xf8fcf8, 0xf8f0c8, 0xe0ac80, 0xf08420, 0xf88428, 0xf88000, 0xf88000, 0xf88400, 0xf88400, 0xf88410, 0xf88410,
+    0xc89050, 0xf8c080, 0xf8fce0, 0xf8fce0, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf8fcf0, 0xe0d8d0, 0xd0a878, 0xb89060, 0xc89050, 0xd09858, 0xd09860, 0xc89458,
+    0xb89060, 0xc89c68, 0xa88860, 0xb89870, 0xf8e8d8, 0xf8fce8, 0xf0f0e8, 0xf8fcf8, 0xf8f8f0, 0xf8fcf8, 0xf8f8e8, 0xf8e8d8, 0xd8bc98, 0xb89870, 0xb88c58, 0xc89c68,
+    0xc0a478, 0xb09468, 0xc09460, 0xc8a068, 0xb89460, 0xb08850, 0xc0b898, 0xf8f8d8, 0xf0fcf8, 0xf0fcf8, 0xe8fcf8, 0xe8fcf8, 0xf8fce8, 0xf8fce8, 0xf8d8a8, 0xd0ac78,
+    0xe08020, 0xe08428, 0xf88410, 0xf88818, 0xf88808, 0xf88400, 0xe88018, 0xe87c10, 0xe0a868, 0xf8d498, 0xf8f8e0, 0xf8f8e0, 0xf8f8f8, 0xf8fcf8, 0xf8f4f0, 0xf8f4f0,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xc89840, 0xe8c060, 0xf8fce8, 0xf8fce8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0, 0xf8f8c8, 0xf0e0b0, 0xf8a430, 0xd88410, 0xf87c00, 0xf88400, 0xf88030, 0xf88030,
+    0xf88408, 0xf88408, 0xd88420, 0xe09030, 0xf8e4a8, 0xf8f4b8, 0xf8fce0, 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8f8c8, 0xf8ecb8, 0xf8b060, 0xd88c38, 0xf08010, 0xf88418,
+    0xf88410, 0xf88410, 0xf08808, 0xf08808, 0xd89420, 0xc07c08, 0xd8bc78, 0xf8f4a8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcd0, 0xf8f8d0,
+    0xd88450, 0xd88858, 0xe08c28, 0xe08c28, 0xe09028, 0xd88c28, 0xc88c68, 0xc88c60, 0xf8f0e8, 0xf8f0e8, 0xf0fce0, 0xf0fce8, 0xf8fce8, 0xf8fce8, 0xf8f8f8, 0xf8f8f8,
+    0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf0fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8f8e8, 0xf8f8e8,
+    0xf0fcf0, 0xf8fcf8, 0xf0fcf0, 0xf0f8f0, 0xf8fcf8, 0xf8fcf8, 0xf0f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0,
+    0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f4e8, 0xf8f4e8, 0xf8f0e8, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe0fcf8, 0xe8fcf8,
+    0xf8fcf0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f4f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xc09838, 0xe8bc60, 0xf8fce8, 0xf8f8e8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0, 0xf8fcd0, 0xf8e0b0, 0xf8a430, 0xe08810, 0xf87c00, 0xf88400, 0xf88030, 0xf88030,
+    0xf88408, 0xf88408, 0xd88420, 0xe08c30, 0xf8e0a8, 0xf8f0b8, 0xf8f8d8, 0xf8f8e0, 0xf8fce0, 0xf8f8e0, 0xf8f4c8, 0xf8e8b8, 0xf8b060, 0xd88838, 0xf07c10, 0xf88418,
+    0xf88810, 0xf88810, 0xf88c08, 0xf08808, 0xd89420, 0xc88008, 0xd8bc78, 0xf8f4a8, 0xf8fce8, 0xf8fce8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf0fcf0, 0xf8f8d0, 0xf8f8d0,
+    0xf8c490, 0xf8b480, 0xf0a440, 0xe09430, 0xe09430, 0xf0a440, 0xf0b890, 0xf8c8a0, 0xf8f0e8, 0xf8f4e8, 0xf0fce8, 0xf0fce8, 0xf8fce8, 0xf8fce8, 0xf8f8f8, 0xf8f4f8,
+    0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0f8f8, 0xf0f8f0, 0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8e8, 0xf8f8e8,
+    0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0f8f0, 0xf8fcf8, 0xf8fcf0, 0xf0f8f0, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0,
+    0xf8f4f0, 0xf8f8f0, 0xf8fcf8, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f4f0, 0xf8f4f0, 0xf8f4f0,
+    0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f4e8, 0xf8f4e8, 0xf8f4f0, 0xf8f4f0, 0xf8f8f8, 0xf8f8f8, 0xe8fcf8, 0xe8fcf8, 0xe0fcf8, 0xe0fcf8,
+    0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xd09830, 0xf8bc58, 0xf8fce8, 0xf8fce8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce0, 0xe8e8c8, 0xf0a848, 0xd08c28, 0xf88000, 0xf88800, 0xf88020, 0xf88020,
+    0xf88808, 0xf88808, 0xd88828, 0xe09038, 0xf8e8c0, 0xf8f4c8, 0xe0fcf8, 0xe0fcf8, 0xe0fcf8, 0xe0fcf8, 0xf8f8d8, 0xf8ecc8, 0xf8b060, 0xd88c40, 0xf88010, 0xf88818,
+    0xf87c10, 0xf87c10, 0xf88410, 0xf88008, 0xf08c28, 0xd87410, 0xf0b480, 0xf8e8b0, 0xf8f4f8, 0xf8f4f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f4e8,
+    0xf8f8f0, 0xf8ece8, 0xf0d8a8, 0xe8cc98, 0xe8cc90, 0xf8d898, 0xf8ecd8, 0xf8f8e8, 0xf8f8f8, 0xf8f8f8, 0xf0fcf0, 0xf0fcf0, 0xf0fce8, 0xf0fce8, 0xf8f4f8, 0xf8f4f8,
+    0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0f8f0, 0xf0fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0f8f8,
+    0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fce8, 0xf8fce8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0f8f8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xd89830, 0xf8bc58, 0xf8fce8, 0xf8fce8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce0, 0xe8e4c8, 0xf0ac48, 0xd88c28, 0xf88000, 0xf88c08, 0xf88420, 0xf88420,
+    0xf88c08, 0xf88c08, 0xd88c30, 0xe89438, 0xf8e8c0, 0xf8f8d0, 0xe0fcf8, 0xe8fcf8, 0xe8fcf8, 0xe0fcf8, 0xf8fcd8, 0xf8f0d0, 0xf8b460, 0xd89040, 0xf88410, 0xf88c18,
+    0xf88010, 0xf88010, 0xf88410, 0xf88010, 0xf08c28, 0xd87410, 0xf0b480, 0xf8e8b0, 0xf8f4f8, 0xf8f4f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0,
+    0xf8f4f0, 0xf8f4f0, 0xf8f4c0, 0xf8f4c0, 0xf8f4b8, 0xf8f4b8, 0xf8f4e0, 0xf8f4e0, 0xf8f8f8, 0xf8f8f8, 0xf0fcf0, 0xf0fcf0, 0xf0fce8, 0xf0fce8, 0xf8f4f8, 0xf8f4f8,
+    0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8,
+    0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0f4f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0f8f0, 0xf0f8f0, 0xf0fcf0, 0xf8fcf8, 0xf8fcf0, 0xf0f8f0, 0xf0f8f0, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fce8, 0xf8fce8,
+    0xf0f8f8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0f8f8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe09020, 0xf8b448, 0xf8fce8, 0xf8f8e0, 0xf0fcf8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fce8, 0xe0e8d0, 0xe8ac58, 0xd09038, 0xf88400, 0xf88c08, 0xf88410, 0xf88010,
+    0xf88808, 0xf88400, 0xd88828, 0xe09038, 0xf8e8c8, 0xf8f8d0, 0xe0fcf8, 0xe0fcf8, 0xe0fcf8, 0xd8fcf8, 0xf8fce0, 0xf8ecd0, 0xf8b060, 0xd88c38, 0xf88008, 0xf88810,
+    0xf88810, 0xf88810, 0xf88c18, 0xf88c10, 0xe09430, 0xc87c18, 0xe0bc80, 0xf8f0b0, 0xf8fcf0, 0xf8fcf0, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf0fcf0, 0xf0fcf0,
+    0xe0fcf8, 0xe0fcf8, 0xf0fcf0, 0xf0fcf0, 0xf8fcd8, 0xf8fcd8, 0xf8f8f0, 0xf8f8f0, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf0fcf0, 0xf0fcf0, 0xf8f4f8, 0xf8f4f0,
+    0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8f4e8, 0xf8f4e8, 0xf8f8e0, 0xf8f8e0, 0xf8f8d8, 0xf8f8d8, 0xf8f8e0, 0xf8f8d8, 0xf8f8e0, 0xf8fce8, 0xf8fce8, 0xf8fce8,
+    0xf8f8f8, 0xf8fcf8, 0xf8f4f0, 0xf8f4f0, 0xf8f8f8, 0xf8f8f0, 0xf8f4f0, 0xf8f8f8, 0xf8f4f0, 0xf8f4f0, 0xf8f4f0, 0xf8f8f0, 0xf8f4f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0,
+    0xf0fcf8, 0xe8fcf8, 0xe8f8f0, 0xf0fcf0, 0xf8fce8, 0xf8fce8, 0xf8f8d8, 0xf8fce0, 0xf8f8e0, 0xf8f8d8, 0xf8f8e0, 0xf8f8e0, 0xf8f8f0, 0xf8f8f0, 0xf0f8f8, 0xf0f8f8,
+    0xf0fcf0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fce8, 0xf8fce8, 0xf8f8d0, 0xf8f4c8, 0xf8f0c0, 0xf8f8c0,
+    0xf0f8f0, 0xf0f8f0, 0xf0f8f0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe09020, 0xf8b448, 0xf8fce8, 0xf8fce8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce8, 0xe8e8d0, 0xf0ac58, 0xd09038, 0xf88400, 0xf88808, 0xf88010, 0xf87c10,
+    0xf88808, 0xf88400, 0xd88830, 0xe09438, 0xf8e8c8, 0xf8f8d8, 0xe0fcf8, 0xe0fcf8, 0xe0fcf8, 0xe0fcf8, 0xf8fce0, 0xf8f0d0, 0xf8b460, 0xd88c38, 0xf88008, 0xf88810,
+    0xf88810, 0xf88810, 0xf88c18, 0xf88c10, 0xe09430, 0xc87c18, 0xe0b880, 0xf8ecb0, 0xf8fcf0, 0xf8fcf0, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xe0fcf8, 0xe0fcf8, 0xf0fcf0, 0xf0fcf0, 0xf8fcd8, 0xf8fce0, 0xf8fcf8, 0xf8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf0fcf0, 0xf0fcf0, 0xf8f4f8, 0xf8f4f8,
+    0xe8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8f8e8, 0xf8fcf0, 0xf8fce0, 0xf8f4e0, 0xf8f8d8, 0xf8f8d8, 0xf8f8d8, 0xf8f4d8, 0xf8f8e0, 0xf8f8e0, 0xf8fce8, 0xf8fce8,
+    0xf8f8f8, 0xf8fcf8, 0xf8f8f8, 0xf8f4f0, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0,
+    0xf0fcf8, 0xe8fcf8, 0xf0f8f0, 0xf0fcf8, 0xf8fce8, 0xf8f8e8, 0xf8f4d8, 0xf8f4d8, 0xf8f0d0, 0xf8f4d8, 0xf8f8e0, 0xf8f8e0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8,
+    0xf0fcf0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8f4e0, 0xf8f8e8, 0xf8f4c8, 0xf8f0c8, 0xf8ecb8, 0xf8f4c0,
+    0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe09028, 0xf8b450, 0xf8fce8, 0xf8fce0, 0xf8fcf8, 0xf8fcf8, 0xf8f4d8, 0xf8f4d8, 0xf8f4c0, 0xf8d8a0, 0xf8a448, 0xd88830, 0xf88408, 0xf88810, 0xf88410, 0xf88010,
+    0xf88808, 0xf88808, 0xe88420, 0xf09028, 0xf8e0a8, 0xf8f0b0, 0xf8f4d0, 0xf8f4d8, 0xf8f8d8, 0xf8f4d8, 0xf8f4b8, 0xf8e8b0, 0xf8b058, 0xe88c30, 0xf88408, 0xf88c18,
+    0xf88c10, 0xf89010, 0xf89010, 0xf08c10, 0xe09428, 0xc87c10, 0xe8b868, 0xf8ec98, 0xf8f8c8, 0xf8f8c8, 0xf8fcd8, 0xf8fcd8, 0xf8fcd8, 0xf8fcd8, 0xf8fcc8, 0xf8fcc8,
+    0xf8f4e8, 0xf8f8e8, 0xf8f8d0, 0xf8f8d0, 0xf8f4c8, 0xf8f4c8, 0xf8f4d0, 0xf8f4d0, 0xf8f8f0, 0xf8f8f0, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8f8, 0xf8f8f0,
+    0xe8f8f8, 0xf0fcf8, 0xf8fcf0, 0xf8f8f0, 0xf8f8d0, 0xf8f8d0, 0xf8e8b0, 0xf8d8a0, 0xf8c480, 0xf8c478, 0xf8c478, 0xf8c078, 0xf8c480, 0xf8c480, 0xf0c888, 0xf0cc90,
+    0xf8d8a8, 0xf8e8c0, 0xf8f4c8, 0xf8f4c8, 0xf8f0c8, 0xf8e4b8, 0xf8dcb0, 0xf8e4b8, 0xf8e4b0, 0xf8e4b8, 0xf8e4b8, 0xf8e4b8, 0xf8e4b8, 0xf8e4b8, 0xf8e8b8, 0xf8e8b8,
+    0xf8fcd8, 0xf8f8d8, 0xf8f8c8, 0xf8f8c8, 0xf8e4a8, 0xf8cc90, 0xf8bc78, 0xf8c078, 0xf8bc78, 0xf8c078, 0xf8c480, 0xf8c488, 0xf0cc98, 0xf8d8a0, 0xf8ecc0, 0xf8f8d0,
+    0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8f8e0, 0xf8f8d8, 0xf8f0d0, 0xf8f0c8, 0xf8e0b0, 0xf8d0a0, 0xf8c080, 0xf8c080, 0xf8c078, 0xf8c070, 0xf8bc68, 0xf8c068,
+    0xf0d4a0, 0xf8e4b0, 0xf8f4c8, 0xf8f8d0, 0xf8f8e0, 0xf8f4d8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe09028, 0xf8b450, 0xf8fce0, 0xf8f4e0, 0xf8f8f8, 0xf0f0f0, 0xf8e8c8, 0xf8e8c8, 0xf8e8b0, 0xf8d4a0, 0xf0a040, 0xd88828, 0xf88408, 0xf88c10, 0xf88818, 0xf88818,
+    0xf88808, 0xf88408, 0xe08020, 0xe88828, 0xf8d8a0, 0xf8e8a8, 0xf8ecc8, 0xf8ecd0, 0xf8f0d0, 0xf8f0d0, 0xf8ecb8, 0xf8e0a8, 0xf8ac50, 0xe88830, 0xf88008, 0xf88810,
+    0xf88c10, 0xf88c10, 0xf89010, 0xf08c10, 0xe09428, 0xc87c10, 0xe8b868, 0xf8ec98, 0xf8f0c0, 0xf8f0c0, 0xf0f0d0, 0xf0f0d0, 0xf0f0c8, 0xf8f0d0, 0xf8f0c0, 0xf8f0c0,
+    0xf0f0e8, 0xf0f0e0, 0xf8ecc8, 0xf8e8c8, 0xf8e8b8, 0xf8e8b8, 0xf8ecc8, 0xf8ecd0, 0xf8f8e8, 0xf8f8f0, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8f8, 0xf8f8f8,
+    0xf0fcf8, 0xf0fcf8, 0xf8f8f0, 0xf8f4e8, 0xf8f0c8, 0xf8ecc8, 0xf8cc90, 0xe8b078, 0xc88440, 0xc08440, 0xc08438, 0xc08038, 0xb88440, 0xb88440, 0xb08850, 0xb08c50,
+    0xd0ac80, 0xf0cca0, 0xf8e8c0, 0xf8f0c8, 0xf8e8b8, 0xf8d0a8, 0xe8c090, 0xf0c498, 0xe8c090, 0xe8c090, 0xf0c090, 0xf0c498, 0xf0c490, 0xf0c498, 0xf0c498, 0xf0c498,
+    0xf8f4d8, 0xf8f8d8, 0xf8f4c8, 0xf8e8c0, 0xf0c488, 0xc09c60, 0xc08440, 0xc08440, 0xc88840, 0xc08440, 0xb88440, 0xb88040, 0xb88c58, 0xd0a878, 0xf0d0a8, 0xf8f0c8,
+    0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8f8e0, 0xf8f8d8, 0xf8f0c8, 0xf8f0c8, 0xe0b888, 0xc09868, 0xb07c40, 0xb88040, 0xc88438, 0xc88438, 0xc88030, 0xc88030,
+    0xb09060, 0xd0b480, 0xf0e0b8, 0xf8f4d0, 0xf8f8e0, 0xf8f4d8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88018, 0xf88418, 0xf88400, 0xf88400, 0xf88818, 0xf88818, 0xf88410, 0xf88410,
+    0xd09038, 0xf8b860, 0xf0f0d8, 0xf8f8e0, 0xf8f0d0, 0xb8a080, 0xd07018, 0xe88830, 0xf08018, 0xf08420, 0xf08820, 0xf88c20, 0xf88810, 0xf88810, 0xf88820, 0xf88818,
+    0xf88818, 0xf88c18, 0xf88c20, 0xf88818, 0xf08820, 0xf08420, 0xe08828, 0xe88828, 0xe08828, 0xe08828, 0xe88828, 0xe88828, 0xf88c20, 0xf88c20, 0xf88c18, 0xf88c18,
+    0xf88410, 0xf88008, 0xf88010, 0xf88410, 0xf88418, 0xf88018, 0xf88028, 0xf87c20, 0xe88430, 0xe88430, 0xe08438, 0xe08438, 0xe88030, 0xe88030, 0xf07c28, 0xf07c28,
+    0xe08430, 0xe08830, 0xe88428, 0xe88428, 0xe88420, 0xe88420, 0xd08830, 0xd08430, 0xa08458, 0xf0d4a0, 0xf8fcf0, 0xf8fcf0, 0xf0f8f8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf0,
+    0xf8f8e8, 0xf8f0e0, 0xf8ecc8, 0xf8e8c0, 0xf8b878, 0xc88440, 0xd87410, 0xf08828, 0xf88818, 0xf88410, 0xf88408, 0xf88810, 0xf88810, 0xf08410, 0xf08418, 0xf08c18,
+    0xe89030, 0xd88020, 0xd88428, 0xf8c060, 0xf8e080, 0xf8a848, 0xd07818, 0xe89030, 0xe88c28, 0xe08420, 0xe08420, 0xe08828, 0xe08420, 0xe08420, 0xe89028, 0xf8a038,
+    0xf8f0a0, 0xf8dc90, 0xf8b058, 0xd08830, 0xd87c18, 0xe88c28, 0xf88c18, 0xf88818, 0xf88410, 0xf88410, 0xf88818, 0xf88818, 0xe88820, 0xe88420, 0xd88020, 0xd88020,
+    0xf0e0b8, 0xf8fcd8, 0xf8f4c0, 0xf8f0b8, 0xf8f4a8, 0xf8d488, 0xf09838, 0xe08c28, 0xe88418, 0xf08818, 0xf08418, 0xf08010, 0xe88418, 0xf08820, 0xf08c28, 0xe88820,
+    0xf09830, 0xd87c10, 0xc88028, 0xf8a858, 0xf8dca0, 0xf8f0b8, 0xf8f8d8, 0xf8fcd8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88018, 0xf88418, 0xf88400, 0xf88400, 0xf88818, 0xf88818, 0xf88410, 0xf88410,
+    0xd08c38, 0xf8b460, 0xe8e8d0, 0xf8f4d8, 0xf8e8c8, 0xb89c78, 0xd07018, 0xe88830, 0xf08420, 0xf08820, 0xf88820, 0xf88c20, 0xf88810, 0xf88810, 0xf88818, 0xf88818,
+    0xf88410, 0xf88410, 0xf88818, 0xf88418, 0xe88420, 0xe88020, 0xe08428, 0xe08428, 0xe08428, 0xe08428, 0xe88020, 0xe88420, 0xf08418, 0xf08818, 0xf88410, 0xf88410,
+    0xf88410, 0xf88410, 0xf88010, 0xf88010, 0xf88418, 0xf88418, 0xf88428, 0xf88028, 0xe88030, 0xe88030, 0xe08038, 0xe08038, 0xe87c30, 0xe87c30, 0xf07828, 0xf07828,
+    0xd88028, 0xd88428, 0xe88028, 0xe88028, 0xe88020, 0xe08020, 0xd08430, 0xc88030, 0xa88858, 0xf0d0a0, 0xf8f8f0, 0xf8fcf0, 0xf0f8f8, 0xf0fcf8, 0xf8fcf8, 0xf0f8f0,
+    0xf8ecd8, 0xf8f8e8, 0xf8e4c0, 0xd0b088, 0xc88440, 0xd08c48, 0xf08c28, 0xe88828, 0xf88410, 0xf88410, 0xf88408, 0xf88810, 0xf88c18, 0xf88810, 0xf08c18, 0xf89020,
+    0xd88428, 0xd88020, 0xd08020, 0xf09c40, 0xf8b858, 0xf09838, 0xd07c18, 0xe08c28, 0xe88c28, 0xe08828, 0xe08828, 0xe88c28, 0xe88c28, 0xe08820, 0xf09430, 0xf8a440,
+    0xf8dc90, 0xf0b468, 0xd88c38, 0xd08830, 0xe88c28, 0xe88c28, 0xf88818, 0xf88818, 0xf88410, 0xf88410, 0xf88818, 0xf88818, 0xe88828, 0xe88820, 0xe08828, 0xe08428,
+    0xb8a480, 0xd8cca0, 0xf8ecb8, 0xf8f4c0, 0xf8d488, 0xe8ac60, 0xe89030, 0xe88c30, 0xf08418, 0xf08818, 0xf88818, 0xf88818, 0xf08820, 0xf08c20, 0xf08820, 0xe88420,
+    0xe88c20, 0xe08018, 0xd08430, 0xe09440, 0xe0b878, 0xf8dca0, 0xf8f8d8, 0xf8f8d8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88018, 0xf88418, 0xf88400, 0xf88400, 0xf88818, 0xf88818, 0xf88410, 0xf88410,
+    0xd89448, 0xf8bc68, 0xf0f4d8, 0xf8fce0, 0xf8f0b8, 0xd0a470, 0xf87400, 0xf88c10, 0xf88000, 0xf88000, 0xf88818, 0xf88818, 0xf88818, 0xf88818, 0xf88c20, 0xf88c20,
+    0xf88818, 0xf88c18, 0xf88c18, 0xf88c18, 0xf88808, 0xf88808, 0xf88808, 0xf88808, 0xf88808, 0xf88808, 0xf88808, 0xf88808, 0xf88c18, 0xf88c18, 0xf89018, 0xf88c18,
+    0xf88810, 0xf88810, 0xf88410, 0xf88410, 0xf88418, 0xf88418, 0xf88418, 0xf88410, 0xf88c18, 0xf88c18, 0xf88c10, 0xf88c10, 0xf88810, 0xf88810, 0xf88408, 0xf88408,
+    0xf88800, 0xf88800, 0xf88400, 0xf88408, 0xf88808, 0xf88408, 0xf88c10, 0xf88c10, 0xc88c40, 0xf8d080, 0xf8f0e0, 0xf8f8e8, 0xf8f8f8, 0xf0f4f8, 0xf8fce8, 0xf0f8e8,
+    0xf8f0b8, 0xf8d098, 0xe8a058, 0xd08438, 0xe87c20, 0xf08c28, 0xf88810, 0xf88410, 0xf88400, 0xf88000, 0xf88000, 0xf88000, 0xf88400, 0xf88000, 0xf88400, 0xf88800,
+    0xf88808, 0xf89010, 0xf88808, 0xf88808, 0xf89818, 0xf89418, 0xf88808, 0xf89010, 0xf88808, 0xf88400, 0xf88408, 0xf88c08, 0xf88808, 0xf88800, 0xf88c08, 0xf89818,
+    0xf8a840, 0xe88c20, 0xe07c08, 0xf08818, 0xf89018, 0xf88810, 0xf88000, 0xf88808, 0xf88400, 0xf88400, 0xf88400, 0xf88400, 0xf88400, 0xf88400, 0xf88408, 0xf88408,
+    0xb87c30, 0xd09448, 0xf8cc78, 0xf8d880, 0xf8a038, 0xe08018, 0xf88408, 0xf88808, 0xf88400, 0xf88400, 0xf88400, 0xf88808, 0xf88c10, 0xf88810, 0xf88818, 0xf88818,
+    0xf88000, 0xf88000, 0xf88810, 0xf08008, 0xd08c40, 0xf8c070, 0xf8f8d0, 0xf8f4d0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88018, 0xf88418, 0xf88400, 0xf88400, 0xf88818, 0xf88818, 0xf88410, 0xf88410,
+    0xd09040, 0xf8b468, 0xe8ecd0, 0xf8f8e0, 0xf8ecb8, 0xd0a070, 0xf87000, 0xf88808, 0xf88400, 0xf88400, 0xf88818, 0xf88818, 0xf88818, 0xf88818, 0xf88820, 0xf88820,
+    0xf88410, 0xf88418, 0xf88810, 0xf88810, 0xf88408, 0xf88408, 0xf88400, 0xf88400, 0xf88808, 0xf88808, 0xf88808, 0xf88808, 0xf88c10, 0xf88c10, 0xf88c18, 0xf88818,
+    0xf88c10, 0xf88810, 0xf88410, 0xf88410, 0xf88418, 0xf88418, 0xf88818, 0xf88418, 0xf88c18, 0xf88c18, 0xf88c10, 0xf88c10, 0xf88810, 0xf88810, 0xf88408, 0xf88408,
+    0xf88800, 0xf88800, 0xf88400, 0xf88400, 0xf88408, 0xf88400, 0xf88c10, 0xf88c10, 0xc89440, 0xf8d080, 0xf8f4e0, 0xf8f8e8, 0xf8f8f8, 0xf0f0f8, 0xf8fce8, 0xf8f8e8,
+    0xf8e4a8, 0xe0a468, 0xc07830, 0xd08840, 0xf89430, 0xf08c28, 0xf88008, 0xf88c10, 0xf88400, 0xf88000, 0xf88400, 0xf88400, 0xf88800, 0xf88800, 0xf88800, 0xf88800,
+    0xf88808, 0xf89418, 0xf88c10, 0xf88000, 0xf88808, 0xf88c08, 0xf88808, 0xf88c10, 0xf88c10, 0xf88c08, 0xf88c08, 0xf89010, 0xf88c08, 0xf88808, 0xf88808, 0xf89010,
+    0xe08820, 0xe88c20, 0xf09020, 0xf09020, 0xf88810, 0xf88810, 0xf88808, 0xf88808, 0xf88400, 0xf88400, 0xf88400, 0xf88400, 0xf88408, 0xf88408, 0xf88808, 0xf88808,
+    0xc89048, 0xc88c40, 0xf0a050, 0xf8ac58, 0xf09028, 0xe08018, 0xf88810, 0xf88810, 0xf88c08, 0xf88808, 0xf88408, 0xf88808, 0xf88c10, 0xf88810, 0xf88818, 0xf88c20,
+    0xf88800, 0xf88800, 0xf89420, 0xf08008, 0xc87c30, 0xf8ac60, 0xf8f0c8, 0xf8f8d0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88018, 0xf88418, 0xf88400, 0xf88400, 0xf88818, 0xf88818, 0xf88410, 0xf88410,
+    0xe08c38, 0xf8b058, 0xe8ecd0, 0xf8f8e0, 0xf8ecb8, 0xc8a470, 0xf87000, 0xf88c08, 0xf88408, 0xf88408, 0xf88828, 0xf88828, 0xf88818, 0xf88818, 0xf88810, 0xf88810,
+    0xf88410, 0xf88410, 0xf88410, 0xf88410, 0xf88408, 0xf88408, 0xf88408, 0xf88808, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88810,
+    0xf89410, 0xf09010, 0xf88c10, 0xf88c10, 0xf88c18, 0xf88c18, 0xf88c18, 0xf88c18, 0xf09010, 0xf09010, 0xf09408, 0xf09408, 0xf89010, 0xf89010, 0xf88c10, 0xf88c10,
+    0xf88800, 0xf88800, 0xf88010, 0xf88010, 0xf88008, 0xf87c08, 0xf08808, 0xf08808, 0xc89038, 0xf8d078, 0xf8f4e0, 0xf8f8e8, 0xf8f4f8, 0xf8f0f0, 0xf8fce0, 0xf8f8d8,
+    0xf89430, 0xf89028, 0xf88c20, 0xf08818, 0xf88410, 0xf88810, 0xf88810, 0xf88808, 0xf88400, 0xf88408, 0xf88808, 0xf88810, 0xf88c18, 0xf89018, 0xf89020, 0xf88c18,
+    0xf88410, 0xf88c10, 0xf88c10, 0xf88810, 0xf88810, 0xf88810, 0xf88410, 0xf88810, 0xf88c10, 0xf88810, 0xf88810, 0xf88c10, 0xf88810, 0xf88408, 0xf88408, 0xf88808,
+    0xe08c28, 0xe89028, 0xe89028, 0xe88820, 0xe88018, 0xe88420, 0xf88820, 0xf88c20, 0xf88c18, 0xf88c18, 0xf88810, 0xf88810, 0xf88408, 0xf88408, 0xf88808, 0xf88808,
+    0xf89420, 0xf88818, 0xf08418, 0xf88c20, 0xf89830, 0xf09028, 0xe88c20, 0xe88c20, 0xf09028, 0xf08c28, 0xf88c20, 0xf89020, 0xf88c18, 0xf88818, 0xf88410, 0xf88410,
+    0xf88800, 0xf88400, 0xf88c10, 0xf88410, 0xc88030, 0xe09848, 0xf0d8b0, 0xf8f4d0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88018, 0xf88418, 0xf88400, 0xf88400, 0xf88818, 0xf88818, 0xf88410, 0xf88410,
+    0xe09038, 0xf8b860, 0xf0f4d8, 0xf8fce0, 0xf8f4c0, 0xc8a870, 0xf87400, 0xf88c10, 0xf88000, 0xf88000, 0xf88828, 0xf88828, 0xf88818, 0xf88818, 0xf88810, 0xf88810,
+    0xf88818, 0xf88818, 0xf88c18, 0xf88818, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88c10, 0xf88c10, 0xf88c18, 0xf88c10,
+    0xf09410, 0xf09010, 0xf88c10, 0xf88c10, 0xf88c18, 0xf88c18, 0xf88c18, 0xf88810, 0xf09010, 0xf09010, 0xf09408, 0xf09408, 0xf89010, 0xf89010, 0xf88c10, 0xf88c10,
+    0xf88c00, 0xf88c00, 0xf88410, 0xf88810, 0xf88410, 0xf88410, 0xf88c10, 0xf88c08, 0xc88830, 0xf8d078, 0xf8f0e0, 0xf8f4e0, 0xf8f8f8, 0xf8f8f8, 0xf8f8d8, 0xe8e4c0,
+    0xe07c18, 0xe88820, 0xf88c20, 0xf08818, 0xf88410, 0xf88c18, 0xf88810, 0xf88408, 0xf88808, 0xf88808, 0xf88408, 0xf88408, 0xf88410, 0xf88810, 0xf88418, 0xf08010,
+    0xf88810, 0xf88408, 0xf88410, 0xf89018, 0xf89018, 0xf88810, 0xf88810, 0xf88c18, 0xf88408, 0xf88408, 0xf88808, 0xf88810, 0xf88810, 0xf88808, 0xf88808, 0xf88408,
+    0xf09830, 0xe08820, 0xd88018, 0xe08420, 0xe88820, 0xe88420, 0xf08018, 0xf08418, 0xf88818, 0xf88818, 0xf88810, 0xf88810, 0xf88808, 0xf88408, 0xf88408, 0xf88408,
+    0xf89020, 0xf89020, 0xf08818, 0xf08818, 0xf09028, 0xe88c20, 0xd88018, 0xd87c18, 0xe07c18, 0xe08418, 0xf08418, 0xf08818, 0xf88410, 0xf88818, 0xf88410, 0xf88410,
+    0xf88800, 0xf88000, 0xf88810, 0xf88c10, 0xd88c40, 0xd88c38, 0xd8bc90, 0xf8f4c8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88018, 0xf88418, 0xf88400, 0xf88400, 0xf88818, 0xf88818, 0xf88410, 0xf88410,
+    0xe88420, 0xf8ac48, 0xe8ecd0, 0xf0f4d8, 0xf8ecc0, 0xb0a070, 0xe06c00, 0xf88410, 0xf88018, 0xf88018, 0xf08838, 0xf08c38, 0xf88818, 0xf88820, 0xf88800, 0xf88800,
+    0xf88008, 0xf88008, 0xf88410, 0xf88010, 0xf88018, 0xf08018, 0xe88020, 0xe88420, 0xe88020, 0xe88018, 0xf08010, 0xf08018, 0xf88008, 0xf88410, 0xf88408, 0xf88008,
+    0xf88810, 0xf88810, 0xf88018, 0xf88418, 0xf88020, 0xf88020, 0xf88018, 0xf87c18, 0xf07c08, 0xf87c08, 0xf08008, 0xf08008, 0xf07c18, 0xf07c18, 0xf87c20, 0xf87c20,
+    0xe89020, 0xe89020, 0xf08838, 0xf08838, 0xf88428, 0xf88428, 0xe88c18, 0xe08c10, 0xc09038, 0xf8d480, 0xf8f0e0, 0xf8f0e0, 0xf8f8f0, 0xf8f8f0, 0xf8e4b8, 0xc8b888,
+    0xf88408, 0xf87800, 0xf87800, 0xf88808, 0xf89010, 0xf88808, 0xf88410, 0xf88810, 0xf89020, 0xf88c20, 0xe88828, 0xe88420, 0xe08c30, 0xe89440, 0xe89848, 0xe89448,
+    0xf88c28, 0xf07818, 0xf07810, 0xf88420, 0xf88420, 0xf88018, 0xf88018, 0xf88418, 0xf88418, 0xf88420, 0xf88820, 0xf88820, 0xf88820, 0xf88820, 0xf88820, 0xf88418,
+    0xd89840, 0xe09840, 0xe8a050, 0xf8b060, 0xf8b868, 0xf8b468, 0xf0a050, 0xe89448, 0xe88c38, 0xe88c38, 0xf08c30, 0xf08c30, 0xf88c20, 0xf88c20, 0xf88818, 0xf88810,
+    0xf88400, 0xf88000, 0xf89018, 0xf88810, 0xd08428, 0xe09438, 0xe0ac68, 0xd8a860, 0xe0a868, 0xe0ac68, 0xe89c48, 0xd88c38, 0xf07c18, 0xf88820, 0xf88810, 0xf88408,
+    0xf88800, 0xf88800, 0xf88818, 0xf89020, 0xd89850, 0xc08440, 0xc0ac88, 0xf8f8d8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88018, 0xf88418, 0xf88400, 0xf88400, 0xf88818, 0xf88818, 0xf88410, 0xf88410,
+    0xe88828, 0xf8b050, 0xf0f0d8, 0xf8fce0, 0xf8f4c8, 0xb8a478, 0xe07000, 0xf88410, 0xf87c10, 0xf88010, 0xe88830, 0xf08c38, 0xf88820, 0xf88820, 0xf88800, 0xf88800,
+    0xf88410, 0xf88810, 0xf88818, 0xf88818, 0xf88420, 0xf88418, 0xf08420, 0xf08420, 0xf08420, 0xe88420, 0xf08418, 0xf88818, 0xf88810, 0xf88c18, 0xf88c10, 0xf88810,
+    0xf88810, 0xf88808, 0xf88418, 0xf88418, 0xf88020, 0xf87c20, 0xf87c18, 0xf87818, 0xf88410, 0xf88410, 0xf88810, 0xf88810, 0xf88418, 0xf88418, 0xf88020, 0xf88020,
+    0xe89428, 0xe89428, 0xf88c38, 0xf88c38, 0xf88830, 0xf88828, 0xe89018, 0xe89018, 0xc89440, 0xf8d880, 0xf8f0e0, 0xf8f0e0, 0xf8f8f0, 0xf8f4f0, 0xe0d0a0, 0xa09060,
+    0xf88000, 0xf88400, 0xf88808, 0xf88c10, 0xf88808, 0xf88008, 0xf88410, 0xf88c18, 0xf88818, 0xf08418, 0xe88420, 0xe88828, 0xf09840, 0xf8b058, 0xf8c070, 0xf8c070,
+    0xf8a440, 0xf88c28, 0xf88420, 0xf88820, 0xf88820, 0xf88820, 0xf88c28, 0xf88820, 0xf88820, 0xf88820, 0xf88820, 0xf88820, 0xf88820, 0xf88820, 0xf88418, 0xf88018,
+    0xd08830, 0xf8b860, 0xf8e090, 0xf8ec98, 0xf8eca0, 0xf8eca0, 0xf8d080, 0xf8ac60, 0xe08830, 0xe08c38, 0xf08c30, 0xf08c30, 0xf89020, 0xf88c20, 0xf88c18, 0xf88c18,
+    0xf88800, 0xf87400, 0xf89018, 0xf88c10, 0xc87820, 0xf8ac50, 0xf8f4b0, 0xf8eca8, 0xf8f4b0, 0xf8eca8, 0xf8c470, 0xe09440, 0xf07810, 0xf88418, 0xf88808, 0xf88008,
+    0xf88000, 0xf88c00, 0xf88818, 0xf88818, 0xd09450, 0xb87c38, 0xb8a080, 0xf8f8d8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xd88830, 0xf8ac50, 0xf0fcf0, 0xe8f4f0, 0xf8f8f0, 0xf0f0e8, 0xf0f4e8, 0xf0f0e8, 0xf8f0c8, 0xe8d8b8, 0xf89c28, 0xe88410, 0xf88410, 0xf88c18, 0xf88418, 0xf88410,
+    0xf88410, 0xf88408, 0xe08420, 0xe88c28, 0xe0e8a0, 0xf0f4a8, 0xf8e8d8, 0xf8ecd8, 0xf8f0e8, 0xf8ece8, 0xf0fcc8, 0xe8f0c0, 0xf8ac58, 0xe08838, 0xf88000, 0xf88808,
+    0xf88808, 0xf88c10, 0xf88c10, 0xf88810, 0xf88828, 0xe07010, 0xd0bc78, 0xf8f0a8, 0xf8f4e0, 0xf0f0d8, 0xf8f0e8, 0xf8f0e8, 0xe0fcc0, 0xe8fcc8, 0xf8c480, 0xd08848,
+    0xf88408, 0xf88408, 0xf88810, 0xf88810, 0xf88808, 0xf88408, 0xf88810, 0xf88410, 0xc08848, 0xf8d088, 0xf8fce8, 0xf8f8e0, 0xf8f8d0, 0xf8ecc8, 0xf8b460, 0xd88c38,
+    0xf88010, 0xf88010, 0xf87c08, 0xf88010, 0xf88010, 0xf88010, 0xf88820, 0xf88420, 0xd89c50, 0xc08438, 0xc0b078, 0xf8ecb8, 0xf8fce0, 0xf8f8d8, 0xf8f8e0, 0xf8fce8,
+    0xf8f0c8, 0xf8ecc8, 0xf8d8a0, 0xd0a470, 0xd08430, 0xe09440, 0xf88c20, 0xf89028, 0xf88410, 0xf88810, 0xf88010, 0xf88008, 0xf88818, 0xf88418, 0xf87c18, 0xf87c18,
+    0xd0d4a8, 0xf0f0c8, 0xf8f4e8, 0xf8f8f0, 0xf8f4f8, 0xf8f0f0, 0xf8fce0, 0xf8f4d0, 0xd8a450, 0xc89440, 0xf88010, 0xf88c18, 0xf88408, 0xf88008, 0xf88c10, 0xf88c10,
+    0xf88808, 0xf88400, 0xd08838, 0xd08838, 0xe8dcb8, 0xf8f4d8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf0, 0xf8f4e0, 0xf8fcd8, 0xf0e4c8, 0xe09c50, 0xc88438, 0xf88400, 0xf88800,
+    0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe09038, 0xf8b458, 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8f8d0, 0xe8dcb8, 0xf8a030, 0xe88410, 0xf88410, 0xf88c18, 0xf88418, 0xf88410,
+    0xf88810, 0xf88410, 0xe08828, 0xe89030, 0xe8f0a8, 0xf8fcb0, 0xf8f0e0, 0xf8f8e8, 0xf8f4f0, 0xf8f0f0, 0xf8fcd0, 0xf0f4c8, 0xf8b060, 0xe08c38, 0xf88408, 0xf88c10,
+    0xf88808, 0xf88c10, 0xf88c10, 0xf88810, 0xf88828, 0xe07010, 0xd0c078, 0xf8f0a8, 0xf8fce8, 0xf8f8e0, 0xf8f8f0, 0xf8f8f0, 0xe8fcc8, 0xf0fcd0, 0xf8cc88, 0xd89050,
+    0xf88408, 0xf88408, 0xf88810, 0xf88810, 0xf88808, 0xf88408, 0xf88810, 0xf88410, 0xc08848, 0xf8cc88, 0xf8fce0, 0xf8f8e0, 0xf8f4d0, 0xf8e0b8, 0xf8a850, 0xd88c38,
+    0xf88010, 0xf88010, 0xf87c08, 0xf88010, 0xf88010, 0xf88010, 0xf88420, 0xf88420, 0xc88840, 0xe8ac60, 0xf0e4b0, 0xf8f4c0, 0xf8f8d8, 0xf8fcd8, 0xf8fce8, 0xf0f4e0,
+    0xf8fcd8, 0xf8f4d0, 0xf8e8b0, 0xf0c490, 0xe8a050, 0xd89040, 0xe87c10, 0xf88818, 0xf88410, 0xf88810, 0xf88410, 0xf88008, 0xf88820, 0xf88418, 0xf87c18, 0xf88018,
+    0xe0e0b8, 0xf8f8d0, 0xf8f4e8, 0xf8f8f0, 0xf8f4f8, 0xf8f0f0, 0xf8fce0, 0xf8f4d8, 0xe8b460, 0xd09c48, 0xf88010, 0xf88818, 0xf88408, 0xf88008, 0xf88c10, 0xf88c10,
+    0xf88800, 0xf88000, 0xd08c40, 0xd89848, 0xf0e0c0, 0xf8f4d8, 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8f4e0, 0xf8fce0, 0xf8e8c8, 0xe8a050, 0xc88438, 0xf88400, 0xf88800,
+    0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe09030, 0xf8b458, 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e4c0, 0xf8a430, 0xf08418, 0xf88410, 0xf88c10, 0xf88418, 0xf88418,
+    0xf88410, 0xf88010, 0xe08428, 0xe89030, 0xe8f0a8, 0xf8fcb8, 0xf8f4e8, 0xf8f8e8, 0xf8f4f8, 0xf8f0f0, 0xf8fcd8, 0xf0f4c8, 0xf8b060, 0xe08c38, 0xf88008, 0xf88810,
+    0xf88810, 0xf88c10, 0xf88810, 0xf88810, 0xf88828, 0xe07010, 0xd0c078, 0xf8f0b0, 0xf8fce8, 0xf8f4e8, 0xf8f8f8, 0xf8f8f0, 0xe8fcd0, 0xf0fcd0, 0xf8c888, 0xd88c50,
+    0xf88408, 0xf88408, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88818, 0xf08418, 0xc08c48, 0xf8cc88, 0xf8f8e0, 0xf8f8e0, 0xf8f0c0, 0xf0d0a0, 0xf09838, 0xe88c28,
+    0xf88810, 0xf88810, 0xf88408, 0xf88810, 0xf88410, 0xf88410, 0xf08828, 0xf08420, 0xd09858, 0xf8d490, 0xf8fcd8, 0xf8f8d0, 0xf0f4e8, 0xf8fce8, 0xf8fcf8, 0xf0f4f0,
+    0xf8fce8, 0xf0f4e0, 0xf8f8d0, 0xf8ecc0, 0xf8c880, 0xe09c50, 0xe07810, 0xf88c28, 0xf88410, 0xf88810, 0xf88410, 0xf88408, 0xf88c20, 0xf88418, 0xf08018, 0xf88020,
+    0xe8f0c8, 0xf8fcd8, 0xf8f4f0, 0xf8f4f0, 0xf8f4f8, 0xf8f0f8, 0xf8fce0, 0xf8f8d8, 0xf8c878, 0xd8a858, 0xf88010, 0xf88418, 0xf88008, 0xf88008, 0xf88c10, 0xf89018,
+    0xf88808, 0xf88000, 0xd89448, 0xf0ac60, 0xf8ecd0, 0xf8f4d8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf0f4e8, 0xf8fce0, 0xf8f0d0, 0xf0ac60, 0xd08c40, 0xf88400, 0xf88400,
+    0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe08c30, 0xf8b058, 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e4c0, 0xf8a430, 0xe88410, 0xf88408, 0xf88c10, 0xf88418, 0xf88418,
+    0xf88010, 0xf88008, 0xe08420, 0xe88c30, 0xe8f0a8, 0xf8fcb8, 0xf8f4e8, 0xf8f8e8, 0xf8f4f8, 0xf8f0f0, 0xf8fcd8, 0xf0f4c8, 0xf8b060, 0xe08838, 0xf87c08, 0xf88410,
+    0xf88810, 0xf88c10, 0xf88810, 0xf88810, 0xf88828, 0xe07010, 0xd0c078, 0xf8f4b0, 0xf8f8e8, 0xf8f4e0, 0xf8f8f0, 0xf8f4f0, 0xe8fcc8, 0xe8fcd0, 0xf8c888, 0xd88848,
+    0xf88408, 0xf88408, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88818, 0xf08418, 0xc89050, 0xf8d090, 0xf8f8e0, 0xf8fce0, 0xf8f0c0, 0xe0c498, 0xe88c28, 0xe89030,
+    0xf88c10, 0xf88c10, 0xf88810, 0xf88810, 0xf88410, 0xf88410, 0xf08428, 0xe88020, 0xf8c888, 0xf8e8a8, 0xf8f8d0, 0xf8f4d0, 0xf8fcf0, 0xf8fcf0, 0xf0f4f0, 0xf8fcf8,
+    0xf8f8e8, 0xf0f4e0, 0xf8f8d0, 0xf8f8d0, 0xf8e098, 0xf0ac60, 0xe87c10, 0xf89028, 0xf88410, 0xf88810, 0xf88410, 0xf88410, 0xf88c20, 0xf88418, 0xf88020, 0xf88420,
+    0xf0f4d0, 0xf8fcd8, 0xf8f0e8, 0xf8f4f0, 0xf8f4f8, 0xf8f4f8, 0xf8fce0, 0xf8f8d8, 0xf8d888, 0xe0ac58, 0xf88010, 0xf88010, 0xf88008, 0xf88008, 0xf88c10, 0xf88c10,
+    0xf88808, 0xf87c00, 0xd89850, 0xf8c478, 0xf8f4d8, 0xf8f8d8, 0xf8f8e8, 0xf8fcf0, 0xf8f8e8, 0xf8f8e8, 0xf8fce0, 0xf8f4d8, 0xf8b868, 0xd09048, 0xf88000, 0xf88400,
+    0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe08c38, 0xf8b458, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e4c0, 0xf8a030, 0xe88010, 0xf88010, 0xf88c18, 0xf88418, 0xf88418,
+    0xf88818, 0xf88410, 0xe08428, 0xe89030, 0xe8f4b0, 0xf8fcc0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f8, 0xf8f4f8, 0xf8fcd8, 0xf0f4d0, 0xf8b060, 0xe08c40, 0xf88010, 0xf88810,
+    0xf88810, 0xf88c10, 0xf88810, 0xf88410, 0xf88828, 0xe07010, 0xd0c080, 0xf8f4b8, 0xf8fcf0, 0xf8f8e8, 0xf8f8f8, 0xf8f8f8, 0xe8fcd8, 0xf0fcd8, 0xf8c890, 0xd88c50,
+    0xf88008, 0xf88408, 0xf88810, 0xf88c10, 0xf88c10, 0xf88810, 0xf08820, 0xf08420, 0xc89450, 0xf8d090, 0xf8f8d8, 0xf8fce0, 0xf8f0b8, 0xe0b880, 0xe88010, 0xf89028,
+    0xf89010, 0xf89010, 0xf88c10, 0xf88c10, 0xf88818, 0xf88810, 0xe88830, 0xe08428, 0xf8e8b0, 0xf8ecb8, 0xf8f0d8, 0xf8f4e0, 0xf8fcf8, 0xf8fcf8, 0xe8f4f8, 0xf8fcf8,
+    0xe8fcf8, 0xe8fcf8, 0xf8fce8, 0xf8f8e0, 0xf8e8a8, 0xf0b478, 0xe07810, 0xf08820, 0xf88810, 0xf88810, 0xf88410, 0xf88810, 0xf88c20, 0xf88818, 0xf08420, 0xf88c28,
+    0xf0f4d0, 0xf8fcd8, 0xf8f0f0, 0xf8f8f8, 0xf8f8f8, 0xf8f4f8, 0xf8fce8, 0xf0f8d8, 0xf8e090, 0xd8ac60, 0xf08010, 0xf88418, 0xf88410, 0xf88410, 0xf88c18, 0xf88c18,
+    0xf88808, 0xf87c00, 0xd89c58, 0xf8d088, 0xf8f8e0, 0xf8fce0, 0xf0f8f0, 0xf8fcf8, 0xf0fcf0, 0xf0f8f0, 0xf8fce8, 0xf8f4e0, 0xf8c078, 0xd09450, 0xf88000, 0xf88408,
+    0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe08c38, 0xf8b458, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e0c0, 0xf8a030, 0xe88010, 0xf88010, 0xf88c18, 0xf88418, 0xf88418,
+    0xf88418, 0xf88010, 0xe08428, 0xe88c30, 0xe8f0b0, 0xf8fcc0, 0xf8f4e8, 0xf8f4e8, 0xf8f4f8, 0xf8f0f8, 0xf8fcd8, 0xf0f4c8, 0xf8b060, 0xe08840, 0xf88008, 0xf88810,
+    0xf88810, 0xf88c10, 0xf88810, 0xf88410, 0xf88828, 0xe07010, 0xd0c080, 0xf8f4b8, 0xf8fcf0, 0xf8f8e8, 0xf8f8f8, 0xf8f8f8, 0xf0fcd8, 0xf0fcd8, 0xf8c890, 0xd88c50,
+    0xf88008, 0xf88408, 0xf88810, 0xf88c10, 0xf88c10, 0xf88810, 0xf08820, 0xf08420, 0xc08c50, 0xf8d090, 0xf8f4d8, 0xf8f8d8, 0xf8f0b8, 0xe0b480, 0xe07808, 0xf89020,
+    0xf89010, 0xf89010, 0xf88c10, 0xf88c10, 0xf88818, 0xf88810, 0xe88830, 0xe08428, 0xf8e0a8, 0xf8f0b8, 0xf8fce8, 0xf8f4e0, 0xf0fcf8, 0xf8fcf8, 0xf0f8f8, 0xf0f4f8,
+    0xe8fcf8, 0xe8fcf8, 0xf8f8e0, 0xf8f0d8, 0xf8e8a8, 0xf0b878, 0xe07810, 0xf08820, 0xf88810, 0xf88810, 0xf88408, 0xf88410, 0xf88c20, 0xf88418, 0xf08820, 0xf89430,
+    0xf0f0d0, 0xf8fcd8, 0xf8f0f0, 0xf8f8f8, 0xf8f8f8, 0xf8f4f8, 0xf8fce8, 0xf8f8e0, 0xf8e090, 0xd8ac60, 0xf07c10, 0xf88418, 0xf88410, 0xf88410, 0xf88c18, 0xf88c18,
+    0xf88808, 0xf87c00, 0xd89c58, 0xf8d890, 0xf8f4d8, 0xf8fce0, 0xf0f8f0, 0xf8fcf0, 0xf0fcf0, 0xf0fcf0, 0xf8fce0, 0xf8f4d8, 0xf8c480, 0xd89850, 0xf87c00, 0xf88808,
+    0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe08c30, 0xf8b058, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e0c0, 0xf8a030, 0xe88418, 0xf88410, 0xf88c18, 0xf88418, 0xf88418,
+    0xf88018, 0xf87c10, 0xd88028, 0xe08830, 0xe8f0b0, 0xf8fcc0, 0xf8f0f0, 0xf8f4f0, 0xf8f4f8, 0xf8f0f8, 0xf8fcd8, 0xe8f0d0, 0xf8ac60, 0xe08840, 0xf87c10, 0xf88418,
+    0xf88810, 0xf88c10, 0xf88818, 0xf88410, 0xf88830, 0xe07018, 0xd0c080, 0xf8f4b8, 0xf8f8f0, 0xf8f4f0, 0xf8f8f8, 0xf8f8f8, 0xe8fcd8, 0xe8fcd8, 0xf8c490, 0xd08858,
+    0xf88008, 0xf88408, 0xf88810, 0xf88818, 0xf88c18, 0xf88818, 0xf08820, 0xe88820, 0xb88c50, 0xf8d498, 0xf8f4d8, 0xf8f8d8, 0xf8f0b8, 0xe0b880, 0xe07408, 0xf89020,
+    0xf88c10, 0xf88c10, 0xf88810, 0xf88c10, 0xf88818, 0xf88418, 0xe88830, 0xe08430, 0xf8e4b0, 0xf8f0b8, 0xf8fce8, 0xf8fce8, 0xe0f8f8, 0xf0fcf8, 0xe8fcf8, 0xe0f8f8,
+    0xe0fcf8, 0xe0fcf8, 0xf8f4e8, 0xf8f4e8, 0xf8ecb0, 0xf0b880, 0xe07810, 0xf89028, 0xf88810, 0xf88810, 0xf88408, 0xf88810, 0xf89020, 0xf88818, 0xf08c28, 0xf89c38,
+    0xf0f4d8, 0xf8fce0, 0xf8f4f0, 0xf8fcf8, 0xf8fcf8, 0xf0f4f8, 0xf8fce8, 0xf8fce8, 0xf8e898, 0xd8b068, 0xf07c18, 0xf88420, 0xf88418, 0xf88410, 0xf88c18, 0xf88c18,
+    0xf88810, 0xf87c00, 0xd89c58, 0xf8d898, 0xf0f0d8, 0xf8fce8, 0xf0fcf8, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xf8f8e8, 0xf8f0e0, 0xf8c880, 0xd89c58, 0xf87c00, 0xf88c10,
+    0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe08c38, 0xf8b458, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e4c0, 0xf8a030, 0xf08418, 0xf88410, 0xf88c18, 0xf88418, 0xf88418,
+    0xf88418, 0xf88010, 0xd88028, 0xe88c38, 0xf0f4b8, 0xf8fcc8, 0xf8f8f0, 0xf8f8f0, 0xf8f4f8, 0xf8f0f8, 0xf8fce0, 0xf0f4d0, 0xf8b068, 0xe08c40, 0xf88010, 0xf88818,
+    0xf88810, 0xf88c10, 0xf88818, 0xf88410, 0xf88830, 0xe07018, 0xd0c080, 0xf8f4b8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xe8fcd8, 0xf0fcd8, 0xf8c890, 0xd08858,
+    0xf88008, 0xf88408, 0xf88810, 0xf88818, 0xf88c18, 0xf88818, 0xf08820, 0xe88820, 0xb88c50, 0xf8d898, 0xf8f8d8, 0xf8f8d8, 0xf8f4b8, 0xe8bc80, 0xe87808, 0xf89020,
+    0xf88c10, 0xf88c10, 0xf88810, 0xf88c10, 0xf88818, 0xf88818, 0xe88830, 0xe08830, 0xf8f4c0, 0xf8e8b0, 0xf8fce8, 0xf8fce8, 0xe0f8f8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8,
+    0xe0fcf8, 0xe0fcf8, 0xf8f8e8, 0xf8fcf0, 0xf8f0b8, 0xe8b078, 0xd06c08, 0xf89030, 0xf88810, 0xf88810, 0xf88408, 0xf88408, 0xf89020, 0xf88818, 0xf09028, 0xf8a040,
+    0xf0f8d8, 0xf8fce0, 0xf8f4f0, 0xf8fcf8, 0xf8f8f8, 0xf0f0f8, 0xf8fce8, 0xf8fce8, 0xf8eca0, 0xd8b468, 0xf08018, 0xf88420, 0xf88410, 0xf88410, 0xf88c18, 0xf89020,
+    0xf88810, 0xf87c00, 0xd89c58, 0xf8d890, 0xf0e8d0, 0xf8fce8, 0xf0fcf8, 0xf0f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8f8e0, 0xf0f0d8, 0xf8c888, 0xd89c58, 0xf87c00, 0xf88c10,
+    0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe08c38, 0xf8b058, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e0c0, 0xf8a038, 0xe88418, 0xf88410, 0xf88818, 0xf88418, 0xf88820,
+    0xf88418, 0xf88018, 0xd88430, 0xe89038, 0xe8f0b8, 0xf8fcc0, 0xf8f4f0, 0xf8f8f0, 0xf8f4f8, 0xf8f4f8, 0xf8fce0, 0xe8f4d0, 0xf8b068, 0xe08c48, 0xf88010, 0xf88818,
+    0xf88810, 0xf88c10, 0xf88818, 0xf88410, 0xf88830, 0xe07018, 0xd0c080, 0xf8f4b8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xe8fce0, 0xf0fce0, 0xf8c898, 0xd08858,
+    0xf88408, 0xf88408, 0xf88818, 0xf88c18, 0xf88c18, 0xf88818, 0xf08820, 0xf08820, 0xb88848, 0xf8dca0, 0xf8f4d8, 0xf8f8e0, 0xf8f8c8, 0xe8c890, 0xe88820, 0xf09028,
+    0xf88818, 0xf88410, 0xf88010, 0xf88810, 0xf88818, 0xf88018, 0xe88428, 0xe88028, 0xf8cc88, 0xf8e4a0, 0xf8f8d0, 0xf8fcd8, 0xf0fcf8, 0xe8fcf0, 0xe0f8f8, 0xf0fcf8,
+    0xf0fcf8, 0xe8f4f0, 0xf8f4d8, 0xf8fce0, 0xf8e0a0, 0xe8a868, 0xe88018, 0xf08420, 0xf88810, 0xf88c10, 0xf88810, 0xf88410, 0xf88c20, 0xf88818, 0xf09028, 0xf89c38,
+    0xf0f4d8, 0xf8fce0, 0xf8f8f8, 0xf8f4f8, 0xf8f8f8, 0xf8f8f8, 0xf8fce8, 0xf0fce8, 0xf8eca0, 0xd0b068, 0xe87c18, 0xf08820, 0xf88418, 0xf88410, 0xf88c18, 0xf88c18,
+    0xf88810, 0xf07800, 0xd89c58, 0xf8d090, 0xf8f4e8, 0xf8fce8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fce8, 0xf8f4e0, 0xf8c480, 0xd09858, 0xf88008, 0xf88810,
+    0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe08c38, 0xf8b058, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e0c0, 0xf8a038, 0xe88418, 0xf88410, 0xf88818, 0xf88418, 0xf88820,
+    0xf88418, 0xf88018, 0xd88430, 0xe89038, 0xe8f0b8, 0xf8fcc0, 0xf8f4f0, 0xf8f8f0, 0xf8f4f8, 0xf8f4f8, 0xf8fce0, 0xe8f4d0, 0xf8b068, 0xe08c48, 0xf88010, 0xf88818,
+    0xf88810, 0xf88c10, 0xf88818, 0xf88410, 0xf88830, 0xe07018, 0xd0c080, 0xf8f4b8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xe8fce0, 0xf0fce0, 0xf8c898, 0xd08858,
+    0xf88408, 0xf88408, 0xf88818, 0xf88c18, 0xf88c18, 0xf88818, 0xf08820, 0xf08820, 0xc08c50, 0xf8dca0, 0xf8f4d8, 0xf8f4d8, 0xf8f8c0, 0xf0cc98, 0xf89430, 0xf08c28,
+    0xf88818, 0xf88810, 0xf88410, 0xf88410, 0xf88418, 0xf88010, 0xf08428, 0xf08428, 0xc09450, 0xf8d898, 0xf8fcd8, 0xf8f8d0, 0xe0f4e8, 0xf0fcf8, 0xf0fcf8, 0xe0f0f0,
+    0xe8f4f0, 0xf0fcf8, 0xf8fce0, 0xf8f0d0, 0xf8c078, 0xd89850, 0xf08420, 0xf89028, 0xf88810, 0xf88c10, 0xf88810, 0xf88410, 0xf88c20, 0xf88818, 0xf09028, 0xf89c38,
+    0xf0f4d8, 0xf8fce0, 0xf8f8f8, 0xf8f4f8, 0xf8f8f8, 0xf8f8f8, 0xf8fce8, 0xf0fce8, 0xf8eca0, 0xd0b068, 0xe87c18, 0xf08820, 0xf88418, 0xf88410, 0xf88c18, 0xf88c18,
+    0xf88810, 0xf07800, 0xd89c58, 0xf8d090, 0xf8f4e8, 0xf8fce8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fce8, 0xf8f4e0, 0xf8c480, 0xd09858, 0xf88008, 0xf88810,
+    0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe08c38, 0xf8b058, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e0c0, 0xf8a038, 0xe88418, 0xf88010, 0xf88818, 0xf88418, 0xf88820,
+    0xf88418, 0xf88010, 0xd88430, 0xe89038, 0xe8f4b0, 0xf8fcc0, 0xf8f4f0, 0xf8f8f0, 0xf8f4f8, 0xf8f4f8, 0xf8fce0, 0xe8f4d0, 0xf8b068, 0xe08c40, 0xf88010, 0xf88818,
+    0xf88c10, 0xf88c10, 0xf88810, 0xf88410, 0xf88830, 0xe07418, 0xd0c080, 0xf8f4b8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xe8fce0, 0xe8fce0, 0xf8c898, 0xd08858,
+    0xf88410, 0xf88810, 0xf88818, 0xf88c18, 0xf88810, 0xf88810, 0xf88818, 0xf88410, 0xc08848, 0xf8d490, 0xf8f8e0, 0xf8f8e0, 0xf8f8d8, 0xf8e8c0, 0xf8b868, 0xd89040,
+    0xf88418, 0xf88418, 0xf88010, 0xf88418, 0xf88010, 0xf87c10, 0xf88420, 0xf88420, 0xc88030, 0xf0ac58, 0xf8e4a0, 0xf8f8b8, 0xf8f8c8, 0xf8fcc8, 0xf8fcd0, 0xf8f4c8,
+    0xf8f4c8, 0xf8f8c8, 0xf8f0b0, 0xf8c888, 0xe89840, 0xd88430, 0xf08418, 0xf89020, 0xf88810, 0xf88c10, 0xf88810, 0xf88410, 0xf88c20, 0xf88818, 0xf08c28, 0xf89c38,
+    0xf0f4d8, 0xf8fce0, 0xf8f8f8, 0xf8f4f8, 0xf8f8f8, 0xf8f8f8, 0xf0fce8, 0xf0fce8, 0xf8eca0, 0xd0b468, 0xe87c10, 0xf08820, 0xf88418, 0xf88010, 0xf88818, 0xf88c18,
+    0xf88810, 0xf07800, 0xd89c58, 0xf8d090, 0xf8f4e8, 0xf8f8e8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fce8, 0xf8f4e0, 0xf8c480, 0xd09858, 0xf88008, 0xf88810,
+    0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe08c38, 0xf8b058, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e0c0, 0xf8a038, 0xe88418, 0xf88010, 0xf88818, 0xf88418, 0xf88820,
+    0xf88418, 0xf88010, 0xd88430, 0xe89038, 0xe8f4b0, 0xf8fcc0, 0xf8f4f0, 0xf8f8f0, 0xf8f4f8, 0xf8f4f8, 0xf8fce0, 0xe8f4d0, 0xf8b068, 0xe08c40, 0xf88010, 0xf88818,
+    0xf88c10, 0xf88c10, 0xf88810, 0xf88410, 0xf88830, 0xe07418, 0xd0c080, 0xf8f4b8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xe8fce0, 0xe8fce0, 0xf8c898, 0xd08858,
+    0xf88410, 0xf88810, 0xf88818, 0xf88c18, 0xf88810, 0xf88810, 0xf88818, 0xf88410, 0xb88440, 0xf8d088, 0xf8fce8, 0xf8fce8, 0xf8fcd8, 0xf8f4d0, 0xf8c470, 0xc88430,
+    0xf88018, 0xf88018, 0xf88010, 0xf88418, 0xf88410, 0xf87c10, 0xf88420, 0xf88820, 0xe09c48, 0xc88430, 0xc8ac68, 0xf8e0a0, 0xf8f0c0, 0xf8e8b8, 0xf8f0c8, 0xf8f8d0,
+    0xf8f0c0, 0xf8e8b8, 0xf8c488, 0xd0a060, 0xd88830, 0xe08c38, 0xf88c20, 0xf88c20, 0xf88810, 0xf88c10, 0xf88810, 0xf88410, 0xf88c20, 0xf88818, 0xf08c28, 0xf89c38,
+    0xf0f4d8, 0xf8fce0, 0xf8f8f8, 0xf8f4f8, 0xf8f8f8, 0xf8f8f8, 0xf0fce8, 0xf0fce8, 0xf8eca0, 0xd0b468, 0xe87c10, 0xf08820, 0xf88418, 0xf88010, 0xf88818, 0xf88c18,
+    0xf88810, 0xf07800, 0xd89c58, 0xf8d090, 0xf8f4e8, 0xf8f8e8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fce8, 0xf8f4e0, 0xf8c480, 0xd09858, 0xf88008, 0xf88810,
+    0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe08c38, 0xf8b060, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce0, 0xf0e0c8, 0xf8a038, 0xe88418, 0xf88010, 0xf88818, 0xf88420, 0xf88820,
+    0xf88410, 0xf88410, 0xd88428, 0xe89038, 0xe8f4b0, 0xf8fcc0, 0xf8f4e8, 0xf8f8f0, 0xf8f8f8, 0xf8f4f8, 0xf8fcd8, 0xe8f8d0, 0xf8b068, 0xe08c40, 0xf88008, 0xf88810,
+    0xf88c08, 0xf88c08, 0xf88810, 0xf88808, 0xf88828, 0xe07410, 0xd0c080, 0xf8f8b0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xe8fcd8, 0xe8fce0, 0xf8c898, 0xd08858,
+    0xf88410, 0xf88818, 0xf88818, 0xf88c18, 0xf88808, 0xf88808, 0xf88408, 0xf88408, 0xc88840, 0xf8d088, 0xf8fcf0, 0xf8fce8, 0xf0f8e8, 0xf0f8e0, 0xf8d098, 0xb08c50,
+    0xe88018, 0xf08418, 0xf88410, 0xf88c18, 0xf88810, 0xf88408, 0xf88410, 0xf88818, 0xf89428, 0xe88018, 0xd88828, 0xe89c40, 0xf0a858, 0xf8c068, 0xf8c878, 0xf8b868,
+    0xf8b858, 0xf8a448, 0xe88828, 0xe08420, 0xf08818, 0xf89020, 0xf89018, 0xf88c18, 0xf88810, 0xf88c18, 0xf88818, 0xf88410, 0xf88c20, 0xf88818, 0xf88c20, 0xf89830,
+    0xf8f4d0, 0xf8fcd8, 0xf8f8f8, 0xf8f4f0, 0xf8f8f8, 0xf8f8f8, 0xf8fce8, 0xf0fce0, 0xf8eca0, 0xd0b460, 0xe87c10, 0xf88818, 0xf88410, 0xf88010, 0xf88818, 0xf88c18,
+    0xf88810, 0xf87800, 0xd89c58, 0xf8cc90, 0xf8f4e0, 0xf8f8e8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0f8f8, 0xf8fce8, 0xf8f4e0, 0xf8c480, 0xd89858, 0xf87c08, 0xf88810,
+    0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe08c38, 0xf8b060, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce0, 0xf0e0c8, 0xf8a038, 0xe88418, 0xf88010, 0xf88818, 0xf88420, 0xf88820,
+    0xf88410, 0xf88410, 0xd88428, 0xe89038, 0xe8f4b0, 0xf8fcc0, 0xf8f4e8, 0xf8f8f0, 0xf8f8f8, 0xf8f4f8, 0xf8fcd8, 0xe8f8d0, 0xf8b068, 0xe08c40, 0xf88008, 0xf88810,
+    0xf88c08, 0xf88c08, 0xf88810, 0xf88808, 0xf88828, 0xe07410, 0xd0c080, 0xf8f8b0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xe8fcd8, 0xe8fce0, 0xf8c898, 0xd08858,
+    0xf88410, 0xf88818, 0xf88818, 0xf88c18, 0xf88808, 0xf88808, 0xf88408, 0xf88408, 0xc89048, 0xf8d088, 0xf0fce8, 0xf0fce8, 0xf0f8e0, 0xf8fce8, 0xf8e8b0, 0xe0bc80,
+    0xe88418, 0xf08418, 0xf88410, 0xf88c18, 0xf88810, 0xf88410, 0xf88810, 0xf88818, 0xf08c20, 0xf08c20, 0xd89030, 0xd08828, 0xd08430, 0xe09440, 0xf09848, 0xe09040,
+    0xe09030, 0xd88828, 0xe08420, 0xe88c28, 0xf89028, 0xf89020, 0xf88810, 0xf88c18, 0xf88810, 0xf88c18, 0xf88818, 0xf88410, 0xf88c20, 0xf88818, 0xf88c20, 0xf89830,
+    0xf8f4d0, 0xf8fcd8, 0xf8f8f8, 0xf8f4f0, 0xf8f8f8, 0xf8f8f8, 0xf8fce8, 0xf0fce0, 0xf8eca0, 0xd0b460, 0xe87c10, 0xf88818, 0xf88410, 0xf88010, 0xf88818, 0xf88c18,
+    0xf88810, 0xf87800, 0xd89c58, 0xf8cc90, 0xf8f4e0, 0xf8f8e8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0f8f8, 0xf8fce8, 0xf8f4e0, 0xf8c480, 0xd89858, 0xf87c08, 0xf88810,
+    0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe08c38, 0xf8b060, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce0, 0xf0e0c8, 0xf8a038, 0xe88418, 0xf88010, 0xf88818, 0xf88420, 0xf88820,
+    0xf88410, 0xf88410, 0xd88428, 0xe89030, 0xe8f4b0, 0xf8fcc0, 0xf8f4e8, 0xf8f8f0, 0xf8f8f8, 0xf8f4f8, 0xf8fcd8, 0xe8f8d0, 0xf8b060, 0xe08c40, 0xf88008, 0xf88810,
+    0xf88c08, 0xf88c08, 0xf88c08, 0xf88808, 0xf88c28, 0xe07410, 0xd0c480, 0xf8f8b0, 0xf8fcf0, 0xf8f8e8, 0xf8f8f8, 0xf8f8f8, 0xe8fcd8, 0xe8fce0, 0xf8c898, 0xd08858,
+    0xf08818, 0xf88818, 0xf88818, 0xf88c18, 0xf88800, 0xf88400, 0xf88400, 0xf88000, 0xc88c40, 0xf8cc88, 0xf0fce8, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xf8fcd0, 0xf8f0c8,
+    0xe89428, 0xe09020, 0xf88810, 0xf88c18, 0xf88808, 0xf88808, 0xf88408, 0xf88808, 0xf88810, 0xf88c10, 0xf88818, 0xf88418, 0xf88018, 0xf07810, 0xf87410, 0xf88020,
+    0xf88808, 0xf88808, 0xf88c10, 0xf89010, 0xf89010, 0xf88808, 0xf88408, 0xf88808, 0xf88810, 0xf88c18, 0xf88818, 0xf88418, 0xf88820, 0xf88818, 0xf88820, 0xf89830,
+    0xf8f4d0, 0xf8f8d8, 0xf8f8f0, 0xf8f4f0, 0xf8f8f8, 0xf8f8f8, 0xf8fce0, 0xf0fce0, 0xf8ec98, 0xd0b460, 0xf07c10, 0xf88418, 0xf88410, 0xf88010, 0xf88818, 0xf88c18,
+    0xf88810, 0xf87800, 0xd89c58, 0xf8cc88, 0xf8f4e0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8fce8, 0xf8f4e0, 0xf8c480, 0xd89858, 0xf87c08, 0xf88810,
+    0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+    0xe08c38, 0xf8b060, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce0, 0xf0e0c8, 0xf8a038, 0xe88418, 0xf88010, 0xf88818, 0xf88420, 0xf88820,
+    0xf88410, 0xf88410, 0xd88428, 0xe89030, 0xe8f4b0, 0xf8fcc0, 0xf8f4e8, 0xf8f8f0, 0xf8f8f8, 0xf8f4f8, 0xf8fcd8, 0xe8f8d0, 0xf8b060, 0xe08c40, 0xf88008, 0xf88810,
+    0xf88c08, 0xf88c08, 0xf88c08, 0xf88808, 0xf88c28, 0xe07410, 0xd0c480, 0xf8f8b0, 0xf8fcf0, 0xf8f8e8, 0xf8f8f8, 0xf8f8f8, 0xe8fcd8, 0xe8fce0, 0xf8c898, 0xd08858,
+    0xf08818, 0xf88818, 0xf88818, 0xf88c18, 0xf88800, 0xf88400, 0xf88400, 0xf88000, 0xc88840, 0xf8d088, 0xf0fcf0, 0xf8fcf0, 0xf0fcf8, 0xe8f8f0, 0xf8f4c8, 0xf8f4d0,
+    0xf09c30, 0xe89428, 0xf08810, 0xf88810, 0xf88408, 0xf88408, 0xf88408, 0xf88408, 0xf88008, 0xf89018, 0xf88818, 0xf08010, 0xf88820, 0xf88828, 0xf87c18, 0xf88420,
+    0xf88808, 0xf88c08, 0xf88808, 0xf88408, 0xf88408, 0xf88810, 0xf88c10, 0xf88810, 0xf88810, 0xf88c18, 0xf88818, 0xf88418, 0xf88820, 0xf88818, 0xf88820, 0xf89830,
+    0xf8f4d0, 0xf8f8d8, 0xf8f8f0, 0xf8f4f0, 0xf8f8f8, 0xf8f8f8, 0xf8fce0, 0xf0fce0, 0xf8ec98, 0xd0b460, 0xf07c10, 0xf88418, 0xf88410, 0xf88010, 0xf88818, 0xf88c18,
+    0xf88810, 0xf87800, 0xd89c58, 0xf8cc88, 0xf8f4e0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8fce8, 0xf8f4e0, 0xf8c480, 0xd89858, 0xf87c08, 0xf88810,
+    0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf0f8f0, 0xf0fcf8, 0xf0fcf8, 0xf8f4e8, 0xf8fcf0, 0xf8f4c0, 0xe0b078, 0xf08420, 0xf08420, 0xf88000, 0xf88000, 0xf88400, 0xf88400, 0xf88410, 0xf88410,
+    0xe08c48, 0xf8b878, 0xf8f4d8, 0xf8f4d8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8d0, 0xf8dcb0, 0xf8a040, 0xe08020, 0xf88000, 0xf88808, 0xf88400, 0xf88400,
+    0xf88400, 0xf88c00, 0xd88418, 0xe89428, 0xf8e4b0, 0xf8f8c0, 0xe8f4e0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf8f8d0, 0xf8e8c0, 0xf8b058, 0xe08830, 0xf87c08, 0xf88810,
+    0xf88400, 0xf88000, 0xf88808, 0xf87c00, 0xe88820, 0xe07c18, 0xe0b880, 0xf8f0b8, 0xf0fcf0, 0xf0f8f0, 0xe8fcf8, 0xe8fcf8, 0xf8fce0, 0xf8f8d8, 0xf8c888, 0xd09458,
+    0xf88008, 0xf88408, 0xf88400, 0xf88400, 0xf88400, 0xf88000, 0xf08418, 0xf08418, 0xc89458, 0xf8cc90, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf8f8e8, 0xf8f8e8,
+    0xf8e4b8, 0xe0a070, 0xc87430, 0xe08448, 0xf89030, 0xf88020, 0xf87800, 0xf88c10, 0xf88000, 0xf88000, 0xf88400, 0xf88800, 0xf88c10, 0xf88808, 0xf09018, 0xf89018,
+    0xf88808, 0xf88000, 0xf88810, 0xf89018, 0xf88810, 0xf07c08, 0xf88410, 0xf88c18, 0xf88808, 0xf88808, 0xf88400, 0xf88400, 0xf88808, 0xf88000, 0xf88810, 0xf8a028,
+    0xf8f4d8, 0xf8f8d8, 0xf8f4f0, 0xf8f8f0, 0xf8f4f8, 0xf8f0f8, 0xf8f4d8, 0xf8f4d8, 0xf8e890, 0xe8b460, 0xe08c10, 0xe08c10, 0xf88408, 0xf88808, 0xf88418, 0xf88018,
+    0xf08c10, 0xe07c00, 0xe09c40, 0xf8d078, 0xf8f0c8, 0xf8f0c8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0f4f8, 0xf8f0c8, 0xf8ecc8, 0xf8c870, 0xe09c40, 0xe88408, 0xf08c10,
+    0xf88400, 0xf89800, 0xf88c08, 0xf88400, 0xe09038, 0xd88430, 0xd8a478, 0xf8ecc0, 0xf8f4e8, 0xf8f4e8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0,
+    0xf8fcf8, 0xf8f8f0, 0xf0fcf8, 0xf0fcf8, 0xf8f8f0, 0xf8fcf0, 0xf8f4c0, 0xe0b080, 0xf08420, 0xf08420, 0xf88000, 0xf88400, 0xf88408, 0xf88400, 0xf88410, 0xf88410,
+    0xd88848, 0xf8b070, 0xf8f0d0, 0xf8f4d8, 0xe8f8f8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8d0, 0xf8e0b8, 0xf8a448, 0xe88828, 0xf88408, 0xf88c08, 0xf88400, 0xf88000,
+    0xf88400, 0xf88800, 0xd88018, 0xe89028, 0xf8e0a8, 0xf8f8c0, 0xe8f4e0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf8f4c8, 0xf8e8c0, 0xf8ac58, 0xe08830, 0xf87c00, 0xf88410,
+    0xf88800, 0xf88400, 0xf89010, 0xf88808, 0xf08c28, 0xe07c18, 0xd8b480, 0xf8ecb8, 0xf8fcf8, 0xf0fcf0, 0xe8fcf8, 0xe8f8f8, 0xf8f4d8, 0xf8f4d8, 0xf8c888, 0xc89050,
+    0xf88408, 0xf88808, 0xf88400, 0xf88800, 0xf88400, 0xf88400, 0xf88818, 0xf08418, 0xc09050, 0xf8c888, 0xf8f8e0, 0xf8f4e0, 0xf0f4e8, 0xf8fcf0, 0xf8fce8, 0xf8fcf0,
+    0xf8ecc0, 0xf8d4a8, 0xf8a060, 0xd07838, 0xf07410, 0xf88c28, 0xf88808, 0xf87400, 0xf88400, 0xf88000, 0xf88800, 0xf88c00, 0xf89010, 0xf88c08, 0xf09018, 0xf89018,
+    0xf88408, 0xf88808, 0xf88408, 0xf87c00, 0xf89018, 0xf8a028, 0xf89420, 0xf07800, 0xf88408, 0xf88808, 0xf88000, 0xf88400, 0xf88808, 0xf88000, 0xf88810, 0xf89c28,
+    0xf8f4d8, 0xf8f8d8, 0xf8f8f0, 0xf8f8f0, 0xf8f4f8, 0xf8f4f8, 0xf8f4d8, 0xf8f4d8, 0xf8e490, 0xe0ac58, 0xd88808, 0xe08c10, 0xf88808, 0xf88810, 0xf88010, 0xf87c10,
+    0xf89418, 0xe88000, 0xe09c48, 0xf8d078, 0xf8f4d0, 0xf8f4d0, 0xf0f8f8, 0xf0f4f8, 0xf8fcf8, 0xf0f8f8, 0xf8f8d0, 0xf8f0d0, 0xf8c470, 0xe09840, 0xe88408, 0xf89010,
+    0xf88800, 0xf89000, 0xf88808, 0xf88808, 0xe09038, 0xc87820, 0xd09c70, 0xf8f4c8, 0xf8f8e8, 0xf8f8e8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8fcf8, 0xf8f4c8, 0xd0b088, 0xe08430, 0xe08830, 0xf88418, 0xf88818, 0xf88820, 0xf88818, 0xe88828, 0xe88428,
+    0xc88c50, 0xf8b880, 0xf8f4e0, 0xf8f8e0, 0xe8f8f8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f4d0, 0xf8dcb8, 0xf0a050, 0xd08438, 0xf08420, 0xf89028, 0xf88c18, 0xf88818,
+    0xf89020, 0xf89020, 0xc88430, 0xe09840, 0xf8e4b8, 0xf8fcd0, 0xe8f8e8, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf8f8d8, 0xf8ecc8, 0xf8b470, 0xd89048, 0xf08428, 0xf89030,
+    0xf88c20, 0xf08818, 0xf89028, 0xf08820, 0xe08c38, 0xc87420, 0xd0ac80, 0xf8ecc0, 0xf8fcf8, 0xf0fcf0, 0xe8fcf8, 0xe8fcf8, 0xf8f8e0, 0xf8fce8, 0xf8d0a0, 0xc09460,
+    0xe08828, 0xe88828, 0xf88818, 0xf88818, 0xf88818, 0xf88410, 0xe08830, 0xe08828, 0xb89060, 0xf8d0a0, 0xf8fcf0, 0xf8f8e8, 0xf0f4f0, 0xf8fcf0, 0xf8fcf0, 0xf8f8e8,
+    0xf8fce0, 0xf0fce0, 0xf0e4b0, 0xc8bc88, 0xc89048, 0xb88440, 0xe08428, 0xe88c30, 0xf88c20, 0xf88820, 0xf88820, 0xf88820, 0xf88828, 0xf88420, 0xf88028, 0xf88428,
+    0xe89038, 0xd88028, 0xc87c28, 0xf0a050, 0xf8c070, 0xf0a858, 0xc88030, 0xd08430, 0xe08c30, 0xe09038, 0xe08830, 0xe08830, 0xe09038, 0xd88c38, 0xd89040, 0xe8a050,
+    0xf0f8e0, 0xf8f8e8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f4f8, 0xf8f8e8, 0xf8f8e8, 0xf8eca8, 0xd0b070, 0xc08828, 0xc89030, 0xe89038, 0xe89030, 0xf08438, 0xf08438,
+    0xd88c30, 0xc87c20, 0xc89858, 0xf8cc90, 0xf8f4e0, 0xf8fce8, 0xf0fcf8, 0xe8f8f8, 0xe8f8f8, 0xf0fcf8, 0xf8fce8, 0xf8f0d8, 0xf0c080, 0xc89458, 0xc88028, 0xd89038,
+    0xf09020, 0xf08c20, 0xd88830, 0xe09438, 0xc89458, 0xb07838, 0xb89c78, 0xf8f8d8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8f8f0, 0xf8fcf8, 0xf8ecc8, 0xd0ac80, 0xe08838, 0xe88c38, 0xf88820, 0xf88c20, 0xf89028, 0xf88c20, 0xf08c30, 0xe88c30,
+    0xc88850, 0xf0b480, 0xf8f8e0, 0xf8f8e0, 0xf0fcf8, 0xf0fcf8, 0xe8f8f8, 0xf0fcf8, 0xf8f8d8, 0xf8e4c0, 0xf0a858, 0xd08838, 0xf08420, 0xf89028, 0xf89020, 0xf89420,
+    0xf08c18, 0xf08818, 0xc88028, 0xe09840, 0xf0e0b0, 0xf8fcd0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf8f8d8, 0xf8e8c8, 0xf8b068, 0xd88c48, 0xe88020, 0xf08428,
+    0xf89428, 0xf88c20, 0xf89028, 0xf08820, 0xe08c38, 0xd07c28, 0xe0b888, 0xf8f8c8, 0xf0f8f0, 0xf0fcf0, 0xf0fcf8, 0xe8fcf8, 0xf8fce8, 0xf8fce8, 0xf8cc98, 0xb88c58,
+    0xe88c28, 0xe88c30, 0xf88c20, 0xf88c20, 0xf88c18, 0xf88818, 0xe88c30, 0xe88c30, 0xb08858, 0xf8cca0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8f8e8, 0xf8f4e8,
+    0xf0fcd8, 0xf0f8d8, 0xf8fcc8, 0xf8f8c8, 0xf8c880, 0xc89450, 0xe08428, 0xf89840, 0xf88c28, 0xf88820, 0xf88418, 0xf88820, 0xf88420, 0xf88020, 0xf88020, 0xf88428,
+    0xe89038, 0xd88428, 0xe89848, 0xf8d888, 0xf8f0a0, 0xe8a050, 0xc07420, 0xe89c48, 0xe08830, 0xe08c38, 0xe08830, 0xe08830, 0xe09038, 0xd88c38, 0xd89040, 0xe89c50,
+    0xf0f8e8, 0xf8fce8, 0xf8f8f8, 0xf8fcf8, 0xf8f8f8, 0xf8f4f8, 0xf8f8e8, 0xf8f8e8, 0xf8eca8, 0xd0b070, 0xc08c28, 0xc89030, 0xe08c30, 0xe08c30, 0xf08438, 0xf08840,
+    0xd89038, 0xd08428, 0xd09c60, 0xf8cc90, 0xf8f0d8, 0xf8fce8, 0xf0fcf8, 0xf0fcf8, 0xe8f8f8, 0xf0fcf8, 0xf8f8e0, 0xf8ecd8, 0xf0c080, 0xd09c60, 0xd88c30, 0xe09438,
+    0xf08c20, 0xe88818, 0xd88828, 0xe09038, 0xc89458, 0xb07c40, 0xb8a080, 0xf8ecd0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0f8f8, 0xf8fcf8, 0xf8f4d8, 0xd0bca0, 0xc08850, 0xc08850, 0xd88840, 0xd88c40, 0xd08c48, 0xd08c48, 0xc08c50, 0xc08c50,
+    0xc09470, 0xe0b898, 0xf8f4e8, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f0d8, 0xe8d8c0, 0xd8a870, 0xb88850, 0xc08848, 0xc88c48, 0xc88c48, 0xc88c48,
+    0xd09048, 0xd09040, 0xb88c50, 0xd0a468, 0xe8dcc0, 0xf8f8e0, 0xf0fcf0, 0xf8fcf8, 0xf0fcf8, 0xf0f8f8, 0xf8f8e0, 0xf8e8d0, 0xe8b888, 0xc89868, 0xd08c50, 0xd08c50,
+    0xd09450, 0xc88c48, 0xd08c48, 0xc88440, 0xc88c50, 0xb88048, 0xc8b090, 0xf8e4c8, 0xf0fcf0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xf8f8e8, 0xf8f8e8, 0xe8ccb0, 0xb09878,
+    0xb88850, 0xb88c50, 0xc88848, 0xd08848, 0xd08840, 0xd08840, 0xc08850, 0xc08850, 0xb09478, 0xe8d0b0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf0, 0xf8f8f0,
+    0xd8fcf8, 0xe0fcf8, 0xf0fcf8, 0xe8fcf0, 0xf8f8d8, 0xf0ecd0, 0xe8c898, 0xd0ac80, 0xc88850, 0xc08048, 0xc87838, 0xc87838, 0xd07430, 0xd07430, 0xd87430, 0xd87838,
+    0xc8a880, 0xe8c8a0, 0xf8e0c0, 0xf8f4d0, 0xf8fce0, 0xe8d4b8, 0xc8b498, 0xe0c8a8, 0xe0c098, 0xe0c4a0, 0xe0c098, 0xe0c098, 0xe0c4a0, 0xd8c0a0, 0xd8c4a8, 0xe0d0b0,
+    0xf0fcf0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xd8d8b0, 0xd0c488, 0xd8c890, 0xe8c088, 0xe0bc88, 0xf0bc98, 0xf8c098,
+    0xe8c4a0, 0xe0bc98, 0xe0ccb0, 0xf8e8d0, 0xf8f8f0, 0xf8fcf0, 0xf0fcf8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8e0c8, 0xe0c8b0, 0xe0bc98, 0xe8c098,
+    0xf0bc90, 0xf0c090, 0xe0bc90, 0xe8c098, 0xe0c8a8, 0xd0bca0, 0xd0d4c0, 0xf8fce8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf0, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8e0, 0xf0dcc0, 0xf8c488, 0xf8c890, 0xf8c478, 0xf8c880, 0xf8c888, 0xf8c888, 0xf8cc90, 0xf8c890,
+    0xf0c4a0, 0xf8d4b0, 0xf8f8e8, 0xf8f8e8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8e0, 0xf8f4e0, 0xf8d8a0, 0xf8c890, 0xf8c888, 0xf8c888, 0xf8c480, 0xf8c480,
+    0xf8c478, 0xf8c070, 0xe8c088, 0xf8cc90, 0xf8ecd0, 0xf8fce0, 0xf8fcf8, 0xf0fcf0, 0xf0fcf8, 0xf0f8f8, 0xf8fce8, 0xf8f4e0, 0xf8d4a8, 0xf0c498, 0xf8bc80, 0xf8c080,
+    0xf8c888, 0xf8c480, 0xf8cc88, 0xf8c480, 0xf8d098, 0xf8c088, 0xf8dcb8, 0xf8f8d8, 0xf8fcf8, 0xf0f8f0, 0xe8fcf8, 0xe8f8f8, 0xf8f4e8, 0xf8f8f0, 0xf8e8c8, 0xe8c8a8,
+    0xf8c890, 0xf8c890, 0xf8c480, 0xf8c880, 0xf8c478, 0xf8c478, 0xf8c888, 0xf8c488, 0xe0c8a8, 0xf8e8c8, 0xf8fcf8, 0xf8f8f0, 0xf0f8f0, 0xf8fcf8, 0xf8f8f0, 0xf8fcf0,
+    0xe0fcf8, 0xe0fcf8, 0xe8fcf0, 0xe0f8e8, 0xf8f8d8, 0xf8fce0, 0xf8f0c0, 0xf8d8a8, 0xf8c890, 0xf8c088, 0xf8b878, 0xf8b878, 0xf8b470, 0xf8b070, 0xf8b470, 0xf8b878,
+    0xf8d8b0, 0xf8f4c8, 0xf8f8d8, 0xf8f0d0, 0xf8f8e0, 0xf8f8e0, 0xf8f0d0, 0xf8f4d0, 0xf8f4d0, 0xf8f8d0, 0xf8f8c8, 0xf8f4c8, 0xf8f8d8, 0xf8f8d0, 0xf8f4d8, 0xf8f8e0,
+    0xf0fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fce0, 0xf8f8d8, 0xf8fcc0, 0xf8fcc8, 0xf8f4c0, 0xf8f0c0, 0xf8f0c8, 0xf8f0c8,
+    0xf8f0c8, 0xf8e8c0, 0xf8f4d8, 0xf8f8e0, 0xf8fcf0, 0xf8f8f0, 0xf0fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8f4d8, 0xf8f4d8, 0xf8ecc8, 0xf8ecc8,
+    0xf8f0c0, 0xf8f4c8, 0xf8f4c8, 0xf8f0c8, 0xf8f8d8, 0xf8f8d8, 0xf8fce8, 0xf8fce8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8f4e8, 0xf8f0d8, 0xf8f0d8, 0xf8f0c8, 0xf8f0c8, 0xf8f4d8, 0xf8f4d8, 0xf8f8e0, 0xf8f8e0,
+    0xf8f8e8, 0xf8f4e0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8e8, 0xf8fcf0, 0xf8f4d8, 0xf8f0d0, 0xf8f8d8, 0xf8f8d8, 0xf8f4d8, 0xf8f4e0,
+    0xf8f4d0, 0xf8f4d0, 0xf8f4d8, 0xf8f4d8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf0f8f8, 0xf0fcf8, 0xf0f8f8, 0xf8fcf0, 0xf8fcf0, 0xf8f4e0, 0xf8f0e0, 0xf8f4d8, 0xf8f4d8,
+    0xf8f0d8, 0xf8f4d8, 0xf8f4d8, 0xf8ecd0, 0xf8f8d8, 0xf8ecd0, 0xf8f4e0, 0xf8f8e0, 0xf8fcf8, 0xf0f4f0, 0xf0f8f8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0,
+    0xf8f4e0, 0xf8f8e8, 0xf8f4d8, 0xf8f4d8, 0xf8f4d0, 0xf8f4d0, 0xf8f4d8, 0xf8f4d8, 0xf8f8e8, 0xf8fcf0, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8fcf8,
+    0xf8fcf8, 0xf8f4f8, 0xf8f4f8, 0xf8f8f8, 0xf8f8f8, 0xf0f0f0, 0xf8f4e8, 0xf8fcf0, 0xf8fce8, 0xf8f8e0, 0xf8f4d8, 0xf8f8d8, 0xf8f8d0, 0xf8f4c8, 0xf8f8c8, 0xf8fcc8,
+    0xf8fcf0, 0xf8f8f0, 0xf0f4f0, 0xf0f8f8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf0, 0xf0fcf0, 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8f8f0,
+    0xf8f8f8, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf0, 0xf0f8f0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf0, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+    0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+    0xf8fcf8, 0xf8fcf8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0f8f8, 0xf8fcf0, 0xf8fcf0, 0xf8f4d8, 0xf8f4d8, 0xf8f4d0, 0xf8f4d0, 0xf8f8d8, 0xf8f8d8, 0xf8f8e0, 0xf8fce0,
+    0xf8f8e8, 0xf8f0e0, 0xf8fcf8, 0xf8f8f0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8e8, 0xf8fcf0, 0xf8fce0, 0xf8f4d8, 0xf8f8d8, 0xf8f8d8, 0xf8f4e0, 0xf8fce0,
+    0xf8f0d0, 0xf8f4d0, 0xf8f8e0, 0xf8f8e0, 0xf8fcf0, 0xf8f8e8, 0xf0f8f8, 0xf0f8f8, 0xf8fcf8, 0xf0f8f8, 0xf8f8f0, 0xf8fcf0, 0xf8f4e0, 0xf8f4e0, 0xf8f4d8, 0xf8f0d8,
+    0xf8fce0, 0xf8fce0, 0xf8f8d8, 0xf8ecd0, 0xf8f8d8, 0xf8f4d8, 0xf8f8e0, 0xf8f0e0, 0xf8fcf8, 0xf0f8f0, 0xf0fcf8, 0xf8fcf8, 0xf8f8f8, 0xf0f4f0, 0xf8f8f0, 0xf8fcf0,
+    0xf8fce8, 0xf8fce8, 0xf8f8e0, 0xf8f8e0, 0xf8f4d8, 0xf8f4d8, 0xf8f8d8, 0xf8f8d8, 0xf8fcf0, 0xf8f8e8, 0xf0f4f8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8fcf8,
+    0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f4e8, 0xf8f8f0, 0xf8f8e0, 0xf8f8e0, 0xf8f8d8, 0xf8fce0, 0xf8fcd0, 0xf8fcd0, 0xf8f8c8, 0xf8fcd0,
+    0xf8f4e8, 0xf8f4e8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf8, 0xf8fcf0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf0f8f8, 0xf0fcf8,
+    0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf0, 0xf0fcf0, 0xf8fce8, 0xf8fce0, 0xf8f4e0, 0xf8fce8, 0xf8fcf0, 0xf8f8e8,
+    0xf8f8f8, 0xf8f4f0, 0xf8fcf8, 0xf8fcf8, 0xf0f8f0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0fcf0, 0xf8f8f0, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+    0xf8f8f8, 0xf8f4f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0f8f8, 0xf0f8f8, 0xf8f8f8, 0xf8f8f8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0,
+    0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f0, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8f4f0, 0xf8f4f0, 0xf8f4f0, 0xf8f4f0, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf0, 0xf0fcf0,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0,
+    0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8f4f0, 0xf8f4f0, 0xf8f4f0, 0xf8f4f0, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf0, 0xf0fcf0,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+    0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+    0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+    0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0,
+    0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8,
+    0xe8fcf8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf0, 0xf0fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+    0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf0, 0xf0fcf0,
+    0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8,
+    0xe8fcf8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf0, 0xf0fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+    0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf0, 0xf0fcf0,
+    0xf0fcf0, 0xf0fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8f8e8, 0xf8f8e8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0,
+    0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8,
+    0xf8f8e8, 0xf8f8e8, 0xf8f8f0, 0xf8f8f0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8,
+    0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf0, 0xf0fcf0,
+    0xf0fce8, 0xf0fce8, 0xf0fcf0, 0xf0fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0,
+    0xf0fcf0, 0xf0fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xe8fcf8, 0xe8fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8f8e8, 0xf8f8e8,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0,
+    0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0,
+    0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8,
+    0xf8f8e8, 0xf8f8e8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8,
+    0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8,
+    0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+    0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf0, 0xf0fcf0,
+    0xf0fce8, 0xf0fce8, 0xf0fcf0, 0xf0fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0,
+    0xf0fce8, 0xf0fce8, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xe8fcf8, 0xe8fcf8,
+};
+
+
+static __inline void  memcpy_2d(UWORD8 *pu1_dest,
+                                UWORD32 dest_stride,
+                                const UWORD8  *pu1_src,
+                                UWORD32 u4_x_pos,
+                                UWORD32 u4_y_pos,
+                                UWORD32 u4_logo_wd,
+                                UWORD32 u4_logo_ht,
+                                UWORD32 u4_logo_strd,
+                                WORD32 shift)
+{
+    UWORD32 i;
+    UWORD32 j;
+
+    pu1_dest = pu1_dest + u4_x_pos + (u4_y_pos * dest_stride);
+
+    for(i = 0; i < u4_logo_ht; i++)
+    {
+#if 1//!OLD_LOGO
+        if(shift)
+        {
+            WORD32 val;
+            for(j = 0; j < u4_logo_wd; j++)
+            {
+                val = CLIP_U8(pu1_dest[j] + shift);
+                pu1_dest[j] = (pu1_src[j] * val) >> 8;
+            }
+        }
+        else
+        {
+            for(j = 0; j < u4_logo_wd; j++)
+            {
+                pu1_dest[j] = (pu1_src[j] * pu1_dest[j]) >> 8;
+            }
+        }
+
+#else
+        memcpy(pu1_dest, pu1_src, u4_logo_wd);
+#endif
+        pu1_src += u4_logo_strd;
+        pu1_dest += dest_stride;
+    }
+}
+
+void ihevcd_insert_logo(UWORD8 *pu1_buf_y,
+                        UWORD8 *pu1_buf_u,
+                        UWORD8 *pu1_buf_v,
+                        UWORD32 u4_stride,
+                        UWORD32 u4_x_pos,
+                        UWORD32 u4_y_pos,
+                        UWORD32 u4_yuv_fmt,
+                        UWORD32 u4_disp_wd,
+                        UWORD32 u4_disp_ht)
+{
+
+    UWORD32 u4_logo_wd_y, u4_logo_wd_uv, u4_logo_ht_y, u4_logo_ht_uv;
+    UWORD32 u4_logo_strd_y, u4_logo_strd_uv;
+    UWORD32 u4_stride_y, u4_tride_uv;
+    const UWORD8 *pu1_buf_logo_y, *pu1_buf_logo_u, *pu1_buf_logo_v;
+    UWORD32 u4_x_pos_y, u4_x_pos_uv, u4_y_pos_y, u4_y_pos_uv;
+    WORD32 num_comp = 0;
+    WORD32 shift_y, shift_uv;
+    if((WORD32)u4_x_pos < 0)
+        u4_x_pos = 0;
+
+    if((WORD32)u4_y_pos < 0)
+        u4_y_pos = 0;
+    /*Use the following to blend the logo*/
+    //shift_y = 0;
+    //shift_uv = 128;
+
+    /* These values will do complete fill */
+    shift_y = 256;
+    shift_uv = 256;
+    switch(u4_yuv_fmt)
+    {
+        case IV_YUV_444P:
+            num_comp = 3;
+            u4_logo_wd_y  = LOGO_WD_Y;
+            u4_logo_wd_uv = LOGO_WD_444_UV;
+            u4_logo_ht_y  = LOGO_HT_Y;
+            u4_logo_ht_uv = LOGO_HT_444_UV;
+            u4_logo_strd_y = LOGO_WD_Y;
+            u4_logo_strd_uv = LOGO_WD_444_UV;
+
+            u4_x_pos_y   = u4_x_pos;
+            u4_x_pos_uv  = u4_x_pos;
+            u4_y_pos_y   = u4_y_pos;
+            u4_y_pos_uv  = u4_y_pos;
+
+
+            pu1_buf_logo_y = gau1_ihevcd_logo_y;
+            pu1_buf_logo_u = gau1_ihevcd_logo_uv;
+            pu1_buf_logo_v = gau1_ihevcd_logo_uv;
+
+            u4_stride_y    = u4_stride;
+            u4_tride_uv   = u4_stride;
+
+            break;
+        case IV_YUV_420P:
+            num_comp = 3;
+            u4_logo_wd_y  = LOGO_WD_Y;
+            u4_logo_wd_uv = LOGO_WD_420_UV;
+            u4_logo_ht_y  = LOGO_HT_Y;
+            u4_logo_ht_uv = LOGO_HT_420_UV;
+            u4_logo_strd_y = LOGO_WD_Y;
+            u4_logo_strd_uv = LOGO_WD_420_UV;
+
+            u4_x_pos_y   = u4_x_pos;
+            u4_x_pos_uv  = u4_x_pos >> 1;
+            u4_y_pos_y   = u4_y_pos;
+            u4_y_pos_uv  = u4_y_pos >> 1;
+
+            pu1_buf_logo_y = gau1_ihevcd_logo_y;
+            pu1_buf_logo_u = gau1_ihevcd_logo_420p_u;
+            pu1_buf_logo_v = gau1_ihevcd_logo_420p_v;
+
+            u4_stride_y    = u4_stride;
+            u4_tride_uv   = u4_stride >> 1;
+            break;
+
+        case IV_YUV_422P:
+            num_comp = 3;
+            u4_logo_wd_y  = LOGO_WD_Y;
+            u4_logo_wd_uv = LOGO_WD_422_UV;
+            u4_logo_ht_y  = LOGO_HT_Y;
+            u4_logo_ht_uv = LOGO_HT_422_UV;
+            u4_logo_strd_y = LOGO_WD_Y;
+            u4_logo_strd_uv = LOGO_WD_422_UV;
+
+            u4_x_pos_y   = u4_x_pos;
+            u4_x_pos_uv  = u4_x_pos >> 1;
+            u4_y_pos_y   = u4_y_pos;
+            u4_y_pos_uv  = u4_y_pos;
+
+            pu1_buf_logo_y = gau1_ihevcd_logo_y;
+            pu1_buf_logo_u = gau1_ihevcd_logo_uv;
+            pu1_buf_logo_v = gau1_ihevcd_logo_uv;
+
+            u4_stride_y    = u4_stride;
+            u4_tride_uv   = u4_stride >> 1;
+
+
+            break;
+
+        case IV_YUV_411P:
+            num_comp = 3;
+            u4_logo_wd_y  = LOGO_WD_Y;
+            u4_logo_wd_uv = LOGO_WD_411_UV;
+            u4_logo_ht_y  = LOGO_HT_Y;
+            u4_logo_ht_uv = LOGO_HT_411_UV;
+            u4_logo_strd_y = LOGO_WD_Y;
+            u4_logo_strd_uv = LOGO_WD_411_UV;
+
+            u4_x_pos_y   = u4_x_pos;
+            u4_x_pos_uv  = u4_x_pos >> 2;
+            u4_y_pos_y   = u4_y_pos;
+            u4_y_pos_uv  = u4_y_pos;
+
+            pu1_buf_logo_y = gau1_ihevcd_logo_y;
+            pu1_buf_logo_u = gau1_ihevcd_logo_uv;
+            pu1_buf_logo_v = gau1_ihevcd_logo_uv;
+
+            u4_stride_y    = u4_stride;
+            u4_tride_uv   = u4_stride >> 2;
+
+            break;
+        case IV_RGB_565:
+            num_comp = 1;
+            u4_logo_wd_y  = LOGO_WD_RGB565 * 2;
+            u4_logo_wd_uv = 0;
+            u4_logo_ht_y  = LOGO_HT_RGB565;
+            u4_logo_ht_uv = 0;
+            u4_logo_strd_y = LOGO_WD_RGB565 * 2;
+            u4_logo_strd_uv = 0;
+
+            u4_x_pos_y   = u4_x_pos * 2;
+            u4_x_pos_uv  = 0;
+            u4_y_pos_y   = u4_y_pos;
+            u4_y_pos_uv  = 0;
+
+            pu1_buf_logo_y = (UWORD8 *)gau2_ihevcd_logo_rgb565;
+            pu1_buf_logo_u = NULL;
+            pu1_buf_logo_v = NULL;
+
+            u4_stride_y    = u4_stride * 2;
+            u4_tride_uv   = 0;
+            shift_y = 256;
+            shift_uv = 256;
+
+            break;
+        case IV_RGBA_8888:
+            num_comp = 1;
+            u4_logo_wd_y  = LOGO_WD_RGBA8888 * 4;
+            u4_logo_wd_uv = 0;
+            u4_logo_ht_y  = LOGO_HT_RGBA8888;
+            u4_logo_ht_uv = 0;
+            u4_logo_strd_y = LOGO_WD_RGBA8888 * 4;
+            u4_logo_strd_uv = 0;
+
+            u4_x_pos_y   = u4_x_pos * 4;
+            u4_x_pos_uv  = 0;
+            u4_y_pos_y   = u4_y_pos;
+            u4_y_pos_uv  = 0;
+
+            pu1_buf_logo_y = (UWORD8 *)gau4_ihevcd_logo_rgb8888;
+            pu1_buf_logo_u = NULL;
+            pu1_buf_logo_v = NULL;
+
+            u4_stride_y    = u4_stride * 4;
+            u4_tride_uv   = 0;
+            shift_y = 256;
+            shift_uv = 256;
+
+            break;
+        case IV_YUV_420SP_UV:
+
+            num_comp = 2;
+            u4_logo_wd_y  = LOGO_WD_Y;
+            u4_logo_wd_uv = LOGO_WD_420SP_UV;
+            u4_logo_ht_y  = LOGO_HT_Y;
+            u4_logo_ht_uv = LOGO_HT_420SP_UV;
+            u4_logo_strd_y = LOGO_WD_Y;
+            u4_logo_strd_uv = LOGO_WD_420SP_UV;
+
+            u4_x_pos_y   = u4_x_pos;
+            u4_x_pos_uv  = u4_x_pos;
+            u4_y_pos_y   = u4_y_pos;
+            u4_y_pos_uv  = u4_y_pos >> 1;
+
+            pu1_buf_logo_y = gau1_ihevcd_logo_y;
+            pu1_buf_logo_u = gau1_ihevcd_logo_420sp_uv;
+            pu1_buf_logo_v = gau1_ihevcd_logo_420sp_uv;
+
+            u4_stride_y    = u4_stride;
+            u4_tride_uv   = u4_stride;
+            break;
+
+        case IV_YUV_420SP_VU:
+        default:
+            num_comp = 2;
+            u4_logo_wd_y  = LOGO_WD_Y;
+            u4_logo_wd_uv = LOGO_WD_420SP_VU;
+            u4_logo_ht_y  = LOGO_HT_Y;
+            u4_logo_ht_uv = LOGO_HT_420SP_VU;
+            u4_logo_strd_y = LOGO_WD_Y;
+            u4_logo_strd_uv = LOGO_WD_420SP_VU;
+
+            u4_x_pos_y   = u4_x_pos;
+            u4_x_pos_uv  = u4_x_pos;
+            u4_y_pos_y   = u4_y_pos;
+            u4_y_pos_uv  = u4_y_pos >> 1;
+
+            pu1_buf_logo_y = gau1_ihevcd_logo_y;
+            pu1_buf_logo_u = gau1_ihevcd_logo_420sp_vu;
+            pu1_buf_logo_v = gau1_ihevcd_logo_420sp_vu;
+
+            u4_stride_y    = u4_stride;
+            u4_tride_uv   = u4_stride;
+            break;
+
+
+    }
+    //num_comp = 2;
+    u4_logo_wd_y = MIN(u4_logo_wd_y, u4_disp_wd - u4_x_pos_y);
+    u4_logo_ht_y = MIN(u4_logo_ht_y, u4_disp_ht - u4_y_pos_y);
+    u4_logo_wd_uv = MIN(u4_logo_wd_uv, (u4_disp_wd >> 1) - u4_x_pos_uv);
+    u4_logo_ht_uv = MIN(u4_logo_ht_uv, (u4_disp_ht >> 1) - u4_x_pos_uv);
+    memcpy_2d(pu1_buf_y, u4_stride_y, pu1_buf_logo_y, u4_x_pos_y, u4_y_pos_y, u4_logo_wd_y, u4_logo_ht_y, u4_logo_strd_y, shift_y);
+    if(num_comp > 1)
+        memcpy_2d(pu1_buf_u, u4_tride_uv, pu1_buf_logo_u, u4_x_pos_uv, u4_y_pos_uv, u4_logo_wd_uv, u4_logo_ht_uv, u4_logo_strd_uv, shift_uv);
+    if(num_comp > 2)
+        memcpy_2d(pu1_buf_v, u4_tride_uv, pu1_buf_logo_v, u4_x_pos_uv, u4_y_pos_uv, u4_logo_wd_uv, u4_logo_ht_uv, u4_logo_strd_uv, shift_uv);
+
+#if CODEC_LOGO
+    u4_y_pos = u4_y_pos +  u4_logo_ht_y;
+    /*Use the following to blend the logo*/
+    //shift_y = 0;
+    //shift_uv = 128;
+    shift_y = 256;
+    shift_yv = 256;
+
+
+    switch(u4_yuv_fmt)
+    {
+        case IV_YUV_444P:
+            num_comp = 3;
+            u4_logo_wd_y  = LOGO_CODEC_WD_Y;
+            u4_logo_wd_uv = LOGO_CODEC_WD_444_UV;
+            u4_logo_ht_y  = LOGO_CODEC_HT_Y;
+            u4_logo_ht_uv = LOGO_CODEC_HT_444_UV;
+
+            u4_x_pos_y   = u4_x_pos;
+            u4_x_pos_uv  = u4_x_pos;
+            u4_y_pos_y   = u4_y_pos;
+            u4_y_pos_uv  = u4_y_pos;
+
+
+            pu1_buf_logo_y = gau1_ihevcd_logo_y;
+            pu1_buf_logo_u = gau1_ihevcd_logo_uv;
+            pu1_buf_logo_v = gau1_ihevcd_logo_uv;
+
+            u4_stride_y    = u4_stride;
+            u4_tride_uv   = u4_stride;
+
+            break;
+        case IV_YUV_420P:
+            num_comp = 3;
+            u4_logo_wd_y  = LOGO_CODEC_WD_Y;
+            u4_logo_wd_uv = LOGO_CODEC_WD_420_UV;
+            u4_logo_ht_y  = LOGO_CODEC_HT_Y;
+            u4_logo_ht_uv = LOGO_CODEC_HT_420_UV;
+
+            u4_x_pos_y   = u4_x_pos;
+            u4_x_pos_uv  = u4_x_pos >> 1;
+            u4_y_pos_y   = u4_y_pos;
+            u4_y_pos_uv  = u4_y_pos >> 1;
+
+            pu1_buf_logo_y = gau1_ihevcd_codec_logo_y;
+            pu1_buf_logo_u = gau1_ihevcd_codec_logo_420p_u;
+            pu1_buf_logo_v = gau1_ihevcd_codec_logo_420p_v;
+
+            u4_stride_y    = u4_stride;
+            u4_tride_uv   = u4_stride >> 1;
+            break;
+
+        case IV_YUV_422P:
+            num_comp = 3;
+            u4_logo_wd_y  = LOGO_CODEC_WD_Y;
+            u4_logo_wd_uv = LOGO_CODEC_WD_422_UV;
+            u4_logo_ht_y  = LOGO_CODEC_HT_Y;
+            u4_logo_ht_uv = LOGO_CODEC_HT_422_UV;
+
+            u4_x_pos_y   = u4_x_pos;
+            u4_x_pos_uv  = u4_x_pos >> 1;
+            u4_y_pos_y   = u4_y_pos;
+            u4_y_pos_uv  = u4_y_pos;
+
+            pu1_buf_logo_y = gau1_ihevcd_logo_y;
+            pu1_buf_logo_u = gau1_ihevcd_logo_uv;
+            pu1_buf_logo_v = gau1_ihevcd_logo_uv;
+
+            u4_stride_y    = u4_stride;
+            u4_tride_uv   = u4_stride >> 1;
+
+
+            break;
+
+        case IV_YUV_411P:
+            num_comp = 3;
+            u4_logo_wd_y  = LOGO_CODEC_WD_Y;
+            u4_logo_wd_uv = LOGO_CODEC_WD_411_UV;
+            u4_logo_ht_y  = LOGO_CODEC_HT_Y;
+            u4_logo_ht_uv = LOGO_CODEC_HT_411_UV;
+
+            u4_x_pos_y   = u4_x_pos;
+            u4_x_pos_uv  = u4_x_pos >> 2;
+            u4_y_pos_y   = u4_y_pos;
+            u4_y_pos_uv  = u4_y_pos;
+
+            pu1_buf_logo_y = gau1_ihevcd_logo_y;
+            pu1_buf_logo_u = gau1_ihevcd_logo_uv;
+            pu1_buf_logo_v = gau1_ihevcd_logo_uv;
+
+            u4_stride_y    = u4_stride;
+            u4_tride_uv   = u4_stride >> 2;
+
+            break;
+        case IV_RGB_565:
+            num_comp = 1;
+            u4_logo_wd_y  = LOGO_CODEC_WD_Y * 2;
+            u4_logo_wd_uv = 0;
+            u4_logo_ht_y  = LOGO_CODEC_HT_Y;
+            u4_logo_ht_uv = 0;
+
+            u4_x_pos_y   = u4_x_pos;
+            u4_x_pos_uv  = 0;
+            u4_y_pos_y   = u4_y_pos;
+            u4_y_pos_uv  = 0;
+
+            pu1_buf_logo_y = (UWORD8 *)gau2_ihevcd_logo_rgb565;
+            pu1_buf_logo_u = NULL;
+            pu1_buf_logo_v = NULL;
+
+            u4_stride_y    = u4_stride * 2;
+            u4_tride_uv   = 0;
+            shift_y = 256;
+            shift_uv = 256;
+
+            break;
+        case IV_RGBA_8888:
+            num_comp = 1;
+            u4_logo_wd_y  = LOGO_CODEC_WD_Y * 4;
+            u4_logo_wd_uv = 0;
+            u4_logo_ht_y  = LOGO_CODEC_HT_Y;
+            u4_logo_ht_uv = 0;
+
+            u4_x_pos_y   = (u4_x_pos + LOGO_CODEC_WD_Y) * 4;
+            u4_x_pos_uv  = 0;
+            u4_y_pos_y   = (u4_y_pos - LOGO_CODEC_HT_Y);
+            u4_y_pos_uv  = 0;
+
+            pu1_buf_logo_y = (UWORD8 *)gau4_ihevcd_logo_rgb8888;
+            pu1_buf_logo_u = NULL;
+            pu1_buf_logo_v = NULL;
+
+            u4_stride_y    = u4_stride * 2;
+            u4_tride_uv   = 0;
+            shift_y = 256;
+            shift_uv = 256;
+
+            break;
+        case IV_YUV_420SP_UV:
+
+            num_comp = 2;
+            u4_logo_wd_y  = LOGO_CODEC_WD_Y;
+            u4_logo_wd_uv = LOGO_CODEC_WD_420SP_UV;
+            u4_logo_ht_y  = LOGO_CODEC_HT_Y;
+            u4_logo_ht_uv = LOGO_CODEC_HT_420SP_UV;
+
+            u4_x_pos_y   = u4_x_pos;
+            u4_x_pos_uv  = u4_x_pos;
+            u4_y_pos_y   = u4_y_pos;
+            u4_y_pos_uv  = u4_y_pos >> 1;
+
+            pu1_buf_logo_y = gau1_ihevcd_codec_logo_y;
+            pu1_buf_logo_u = gau1_ihevcd_codec_logo_420sp_uv;
+            pu1_buf_logo_v = gau1_ihevcd_codec_logo_420sp_uv;
+
+            u4_stride_y    = u4_stride;
+            u4_tride_uv   = u4_stride;
+            break;
+
+        case IV_YUV_420SP_VU:
+
+            num_comp = 2;
+            u4_logo_wd_y  = LOGO_CODEC_WD_Y;
+            u4_logo_wd_uv = LOGO_CODEC_WD_420SP_VU;
+            u4_logo_ht_y  = LOGO_CODEC_HT_Y;
+            u4_logo_ht_uv = LOGO_CODEC_HT_420SP_VU;
+
+            u4_x_pos_y   = u4_x_pos;
+            u4_x_pos_uv  = u4_x_pos;
+            u4_y_pos_y   = u4_y_pos;
+            u4_y_pos_uv  = u4_y_pos >> 1;
+
+            pu1_buf_logo_y = gau1_ihevcd_codec_logo_y;
+            pu1_buf_logo_u = gau1_ihevcd_codec_logo_420sp_vu;
+            pu1_buf_logo_v = gau1_ihevcd_codec_logo_420sp_vu;
+
+            u4_stride_y    = u4_stride;
+            u4_tride_uv   = u4_stride;
+            break;
+        default:
+            break;
+    }
+
+    memcpy_2d(pu1_buf_y, u4_stride_y, pu1_buf_logo_y, u4_x_pos_y, u4_y_pos_y, u4_logo_wd_y, u4_logo_ht_y, shift_y);
+    if(num_comp > 1)
+        memcpy_2d(pu1_buf_u, u4_tride_uv, pu1_buf_logo_u, u4_x_pos_uv, u4_y_pos_uv, u4_logo_wd_uv, u4_logo_ht_uv, shift_uv);
+    if(num_comp > 2)
+        memcpy_2d(pu1_buf_v, u4_tride_uv, pu1_buf_logo_v, u4_x_pos_uv, u4_y_pos_uv, u4_logo_wd_uv, u4_logo_ht_uv, shift_uv);
+#endif
+
+}
+#endif

diff --git a/decoder/ihevcd_ittiam_logo.h b/decoder/ihevcd_ittiam_logo.h
new file mode 100644
index 0000000..71540e3
--- /dev/null
+++ b/decoder/ihevcd_ittiam_logo.h

@@ -0,0 +1,128 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/*****************************************************************************/
+/*                                                                           */
+/*  File Name         : ihevcd_ittiam_logo.h.h                               */
+/*                                                                           */
+/*  Description       : This file contains all the necessary function headers*/
+/*                      to insert ittiam logo to a yuv buffer.               */
+/*                                                                           */
+/*  List of Functions : None                                                 */
+/*                                                                           */
+/*  Issues / Problems : None                                                 */
+/*                                                                           */
+/*  Revision History  :                                                      */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
+/*         10 10 2005   Ittiam          Draft                                */
+/*                                                                           */
+/*****************************************************************************/
+
+#ifndef LOGO_INSERT_H
+#define LOGO_INSERT_H
+
+//#define LOGO_EN
+
+#define LOGO_WD 90
+#define LOGO_HT 36
+
+#define LOGO_WD_Y       LOGO_WD
+#define LOGO_HT_Y       LOGO_HT
+
+#define LOGO_WD_RGBA8888       160
+#define LOGO_HT_RGBA8888       64
+
+#define LOGO_WD_RGB565       160
+#define LOGO_HT_RGB565       64
+
+#define LOGO_WD_444_UV  LOGO_WD
+#define LOGO_HT_444_UV  LOGO_HT
+
+
+#define LOGO_WD_420_UV  (LOGO_WD >> 1)
+#define LOGO_HT_420_UV  (LOGO_HT >> 1)
+
+#define LOGO_WD_420SP_UV  (LOGO_WD)
+#define LOGO_HT_420SP_UV  (LOGO_HT >> 1)
+
+#define LOGO_WD_420SP_VU  (LOGO_WD)
+#define LOGO_HT_420SP_VU  (LOGO_HT >> 1)
+
+#define LOGO_WD_422_UV  (LOGO_WD >> 1)
+#define LOGO_HT_422_UV  (LOGO_HT)
+
+#define LOGO_WD_422V_UV  (LOGO_WD)
+#define LOGO_HT_422V_UV  (LOGO_HT >> 1)
+
+#define LOGO_WD_411_UV  (LOGO_WD >> 2)
+#define LOGO_HT_411_UV  (LOGO_HT)
+
+#define LOGO_CODEC_WD 80
+#define LOGO_CODEC_HT  24
+
+#define LOGO_CODEC_WD_Y       LOGO_CODEC_WD
+#define LOGO_CODEC_HT_Y       LOGO_CODEC_HT
+
+
+#define LOGO_CODEC_WD_444_UV  LOGO_CODEC_WD
+#define LOGO_CODEC_HT_444_UV  LOGO_CODEC_HT
+
+
+#define LOGO_CODEC_WD_420_UV  (LOGO_CODEC_WD >> 1)
+#define LOGO_CODEC_HT_420_UV  (LOGO_CODEC_HT >> 1)
+
+#define LOGO_CODEC_WD_420SP_UV  (LOGO_CODEC_WD)
+#define LOGO_CODEC_HT_420SP_UV  (LOGO_CODEC_HT >> 1)
+
+#define LOGO_CODEC_WD_420SP_VU  (LOGO_CODEC_WD)
+#define LOGO_CODEC_HT_420SP_VU  (LOGO_CODEC_HT >> 1)
+
+#define LOGO_CODEC_WD_422_UV  (LOGO_CODEC_WD >> 1)
+#define LOGO_CODEC_HT_422_UV  (LOGO_CODEC_HT)
+
+#define LOGO_CODEC_WD_422V_UV  (LOGO_CODEC_WD)
+#define LOGO_CODEC_HT_422V_UV  (LOGO_CODEC_HT >> 1)
+
+#define LOGO_CODEC_WD_411_UV  (LOGO_CODEC_WD >> 2)
+#define LOGO_CODEC_HT_411_UV  (LOGO_CODEC_HT)
+
+
+
+
+#define START_X_ITT_LOGO        0
+#define START_Y_ITT_LOGO        0
+
+#define WD_ITT_LOGO             128
+#define HT_ITT_LOGO             60
+
+void ihevcd_insert_logo(UWORD8 *buf_y, UWORD8 *buf_u, UWORD8 *buf_v,
+                        UWORD32 stride,
+                        UWORD32 x_pos,
+                        UWORD32 y_pos,
+                        UWORD32 yuv_fmt,
+                        UWORD32 u4_disp_wd,
+                        UWORD32 u4_disp_ht);
+
+#ifdef LOGO_EN
+#define INSERT_LOGO(buf_y, buf_u, buf_v, stride, x_pos, y_pos, yuv_fmt,disp_wd,disp_ht) ihevcd_insert_logo(buf_y, buf_u, buf_v, stride, x_pos, y_pos, yuv_fmt,disp_wd,disp_ht);
+#else
+#define INSERT_LOGO(buf_y, buf_u, buf_v, stride, x_pos, y_pos, yuv_fmt,disp_wd,disp_ht)
+#endif
+
+#endif /* LOGO_INSERT_H */
+

diff --git a/decoder/ihevcd_job_queue.c b/decoder/ihevcd_job_queue.c
new file mode 100644
index 0000000..e926f94
--- /dev/null
+++ b/decoder/ihevcd_job_queue.c

@@ -0,0 +1,593 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_job_queue.c
+*
+* @brief
+*  Contains functions for job queue
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+#include "ihevc_platform_macros.h"
+
+#include "ihevc_macros.h"
+#include "ihevcd_error.h"
+#include "ihevcd_job_queue.h"
+
+/**
+*******************************************************************************
+*
+* @brief Returns size for job queue context. Does not include job queue buffer
+* requirements
+*
+* @par   Description
+* Returns size for job queue context. Does not include job queue buffer
+* requirements. Buffer size required to store the jobs should be allocated in
+* addition to the value returned here.
+*
+* @returns Size of the job queue context
+*
+* @remarks
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_jobq_ctxt_size()
+{
+    WORD32 size;
+    size = sizeof(jobq_t);
+    size += ithread_get_mutex_lock_size();
+    return size;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Locks the jobq conext
+*
+* @par   Description
+*   Locks the jobq conext by calling ithread_mutex_lock()
+*
+* @param[in] ps_jobq
+*   Job Queue context
+*
+* @returns IHEVCD_FAIL if mutex lock fails else IHEVCD_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_jobq_lock(jobq_t *ps_jobq)
+{
+    WORD32 retval;
+    retval = ithread_mutex_lock(ps_jobq->pv_mutex);
+    if(retval)
+    {
+        return (IHEVCD_ERROR_T)IHEVCD_FAIL;
+    }
+    return (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Unlocks the jobq conext
+*
+* @par   Description
+*   Unlocks the jobq conext by calling ithread_mutex_unlock()
+*
+* @param[in] ps_jobq
+*   Job Queue context
+*
+* @returns IHEVCD_FAIL if mutex unlock fails else IHEVCD_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+
+IHEVCD_ERROR_T ihevcd_jobq_unlock(jobq_t *ps_jobq)
+{
+    WORD32 retval;
+    retval = ithread_mutex_unlock(ps_jobq->pv_mutex);
+    if(retval)
+    {
+        return (IHEVCD_ERROR_T)IHEVCD_FAIL;
+    }
+    return (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+
+}
+/**
+*******************************************************************************
+*
+* @brief
+*   Yeilds the thread
+*
+* @par   Description
+*   Unlocks the jobq conext by calling
+* ihevcd_jobq_unlock(), ithread_yield() and then ihevcd_jobq_lock()
+* jobq is unlocked before to ensure the jobq can be accessed by other threads
+* If unlock is not done before calling yield then no other thread can access
+* the jobq functions and update jobq.
+*
+* @param[in] ps_jobq
+*   Job Queue context
+*
+* @returns IHEVCD_FAIL if mutex lock unlock or yield fails else IHEVCD_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_jobq_yield(jobq_t *ps_jobq)
+{
+
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+
+    IHEVCD_ERROR_T rettmp;
+    rettmp = ihevcd_jobq_unlock(ps_jobq);
+    RETURN_IF((rettmp != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), rettmp);
+
+#ifdef GPU_CIRCULAR_QUEUE
+    usleep(1000);
+#else
+    //NOP(1024 * 8);
+    ithread_yield();
+#endif
+
+    rettmp = ihevcd_jobq_lock(ps_jobq);
+    RETURN_IF((rettmp != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), rettmp);
+    return ret;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief free the job queue pointers
+*
+* @par   Description
+* Frees the jobq context
+*
+* @param[in] pv_buf
+* Memoy for job queue buffer and job queue context
+*
+* @returns Pointer to job queue context
+*
+* @remarks
+* Since it will be called only once by master thread this is not thread safe.
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_jobq_free(jobq_t *ps_jobq)
+{
+    WORD32 ret;
+    ret = ithread_mutex_destroy(ps_jobq->pv_mutex);
+
+    if(0 == ret)
+        return (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    else
+        return (IHEVCD_ERROR_T)IHEVCD_FAIL;
+}
+
+/**
+*******************************************************************************
+*
+* @brief Initialize the job queue
+*
+* @par   Description
+* Initializes the jobq context and sets write and read pointers to start of
+* job queue buffer
+*
+* @param[in] pv_buf
+* Memoy for job queue buffer and job queue context
+*
+* @param[in] buf_size
+* Size of the total memory allocated
+*
+* @returns Pointer to job queue context
+*
+* @remarks
+* Since it will be called only once by master thread this is not thread safe.
+*
+*******************************************************************************
+*/
+void* ihevcd_jobq_init(void *pv_buf, WORD32 buf_size)
+{
+    jobq_t *ps_jobq;
+    UWORD8 *pu1_buf;
+    pu1_buf = (UWORD8 *)pv_buf;
+
+    ps_jobq = (jobq_t *)pu1_buf;
+    pu1_buf += sizeof(jobq_t);
+    buf_size -= sizeof(jobq_t);
+
+    ps_jobq->pv_mutex = pu1_buf;
+    pu1_buf += ithread_get_mutex_lock_size();
+    buf_size -= ithread_get_mutex_lock_size();
+
+    if(buf_size <= 0)
+        return NULL;
+
+    ithread_mutex_init(ps_jobq->pv_mutex);
+
+    ps_jobq->pv_buf_base = pu1_buf;
+    ps_jobq->pv_buf_wr = pu1_buf;
+    ps_jobq->pv_buf_rd = pu1_buf;
+    ps_jobq->pv_buf_end = pu1_buf + buf_size;
+    ps_jobq->i4_terminate = 0;
+#ifdef GPU_CIRCULAR_QUEUE
+    ps_jobq->i4_wrapped_around = 0;
+#endif
+
+
+    return ps_jobq;
+}
+/**
+*******************************************************************************
+*
+* @brief
+*   Resets the jobq conext
+*
+* @par   Description
+*   Resets the jobq conext by initilizing job queue context elements
+*
+* @param[in] ps_jobq
+*   Job Queue context
+*
+* @returns IHEVCD_FAIL if lock unlock fails else IHEVCD_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_jobq_reset(jobq_t *ps_jobq)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    ret = ihevcd_jobq_lock(ps_jobq);
+    RETURN_IF((ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), ret);
+
+    ps_jobq->pv_buf_wr      = ps_jobq->pv_buf_base;
+    ps_jobq->pv_buf_rd      = ps_jobq->pv_buf_base;
+    ps_jobq->i4_terminate   = 0;
+#ifdef GPU_CIRCULAR_QUEUE
+    ps_jobq->i4_wrapped_around = 0;
+#endif
+    ret = ihevcd_jobq_unlock(ps_jobq);
+    RETURN_IF((ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), ret);
+
+    return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Deinitializes the jobq conext
+*
+* @par   Description
+*   Deinitializes the jobq conext by calling ihevc_jobq_reset()
+* and then destrying the mutex created
+*
+* @param[in] ps_jobq
+*   Job Queue context
+*
+* @returns IHEVCD_FAIL if lock unlock fails else IHEVCD_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_jobq_deinit(jobq_t *ps_jobq)
+{
+    WORD32 retval;
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+
+    ret = ihevcd_jobq_reset(ps_jobq);
+    RETURN_IF((ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), ret);
+
+    retval = ithread_mutex_destroy(ps_jobq->pv_mutex);
+    if(retval)
+    {
+        return (IHEVCD_ERROR_T)IHEVCD_FAIL;
+    }
+
+    return (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*   Terminates the jobq
+*
+* @par   Description
+*   Terminates the jobq by setting a flag in context.
+*
+* @param[in] ps_jobq
+*   Job Queue context
+*
+* @returns IHEVCD_FAIL if lock unlock fails else IHEVCD_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+
+IHEVCD_ERROR_T ihevcd_jobq_terminate(jobq_t *ps_jobq)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    ret = ihevcd_jobq_lock(ps_jobq);
+    RETURN_IF((ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), ret);
+
+    ps_jobq->i4_terminate = 1;
+
+    ret = ihevcd_jobq_unlock(ps_jobq);
+    RETURN_IF((ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), ret);
+    return ret;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief Adds a job to the queue
+*
+* @par   Description
+* Adds a job to the queue and updates wr address to next location.
+* Format/content of the job structure is abstracted and hence size of the job
+* buffer is being passed.
+*
+* @param[in] ps_jobq
+*   Job Queue context
+*
+* @param[in] pv_job
+*   Pointer to the location that contains details of the job to be added
+*
+* @param[in] job_size
+*   Size of the job buffer
+*
+* @param[in] blocking
+*   To signal if the write is blocking or non-blocking.
+*
+* @returns
+*
+* @remarks
+* Job Queue buffer is assumed to be allocated to handle worst case number of jobs
+* Wrap around is not supported
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_jobq_queue(jobq_t *ps_jobq, void *pv_job, WORD32 job_size, WORD32 blocking)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    IHEVCD_ERROR_T rettmp;
+    UWORD8 *pu1_buf;
+    UNUSED(blocking);
+
+    rettmp = ihevcd_jobq_lock(ps_jobq);
+    RETURN_IF((rettmp != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), rettmp);
+
+    pu1_buf = (UWORD8 *)ps_jobq->pv_buf_wr;
+#ifdef GPU_CIRCULAR_QUEUE
+    if((UWORD8 *)ps_jobq->pv_buf_end > (pu1_buf + job_size))
+    {
+        memcpy(ps_jobq->pv_buf_wr, pv_job, job_size);
+        ps_jobq->pv_buf_wr = (UWORD8 *)ps_jobq->pv_buf_wr + job_size;
+
+    }
+    else
+    {
+        /* Handle wrap around case */
+        /* Wait for pv_buf_rd to consume first job_size number of bytes
+         * from the beginning of job queue
+         */
+        //ret = (IHEVCD_ERROR_T)IHEVCD_FAIL;
+        ps_jobq->pv_buf_wr = ps_jobq->pv_buf_base;
+        memcpy(ps_jobq->pv_buf_wr, pv_job, job_size);
+        ps_jobq->pv_buf_wr = (UWORD8 *)ps_jobq->pv_buf_wr + job_size;
+        //printf("Queue wrapped around\n");
+        ps_jobq->i4_wrapped_around = 1;
+    }
+#else
+    if((UWORD8 *)ps_jobq->pv_buf_end >= (pu1_buf + job_size))
+    {
+        memcpy(ps_jobq->pv_buf_wr, pv_job, job_size);
+        ps_jobq->pv_buf_wr = (UWORD8 *)ps_jobq->pv_buf_wr + job_size;
+        ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    }
+    else
+    {
+        /* Handle wrap around case */
+        /* Wait for pv_buf_rd to consume first job_size number of bytes
+         * from the beginning of job queue
+         */
+        ret = (IHEVCD_ERROR_T)IHEVCD_FAIL;
+    }
+#endif
+
+    ps_jobq->i4_terminate = 0;
+
+    rettmp = ihevcd_jobq_unlock(ps_jobq);
+    RETURN_IF((rettmp != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), rettmp);
+
+    return ret;
+}
+/**
+*******************************************************************************
+*
+* @brief Gets next from the Job queue
+*
+* @par   Description
+* Gets next job from the job queue and updates rd address to next location.
+* Format/content of the job structure is abstracted and hence size of the job
+* buffer is being passed. If it is a blocking call and if there is no new job
+* then this functions unlocks the mutext and calls yield and then locks it back.
+* and continues till a job is available or terminate is set
+*
+* @param[in] ps_jobq
+*   Job Queue context
+*
+* @param[out] pv_job
+*   Pointer to the location that contains details of the job to be written
+*
+* @param[in] job_size
+*   Size of the job buffer
+*
+* @param[in] blocking
+*   To signal if the read is blocking or non-blocking.
+*
+* @returns
+*
+* @remarks
+* Job Queue buffer is assumed to be allocated to handle worst case number of jobs
+* Wrap around is not supported
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_jobq_dequeue(jobq_t *ps_jobq, void *pv_job, WORD32 job_size, WORD32 blocking)
+{
+    IHEVCD_ERROR_T ret;
+    IHEVCD_ERROR_T rettmp;
+    volatile UWORD8 *pu1_buf;
+
+    rettmp = ihevcd_jobq_lock(ps_jobq);
+    RETURN_IF((rettmp != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), rettmp);
+#ifdef GPU_CIRCULAR_QUEUE
+    if(((UWORD8 *)ps_jobq->pv_buf_end <= (ps_jobq->pv_buf_rd + job_size)) &&
+                    (ps_jobq->i4_wrapped_around == 1))
+    {
+        ps_jobq->pv_buf_rd = ps_jobq->pv_buf_base;
+        ps_jobq->i4_wrapped_around = 0;
+        //printf("DeQueue wrapped around\n");
+    }
+
+    pu1_buf = (UWORD8 *)ps_jobq->pv_buf_rd;
+
+    while(1)
+    {
+        pu1_buf = (UWORD8 *)ps_jobq->pv_buf_rd;
+        if(((UWORD8 *)ps_jobq->pv_buf_wr >= (pu1_buf + job_size)) ||
+                        (ps_jobq->i4_wrapped_around == 1))
+        {
+            memcpy(pv_job, ps_jobq->pv_buf_rd, job_size);
+            ps_jobq->pv_buf_rd = (UWORD8 *)ps_jobq->pv_buf_rd + job_size;
+            ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+            break;
+        }
+        else
+        {
+            /* If all the entries have been dequeued, then break and return */
+            if(1 == ps_jobq->i4_terminate)
+            {
+                ret = (IHEVCD_ERROR_T)IHEVCD_FAIL;
+                break;
+            }
+
+            if(1 == blocking)
+            {
+                ihevcd_jobq_yield(ps_jobq);
+            }
+            else
+            {
+                /* If there is no job available,
+                 * and this is non blocking call then return fail */
+                ret = (IHEVCD_ERROR_T)IHEVCD_FAIL;
+                break;
+            }
+        }
+    }
+#else
+    pu1_buf = (UWORD8 *)ps_jobq->pv_buf_rd;
+
+
+    if((UWORD8 *)ps_jobq->pv_buf_end >= (pu1_buf + job_size))
+    {
+        while(1)
+        {
+            pu1_buf = (UWORD8 *)ps_jobq->pv_buf_rd;
+            if((UWORD8 *)ps_jobq->pv_buf_wr >= (pu1_buf + job_size))
+            {
+                memcpy(pv_job, ps_jobq->pv_buf_rd, job_size);
+                ps_jobq->pv_buf_rd = (UWORD8 *)ps_jobq->pv_buf_rd + job_size;
+                ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+                break;
+            }
+            else
+            {
+                /* If all the entries have been dequeued, then break and return */
+                if(1 == ps_jobq->i4_terminate)
+                {
+                    ret = (IHEVCD_ERROR_T)IHEVCD_FAIL;
+                    break;
+                }
+
+                if(1 == blocking)
+                {
+                    ihevcd_jobq_yield(ps_jobq);
+
+                }
+                else
+                {
+                    /* If there is no job available,
+                     * and this is non blocking call then return fail */
+                    ret = (IHEVCD_ERROR_T)IHEVCD_FAIL;
+                }
+            }
+        }
+    }
+    else
+    {
+        /* Handle wrap around case */
+        /* Wait for pv_buf_rd to consume first job_size number of bytes
+         * from the beginning of job queue
+         */
+        ret = (IHEVCD_ERROR_T)IHEVCD_FAIL;
+    }
+#endif
+    rettmp = ihevcd_jobq_unlock(ps_jobq);
+    RETURN_IF((rettmp != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), rettmp);
+
+    return ret;
+}

diff --git a/decoder/ihevcd_job_queue.h b/decoder/ihevcd_job_queue.h
new file mode 100644
index 0000000..190ca83
--- /dev/null
+++ b/decoder/ihevcd_job_queue.h

@@ -0,0 +1,74 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_job_queue.h
+*
+* @brief
+*  Contains functions for job queue
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_JOB_QUEUE_H_
+#define _IHEVCD_JOB_QUEUE_H_
+
+typedef struct
+{
+    /** Pointer to buffer base which contains the jobs */
+    void *pv_buf_base;
+
+    /** Pointer to current address where new job can be added */
+    void *pv_buf_wr;
+
+    /** Pointer to current address from where next job can be obtained */
+    void *pv_buf_rd;
+
+    /** Pointer to end of job buffer */
+    void *pv_buf_end;
+
+    /** Mutex used to keep the functions thread-safe */
+    void *pv_mutex;
+
+    /** Flag to indicate jobq has to be terminated */
+    WORD32 i4_terminate;
+#ifdef GPU_CIRCULAR_QUEUE
+    /** Flag to indicate jobq wrap around */
+    WORD32 i4_wrapped_around;
+#endif
+}jobq_t;
+
+WORD32 ihevcd_jobq_ctxt_size(void);
+void* ihevcd_jobq_init(void *pv_buf, WORD32 buf_size);
+IHEVCD_ERROR_T ihevcd_jobq_free(jobq_t *ps_jobq);
+IHEVCD_ERROR_T ihevcd_jobq_reset(jobq_t *ps_jobq);
+IHEVCD_ERROR_T ihevcd_jobq_deinit(jobq_t *ps_jobq);
+IHEVCD_ERROR_T ihevcd_jobq_terminate(jobq_t *ps_jobq);
+IHEVCD_ERROR_T ihevcd_jobq_queue(jobq_t *ps_jobq, void *pv_job, WORD32 job_size, WORD32 blocking);
+IHEVCD_ERROR_T ihevcd_jobq_dequeue(jobq_t *ps_jobq, void *pv_job, WORD32 job_size, WORD32 blocking);
+
+#endif /* _IHEVCD_PROCESS_SLICE_H_ */

diff --git a/decoder/ihevcd_mv_merge.c b/decoder/ihevcd_mv_merge.c
new file mode 100644
index 0000000..4d5dfbd
--- /dev/null
+++ b/decoder/ihevcd_mv_merge.c

@@ -0,0 +1,938 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevcd_mv_merge.c
+ *
+ * @brief
+ *  Contains functions for motion vector merge candidates derivation
+ *
+ * @author
+ *  Ittiam
+ *
+ * @par List of Functions:
+ * - ihevcd_compare_pu_mv_t()
+ * - ihevcd_mv_pred_merge()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_fmt_conv.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_mv_merge.h"
+/**
+ *******************************************************************************
+ *
+ * @brief Compare Motion vectors function
+ *
+ *
+ * @par Description:
+ *   Checks if MVs and Reference idx are excatly matching.
+ *
+ * @param[inout] ps_1
+ *   motion vector 1 to be compared
+ *
+ * @param[in] ps_2
+ *   motion vector 2 to be compared
+ *
+ * @returns
+ *  0 : if not matching 1 : if matching
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+WORD32 ihevcd_compare_pu_t(pu_t *ps_pu_1, pu_t *ps_pu_2)
+{
+    WORD32 l0_match = 0, l1_match = 0;
+    pu_mv_t *ps_mv_1, *ps_mv_2;
+    WORD32 pred_mode_1, pred_mode_2;
+
+    ps_mv_1 = &ps_pu_1->mv;
+    ps_mv_2 = &ps_pu_2->mv;
+
+    pred_mode_1 = ps_pu_1->b2_pred_mode;
+    pred_mode_2 = ps_pu_2->b2_pred_mode;
+
+    if(pred_mode_1 == pred_mode_2)
+    {
+        if(pred_mode_1 != PRED_L1)
+        {
+            if(ps_mv_1->i1_l0_ref_idx == ps_mv_2->i1_l0_ref_idx)
+            {
+                if(0 == memcmp(&ps_mv_1->s_l0_mv, &ps_mv_2->s_l0_mv, sizeof(mv_t)))
+                {
+                    l0_match = 1;
+                }
+            }
+        }
+        if(pred_mode_1 != PRED_L0)
+        {
+            if(ps_mv_1->i1_l1_ref_idx == ps_mv_2->i1_l1_ref_idx)
+            {
+                if(0 == memcmp(&ps_mv_1->s_l1_mv, &ps_mv_2->s_l1_mv, sizeof(mv_t)))
+                {
+                    l1_match = 1;
+                }
+            }
+        }
+        if(pred_mode_1 == PRED_BI)
+            return (l1_match && l0_match);
+        else if(pred_mode_1 == PRED_L0)
+            return l0_match;
+        else
+            return l1_match;
+    }
+
+    return 0;
+}
+
+void ihevcd_collocated_mvp(mv_ctxt_t *ps_mv_ctxt,
+                           pu_t *ps_pu,
+                           mv_t *ps_mv_col,
+                           WORD32 *pu4_avail_col_flag,
+                           WORD32 use_pu_ref_idx,
+                           WORD32 x_col,
+                           WORD32 y_col)
+{
+    sps_t *ps_sps = ps_mv_ctxt->ps_sps;
+    slice_header_t *ps_slice_hdr = ps_mv_ctxt->ps_slice_hdr;
+    ref_list_t *ps_ref_list[2];
+    mv_buf_t *ps_mv_buf_col;
+    WORD32 xp_col, yp_col;
+    WORD32 col_ctb_x, col_ctb_y;
+    mv_t as_mv_col[2];
+    WORD32 log2_ctb_size;
+    WORD32 ctb_size;
+    WORD32 avail_col;
+    WORD32 col_ctb_idx, pu_cnt;
+    WORD32 au4_list_col[2];
+    WORD32 num_minpu_in_ctb;
+    UWORD8 *pu1_pic_pu_map_ctb;
+    pu_t *ps_col_pu;
+    WORD32 part_pos_y;
+
+
+    part_pos_y = ps_pu->b4_pos_y << 2;
+
+    log2_ctb_size = ps_sps->i1_log2_ctb_size;
+    ctb_size = (1 << log2_ctb_size);
+
+    avail_col = 1;
+
+    /* Initializing reference list */
+    ps_ref_list[0] = ps_slice_hdr->as_ref_pic_list0;
+    ps_ref_list[1] = ps_slice_hdr->as_ref_pic_list1;
+    if(PSLICE == ps_slice_hdr->i1_slice_type)
+        ps_ref_list[1] = ps_slice_hdr->as_ref_pic_list0;
+
+    if((ps_slice_hdr->i1_slice_type == BSLICE) && (ps_slice_hdr->i1_collocated_from_l0_flag == 0))
+    {
+        /* L1 */
+        ps_mv_buf_col = (mv_buf_t *)ps_ref_list[1][ps_slice_hdr->i1_collocated_ref_idx].pv_mv_buf;
+
+    }
+    else
+    {
+        /* L0 */
+        ps_mv_buf_col = (mv_buf_t *)ps_ref_list[0][ps_slice_hdr->i1_collocated_ref_idx].pv_mv_buf;
+
+    }
+    num_minpu_in_ctb = (ctb_size / MIN_PU_SIZE) * (ctb_size / MIN_PU_SIZE);
+
+    if(((part_pos_y >> log2_ctb_size) == (y_col >> log2_ctb_size))
+                    && ((x_col + (ps_mv_ctxt->i4_ctb_x << log2_ctb_size)) < ps_sps->i2_pic_width_in_luma_samples)
+                    && (((y_col + (ps_mv_ctxt->i4_ctb_y << log2_ctb_size))
+                                    < ps_sps->i2_pic_height_in_luma_samples)))
+    {
+        xp_col = ((x_col >> 4) << 4);
+        yp_col = ((y_col >> 4) << 4);
+        col_ctb_x = ps_mv_ctxt->i4_ctb_x + (xp_col >> log2_ctb_size);
+        col_ctb_y = ps_mv_ctxt->i4_ctb_y + (yp_col >> log2_ctb_size);
+        col_ctb_idx = col_ctb_x + (col_ctb_y)*(ps_sps->i2_pic_wd_in_ctb);
+        pu_cnt = ps_mv_buf_col->pu4_pic_pu_idx[col_ctb_idx];
+        pu1_pic_pu_map_ctb = ps_mv_buf_col->pu1_pic_pu_map
+                        + col_ctb_idx * num_minpu_in_ctb;
+        if(xp_col == ctb_size)
+            xp_col = 0;
+        pu_cnt += pu1_pic_pu_map_ctb[(yp_col >> 2)
+                        * (ctb_size / MIN_PU_SIZE) + (xp_col >> 2)];
+        ps_col_pu = &ps_mv_buf_col->ps_pic_pu[pu_cnt];
+    }
+    else
+        avail_col = 0;
+
+    if((avail_col == 0) || (ps_col_pu->b1_intra_flag == 1)
+                    || (ps_slice_hdr->i1_slice_temporal_mvp_enable_flag == 0))
+    {
+        pu4_avail_col_flag[0] = 0;
+        pu4_avail_col_flag[1] = 0;
+        ps_mv_col[0].i2_mvx = 0;
+        ps_mv_col[0].i2_mvy = 0;
+        ps_mv_col[1].i2_mvx = 0;
+        ps_mv_col[1].i2_mvy = 0;
+    }
+    else
+    {
+        WORD32 au4_ref_idx_col[2];
+        WORD32 pred_flag_l0, pred_flag_l1;
+        pred_flag_l0 = (ps_col_pu->b2_pred_mode != PRED_L1);
+        pred_flag_l1 = (ps_col_pu->b2_pred_mode != PRED_L0);
+
+        if(pred_flag_l0 == 0)
+        {
+            as_mv_col[0] = ps_col_pu->mv.s_l1_mv;
+            au4_ref_idx_col[0] = ps_col_pu->mv.i1_l1_ref_idx;
+            au4_list_col[0] = 1; /* L1 */
+
+            as_mv_col[1] = ps_col_pu->mv.s_l1_mv;
+            au4_ref_idx_col[1] = ps_col_pu->mv.i1_l1_ref_idx;
+            au4_list_col[1] = 1; /* L1 */
+        }
+        else
+        {
+            if(pred_flag_l1 == 0)
+            {
+                as_mv_col[0] = ps_col_pu->mv.s_l0_mv;
+                au4_ref_idx_col[0] = ps_col_pu->mv.i1_l0_ref_idx;
+                au4_list_col[0] = 0; /* L1 */
+
+                as_mv_col[1] = ps_col_pu->mv.s_l0_mv;
+                au4_ref_idx_col[1] = ps_col_pu->mv.i1_l0_ref_idx;
+                au4_list_col[1] = 0; /* L1 */
+            }
+            else
+            {
+                if(1 == ps_slice_hdr->i1_low_delay_flag)
+                {
+                    as_mv_col[0] = ps_col_pu->mv.s_l0_mv;
+                    au4_ref_idx_col[0] = ps_col_pu->mv.i1_l0_ref_idx;
+                    au4_list_col[0] = 0; /* L0 */
+
+                    as_mv_col[1] = ps_col_pu->mv.s_l1_mv;
+                    au4_ref_idx_col[1] = ps_col_pu->mv.i1_l1_ref_idx;
+                    au4_list_col[1] = 1; /* L1 */
+                }
+                else
+                {
+                    if(0 == ps_slice_hdr->i1_collocated_from_l0_flag)
+                    {
+                        as_mv_col[0] = ps_col_pu->mv.s_l0_mv;
+                        au4_ref_idx_col[0] = ps_col_pu->mv.i1_l0_ref_idx;
+
+                        as_mv_col[1] = ps_col_pu->mv.s_l0_mv;
+                        au4_ref_idx_col[1] = ps_col_pu->mv.i1_l0_ref_idx;
+                    }
+                    else
+                    {
+                        as_mv_col[0] = ps_col_pu->mv.s_l1_mv;
+                        au4_ref_idx_col[0] = ps_col_pu->mv.i1_l1_ref_idx;
+
+                        as_mv_col[1] = ps_col_pu->mv.s_l1_mv;
+                        au4_ref_idx_col[1] = ps_col_pu->mv.i1_l1_ref_idx;
+                    }
+
+                    au4_list_col[0] = ps_slice_hdr->i1_collocated_from_l0_flag; /* L"collocated_from_l0_flag" */
+                    au4_list_col[1] = ps_slice_hdr->i1_collocated_from_l0_flag; /* L"collocated_from_l0_flag" */
+                }
+            }
+        }
+        avail_col = 1;
+        {
+            WORD32 cur_poc, col_poc, col_ref_poc_l0, cur_ref_poc;
+            WORD32 col_ref_poc_l0_lt, cur_ref_poc_lt;
+            WORD32 ref_idx_l0, ref_idx_l1;
+            WORD32 slice_idx;
+            pic_buf_t *ps_pic_buf;
+
+            if(use_pu_ref_idx)
+            {
+                ref_idx_l0 = ps_pu->mv.i1_l0_ref_idx;
+                ref_idx_l1 = ps_pu->mv.i1_l1_ref_idx;
+            }
+            else
+            {
+                ref_idx_l0 = 0;
+                ref_idx_l1 = 0;
+            }
+
+            col_poc = ps_mv_buf_col->i4_abs_poc;
+            cur_poc = ps_slice_hdr->i4_abs_pic_order_cnt;
+
+            slice_idx = *(ps_mv_buf_col->pu1_pic_slice_map + col_ctb_x + col_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+
+            if(au4_list_col[0] == 0)
+            {
+                col_ref_poc_l0 =
+                                ps_mv_buf_col->l0_collocated_poc[slice_idx][au4_ref_idx_col[0]];
+                col_ref_poc_l0_lt =
+                                (ps_mv_buf_col->u1_l0_collocated_poc_lt[slice_idx][au4_ref_idx_col[0]] == LONG_TERM_REF);
+            }
+            else
+            {
+                col_ref_poc_l0 =
+                                ps_mv_buf_col->l1_collocated_poc[slice_idx][au4_ref_idx_col[0]];
+                col_ref_poc_l0_lt =
+                                (ps_mv_buf_col->u1_l1_collocated_poc_lt[slice_idx][au4_ref_idx_col[0]] == LONG_TERM_REF);
+            }
+            /* L0 collocated mv */
+            ps_pic_buf = (pic_buf_t *)((ps_ref_list[0][ref_idx_l0].pv_pic_buf));
+            cur_ref_poc = ps_pic_buf->i4_abs_poc;
+            cur_ref_poc_lt = (ps_pic_buf->u1_used_as_ref == LONG_TERM_REF);
+
+            if(cur_ref_poc_lt == col_ref_poc_l0_lt)
+            {
+                pu4_avail_col_flag[0] = 1;
+
+                if(cur_ref_poc_lt || ((col_poc - col_ref_poc_l0) == (cur_poc - cur_ref_poc)))
+                {
+                    ps_mv_col[0] = as_mv_col[0];
+                }
+                else
+                {
+                    ps_mv_col[0] = as_mv_col[0];
+                    if(col_ref_poc_l0 != col_poc)
+                        ihevcd_scale_collocated_mv((mv_t *)(&ps_mv_col[0]), cur_ref_poc,
+                                                   col_ref_poc_l0, col_poc, cur_poc);
+                }
+            }
+            else
+            {
+                pu4_avail_col_flag[0] = 0;
+                ps_mv_col[0].i2_mvx = 0;
+                ps_mv_col[0].i2_mvy = 0;
+            }
+            if((BSLICE == ps_slice_hdr->i1_slice_type))
+            {
+                WORD32 col_ref_poc_l1_lt, col_ref_poc_l1;
+
+                if(au4_list_col[1] == 0)
+                {
+                    col_ref_poc_l1 =
+                                    ps_mv_buf_col->l0_collocated_poc[slice_idx][au4_ref_idx_col[1]];
+                    col_ref_poc_l1_lt =
+                                    (ps_mv_buf_col->u1_l0_collocated_poc_lt[slice_idx][au4_ref_idx_col[1]] == LONG_TERM_REF);
+                }
+                else
+                {
+                    col_ref_poc_l1 =
+                                    ps_mv_buf_col->l1_collocated_poc[slice_idx][au4_ref_idx_col[1]];
+                    col_ref_poc_l1_lt =
+                                    (ps_mv_buf_col->u1_l1_collocated_poc_lt[slice_idx][au4_ref_idx_col[1]] == LONG_TERM_REF);
+                }
+
+                /* L1 collocated mv */
+                ps_pic_buf = (pic_buf_t *)((ps_ref_list[1][ref_idx_l1].pv_pic_buf));
+                cur_ref_poc = ps_pic_buf->i4_abs_poc;
+                cur_ref_poc_lt = (ps_pic_buf->u1_used_as_ref == LONG_TERM_REF);
+
+                if(cur_ref_poc_lt == col_ref_poc_l1_lt)
+                {
+                    pu4_avail_col_flag[1] = 1;
+
+                    if(cur_ref_poc_lt || ((col_poc - col_ref_poc_l1) == (cur_poc - cur_ref_poc)))
+                    {
+                        ps_mv_col[1] = as_mv_col[1];
+                    }
+                    else
+                    {
+                        ps_mv_col[1] = as_mv_col[1];
+                        if(col_ref_poc_l1 != col_poc)
+                            ihevcd_scale_collocated_mv((mv_t *)&ps_mv_col[1], cur_ref_poc,
+                                                       col_ref_poc_l1, col_poc, cur_poc);
+                    }
+                }
+                else
+                {
+                    pu4_avail_col_flag[1] = 0;
+                    ps_mv_col[1].i2_mvx = 0;
+                    ps_mv_col[1].i2_mvy = 0;
+                }
+            }
+            else
+            {
+                pu4_avail_col_flag[1] = 0;
+            }
+        }
+    }
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Motion Vector Merge candidates derivation
+ *
+ * @par Description:
+ *  MV merge list is computed using neighbor mvs and colocated mv
+ *
+ * @param[in] ps_ctxt
+ * pointer to mv predictor context
+ *
+ * @param[in] ps_top_nbr_4x4
+ * pointer to top 4x4 nbr structure
+ *
+ * @param[in] ps_left_nbr_4x4
+ * pointer to left 4x4 nbr structure
+ *
+ * @param[in] ps_top_left_nbr_4x4
+ * pointer to top left 4x4 nbr structure
+ *
+ * @param[in] left_nbr_4x4_strd
+ * left nbr buffer stride in terms of 4x4 units
+ *
+ * @param[in] ps_avail_flags
+ * Neighbor availability flags container
+ *
+ * @param[in] ps_col_mv
+ * Colocated MV pointer
+ *
+ * @param[in] ps_pu
+ * Current Partition PU strucrture pointer
+ *
+ * @param[in] part_mode
+ * Partition mode @sa PART_SIZE_E
+ *
+ * @param[in] part_idx
+ * Partition idx of current partition inside CU
+ *
+ * @param[in] single_mcl_flag
+ * Single MCL flag based on 8x8 CU and Parallel merge value
+ *
+ * @param[out] ps_merge_cand_list
+ * pointer to store MV merge candidates list
+ *
+ * @returns
+ * None
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+void ihevcd_mv_merge(mv_ctxt_t *ps_mv_ctxt,
+                     UWORD32 *pu4_top_pu_idx,
+                     UWORD32 *pu4_left_pu_idx,
+                     WORD32 left_nbr_4x4_strd,
+                     pu_t *ps_pu,
+                     WORD32 part_mode,
+                     WORD32 part_idx,
+                     WORD32 part_wd,
+                     WORD32 part_ht,
+                     WORD32 part_pos_x,
+                     WORD32 part_pos_y,
+                     WORD32 single_mcl_flag,
+                     WORD32 lb_avail,
+                     WORD32 l_avail,
+                     WORD32 tr_avail,
+                     WORD32 t_avail,
+                     WORD32 tl_avail)
+{
+    /******************************************************/
+    /*      Spatial Merge Candidates                      */
+    /******************************************************/
+    slice_header_t *ps_slice_hdr;
+    pu_t as_pu_merge_list[MAX_NUM_MERGE_CAND];
+    pps_t *ps_pps;
+    ref_list_t *ps_ref_list[2];
+    WORD32 sum_avail_a0_a1_b0_b1 = 0; /*Sum of availability of A0, A1, B0, B1*/
+    WORD32 nbr_x, nbr_y;
+    WORD32 nbr_avail[MAX_NUM_MV_NBR];
+    WORD32 merge_shift;
+    WORD32 nbr_pu_idx;
+    pu_t *ps_nbr_pu[MAX_NUM_MV_NBR];
+    WORD32 max_num_merge_cand;
+    WORD32 candidate_cnt;
+    WORD32 pos_x_merge_shift, pos_y_merge_shift;
+
+    ps_slice_hdr = ps_mv_ctxt->ps_slice_hdr;
+    ps_pps = ps_mv_ctxt->ps_pps;
+    /* Initializing reference list */
+    ps_ref_list[0] = ps_slice_hdr->as_ref_pic_list0;
+    ps_ref_list[1] = ps_slice_hdr->as_ref_pic_list1;
+    if(PSLICE == ps_slice_hdr->i1_slice_type)
+        ps_ref_list[1] = ps_slice_hdr->as_ref_pic_list0;
+
+    candidate_cnt = 0;
+    /*******************************************/
+    /* Neighbor location: Graphical indication */
+    /*                                         */
+    /*          B2 _____________B1 B0          */
+    /*            |               |            */
+    /*            |               |            */
+    /*            |               |            */
+    /*            |      PU     ht|            */
+    /*            |               |            */
+    /*            |               |            */
+    /*          A1|______wd_______|            */
+    /*          A0                             */
+    /*                                         */
+    /*******************************************/
+
+    merge_shift = ps_pps->i1_log2_parallel_merge_level;
+
+    /* Availability check */
+    /* A1 */
+    nbr_x = part_pos_x - 1;
+    nbr_y = part_pos_y + part_ht - 1; /* A1 */
+
+    nbr_pu_idx = *(pu4_left_pu_idx + ((nbr_y - part_pos_y) >> 2) * left_nbr_4x4_strd);
+    ps_nbr_pu[NBR_A1] = ps_mv_ctxt->ps_pic_pu + nbr_pu_idx;
+
+    nbr_avail[NBR_A1] = l_avail
+                    && (!ps_nbr_pu[NBR_A1]->b1_intra_flag); /* A1 */
+
+    pos_x_merge_shift = (part_pos_x >> merge_shift);
+    pos_y_merge_shift = (part_pos_y >> merge_shift);
+    max_num_merge_cand = ps_pu->b3_merge_idx + 1;
+
+    {
+        if(nbr_avail[NBR_A1])
+        {
+            /* if at same merge level */
+            if(pos_x_merge_shift == (nbr_x >> merge_shift) &&
+               (pos_y_merge_shift == (nbr_y >> merge_shift)))
+            {
+                nbr_avail[NBR_A1] = 0;
+            }
+
+            /* SPEC JCTVC-K1003_v9 version has a different way using not available       */
+            /* candidates compared to software. for non square part and seconf part case */
+            /* ideally nothing from the 1st partition should be used as per spec but     */
+            /* HM 8.2 dev verison does not adhere to this. currenlty code fllows HM      */
+
+            /* if single MCL is 0 , second part of 2 part in CU */
+            else if((single_mcl_flag == 0) && (part_idx == 1) &&
+                            ((part_mode == PART_Nx2N) || (part_mode == PART_nLx2N) ||
+                                            (part_mode == PART_nRx2N)))
+            {
+                nbr_avail[NBR_A1] = 0;
+            }
+            sum_avail_a0_a1_b0_b1 += nbr_avail[NBR_A1];
+            if(nbr_avail[NBR_A1])
+            {
+                as_pu_merge_list[candidate_cnt] = *ps_nbr_pu[NBR_A1];
+                candidate_cnt++;
+                if(candidate_cnt == max_num_merge_cand)
+                {
+                    ps_pu[0].mv = as_pu_merge_list[candidate_cnt - 1].mv;
+                    ps_pu[0].b2_pred_mode = as_pu_merge_list[candidate_cnt - 1].b2_pred_mode;
+                    return;
+                }
+            }
+        }
+    }
+
+    /* B1 */
+    nbr_x = part_pos_x + part_wd - 1;
+    nbr_y = part_pos_y - 1;
+
+    nbr_pu_idx = *(pu4_top_pu_idx + ((nbr_x - part_pos_x) >> 2));
+    ps_nbr_pu[NBR_B1] = ps_mv_ctxt->ps_pic_pu + nbr_pu_idx;
+
+    nbr_avail[NBR_B1] = t_avail
+                    && (!ps_nbr_pu[NBR_B1]->b1_intra_flag); /* B1 */
+
+    {
+        WORD32 avail_flag;
+        avail_flag = nbr_avail[NBR_B1];
+
+        if(nbr_avail[NBR_B1])
+        {
+            /* if at same merge level */
+            if(pos_x_merge_shift == (nbr_x >> merge_shift) &&
+               (pos_y_merge_shift == (nbr_y >> merge_shift)))
+            {
+                nbr_avail[NBR_B1] = 0;
+                avail_flag = 0;
+            }
+
+            /* if single MCL is 0 , second part of 2 part in CU */
+            else if((single_mcl_flag == 0) && (part_idx == 1) &&
+                            ((part_mode == PART_2NxN) || (part_mode == PART_2NxnU) ||
+                                            (part_mode == PART_2NxnD)))
+            {
+                nbr_avail[NBR_B1] = 0;
+                avail_flag = 0;
+            }
+
+            else if(nbr_avail[NBR_A1])
+            {
+                avail_flag = !ihevcd_compare_pu_t(ps_nbr_pu[NBR_A1], ps_nbr_pu[NBR_B1]);
+            }
+
+            sum_avail_a0_a1_b0_b1 += avail_flag;
+            if(avail_flag)
+            {
+                as_pu_merge_list[candidate_cnt] = *ps_nbr_pu[NBR_B1];
+                candidate_cnt++;
+                if(candidate_cnt == max_num_merge_cand)
+                {
+                    ps_pu[0].mv = as_pu_merge_list[candidate_cnt - 1].mv;
+                    ps_pu[0].b2_pred_mode = as_pu_merge_list[candidate_cnt - 1].b2_pred_mode;
+                    return;
+                }
+            }
+        }
+    }
+    /* B0 */
+    nbr_x = part_pos_x + part_wd;
+    nbr_y = part_pos_y - 1;
+
+    nbr_pu_idx = *(pu4_top_pu_idx + ((nbr_x - part_pos_x) >> 2));
+    ps_nbr_pu[NBR_B0] = ps_mv_ctxt->ps_pic_pu + nbr_pu_idx;
+
+    nbr_avail[NBR_B0] = tr_avail
+                    && (!ps_nbr_pu[NBR_B0]->b1_intra_flag); /* B0 */
+
+    {
+        WORD32 avail_flag;
+        avail_flag = nbr_avail[NBR_B0];
+
+        /* if at same merge level */
+        if(nbr_avail[NBR_B0])
+        {
+            if(pos_x_merge_shift == (nbr_x >> merge_shift) &&
+               (pos_y_merge_shift == (nbr_y >> merge_shift)))
+            {
+                nbr_avail[NBR_B0] = 0;
+                avail_flag = 0;
+            }
+            else if(nbr_avail[NBR_B1])
+            {
+                avail_flag = !ihevcd_compare_pu_t(ps_nbr_pu[NBR_B1], ps_nbr_pu[NBR_B0]);
+            }
+
+            sum_avail_a0_a1_b0_b1 += avail_flag;
+            if(avail_flag)
+            {
+                as_pu_merge_list[candidate_cnt] = *ps_nbr_pu[NBR_B0];
+                candidate_cnt++;
+                if(candidate_cnt == max_num_merge_cand)
+                {
+                    ps_pu[0].mv = as_pu_merge_list[candidate_cnt - 1].mv;
+                    ps_pu[0].b2_pred_mode = as_pu_merge_list[candidate_cnt - 1].b2_pred_mode;
+                    return;
+                }
+            }
+        }
+    }
+    /* A0 */
+    nbr_x = part_pos_x - 1;
+    nbr_y = part_pos_y + part_ht; /* A0 */
+
+    nbr_pu_idx = *(pu4_left_pu_idx + ((nbr_y - part_pos_y) >> 2) * left_nbr_4x4_strd);
+    ps_nbr_pu[NBR_A0] = ps_mv_ctxt->ps_pic_pu + nbr_pu_idx;
+
+    nbr_avail[NBR_A0] = lb_avail
+                    && (!ps_nbr_pu[NBR_A0]->b1_intra_flag); /* A0 */
+    {
+        WORD32 avail_flag;
+        avail_flag = nbr_avail[NBR_A0];
+
+        if(nbr_avail[NBR_A0])
+        {
+            /* if at same merge level */
+            if(pos_x_merge_shift == (nbr_x >> merge_shift) &&
+                            (pos_y_merge_shift == (nbr_y >> merge_shift)))
+            {
+                nbr_avail[NBR_A0] = 0;
+                avail_flag = 0;
+            }
+            else if(nbr_avail[NBR_A1])
+            {
+                avail_flag = !ihevcd_compare_pu_t(ps_nbr_pu[NBR_A1], ps_nbr_pu[NBR_A0]);
+            }
+
+            sum_avail_a0_a1_b0_b1 += avail_flag;
+            if(avail_flag)
+            {
+                as_pu_merge_list[candidate_cnt] = *ps_nbr_pu[NBR_A0];
+                candidate_cnt++;
+                if(candidate_cnt == max_num_merge_cand)
+                {
+                    ps_pu[0].mv = as_pu_merge_list[candidate_cnt - 1].mv;
+                    ps_pu[0].b2_pred_mode = as_pu_merge_list[candidate_cnt - 1].b2_pred_mode;
+                    return;
+                }
+            }
+        }
+    }
+    /* B2 */
+
+    nbr_x = part_pos_x - 1;
+    nbr_y = part_pos_y - 1; /* B2 */
+
+    nbr_pu_idx = *(pu4_top_pu_idx + ((nbr_x - part_pos_x) >> 2));
+    ps_nbr_pu[NBR_B2] = ps_mv_ctxt->ps_pic_pu + nbr_pu_idx;
+
+    nbr_avail[NBR_B2] = tl_avail
+                    && (!ps_nbr_pu[NBR_B2]->b1_intra_flag); /* B2 */
+
+    {
+        WORD32 avail_flag;
+        avail_flag = nbr_avail[NBR_B2];
+
+        if(nbr_avail[NBR_B2])
+        {
+            /* if at same merge level */
+            if(pos_x_merge_shift == (nbr_x >> merge_shift) &&
+                            (pos_y_merge_shift == (nbr_y >> merge_shift)))
+            {
+                nbr_avail[NBR_B2] = 0;
+                avail_flag = 0;
+            }
+            else if(4 == sum_avail_a0_a1_b0_b1)
+            {
+                avail_flag = 0;
+            }
+
+            else
+            {
+                if(nbr_avail[NBR_A1])
+                {
+                    avail_flag = !ihevcd_compare_pu_t(ps_nbr_pu[NBR_A1], ps_nbr_pu[NBR_B2]);
+                }
+
+                if(avail_flag && nbr_avail[NBR_B1])
+                {
+                    avail_flag = !ihevcd_compare_pu_t(ps_nbr_pu[NBR_B1], ps_nbr_pu[NBR_B2]);
+                }
+            }
+
+            if(avail_flag)
+            {
+                as_pu_merge_list[candidate_cnt] = *ps_nbr_pu[NBR_B2];
+                candidate_cnt++;
+                if(candidate_cnt == max_num_merge_cand)
+                {
+                    ps_pu[0].mv = as_pu_merge_list[candidate_cnt - 1].mv;
+                    ps_pu[0].b2_pred_mode = as_pu_merge_list[candidate_cnt - 1].b2_pred_mode;
+                    return;
+                }
+            }
+        }
+    }
+
+    /***********************************************************/
+    /*          Collocated MV prediction                       */
+    /***********************************************************/
+#if 1
+    {
+        mv_t as_mv_col[2];
+        WORD32 avail_col_flag[2] = { 0 }, x_col, y_col;
+        WORD32 avail_col_l0, avail_col_l1;
+//        ihevcd_collocated_mvp(ps_mv_ctxt,ps_pu,part_pos_x,part_pos_y,part_wd,part_ht,as_mv_col,avail_col_flag,0);
+
+        /* Checking Collocated MV availability at Bottom right of PU*/
+        x_col = part_pos_x + part_wd;
+        y_col = part_pos_y + part_ht;
+        ihevcd_collocated_mvp(ps_mv_ctxt, ps_pu, as_mv_col, avail_col_flag, 0, x_col, y_col);
+
+        avail_col_l0 = avail_col_flag[0];
+        avail_col_l1 = avail_col_flag[1];
+
+        if(avail_col_l0 || avail_col_l1)
+        {
+            as_pu_merge_list[candidate_cnt].mv.s_l0_mv = as_mv_col[0];
+            as_pu_merge_list[candidate_cnt].mv.s_l1_mv = as_mv_col[1];
+        }
+
+        if(avail_col_l0 == 0 || avail_col_l1 == 0)
+        {
+            /* Checking Collocated MV availability at Center of PU */
+            x_col = part_pos_x + (part_wd >> 1);
+            y_col = part_pos_y + (part_ht >> 1);
+            ihevcd_collocated_mvp(ps_mv_ctxt, ps_pu, as_mv_col, avail_col_flag, 0, x_col, y_col);
+
+            if(avail_col_l0 == 0)
+            {
+                as_pu_merge_list[candidate_cnt].mv.s_l0_mv = as_mv_col[0];
+            }
+            if(avail_col_l1 == 0)
+            {
+                as_pu_merge_list[candidate_cnt].mv.s_l1_mv = as_mv_col[1];
+            }
+
+            avail_col_l0 |= avail_col_flag[0];
+            avail_col_l1 |= avail_col_flag[1];
+        }
+
+        as_pu_merge_list[candidate_cnt].mv.i1_l0_ref_idx = 0;
+        as_pu_merge_list[candidate_cnt].mv.i1_l1_ref_idx = 0;
+        as_pu_merge_list[candidate_cnt].b2_pred_mode = avail_col_l0 ? (avail_col_l1 ? PRED_BI : PRED_L0) : PRED_L1;
+
+        candidate_cnt += (avail_col_l0 || avail_col_l1);
+
+        if(candidate_cnt == max_num_merge_cand)
+        {
+            ps_pu[0].mv = as_pu_merge_list[candidate_cnt - 1].mv;
+            ps_pu[0].b2_pred_mode = as_pu_merge_list[candidate_cnt - 1].b2_pred_mode;
+            return;
+        }
+
+    }
+#endif
+    {
+        WORD32 slice_type;
+
+        slice_type = ps_slice_hdr->i1_slice_type;
+        /* Colocated mv has to be added to list, if available */
+
+        /******************************************************/
+        /*      Bi pred merge candidates                      */
+        /******************************************************/
+        if(slice_type == BSLICE)
+        {
+            if((candidate_cnt > 1) && (candidate_cnt < MAX_NUM_MERGE_CAND))
+            {
+                WORD32 priority_list0[12] =
+                    { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 };
+                WORD32 priority_list1[12] =
+                    { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 };
+                WORD32 l0_cand, l1_cand;
+                WORD32 bi_pred_idx = 0;
+                WORD32 total_bi_pred_cand =
+                                candidate_cnt * (candidate_cnt - 1);
+
+                while(bi_pred_idx < total_bi_pred_cand)
+                {
+                    l0_cand = priority_list0[bi_pred_idx];
+                    l1_cand = priority_list1[bi_pred_idx];
+
+                    if((as_pu_merge_list[l0_cand].b2_pred_mode != PRED_L1)
+                                    && (as_pu_merge_list[l1_cand].b2_pred_mode
+                                                    != PRED_L0))
+                    {
+                        WORD8 i1_l0_ref_idx, i1_l1_ref_idx;
+                        mv_t s_l0_mv, s_l1_mv;
+                        pic_buf_t *ps_pic_buf_l0, *ps_pic_buf_l1;
+
+                        i1_l0_ref_idx = as_pu_merge_list[l0_cand].mv.i1_l0_ref_idx;
+                        i1_l1_ref_idx = as_pu_merge_list[l1_cand].mv.i1_l1_ref_idx;
+                        ps_pic_buf_l0 = (pic_buf_t *)((ps_ref_list[0][i1_l0_ref_idx].pv_pic_buf));
+                        ps_pic_buf_l1 = (pic_buf_t *)((ps_ref_list[1][i1_l1_ref_idx].pv_pic_buf));
+                        s_l0_mv = as_pu_merge_list[l0_cand].mv.s_l0_mv;
+                        s_l1_mv = as_pu_merge_list[l1_cand].mv.s_l1_mv;
+
+                        if((ps_pic_buf_l0->i4_abs_poc != ps_pic_buf_l1->i4_abs_poc)
+                                        || (s_l0_mv.i2_mvx != s_l1_mv.i2_mvx)
+                                        || (s_l0_mv.i2_mvy != s_l1_mv.i2_mvy))
+                        {
+                            candidate_cnt++;
+                            if(candidate_cnt == max_num_merge_cand)
+                            {
+                                ps_pu[0].mv.s_l0_mv = s_l0_mv;
+                                ps_pu[0].mv.s_l1_mv = s_l1_mv;
+                                ps_pu[0].mv.i1_l0_ref_idx = i1_l0_ref_idx;
+                                ps_pu[0].mv.i1_l1_ref_idx = i1_l1_ref_idx;
+                                ps_pu[0].b2_pred_mode = PRED_BI;
+                                return;
+                            }
+                        }
+                    }
+
+                    bi_pred_idx++;
+
+                    if((bi_pred_idx == total_bi_pred_cand)
+                                    || (candidate_cnt == MAX_NUM_MERGE_CAND))
+                    {
+                        break;
+                    }
+                }
+            }
+        }
+
+        /******************************************************/
+        /*      Zero merge candidates                         */
+        /******************************************************/
+//        if(candidate_cnt < max_num_merge_cand)
+        {
+            WORD32 num_ref_idx;
+            WORD32 zero_idx;
+
+            zero_idx = max_num_merge_cand - candidate_cnt - 1;
+
+            if(slice_type == PSLICE)
+                num_ref_idx = ps_slice_hdr->i1_num_ref_idx_l0_active;
+            else
+                /* Slice type B */
+                num_ref_idx = MIN(ps_slice_hdr->i1_num_ref_idx_l0_active, ps_slice_hdr->i1_num_ref_idx_l1_active);
+
+            if(zero_idx >= num_ref_idx)
+                zero_idx = 0;
+
+            ps_pu[0].mv.i1_l0_ref_idx = zero_idx;
+            if(slice_type == PSLICE)
+            {
+                ps_pu[0].mv.i1_l1_ref_idx = 0;
+                ps_pu[0].b2_pred_mode = PRED_L0;
+            }
+            else /* Slice type B */
+            {
+                ps_pu[0].mv.i1_l1_ref_idx = zero_idx;
+                ps_pu[0].b2_pred_mode = PRED_BI;
+            }
+
+            ps_pu[0].mv.s_l0_mv.i2_mvx = 0;
+            ps_pu[0].mv.s_l0_mv.i2_mvy = 0;
+            ps_pu[0].mv.s_l1_mv.i2_mvx = 0;
+            ps_pu[0].mv.s_l1_mv.i2_mvy = 0;
+
+            candidate_cnt++;
+        }
+    }
+
+    return;
+}
+
+

diff --git a/decoder/ihevcd_mv_merge.h b/decoder/ihevcd_mv_merge.h
new file mode 100644
index 0000000..52a7e98
--- /dev/null
+++ b/decoder/ihevcd_mv_merge.h

@@ -0,0 +1,111 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevce_mv_merge.h
+*
+* @brief
+*    This file contains function prototypes of MV Merge candidates list
+*    derivation functions and corresponding structure and macrso definations
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_MV_MERGE_H_
+#define _IHEVCD_MV_MERGE_H_
+
+/*****************************************************************************/
+/* Constant Macros                                                           */
+/*****************************************************************************/
+
+#define MAX_NUM_MV_NBR     5
+
+/*****************************************************************************/
+/* Function Macros                                                           */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Typedefs                                                                  */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Enums                                                                     */
+/*****************************************************************************/
+typedef enum
+{
+    NBR_A0 = 0,
+    NBR_A1 = 1,
+    NBR_B0 = 2,
+    NBR_B1 = 3,
+    NBR_B2 = 4,
+
+    /* should be last */
+    MAX_NUM_NBRS
+}MV_MERGE_NBRS_T;
+
+/*****************************************************************************/
+/* Structure                                                                 */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Extern Variable Declarations                                              */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Extern Function Declarations                                              */
+/*****************************************************************************/
+void ihevcd_mv_merge(mv_ctxt_t *ps_mv_ctxt,
+                     UWORD32 *pu4_top_pu_idx,
+                     UWORD32 *pu4_left_pu_idx,
+                     WORD32 left_nbr_4x4_strd,
+                     pu_t *ps_pu,
+                     WORD32 part_mode,
+                     WORD32 part_idx,
+                     WORD32 part_wd,
+                     WORD32 part_ht,
+                     WORD32 part_pos_x,
+                     WORD32 part_pos_y,
+                     WORD32 single_mcl_flag,
+                     WORD32 lb_avail,
+                     WORD32 l_avail,
+                     WORD32 tr_avail,
+                     WORD32 t_avail,
+                     WORD32 tl_avail);
+void ihevcd_collocated_mvp(mv_ctxt_t *ps_mv_ctxt,
+                           pu_t *ps_pu,
+                           mv_t *ps_mv_col,
+                           WORD32 *pu4_avail_col_flag,
+                           WORD32 use_pu_ref_idx,
+                           WORD32 x_col,
+                           WORD32 y_col);
+
+void ihevcd_scale_collocated_mv(mv_t *ps_mv,
+                                WORD32 cur_ref_poc,
+                                WORD32 col_ref_poc,
+                                WORD32 col_poc,
+                                WORD32 cur_poc);
+#endif  /* _IHEVCD_MV_MERGE_H_ */

diff --git a/decoder/ihevcd_mv_pred.c b/decoder/ihevcd_mv_pred.c
new file mode 100644
index 0000000..e811198
--- /dev/null
+++ b/decoder/ihevcd_mv_pred.c

@@ -0,0 +1,874 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevcd_mv_pred.c
+ *
+ * @brief
+ *  Contains functions for motion vector prediction
+ *
+ * @author
+ *  Ittiam
+ *
+ * @par List of Functions:
+ * - ihevcd_scale_mv()
+ * - ihevcd_mv_pred()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_fmt_conv.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_mv_merge.h"
+
+/**
+ *******************************************************************************
+ *
+ * @brief Function scaling motion vector
+ *
+ *
+ * @par Description:
+ *   Scales mv based on difference between current POC and current
+ *   reference POC and neighbour reference poc
+ *
+ * @param[inout] mv
+ *   motion vector to be scaled
+ *
+ * @param[in] cur_ref_poc
+ *   Current PU refernce pic poc
+ *
+ * @param[in] nbr_ref_poc
+ *   Neighbor PU reference pic poc
+ *
+ * @param[in] cur_poc
+ *   Picture order count of current pic
+ *
+ * @returns
+ *  None
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+void ihevcd_scale_mv(mv_t *ps_mv,
+                     WORD32 cur_ref_poc,
+                     WORD32 nbr_ref_poc,
+                     WORD32 cur_poc)
+{
+    WORD32 td, tb, tx;
+    WORD32 dist_scale_factor;
+    WORD32 mvx, mvy;
+
+    td = CLIP_S8(cur_poc - nbr_ref_poc);
+    tb = CLIP_S8(cur_poc - cur_ref_poc);
+
+    if(0 != td)
+    {
+        tx = (16384 + (abs(td) >> 1)) / td;
+
+        dist_scale_factor = (tb * tx + 32) >> 6;
+        dist_scale_factor = CLIP3(dist_scale_factor, -4096, 4095);
+
+        mvx = ps_mv->i2_mvx;
+        mvy = ps_mv->i2_mvy;
+
+        mvx = SIGN(dist_scale_factor * mvx)
+                        * ((abs(dist_scale_factor * mvx) + 127) >> 8);
+        mvy = SIGN(dist_scale_factor * mvy)
+                        * ((abs(dist_scale_factor * mvy) + 127) >> 8);
+
+        ps_mv->i2_mvx = CLIP_S16(mvx);
+        ps_mv->i2_mvy = CLIP_S16(mvy);
+    }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief Function scaling temporal motion vector
+ *
+ *
+ * @par Description:
+ *   Scales mv based on difference between current POC and current
+ *   reference POC and neighbour reference poc
+ *
+ * @param[inout] mv
+ *   motion vector to be scaled
+ *
+ * @param[in] cur_ref_poc
+ *   Current PU refernce pic poc
+ *
+ * @param[in] nbr_ref_poc
+ *   Neighbor PU reference pic poc
+ *
+ * @param[in] cur_poc
+ *   Picture order count of current pic
+ *
+ * @returns
+ *  None
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+void ihevcd_scale_collocated_mv(mv_t *ps_mv,
+                                WORD32 cur_ref_poc,
+                                WORD32 col_ref_poc,
+                                WORD32 col_poc,
+                                WORD32 cur_poc)
+{
+    WORD32 td, tb, tx;
+    WORD32 dist_scale_factor;
+    WORD32 mvx, mvy;
+
+    td = CLIP_S8(col_poc - col_ref_poc);
+    tb = CLIP_S8(cur_poc - cur_ref_poc);
+
+    tx = (16384 + (abs(td) >> 1)) / td;
+
+    dist_scale_factor = (tb * tx + 32) >> 6;
+    dist_scale_factor = CLIP3(dist_scale_factor, -4096, 4095);
+
+    mvx = ps_mv->i2_mvx;
+    mvy = ps_mv->i2_mvy;
+
+    mvx = SIGN(dist_scale_factor * mvx)
+                    * ((abs(dist_scale_factor * mvx) + 127) >> 8);
+    mvy = SIGN(dist_scale_factor * mvy)
+                    * ((abs(dist_scale_factor * mvy) + 127) >> 8);
+
+    ps_mv->i2_mvx = CLIP_S16(mvx);
+    ps_mv->i2_mvy = CLIP_S16(mvy);
+}
+
+#if 1
+#define CHECK_NBR_MV_ST(pi4_avail_flag, cur_ref_poc, u1_nbr_pred_flag, nbr_ref_poc,         \
+                        ps_mv, ps_nbr_mv )                                                  \
+{                                                                                           \
+    if((u1_nbr_pred_flag) && (cur_ref_poc == nbr_ref_poc))                                  \
+    {                                                                                       \
+        *pi4_avail_flag = 1;                                                                \
+        *ps_mv = *ps_nbr_mv;                                                                \
+        break ;                                                                             \
+    }                                                                                       \
+}
+#define CHECK_NBR_MV_LT(pi4_avail_flag, u1_cur_ref_lt, cur_poc, cur_ref_poc,                 \
+                      u1_nbr_pred_flag, u1_nbr_ref_lt, nbr_ref_poc,                          \
+                      ps_mv, ps_nbr_mv )                                                     \
+{                                                                                            \
+    WORD32 cur_lt, nbr_lt;                                                                   \
+    cur_lt = (LONG_TERM_REF == (u1_cur_ref_lt));                                             \
+    nbr_lt = (LONG_TERM_REF == (u1_nbr_ref_lt));                                             \
+    if((u1_nbr_pred_flag) && (cur_lt == nbr_lt))                                             \
+    {                                                                                        \
+        *pi4_avail_flag = 1;                                                                 \
+        *ps_mv = *ps_nbr_mv;                                                                 \
+        if(SHORT_TERM_REF == u1_nbr_ref_lt)                                                  \
+        {                                                                                    \
+            ihevcd_scale_mv(ps_mv, cur_ref_poc, nbr_ref_poc,                                 \
+                                                cur_poc);                                    \
+        }                                                                                    \
+        break ;                                                                              \
+    }                                                                                        \
+}
+
+#else
+
+void CHECK_NBR_MV_ST(WORD32 *pi4_avail_flag, WORD32 cur_ref_poc, UWORD8 u1_nbr_pred_flag, WORD32 nbr_ref_poc,
+                     mv_t *ps_mv, mv_t *ps_nbr_mv )
+{
+    if((u1_nbr_pred_flag) && (cur_ref_poc == nbr_ref_poc))
+    {
+        *pi4_avail_flag = 1;
+        *ps_mv = *ps_nbr_mv;
+    }
+}
+void CHECK_NBR_MV_LT(WORD32 *pi4_avail_flag, UWORD8 u1_cur_ref_lt, WORD32 cur_poc, WORD32 cur_ref_poc,
+                     UWORD8 u1_nbr_pred_flag, UWORD8 u1_nbr_ref_lt, WORD32 nbr_ref_poc,
+                     mv_t *ps_mv, mv_t *ps_nbr_mv )
+{
+    WORD32 cur_lt, nbr_lt;
+    cur_lt = (LONG_TERM_REF == u1_cur_ref_lt);
+    nbr_lt = (LONG_TERM_REF == u1_nbr_ref_lt);
+
+    if((u1_nbr_pred_flag) && (cur_lt == nbr_lt))
+    {
+        *pi4_avail_flag = 1;
+        *ps_mv = *ps_nbr_mv;
+        if(SHORT_TERM_REF == u1_nbr_ref_lt)
+        {
+            ihevcd_scale_mv(ps_mv, cur_ref_poc, nbr_ref_poc,
+                            cur_poc);
+        }
+    }
+}
+#endif
+
+#if 1
+void GET_MV_NBR_ST(ref_list_t **ps_ref_pic_list, WORD32 *pi4_avail_flag, pic_buf_t *ps_cur_pic_buf_lx, pu_t **aps_nbr_pu, mv_t *ps_mv, WORD32 num_nbrs, WORD32 lx)
+{
+    WORD32 i, nbr_pred_lx;
+    pic_buf_t *ps_nbr_pic_buf_lx;
+    /* Short Term */
+    /* L0 */
+    if(0 == lx)
+    {
+        for(i = 0; i < num_nbrs; i++)
+        {
+            nbr_pred_lx = (PRED_L1 != aps_nbr_pu[i]->b2_pred_mode);
+            ps_nbr_pic_buf_lx = (pic_buf_t *)((ps_ref_pic_list[0][aps_nbr_pu[i]->mv.i1_l0_ref_idx].pv_pic_buf));
+            CHECK_NBR_MV_ST(pi4_avail_flag, ps_cur_pic_buf_lx->i4_abs_poc, nbr_pred_lx,
+                            ps_nbr_pic_buf_lx->i4_abs_poc, ps_mv, &aps_nbr_pu[i]->mv.s_l0_mv);
+            nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode);
+
+            nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode);
+            ps_nbr_pic_buf_lx = (pic_buf_t *)((ps_ref_pic_list[1][aps_nbr_pu[i]->mv.i1_l1_ref_idx].pv_pic_buf));
+            CHECK_NBR_MV_ST(pi4_avail_flag, ps_cur_pic_buf_lx->i4_abs_poc, nbr_pred_lx,
+                            ps_nbr_pic_buf_lx->i4_abs_poc, ps_mv, &aps_nbr_pu[i]->mv.s_l1_mv);
+        }
+    }
+    /* L1 */
+    else
+    {
+        for(i = 0; i < num_nbrs; i++)
+        {
+            nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode);
+            ps_nbr_pic_buf_lx = (pic_buf_t *)((ps_ref_pic_list[1][aps_nbr_pu[i]->mv.i1_l1_ref_idx].pv_pic_buf));
+            CHECK_NBR_MV_ST(pi4_avail_flag, ps_cur_pic_buf_lx->i4_abs_poc, nbr_pred_lx,
+                            ps_nbr_pic_buf_lx->i4_abs_poc, ps_mv, &aps_nbr_pu[i]->mv.s_l1_mv);
+
+            nbr_pred_lx = (PRED_L1 != aps_nbr_pu[i]->b2_pred_mode);
+            ps_nbr_pic_buf_lx = (pic_buf_t *)((ps_ref_pic_list[0][aps_nbr_pu[i]->mv.i1_l0_ref_idx].pv_pic_buf));
+            CHECK_NBR_MV_ST(pi4_avail_flag, ps_cur_pic_buf_lx->i4_abs_poc, nbr_pred_lx,
+                            ps_nbr_pic_buf_lx->i4_abs_poc, ps_mv, &aps_nbr_pu[i]->mv.s_l0_mv);
+        }
+    }
+}
+
+void GET_MV_NBR_LT(ref_list_t **ps_ref_pic_list, slice_header_t *ps_slice_hdr, WORD32 *pi4_avail_flag, pic_buf_t *ps_cur_pic_buf_lx, pu_t **aps_nbr_pu, mv_t *ps_mv, WORD32 num_nbrs, WORD32 lx)
+{
+    WORD32 i, nbr_pred_lx;
+    pic_buf_t *ps_nbr_pic_buf_lx;
+    /* Long Term*/
+    /* L0 */
+    if(0 == lx)
+    {
+        for(i = 0; i < num_nbrs; i++)
+        {
+            nbr_pred_lx = (PRED_L1 != aps_nbr_pu[i]->b2_pred_mode);
+            ps_nbr_pic_buf_lx = (pic_buf_t *)((ps_ref_pic_list[0][aps_nbr_pu[i]->mv.i1_l0_ref_idx].pv_pic_buf));
+            CHECK_NBR_MV_LT(pi4_avail_flag, ps_cur_pic_buf_lx->u1_used_as_ref, ps_slice_hdr->i4_abs_pic_order_cnt, ps_cur_pic_buf_lx->i4_abs_poc,
+                            nbr_pred_lx,
+                            ps_nbr_pic_buf_lx->u1_used_as_ref, ps_nbr_pic_buf_lx->i4_abs_poc,
+                            ps_mv, &aps_nbr_pu[i]->mv.s_l0_mv);
+
+            nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode);
+            ps_nbr_pic_buf_lx = (pic_buf_t *)((ps_ref_pic_list[1][aps_nbr_pu[i]->mv.i1_l1_ref_idx].pv_pic_buf));
+            CHECK_NBR_MV_LT(pi4_avail_flag, ps_cur_pic_buf_lx->u1_used_as_ref, ps_slice_hdr->i4_abs_pic_order_cnt, ps_cur_pic_buf_lx->i4_abs_poc,
+                            nbr_pred_lx,
+                            ps_nbr_pic_buf_lx->u1_used_as_ref, ps_nbr_pic_buf_lx->i4_abs_poc,
+                            ps_mv, &aps_nbr_pu[i]->mv.s_l1_mv);
+        }
+    }
+    /* L1 */
+    else
+    {
+        for(i = 0; i < num_nbrs; i++)
+        {
+            nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode);
+            ps_nbr_pic_buf_lx = (pic_buf_t *)((ps_ref_pic_list[1][aps_nbr_pu[i]->mv.i1_l1_ref_idx].pv_pic_buf));
+            CHECK_NBR_MV_LT(pi4_avail_flag, ps_cur_pic_buf_lx->u1_used_as_ref, ps_slice_hdr->i4_abs_pic_order_cnt, ps_cur_pic_buf_lx->i4_abs_poc,
+                            nbr_pred_lx,
+                            ps_nbr_pic_buf_lx->u1_used_as_ref, ps_nbr_pic_buf_lx->i4_abs_poc,
+                            ps_mv, &aps_nbr_pu[i]->mv.s_l1_mv);
+
+            nbr_pred_lx = (PRED_L1 != aps_nbr_pu[i]->b2_pred_mode);
+            ps_nbr_pic_buf_lx = (pic_buf_t *)((ps_ref_pic_list[0][aps_nbr_pu[i]->mv.i1_l0_ref_idx].pv_pic_buf));
+            CHECK_NBR_MV_LT(pi4_avail_flag, ps_cur_pic_buf_lx->u1_used_as_ref, ps_slice_hdr->i4_abs_pic_order_cnt, ps_cur_pic_buf_lx->i4_abs_poc,
+                            nbr_pred_lx,
+                            ps_nbr_pic_buf_lx->u1_used_as_ref, ps_nbr_pic_buf_lx->i4_abs_poc,
+                            ps_mv, &aps_nbr_pu[i]->mv.s_l0_mv);
+        }
+    }
+}
+#else
+
+#define GET_MV_NBR_ST(ps_ref_pic_list, pi4_avail_flag, ps_cur_pic_buf_lx, aps_nbr_pu, ps_mv, num_nbrs, lx) \
+{                                                                                                                                                                                \
+    WORD32 i, nbr_pred_lx;                                                                                                                                                       \
+    pic_buf_t *ps_nbr_pic_buf_lx;                                                                                                                                                \
+    /* Short Term */                                                                                                                                                             \
+    /* L0 */                                                                                                                                                                     \
+    if(0 == lx)                                                                                                                                                                  \
+    {                                                                                                                                                                            \
+        for(i=0; i< num_nbrs; i++)                                                                                                                                               \
+        {                                                                                                                                                                        \
+            nbr_pred_lx = (PRED_L1 != aps_nbr_pu[i]->b2_pred_mode);                                                                                                              \
+            ps_nbr_pic_buf_lx = (pic_buf_t*)((ps_ref_pic_list[0][aps_nbr_pu[i]->mv.i1_l0_ref_idx].pv_pic_buf));                                                                  \
+            CHECK_NBR_MV_ST(pi4_avail_flag, ps_cur_pic_buf_lx->i4_abs_poc, nbr_pred_lx ,                                                                                         \
+                            ps_nbr_pic_buf_lx->i4_abs_poc,ps_mv, &aps_nbr_pu[i]->mv.s_l0_mv );                                                                                   \
+                            nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode);                                                                                              \
+                                                                                                                                                                                 \
+            nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode);                                                                                                              \
+            ps_nbr_pic_buf_lx = (pic_buf_t*)((ps_ref_pic_list[1][aps_nbr_pu[i]->mv.i1_l1_ref_idx].pv_pic_buf));                                                                  \
+            CHECK_NBR_MV_ST(pi4_avail_flag, ps_cur_pic_buf_lx->i4_abs_poc, nbr_pred_lx,                                                                                          \
+                            ps_nbr_pic_buf_lx->i4_abs_poc,ps_mv, &aps_nbr_pu[i]->mv.s_l1_mv );                                                                                   \
+        }                                                                                                                                                                        \
+    }                                                                                                                                                                            \
+    /* L1 */                                                                                                                                                                     \
+    else                                                                                                                                                                         \
+    {                                                                                                                                                                            \
+        for(i=0; i< num_nbrs; i++)                                                                                                                                               \
+        {                                                                                                                                                                        \
+            nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode);                                                                                                              \
+            ps_nbr_pic_buf_lx = (pic_buf_t*)((ps_ref_pic_list[1][aps_nbr_pu[i]->mv.i1_l1_ref_idx].pv_pic_buf));                                                                  \
+            CHECK_NBR_MV_ST(pi4_avail_flag, ps_cur_pic_buf_lx->i4_abs_poc, nbr_pred_lx,                                                                                          \
+                            ps_nbr_pic_buf_lx->i4_abs_poc,ps_mv, &aps_nbr_pu[i]->mv.s_l1_mv );                                                                                   \
+                                                                                                                                                                                 \
+            nbr_pred_lx = (PRED_L1 != aps_nbr_pu[i]->b2_pred_mode);                                                                                                              \
+            ps_nbr_pic_buf_lx = (pic_buf_t*)((ps_ref_pic_list[0][aps_nbr_pu[i]->mv.i1_l0_ref_idx].pv_pic_buf));                                                                  \
+            CHECK_NBR_MV_ST(pi4_avail_flag, ps_cur_pic_buf_lx->i4_abs_poc, nbr_pred_lx,                                                                                          \
+                            ps_nbr_pic_buf_lx->i4_abs_poc,ps_mv, &aps_nbr_pu[i]->mv.s_l0_mv );                                                                                   \
+        }                                                                                                                                                                        \
+    }                                                                                                                                                                            \
+}
+
+#define GET_MV_NBR_LT(ps_ref_pic_list, ps_slice_hdr, pi4_avail_flag, ps_cur_pic_buf_lx, aps_nbr_pu, ps_mv, num_nbrs, lx)                                              \
+{                                                                                                                                                                                \
+    WORD32 i, nbr_pred_lx;                                                                                                                                                       \
+    pic_buf_t *ps_nbr_pic_buf_lx;                                                                                                                                                \
+    /* Long Term*/                                                                                                                                                               \
+    /* L0 */                                                                                                                                                                     \
+    if(0 == lx)                                                                                                                                                                  \
+    {                                                                                                                                                                            \
+        for(i=0; i< num_nbrs; i++)                                                                                                                                               \
+        {                                                                                                                                                                        \
+            nbr_pred_lx = (PRED_L1 != aps_nbr_pu[i]->b2_pred_mode);                                                                                                              \
+            ps_nbr_pic_buf_lx = (pic_buf_t*)((ps_ref_pic_list[0][aps_nbr_pu[i]->mv.i1_l0_ref_idx].pv_pic_buf));                                                                  \
+            CHECK_NBR_MV_LT(pi4_avail_flag,ps_cur_pic_buf_lx->u1_used_as_ref, ps_slice_hdr->i4_abs_pic_order_cnt, ps_cur_pic_buf_lx->i4_abs_poc,                                 \
+                            nbr_pred_lx,                                                                                                                                         \
+                            ps_nbr_pic_buf_lx->u1_used_as_ref, ps_nbr_pic_buf_lx->i4_abs_poc,                                                                                    \
+                            ps_mv, &aps_nbr_pu[i]->mv.s_l0_mv);                                                                                                                  \
+                                                                                                                                                                                 \
+            nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode);                                                                                                              \
+            ps_nbr_pic_buf_lx = (pic_buf_t*)((ps_ref_pic_list[1][aps_nbr_pu[i]->mv.i1_l1_ref_idx].pv_pic_buf));                                                                  \
+            CHECK_NBR_MV_LT(pi4_avail_flag,ps_cur_pic_buf_lx->u1_used_as_ref, ps_slice_hdr->i4_abs_pic_order_cnt, ps_cur_pic_buf_lx->i4_abs_poc,                                 \
+                            nbr_pred_lx,                                                                                                                                         \
+                            ps_nbr_pic_buf_lx->u1_used_as_ref, ps_nbr_pic_buf_lx->i4_abs_poc,                                                                                    \
+                            ps_mv, &aps_nbr_pu[i]->mv.s_l1_mv);                                                                                                                  \
+        }                                                                                                                                                                        \
+    }                                                                                                                                                                            \
+    /* L1 */                                                                                                                                                                     \
+    else                                                                                                                                                                         \
+    {                                                                                                                                                                            \
+        for(i=0; i< num_nbrs; i++)                                                                                                                                               \
+        {                                                                                                                                                                        \
+            nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode);                                                                                                              \
+            ps_nbr_pic_buf_lx = (pic_buf_t*)((ps_ref_pic_list[1][aps_nbr_pu[i]->mv.i1_l1_ref_idx].pv_pic_buf));                                                                  \
+            CHECK_NBR_MV_LT(pi4_avail_flag,ps_cur_pic_buf_lx->u1_used_as_ref, ps_slice_hdr->i4_abs_pic_order_cnt, ps_cur_pic_buf_lx->i4_abs_poc,                                 \
+                            nbr_pred_lx,                                                                                                                                         \
+                            ps_nbr_pic_buf_lx->u1_used_as_ref, ps_nbr_pic_buf_lx->i4_abs_poc,                                                                                    \
+                            ps_mv, &aps_nbr_pu[i]->mv.s_l1_mv);                                                                                                                  \
+                                                                                                                                                                                 \
+            nbr_pred_lx = (PRED_L1 != aps_nbr_pu[i]->b2_pred_mode);                                                                                                              \
+            ps_nbr_pic_buf_lx = (pic_buf_t*)((ps_ref_pic_list[0][aps_nbr_pu[i]->mv.i1_l0_ref_idx].pv_pic_buf));                                                                  \
+            CHECK_NBR_MV_LT(pi4_avail_flag,ps_cur_pic_buf_lx->u1_used_as_ref, ps_slice_hdr->i4_abs_pic_order_cnt, ps_cur_pic_buf_lx->i4_abs_poc,                                 \
+                            nbr_pred_lx,                                                                                                                                         \
+                            ps_nbr_pic_buf_lx->u1_used_as_ref, ps_nbr_pic_buf_lx->i4_abs_poc,                                                                                    \
+                            ps_mv, &aps_nbr_pu[i]->mv.s_l0_mv);                                                                                                                  \
+        }                                                                                                                                                                        \
+    }                                                                                                                                                                            \
+}
+#endif
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Motion Vector prediction and return a list of mv
+ *
+ * @par Description:
+ *  MV predictor list is computed using neighbor mvs and colocated mv
+ *
+ * @param[in] ps_ctxt
+ * pointer to mv predictor context
+ *
+ * @param[in] ps_top_nbr_4x4
+ * pointer to top 4x4 nbr structure
+ *
+ * @param[in] ps_left_nbr_4x4
+ * pointer to left 4x4 nbr structure
+ *
+ * @param[in] ps_top_left_nbr_4x4
+ * pointer to top left 4x4 nbr structure
+ *
+ * @param[in] left_nbr_4x4_strd
+ * left nbr buffer stride in terms of 4x4 units
+ *
+ * @param[in] ps_avail_flags
+ * Neighbor availability flags container
+ *
+ * @param[in] ps_col_mv
+ * Colocated MV pointer
+ *
+ * @param[in] ps_pu
+ * Current Partition PU strucrture pointer
+ *
+ * @param[inout] ps_pred_mv
+ * pointer to store predicted MV list
+ *
+ * @returns
+ * None
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+void ihevcd_mv_pred(mv_ctxt_t *ps_mv_ctxt,
+                    UWORD32 *pu4_top_pu_idx,
+                    UWORD32 *pu4_left_pu_idx,
+                    UWORD32 *pu4_top_left_pu_idx,
+                    WORD32 left_nbr_4x4_strd,
+                    pu_t *ps_pu,
+                    WORD32 lb_avail,
+                    WORD32 l_avail,
+                    WORD32 tr_avail,
+                    WORD32 t_avail,
+                    WORD32 tl_avail,
+                    pu_mv_t *ps_pred_mv)
+{
+    slice_header_t *ps_slice_hdr;
+    ref_list_t *ps_ref_pic_list[2];
+    pu_t *ps_pic_pu;
+    WORD32 max_l0_mvp_cand, max_l1_mvp_cand;
+    WORD32 l0_done_flag, l1_done_flag;
+    WORD32 num_l0_mvp_cand, num_l1_mvp_cand;
+    WORD32 is_scaled_flag_list /* Indicates whether A0 or A1 is available */;
+    WORD32 avail_a_flag[2];
+    mv_t as_mv_a[2];
+    WORD32 part_pos_x;
+    WORD32 part_pos_y;
+    WORD32 part_wd;
+    WORD32 part_ht;
+    pic_buf_t *ps_cur_pic_buf_l0, *ps_cur_pic_buf_l1;
+    WORD32 nbr_avail[3]; /*[A0/A1] */ /* [B0/B1/B2] */
+    pu_t *aps_nbr_pu[3];  /*[A0/A1] */ /* [B0/B1/B2] */
+    WORD32 num_nbrs = 0;
+
+    /*******************************************/
+    /* Neighbor location: Graphical indication */
+    /*                                         */
+    /*          B2 _____________B1 B0          */
+    /*            |               |            */
+    /*            |               |            */
+    /*            |               |            */
+    /*            |      PU     ht|            */
+    /*            |               |            */
+    /*            |               |            */
+    /*          A1|______wd_______|            */
+    /*          A0                             */
+    /*                                         */
+    /*******************************************/
+
+    ps_slice_hdr = ps_mv_ctxt->ps_slice_hdr;
+    ps_pic_pu = ps_mv_ctxt->ps_pic_pu;
+    max_l0_mvp_cand = ps_pu->b1_l0_mvp_idx + 1;
+    max_l1_mvp_cand = ps_pu->b1_l1_mvp_idx + 1;
+    num_l0_mvp_cand = 0;
+    num_l1_mvp_cand = 0;
+
+    /* Initializing reference list */
+    ps_ref_pic_list[0] = ps_slice_hdr->as_ref_pic_list0;
+    ps_ref_pic_list[1] = ps_slice_hdr->as_ref_pic_list1;
+    if(PSLICE == ps_slice_hdr->i1_slice_type)
+        ps_ref_pic_list[1] = ps_slice_hdr->as_ref_pic_list0;
+
+    ps_cur_pic_buf_l0 = (pic_buf_t *)((ps_ref_pic_list[0][ps_pu->mv.i1_l0_ref_idx].pv_pic_buf));
+    ps_cur_pic_buf_l1 = (pic_buf_t *)((ps_ref_pic_list[1][ps_pu->mv.i1_l1_ref_idx].pv_pic_buf));
+
+    is_scaled_flag_list = 0;
+
+    part_pos_x = ps_pu->b4_pos_x << 2;
+    part_pos_y = ps_pu->b4_pos_y << 2;
+    part_wd = (ps_pu->b4_wd + 1) << 2;
+    part_ht = (ps_pu->b4_ht + 1) << 2;
+
+    /************************************************************/
+    /* Calculating of motion vector A from neighbors A0 and A1  */
+    /************************************************************/
+    {
+        nbr_avail[0] = 0;
+        nbr_avail[1] = 0;
+
+        /* Pointers to A0 and A1 */
+        {
+            WORD32 y_a0, y_a1;
+            WORD32 pu_idx_a0, pu_idx_a1;
+
+            /* TODO: y_a0, y_a1 is coded assuming left nbr pointer starts at PU */
+            y_a0 = (part_ht >> 2);
+            y_a1 = ((part_ht - 1) >> 2);
+
+            pu_idx_a0 = *(pu4_left_pu_idx + (y_a0 * left_nbr_4x4_strd));
+            pu_idx_a1 = *(pu4_left_pu_idx + (y_a1 * left_nbr_4x4_strd));
+
+            if(lb_avail && (!ps_pic_pu[pu_idx_a0].b1_intra_flag))
+            {
+                aps_nbr_pu[num_nbrs] = &ps_pic_pu[pu_idx_a0];
+                num_nbrs++;
+                nbr_avail[0] = 1;
+            }
+            if(l_avail && (!ps_pic_pu[pu_idx_a1].b1_intra_flag))
+            {
+                aps_nbr_pu[num_nbrs] = &ps_pic_pu[pu_idx_a1];
+                num_nbrs++;
+                nbr_avail[1] = 1;
+            }
+        }
+        /* Setting is scaled flag based on availability of A0 and A1 */
+        if((nbr_avail[0] == 1) || (nbr_avail[1]))
+        {
+            is_scaled_flag_list = 1;
+        }
+
+        avail_a_flag[0] = 0;
+        avail_a_flag[1] = 0;
+
+        /* L0 */
+        GET_MV_NBR_ST(ps_ref_pic_list, &avail_a_flag[0], ps_cur_pic_buf_l0, aps_nbr_pu, &as_mv_a[0], num_nbrs, 0);
+        if(0 == avail_a_flag[0])
+        {
+            GET_MV_NBR_LT(ps_ref_pic_list, ps_slice_hdr, &avail_a_flag[0], ps_cur_pic_buf_l0, aps_nbr_pu, &as_mv_a[0], num_nbrs, 0);
+        }
+
+        /* L1 */
+        if(PRED_L0 != ps_pu->b2_pred_mode)
+        {
+            GET_MV_NBR_ST(ps_ref_pic_list, &avail_a_flag[1], ps_cur_pic_buf_l1, aps_nbr_pu, &as_mv_a[1], num_nbrs, 1);
+            if(0 == avail_a_flag[1])
+            {
+                GET_MV_NBR_LT(ps_ref_pic_list, ps_slice_hdr, &avail_a_flag[1], ps_cur_pic_buf_l1, aps_nbr_pu, &as_mv_a[1], num_nbrs, 1);
+            }
+        }
+
+        l0_done_flag = (PRED_L1 == ps_pu->b2_pred_mode);
+        l1_done_flag = (PRED_L0 == ps_pu->b2_pred_mode);
+
+        if(avail_a_flag[0])
+        {
+            num_l0_mvp_cand++;
+            if(max_l0_mvp_cand == num_l0_mvp_cand)
+            {
+                ps_pred_mv->s_l0_mv = as_mv_a[0];
+                l0_done_flag = 1;
+            }
+        }
+        if(avail_a_flag[1])
+        {
+            num_l1_mvp_cand++;
+            if(max_l1_mvp_cand == num_l1_mvp_cand)
+            {
+                ps_pred_mv->s_l1_mv = as_mv_a[1];
+                l1_done_flag = 1;
+            }
+        }
+        if(l0_done_flag && l1_done_flag)
+            return;
+    }
+
+    /************************************************************/
+    /* Calculating of motion vector B from neighbors B0 and B1  */
+    /************************************************************/
+    {
+        WORD32 avail_b_flag[2];
+        mv_t as_mv_b[2];
+
+        /* Pointers to B0, B1 and B2 */
+        {
+            WORD32 x_b0, x_b1, x_b2;
+            WORD32 pu_idx_b0, pu_idx_b1, pu_idx_b2;
+
+            /* Relative co-ordiante of Xp,Yp w.r.t CTB start will work */
+            /* as long as minCTB = 16                                  */
+            x_b0 = (part_pos_x + part_wd);
+            x_b1 = (part_pos_x + part_wd - 1);
+            x_b2 = (part_pos_x - 1);
+            /* Getting offset back to given pointer */
+            x_b0 = x_b0 - part_pos_x;
+            x_b1 = x_b1 - part_pos_x;
+            x_b2 = x_b2 - part_pos_x;
+
+            /* Below derivation are based on top pointer */
+            /* is pointing first pixel of PU             */
+            pu_idx_b0 = *(pu4_top_pu_idx + (x_b0 >> 2));
+            pu_idx_b0 = pu_idx_b0 * tr_avail;
+            pu_idx_b1 = *(pu4_top_pu_idx + (x_b1 >> 2));
+            pu_idx_b1 = pu_idx_b1 * t_avail;
+            /* At CTB boundary, use top-left passed in */
+            if(part_pos_y)
+            {
+                pu_idx_b2 = *pu4_top_left_pu_idx;
+            }
+            else
+            {
+                /* Not at CTB boundary, use top and  */
+                /* add correction to go to top-left */
+                pu_idx_b2 = *((pu4_top_pu_idx)+(x_b2 >> 2));
+            }
+            pu_idx_b2 = pu_idx_b2 * tl_avail;
+
+            num_nbrs = 0;
+            nbr_avail[0] = 0;
+            nbr_avail[1] = 0;
+            nbr_avail[2] = 0;
+
+            if(tr_avail && (!ps_pic_pu[pu_idx_b0].b1_intra_flag))
+            {
+                aps_nbr_pu[num_nbrs] = &ps_pic_pu[pu_idx_b0];
+                num_nbrs++;
+                nbr_avail[0] = 1;
+            }
+            if(t_avail && (!ps_pic_pu[pu_idx_b1].b1_intra_flag))
+            {
+                aps_nbr_pu[num_nbrs] = &ps_pic_pu[pu_idx_b1];
+                num_nbrs++;
+                nbr_avail[1] = 1;
+            }
+            if(tl_avail && (!ps_pic_pu[pu_idx_b2].b1_intra_flag))
+            {
+                aps_nbr_pu[num_nbrs] = &ps_pic_pu[pu_idx_b2];
+                num_nbrs++;
+                nbr_avail[2] = 1;
+            }
+        }
+
+        /* L0 */
+        avail_b_flag[0] = 0;
+        avail_b_flag[1] = 0;
+
+        GET_MV_NBR_ST(ps_ref_pic_list, &avail_b_flag[0], ps_cur_pic_buf_l0, aps_nbr_pu, &as_mv_b[0], num_nbrs, 0);
+
+        /* L1 */
+        if(PRED_L0 != ps_pu->b2_pred_mode)
+        {
+            /* B0 Short Term */
+            GET_MV_NBR_ST(ps_ref_pic_list, &avail_b_flag[1], ps_cur_pic_buf_l1, aps_nbr_pu, &as_mv_b[1], num_nbrs, 1);
+        }
+
+        if(avail_b_flag[0])
+        {
+            if(((0 == num_l0_mvp_cand)
+                            || (as_mv_a[0].i2_mvx != as_mv_b[0].i2_mvx)
+                            || (as_mv_a[0].i2_mvy != as_mv_b[0].i2_mvy)))
+            {
+                num_l0_mvp_cand++;
+                if(max_l0_mvp_cand == num_l0_mvp_cand)
+                {
+                    ps_pred_mv->s_l0_mv = as_mv_b[0];
+                    l0_done_flag = 1;
+                }
+            }
+        }
+        if(avail_b_flag[1])
+        {
+            if(((0 == num_l1_mvp_cand)
+                            || (as_mv_a[1].i2_mvx != as_mv_b[1].i2_mvx)
+                            || (as_mv_a[1].i2_mvy != as_mv_b[1].i2_mvy)))
+            {
+                num_l1_mvp_cand++;
+                if(max_l1_mvp_cand == num_l1_mvp_cand)
+                {
+                    ps_pred_mv->s_l1_mv = as_mv_b[1];
+                    l1_done_flag = 1;
+                }
+            }
+        }
+        if(l0_done_flag && l1_done_flag)
+            return;
+
+        if((is_scaled_flag_list == 0) && (avail_b_flag[0] == 1))
+        {
+            avail_a_flag[0] = 1;
+            as_mv_a[0] = as_mv_b[0];
+        }
+        if((is_scaled_flag_list == 0) && (avail_b_flag[1] == 1))
+        {
+            avail_a_flag[1] = 1;
+            as_mv_a[1] = as_mv_b[1];
+        }
+
+        if(0 == is_scaled_flag_list)
+        {
+            avail_b_flag[0] = avail_b_flag[1] = 0;
+
+            GET_MV_NBR_LT(ps_ref_pic_list, ps_slice_hdr, &avail_b_flag[0], ps_cur_pic_buf_l0, aps_nbr_pu, &as_mv_b[0], num_nbrs, 0);
+
+            if(PRED_L0 != ps_pu->b2_pred_mode)
+            {
+                GET_MV_NBR_LT(ps_ref_pic_list, ps_slice_hdr, &avail_b_flag[1], ps_cur_pic_buf_l1, aps_nbr_pu, &as_mv_b[1], num_nbrs, 1);
+            }
+
+            if(avail_b_flag[0])
+            {
+                if(((0 == num_l0_mvp_cand)
+                                || (as_mv_a[0].i2_mvx != as_mv_b[0].i2_mvx)
+                                || (as_mv_a[0].i2_mvy != as_mv_b[0].i2_mvy)))
+                {
+                    num_l0_mvp_cand++;
+                    if(max_l0_mvp_cand == num_l0_mvp_cand)
+                    {
+                        ps_pred_mv->s_l0_mv = as_mv_b[0];
+                        l0_done_flag = 1;
+                    }
+                }
+            }
+            if(avail_b_flag[1])
+            {
+                if(((0 == num_l1_mvp_cand)
+                                || (as_mv_a[1].i2_mvx != as_mv_b[1].i2_mvx)
+                                || (as_mv_a[1].i2_mvy != as_mv_b[1].i2_mvy)))
+                {
+                    num_l1_mvp_cand++;
+                    if(max_l1_mvp_cand == num_l1_mvp_cand)
+                    {
+                        ps_pred_mv->s_l1_mv = as_mv_b[1];
+                        l1_done_flag = 1;
+                    }
+                }
+            }
+            if(l0_done_flag && l1_done_flag)
+                return;
+        }
+        /***********************************************************/
+        /*          Collocated MV prediction                       */
+        /***********************************************************/
+#if 1
+        if((2 != num_l0_mvp_cand) || (2 != num_l1_mvp_cand))
+        {
+            mv_t as_mv_col[2], s_mv_col_l0, s_mv_col_l1;
+            WORD32 avail_col_flag[2] = { 0 };
+            WORD32 x_col, y_col, avail_col_l0, avail_col_l1;
+//            ihevcd_collocated_mvp((mv_ctxt_t *)ps_mv_ctxt,ps_pu,part_pos_x,part_pos_y,part_wd,part_ht,as_mv_col,avail_col_flag,1);
+            x_col = part_pos_x + part_wd;
+            y_col = part_pos_y + part_ht;
+            ihevcd_collocated_mvp(ps_mv_ctxt, ps_pu, as_mv_col, avail_col_flag, 1, x_col, y_col);
+
+            avail_col_l0 = avail_col_flag[0];
+            avail_col_l1 = avail_col_flag[1];
+            if(avail_col_l0 || avail_col_l1)
+            {
+                s_mv_col_l0 = as_mv_col[0];
+                s_mv_col_l1 = as_mv_col[1];
+            }
+
+            if(avail_col_l0 == 0 || avail_col_l1 == 0)
+            {
+                /* Checking Collocated MV availability at Center of PU */
+                x_col = part_pos_x + (part_wd >> 1);
+                y_col = part_pos_y + (part_ht >> 1);
+                ihevcd_collocated_mvp(ps_mv_ctxt, ps_pu, as_mv_col, avail_col_flag, 1, x_col, y_col);
+
+                if(avail_col_l0 == 0)
+                {
+                    s_mv_col_l0 = as_mv_col[0];
+                }
+                if(avail_col_l1 == 0)
+                {
+                    s_mv_col_l1 = as_mv_col[1];
+                }
+
+                avail_col_l0 |= avail_col_flag[0];
+                avail_col_l1 |= avail_col_flag[1];
+            }
+
+            /* Checking if mvp index matches collocated mv */
+            if(avail_col_l0)
+            {
+                if(2 != num_l0_mvp_cand)
+                {
+                    num_l0_mvp_cand++;
+                    if(max_l0_mvp_cand == num_l0_mvp_cand)
+                    {
+                        ps_pred_mv->s_l0_mv = s_mv_col_l0;
+                        l0_done_flag = 1;
+                    }
+                }
+            }
+            if(avail_col_l1)
+            {
+                if(2 != num_l1_mvp_cand)
+                {
+                    num_l1_mvp_cand++;
+                    if(max_l1_mvp_cand == num_l1_mvp_cand)
+                    {
+                        ps_pred_mv->s_l1_mv = s_mv_col_l1;
+                        l1_done_flag = 1;
+                    }
+                }
+            }
+            if(l0_done_flag && l1_done_flag)
+                return;
+        }
+#endif
+
+        if(0 == l0_done_flag)
+        {
+            ps_pred_mv->s_l0_mv.i2_mvx = 0;
+            ps_pred_mv->s_l0_mv.i2_mvy = 0;
+        }
+        if(0 == l1_done_flag)
+        {
+            ps_pred_mv->s_l1_mv.i2_mvx = 0;
+            ps_pred_mv->s_l1_mv.i2_mvy = 0;
+        }
+    }
+}

diff --git a/decoder/ihevcd_mv_pred.h b/decoder/ihevcd_mv_pred.h
new file mode 100644
index 0000000..b349e58
--- /dev/null
+++ b/decoder/ihevcd_mv_pred.h

@@ -0,0 +1,58 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+ *******************************************************************************
+ * @file
+ *  ihevcd_structs.h
+ *
+ * @brief
+ *  Structure definitions used in the decoder
+ *
+ * @author
+ *  Harish
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+#ifndef IHEVCD_MV_PRED_H_
+#define IHEVCD_MV_PRED_H_
+void ihevcd_mv_pred(mv_ctxt_t *ps_mv_ctxt,
+                    UWORD32 *pu4_top_pu_idx,
+                    UWORD32 *pu4_left_pu_idx,
+                    UWORD32 *pu4_top_left_pu_idx,
+                    WORD32 left_nbr_4x4_strd,
+                    pu_t *ps_pu,
+                    WORD32 lb_avail,
+                    WORD32 l_avail,
+                    WORD32 tr_avail,
+                    WORD32 t_avail,
+                    WORD32 tl_avail,
+                    pu_mv_t *ps_pred_mv);
+void ihevcd_scale_mv(mv_t *ps_mv,
+                     WORD32 cur_ref_poc,
+                     WORD32 nbr_ref_poc,
+                     WORD32 cur_poc);
+
+
+#endif /* IHEVCD_MV_PRED_H_ */

diff --git a/decoder/ihevcd_nal.c b/decoder/ihevcd_nal.c
new file mode 100644
index 0000000..cf2208f
--- /dev/null
+++ b/decoder/ihevcd_nal.c

@@ -0,0 +1,458 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_nal.c
+*
+* @brief
+*  Contains functions for NAL level such as search start code etc
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_parse_headers.h"
+#include "ihevcd_parse_slice.h"
+#include "ihevcd_debug.h"
+/*****************************************************************************/
+/* Function Prototypes                                                       */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Search start code from the given buffer pointer
+*
+* @par Description:
+*  Search for start code  Return the offset of start code if start code is
+* found  If no start code is found till end of given bitstream  then treat
+* it as invalid NAL and return end of buffer as  offset
+*
+* @param[in] pu1_buf
+*  Pointer to bitstream
+*
+* @param[in] bytes_remaining
+*  Number of bytes remaining in the buffer
+*
+* @returns  Offset to the first byte in NAL after start code
+*
+* @remarks
+*  Incomplete start code at the end of input bitstream is  not handled. This
+* has to be taken care outside this func
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_nal_search_start_code(UWORD8 *pu1_buf, WORD32 bytes_remaining)
+{
+    WORD32 ofst;
+
+    WORD32 zero_byte_cnt;
+    WORD32 start_code_found;
+
+    ofst = -1;
+
+    zero_byte_cnt = 0;
+    start_code_found = 0;
+    while(ofst < (bytes_remaining - 1))
+    {
+        ofst++;
+        if(pu1_buf[ofst] != 0)
+        {
+            zero_byte_cnt = 0;
+            continue;
+        }
+
+        zero_byte_cnt++;
+        if((pu1_buf[ofst + 1] == START_CODE_PREFIX_BYTE) &&
+           (zero_byte_cnt >= NUM_ZEROS_BEFORE_START_CODE))
+        {
+            /* Found the start code */
+            ofst++;
+            start_code_found = 1;
+            break;
+        }
+    }
+    if(0 == start_code_found)
+    {
+        if((START_CODE_PREFIX_BYTE == pu1_buf[ofst]) &&
+           (zero_byte_cnt >= NUM_ZEROS_BEFORE_START_CODE))
+        {
+            /* Found a start code at the end*/
+            ofst++;
+        }
+    }
+    /* Since ofst started at -1, increment it by 1 */
+    ofst++;
+
+    return ofst;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Remove emulation prevention byte present in the bitstream till next start
+* code is found. Emulation prevention byte  removed data is stored in a
+* different buffer
+*
+* @par Description:
+*  Assumption is first start code is already found and  pu1_buf is pointing
+* to a byte after the start code  Search for Next NAL's start code  Return
+* if start code is found  Remove any emulation prevention byte present  Copy
+* data to new buffer  If no start code is found, then treat complete buffer
+* as  one nal.
+*
+* @param[in] pu1_src
+*  Pointer to bitstream (excludes the initial the start code)
+*
+* @param[in] pu1_dst
+*  Pointer to destination buffer
+*
+* @param[in] bytes_remaining
+*  Number of bytes remaining
+*
+* @param[out] pi4_nal_len
+*  NAL length (length of bitstream parsed)
+*
+* @param[out] pi4_dst_len
+*  Destination bitstream size (length of bitstream parsed with emulation bytes
+* removed)
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*  Incomplete start code at the end of input bitstream is  not handled. This
+* has to be taken care outside this func
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_nal_remv_emuln_bytes(UWORD8 *pu1_src,
+                                           UWORD8 *pu1_dst,
+                                           WORD32 bytes_remaining,
+                                           WORD32 *pi4_nal_len,
+                                           WORD32 *pi4_dst_len)
+{
+    WORD32 src_cnt;
+    WORD32 dst_cnt;
+    WORD32 zero_byte_cnt;
+    WORD32 start_code_found;
+    UWORD8 u1_src;
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+
+    src_cnt = 0;
+    dst_cnt = 0;
+    zero_byte_cnt = 0;
+    start_code_found = 0;
+    while(src_cnt < (bytes_remaining - 1))
+    {
+        u1_src = pu1_src[src_cnt++];
+
+        pu1_dst[dst_cnt++] = u1_src;
+        if(u1_src != 0)
+        {
+            zero_byte_cnt = 0;
+            continue;
+        }
+
+        zero_byte_cnt++;
+        if(zero_byte_cnt >= NUM_ZEROS_BEFORE_START_CODE)
+        {
+            u1_src = pu1_src[src_cnt];
+            if(START_CODE_PREFIX_BYTE == u1_src)
+            {
+                /* Found the start code */
+                src_cnt -= zero_byte_cnt;
+                dst_cnt -= zero_byte_cnt;
+                start_code_found = 1;
+                break;
+            }
+            else if(EMULATION_PREVENT_BYTE == u1_src)
+            {
+                /* Found the emulation prevention byte */
+                src_cnt++;
+                zero_byte_cnt = 0;
+
+                /* Decrement dst_cnt so that the next byte overwrites
+                 * the emulation prevention byte already copied to dst above
+                 */
+            }
+        }
+
+    }
+
+    if(0 == start_code_found)
+    {
+        u1_src = pu1_src[src_cnt++];
+        if(zero_byte_cnt >= NUM_ZEROS_BEFORE_START_CODE)
+        {
+
+            if(START_CODE_PREFIX_BYTE == u1_src)
+            {
+                /* Found a start code at the end*/
+                src_cnt -= zero_byte_cnt;
+            }
+            else if(EMULATION_PREVENT_BYTE == u1_src)
+            {
+                /* Found the emulation prevention byte at the end*/
+                src_cnt++;
+                /* Decrement dst_cnt so that the next byte overwrites
+                 * the emulation prevention byte already copied to dst above
+                 */
+                dst_cnt--;
+            }
+        }
+        else
+        {
+            pu1_dst[dst_cnt++] = u1_src;
+        }
+
+
+    }
+    *pi4_nal_len = src_cnt;
+    *pi4_dst_len = dst_cnt;
+    return ret;
+}
+/**
+*******************************************************************************
+*
+* @brief
+*  Decode given NAL unit's header
+*
+* @par Description:
+*  Call NAL unit's header decode  Section: 7.3.1.2
+*
+* @param[in] ps_bitstrm
+*  Pointer to bitstream context
+*
+* @param[out] ps_nal
+*  Pointer to NAL header
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_nal_unit_header(bitstrm_t *ps_bitstrm, nal_header_t *ps_nal)
+{
+    WORD32 unused;
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    UNUSED(unused);
+    /* Syntax : forbidden_zero_bit */
+    unused = ihevcd_bits_get(ps_bitstrm, 1);
+
+    /* Syntax : nal_unit_type */
+    ps_nal->i1_nal_unit_type = ihevcd_bits_get(ps_bitstrm, 6);
+
+    /* Syntax : nuh_reserved_zero_6bits */
+    unused = ihevcd_bits_get(ps_bitstrm, 6);
+
+    /* Syntax : nuh_temporal_id_plus1 */
+    ps_nal->i1_nuh_temporal_id = ihevcd_bits_get(ps_bitstrm, 3) - 1;
+
+    return ret;
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Decode given NAL
+*
+* @par Description:
+*  Based on the NAL type call appropriate decode function  Section: 7.3.1.1
+*
+*
+* @param[in,out] ps_codec
+*  Pointer to codec context (Functions called within will modify contents of
+* ps_codec)
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_nal_unit(codec_t *ps_codec)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+
+    /* NAL Header */
+    nal_header_t s_nal;
+
+    ret = ihevcd_nal_unit_header(&ps_codec->s_parse.s_bitstrm, &s_nal);
+    RETURN_IF((ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), ret);
+
+    if(ps_codec->i4_slice_error)
+        s_nal.i1_nal_unit_type = ps_codec->s_parse.ps_slice_hdr->i1_nal_unit_type;
+
+    /* Setting RASL Output flag */
+    switch(s_nal.i1_nal_unit_type)
+    {
+        case NAL_BLA_W_LP    :
+        case NAL_BLA_W_DLP   :
+        case NAL_BLA_N_LP    :
+            ps_codec->i4_rasl_output_flag = 0;
+            break;
+
+        //TODO: After IDR, there is no case of open GOP
+        //To be fixed appropriately by ignoring RASL only if the
+        // required references are not found
+        case NAL_IDR_W_LP    :
+        case NAL_IDR_N_LP    :
+            ps_codec->i4_rasl_output_flag = 1;
+            break;
+
+        case NAL_CRA         :
+            ps_codec->i4_rasl_output_flag = (0 == ps_codec->u4_pic_cnt) ? 0 : 1;
+            break;
+
+        default:
+            break;
+    }
+
+    switch(s_nal.i1_nal_unit_type)
+    {
+        case NAL_BLA_W_LP    :
+        case NAL_BLA_W_DLP   :
+        case NAL_BLA_N_LP    :
+        case NAL_IDR_W_LP    :
+        case NAL_IDR_N_LP    :
+        case NAL_CRA         :
+        case NAL_TRAIL_N     :
+        case NAL_TRAIL_R     :
+        case NAL_TSA_N       :
+        case NAL_TSA_R       :
+        case NAL_STSA_N      :
+        case NAL_STSA_R      :
+        case NAL_RADL_N      :
+        case NAL_RADL_R      :
+        case NAL_RASL_N      :
+        case NAL_RASL_R      :
+            if(ps_codec->i4_header_mode)
+                return IHEVCD_SLICE_IN_HEADER_MODE;
+
+            if((0 == ps_codec->i4_sps_done) ||
+                            (0 == ps_codec->i4_pps_done))
+            {
+                return IHEVCD_INVALID_HEADER;
+            }
+
+            ps_codec->i4_header_in_slice_mode = 0;
+
+            ret = ihevcd_parse_slice_header(ps_codec, &s_nal);
+            DEBUG_PRINT_NAL_INFO(ps_codec, s_nal.i1_nal_unit_type);
+            if(ret == (IHEVCD_ERROR_T)IHEVCD_SUCCESS)
+            {
+                if((s_nal.i1_nal_unit_type != NAL_RASL_N && s_nal.i1_nal_unit_type != NAL_RASL_R) ||
+                                ps_codec->i4_rasl_output_flag ||
+                                ps_codec->i4_slice_error)
+                    ret = ihevcd_parse_slice_data(ps_codec);
+            }
+            break;
+
+        case NAL_VPS        :
+            // ret = ihevcd_parse_vps(ps_codec);
+            DEBUG_PRINT_NAL_INFO(ps_codec, s_nal.i1_nal_unit_type);
+            break;
+
+        case NAL_SPS        :
+            if(0 == ps_codec->i4_header_mode)
+            {
+                ps_codec->i4_header_in_slice_mode = 1;
+                if(ps_codec->i4_sps_done &&
+                                ps_codec->i4_pic_present)
+                    break;
+            }
+
+            ret = ihevcd_parse_sps(ps_codec);
+            if(ret == (IHEVCD_ERROR_T)IHEVCD_SUCCESS)
+            {
+                sps_t *ps_sps = ps_codec->ps_sps_base + MAX_SPS_CNT - 1;
+                ihevcd_copy_sps(ps_codec, ps_sps->i1_sps_id, MAX_SPS_CNT - 1);
+            }
+
+            DEBUG_PRINT_NAL_INFO(ps_codec, s_nal.i1_nal_unit_type);
+            break;
+
+        case NAL_PPS        :
+            if(0 == ps_codec->i4_header_mode)
+            {
+                ps_codec->i4_header_in_slice_mode = 1;
+                if(ps_codec->i4_pps_done &&
+                                ps_codec->i4_pic_present)
+                    break;
+            }
+
+            ret = ihevcd_parse_pps(ps_codec);
+            if(ret == (IHEVCD_ERROR_T)IHEVCD_SUCCESS)
+            {
+                pps_t *ps_pps = ps_codec->ps_pps_base + MAX_PPS_CNT - 1;
+                ihevcd_copy_pps(ps_codec, ps_pps->i1_pps_id, MAX_PPS_CNT - 1);
+            }
+
+            DEBUG_PRINT_NAL_INFO(ps_codec, s_nal.i1_nal_unit_type);
+            break;
+
+        default:
+            DEBUG_PRINT_NAL_INFO(ps_codec, s_nal.i1_nal_unit_type);
+            break;
+    }
+
+    return ret;
+}
+

diff --git a/decoder/ihevcd_nal.h b/decoder/ihevcd_nal.h
new file mode 100644
index 0000000..b7d09d0
--- /dev/null
+++ b/decoder/ihevcd_nal.h

@@ -0,0 +1,69 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_nal.h
+*
+* @brief
+*  Header for NAL related function
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_NAL_H_
+#define _IHEVCD_NAL_H_
+/**
+ * Minimum size of start code including NAL type
+ */
+
+#define MIN_START_CODE_LEN          4
+/**
+ * Start code prefix byte - 1
+ */
+#define START_CODE_PREFIX_BYTE      1
+
+/**
+ * Emulation prevention byte - 3
+ */
+
+#define EMULATION_PREVENT_BYTE      3
+/**
+ * Minimum number of zeros before start code
+ */
+#define NUM_ZEROS_BEFORE_START_CODE 2
+
+
+WORD32 ihevcd_nal_search_start_code(UWORD8 *pu1_buf, WORD32 bytes_remaining);
+
+IHEVCD_ERROR_T ihevcd_nal_remv_emuln_bytes(UWORD8 *pu1_src,
+                                           UWORD8 *pu1_dst,
+                                           WORD32 bytes_remaining,
+                                           WORD32 *pi4_nal_len,
+                                           WORD32 *pi4_dst_len);
+
+IHEVCD_ERROR_T ihevcd_nal_unit(codec_t *ps_codec);
+#endif /* _IHEVCD_NAL_H_ */

diff --git a/decoder/ihevcd_parse_headers.c b/decoder/ihevcd_parse_headers.c
new file mode 100644
index 0000000..76240f9
--- /dev/null
+++ b/decoder/ihevcd_parse_headers.c

@@ -0,0 +1,2267 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_parse_headers.c
+*
+* @brief
+*  Contains functions for parsing headers
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_quant_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_parse_headers.h"
+#include "ihevcd_ref_list.h"
+#ifdef GPU_BUILD
+#include "ihevcd_opencl_mc_interface.h"
+#endif
+
+#define COPY_DEFAULT_SCALING_LIST(pi2_scaling_mat)                                                                                      \
+{                                                                                                                                       \
+    WORD32 scaling_mat_offset[]={0, 16, 32, 48, 64, 80, 96, 160, 224, 288, 352, 416, 480, 736, 992, 1248, 1504, 1760, 2016, 3040};      \
+                                                                                                                                        \
+    /* scaling matrix for 4x4 */                                                                                                        \
+    memcpy(pi2_scaling_mat, gi2_flat_scale_mat_32x32, 6*16*sizeof(WORD16));                                                             \
+/* scaling matrix for 8x8 */                                                                                                            \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[6], gi2_intra_default_scale_mat_8x8, 64*sizeof(WORD16));                                \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[7], gi2_intra_default_scale_mat_8x8, 64*sizeof(WORD16));                                \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[8], gi2_intra_default_scale_mat_8x8, 64*sizeof(WORD16));                                \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[9], gi2_inter_default_scale_mat_8x8, 64*sizeof(WORD16));                                \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[10], gi2_inter_default_scale_mat_8x8, 64*sizeof(WORD16));                               \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[11], gi2_inter_default_scale_mat_8x8, 64*sizeof(WORD16));                               \
+    /* scaling matrix for 16x16 */                                                                                                      \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[12], gi2_intra_default_scale_mat_16x16, 256*sizeof(WORD16));                            \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[13], gi2_intra_default_scale_mat_16x16, 256*sizeof(WORD16));                            \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[14], gi2_intra_default_scale_mat_16x16, 256*sizeof(WORD16));                            \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[15], gi2_inter_default_scale_mat_16x16, 256*sizeof(WORD16));                            \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[16], gi2_inter_default_scale_mat_16x16, 256*sizeof(WORD16));                            \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[17], gi2_inter_default_scale_mat_16x16, 256*sizeof(WORD16));                            \
+    /* scaling matrix for 32x32 */                                                                                                      \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[18], gi2_intra_default_scale_mat_32x32, 1024*sizeof(WORD16));                           \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[19], gi2_inter_default_scale_mat_32x32, 1024*sizeof(WORD16));                           \
+}
+
+#define COPY_FLAT_SCALING_LIST(pi2_scaling_mat)                                                                                         \
+{                                                                                                                                       \
+    WORD32 scaling_mat_offset[]={0, 16, 32, 48, 64, 80, 96, 160, 224, 288, 352, 416, 480, 736, 992, 1248, 1504, 1760, 2016, 3040};      \
+                                                                                                                                        \
+    /* scaling matrix for 4x4 */                                                                                                        \
+    memcpy(pi2_scaling_mat, gi2_flat_scale_mat_32x32, 6*16*sizeof(WORD16));                                                             \
+    /* scaling matrix for 8x8 */                                                                                                        \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[6], gi2_flat_scale_mat_32x32, 6*64*sizeof(WORD16));                                     \
+    /* scaling matrix for 16x16 */                                                                                                      \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[12], gi2_flat_scale_mat_32x32, 3*256*sizeof(WORD16));                                   \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[15], gi2_flat_scale_mat_32x32, 3*256*sizeof(WORD16));                                   \
+    /* scaling matrix for 32x32 */                                                                                                      \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[18], gi2_flat_scale_mat_32x32, 1024*sizeof(WORD16));                                    \
+    memcpy(pi2_scaling_mat + scaling_mat_offset[19], gi2_flat_scale_mat_32x32, 1024*sizeof(WORD16));                                    \
+}
+
+/* Function declarations */
+
+#if 0
+/**
+*******************************************************************************
+*
+* @brief
+*  Parses VPS operation point
+*
+* @par   Description
+* Parses VPS operation point as per section 7.3.5
+*
+* @param[out] ps_vps
+*  Pointer to VPS structure
+*
+* @param[in] ps_bitstrm
+*  Pointer to bitstream structure
+*
+* @param[in] ops_idx
+*  Operating point index
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*******************************************************************************
+*/
+
+IHEVCD_ERROR_T ihevcd_operation_point_set( vps_t *ps_vps, bitstrm_t *ps_bitstrm, WORD32 ops_idx)
+{
+    WORD32 i;
+    WORD32 value;
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    for( i = 0; i <= ps_vps->i1_vps_max_nuh_reserved_zero_layer_id; i++ )
+    {
+        BITS_PARSE("list_entry_l0[ i ]", value, ps_bitstrm, 1);
+        //ps_vps->ai1_layer_id_included_flag[ops_idx][i] = value;
+
+    }
+    return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Parses pic_lismod_t (picture list mod syntax)  Section:7.3.8.3 Reference
+* picture list mod syntax
+*
+* @par Description:
+*  Parse pict list mod synt and update pic_lismod_t struct
+*
+* @param[in] ps_codec
+*  Pointer to codec context
+*
+* @returns  Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_ref_pic_list_modification(bitstrm_t *ps_bitstrm,
+                                        slice_header_t *ps_slice_hdr,
+                                        WORD32 num_poc_total_curr)
+{
+    WORD32 ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    WORD32 value;
+    WORD32 i;
+    rplm_t *ps_rplm;
+    WORD32 num_bits_list_entry;
+
+    ps_rplm = &(ps_slice_hdr->s_rplm);
+
+    /* Calculate Ceil(Log2(num_poc_total_curr)) */
+    {
+        num_bits_list_entry = 32 - CLZ(num_poc_total_curr);
+        /* Check if num_poc_total_curr is power of 2 */
+        if(0 == (num_poc_total_curr & (num_poc_total_curr - 1)))
+        {
+            num_bits_list_entry--;
+        }
+    }
+
+    if(ps_slice_hdr->i1_slice_type  == PSLICE || ps_slice_hdr->i1_slice_type  == BSLICE)
+    {
+        BITS_PARSE("ref_pic_list_modification_flag_l0", value, ps_bitstrm, 1);
+        ps_rplm->i1_ref_pic_list_modification_flag_l0 = value;
+
+        if(ps_rplm->i1_ref_pic_list_modification_flag_l0)
+            for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
+            {
+                BITS_PARSE("list_entry_l0", value, ps_bitstrm, num_bits_list_entry);
+                ps_rplm->i1_list_entry_l0[i] = value;
+
+                ps_rplm->i1_list_entry_l0[i] = CLIP3(ps_rplm->i1_list_entry_l0[i], 0, num_poc_total_curr - 1);
+            }
+    }
+
+    if(ps_slice_hdr->i1_slice_type  == BSLICE)
+    {
+        BITS_PARSE("ref_pic_list_modification_flag_l1", value, ps_bitstrm, 1);
+        ps_rplm->i1_ref_pic_list_modification_flag_l1 = value;
+
+        if(ps_rplm->i1_ref_pic_list_modification_flag_l1)
+            for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
+            {
+                BITS_PARSE("list_entry_l1", value, ps_bitstrm, num_bits_list_entry);
+                ps_rplm->i1_list_entry_l1[i] = value;
+
+                ps_rplm->i1_list_entry_l1[i] = CLIP3(ps_rplm->i1_list_entry_l1[i], 0, num_poc_total_curr - 1);
+            }
+
+    }
+
+    return ret;
+}
+#endif
+/**
+*******************************************************************************
+*
+* @brief
+*  Parses Prediction weight table syntax
+*
+* @par Description:
+*  Parse Prediction weight table syntax as per Section: 7.3.8.4
+*
+* @param[in] ps_bitstrm
+*  Pointer to bitstream context
+*
+* @param[in] ps_sps
+*  Current SPS
+*
+* @param[in] ps_pps
+*  Current PPS
+*
+* @param[in] ps_slice_hdr
+*  Current Slice header
+*
+* @returns  Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_parse_pred_wt_ofst(bitstrm_t *ps_bitstrm,
+                                 sps_t *ps_sps,
+                                 pps_t *ps_pps,
+                                 slice_header_t *ps_slice_hdr)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    WORD32 value;
+    WORD32 i;
+    UNUSED(ps_pps);
+    pred_wt_ofst_t *ps_wt_ofst = &ps_slice_hdr->s_wt_ofst;
+
+    UEV_PARSE("luma_log2_weight_denom", value, ps_bitstrm);
+    ps_wt_ofst->i1_luma_log2_weight_denom = value;
+
+    if(ps_sps->i1_chroma_format_idc != 0)
+    {
+        SEV_PARSE("delta_chroma_log2_weight_denom", value, ps_bitstrm);
+        ps_wt_ofst->i1_chroma_log2_weight_denom = ps_wt_ofst->i1_luma_log2_weight_denom + value;
+    }
+
+    for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
+    {
+        BITS_PARSE("luma_weight_l0_flag[ i ]", value, ps_bitstrm, 1);
+        ps_wt_ofst->i1_luma_weight_l0_flag[i] = value;
+    }
+
+
+
+    if(ps_sps->i1_chroma_format_idc != 0)
+    {
+        for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
+        {
+            BITS_PARSE("chroma_weight_l0_flag[ i ]", value, ps_bitstrm, 1);
+            ps_wt_ofst->i1_chroma_weight_l0_flag[i] = value;
+        }
+    }
+    else
+    {
+        for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
+        {
+            ps_wt_ofst->i1_chroma_weight_l0_flag[i] = 0;
+        }
+    }
+
+
+    for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
+    {
+        if(ps_wt_ofst->i1_luma_weight_l0_flag[i])
+        {
+            SEV_PARSE("delta_luma_weight_l0[ i ]", value, ps_bitstrm);
+
+
+            ps_wt_ofst->i2_luma_weight_l0[i] = (1 << ps_wt_ofst->i1_luma_log2_weight_denom) + value;
+
+            SEV_PARSE("luma_offset_l0[ i ]", value, ps_bitstrm);
+            ps_wt_ofst->i2_luma_offset_l0[i] = value;
+
+        }
+        else
+        {
+            ps_wt_ofst->i2_luma_weight_l0[i] = (1 << ps_wt_ofst->i1_luma_log2_weight_denom);
+            ps_wt_ofst->i2_luma_offset_l0[i] = 0;
+        }
+        if(ps_wt_ofst->i1_chroma_weight_l0_flag[i])
+        {
+            WORD32 ofst;
+            WORD32 shift = (1 << (BIT_DEPTH_CHROMA - 1));
+            SEV_PARSE("delta_chroma_weight_l0[ i ][ j ]", value, ps_bitstrm);
+            ps_wt_ofst->i2_chroma_weight_l0_cb[i] = (1 << ps_wt_ofst->i1_chroma_log2_weight_denom) + value;
+
+
+            SEV_PARSE("delta_chroma_offset_l0[ i ][ j ]", value, ps_bitstrm);
+            ofst = ((shift * ps_wt_ofst->i2_chroma_weight_l0_cb[i]) >> ps_wt_ofst->i1_chroma_log2_weight_denom);
+            ofst = value - ofst + shift;
+
+            ps_wt_ofst->i2_chroma_offset_l0_cb[i] = CLIP_S8(ofst);
+
+            SEV_PARSE("delta_chroma_weight_l0[ i ][ j ]", value, ps_bitstrm);
+            ps_wt_ofst->i2_chroma_weight_l0_cr[i] = (1 << ps_wt_ofst->i1_chroma_log2_weight_denom) + value;
+
+
+            SEV_PARSE("delta_chroma_offset_l0[ i ][ j ]", value, ps_bitstrm);
+            ofst = ((shift * ps_wt_ofst->i2_chroma_weight_l0_cr[i]) >> ps_wt_ofst->i1_chroma_log2_weight_denom);
+            ofst = value - ofst + shift;
+
+            ps_wt_ofst->i2_chroma_offset_l0_cr[i] = CLIP_S8(ofst);
+
+        }
+        else
+        {
+            ps_wt_ofst->i2_chroma_weight_l0_cb[i] = (1 << ps_wt_ofst->i1_chroma_log2_weight_denom);
+            ps_wt_ofst->i2_chroma_weight_l0_cr[i] = (1 << ps_wt_ofst->i1_chroma_log2_weight_denom);
+
+            ps_wt_ofst->i2_chroma_offset_l0_cb[i] = 0;
+            ps_wt_ofst->i2_chroma_offset_l0_cr[i] = 0;
+        }
+    }
+    if(BSLICE == ps_slice_hdr->i1_slice_type)
+    {
+        for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
+        {
+            BITS_PARSE("luma_weight_l1_flag[ i ]", value, ps_bitstrm, 1);
+            ps_wt_ofst->i1_luma_weight_l1_flag[i] = value;
+        }
+
+        if(ps_sps->i1_chroma_format_idc != 0)
+        {
+            for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
+            {
+                BITS_PARSE("chroma_weight_l1_flag[ i ]", value, ps_bitstrm, 1);
+                ps_wt_ofst->i1_chroma_weight_l1_flag[i] = value;
+            }
+        }
+        else
+        {
+            for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
+            {
+                ps_wt_ofst->i1_chroma_weight_l1_flag[i] = 0;
+            }
+        }
+
+        for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
+        {
+            if(ps_wt_ofst->i1_luma_weight_l1_flag[i])
+            {
+                SEV_PARSE("delta_luma_weight_l1[ i ]", value, ps_bitstrm);
+
+
+                ps_wt_ofst->i2_luma_weight_l1[i] = (1 << ps_wt_ofst->i1_luma_log2_weight_denom) + value;
+
+                SEV_PARSE("luma_offset_l1[ i ]", value, ps_bitstrm);
+                ps_wt_ofst->i2_luma_offset_l1[i] = value;
+
+            }
+            else
+            {
+                ps_wt_ofst->i2_luma_weight_l1[i] = (1 << ps_wt_ofst->i1_luma_log2_weight_denom);
+                ps_wt_ofst->i2_luma_offset_l1[i] = 0;
+            }
+
+            if(ps_wt_ofst->i1_chroma_weight_l1_flag[i])
+            {
+                WORD32 ofst;
+                WORD32 shift = (1 << (BIT_DEPTH_CHROMA - 1));
+                SEV_PARSE("delta_chroma_weight_l1[ i ][ j ]", value, ps_bitstrm);
+                ps_wt_ofst->i2_chroma_weight_l1_cb[i] = (1 << ps_wt_ofst->i1_chroma_log2_weight_denom) + value;;
+
+
+                SEV_PARSE("delta_chroma_offset_l1[ i ][ j ]", value, ps_bitstrm);
+                ofst = ((shift * ps_wt_ofst->i2_chroma_weight_l1_cb[i]) >> ps_wt_ofst->i1_chroma_log2_weight_denom);
+                ofst = value - ofst + shift;
+
+                ps_wt_ofst->i2_chroma_offset_l1_cb[i] = CLIP_S8(ofst);;
+
+                SEV_PARSE("delta_chroma_weight_l1[ i ][ j ]", value, ps_bitstrm);
+                ps_wt_ofst->i2_chroma_weight_l1_cr[i] = (1 << ps_wt_ofst->i1_chroma_log2_weight_denom) + value;
+
+
+                SEV_PARSE("delta_chroma_offset_l1[ i ][ j ]", value, ps_bitstrm);
+                ofst = ((shift * ps_wt_ofst->i2_chroma_weight_l1_cr[i]) >> ps_wt_ofst->i1_chroma_log2_weight_denom);
+                ofst = value - ofst + shift;
+
+                ps_wt_ofst->i2_chroma_offset_l1_cr[i] = CLIP_S8(ofst);;
+
+            }
+            else
+            {
+                ps_wt_ofst->i2_chroma_weight_l1_cb[i] = (1 << ps_wt_ofst->i1_chroma_log2_weight_denom);
+                ps_wt_ofst->i2_chroma_weight_l1_cr[i] = (1 << ps_wt_ofst->i1_chroma_log2_weight_denom);
+
+                ps_wt_ofst->i2_chroma_offset_l1_cb[i] = 0;
+                ps_wt_ofst->i2_chroma_offset_l1_cr[i] = 0;
+
+            }
+        }
+    }
+    return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Parses short term reference picture set
+*
+* @par   Description
+*  Parses short term reference picture set as per section 7.3.8.2.
+* Can be called by either SPS or Slice header parsing modules.
+*
+* @param[in] ps_bitstrm
+*  Pointer to bitstream structure
+*
+* @param[out] ps_stref_picset_base
+*  Pointer to first short term ref pic set structure
+*
+* @param[in] num_short_term_ref_pic_sets
+*  Number of short term reference pic sets
+*
+* @param[in] idx
+*  Current short term ref pic set id
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_short_term_ref_pic_set(bitstrm_t *ps_bitstrm,
+                                             stref_picset_t *ps_stref_picset_base,
+                                             WORD32 num_short_term_ref_pic_sets,
+                                             WORD32 idx,
+                                             stref_picset_t *ps_stref_picset)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    WORD32 value;
+    stref_picset_t *ps_stref_picset_ref;
+    WORD32 delta_idx, delta_rps;
+    WORD32 r_idx;
+    WORD32 i;
+    WORD32 j, k, temp;
+    if(idx > 0)
+    {
+        BITS_PARSE("inter_ref_pic_set_prediction_flag", value, ps_bitstrm, 1);
+        ps_stref_picset->i1_inter_ref_pic_set_prediction_flag = value;
+    }
+    else
+        ps_stref_picset->i1_inter_ref_pic_set_prediction_flag = 0;
+
+    if(ps_stref_picset->i1_inter_ref_pic_set_prediction_flag)
+    {
+        WORD32 delta_rps_sign;
+        WORD32 abs_delta_rps;
+        WORD32 num_neg_pics = 0;
+        WORD32 num_pos_pics = 0;
+        WORD32 num_pics = 0;
+
+        if(idx == num_short_term_ref_pic_sets)
+        {
+            UEV_PARSE("delta_idx_minus1", value, ps_bitstrm);
+            delta_idx = value + 1;
+        }
+        else
+        {
+            delta_idx = 1;
+        }
+        r_idx = idx - delta_idx;
+        r_idx = CLIP3(r_idx, 0, idx - 1);
+
+        ps_stref_picset_ref = ps_stref_picset_base + r_idx;
+
+        BITS_PARSE("delta_rps_sign", value, ps_bitstrm, 1);
+        delta_rps_sign = value;
+
+        UEV_PARSE("abs_delta_rps_minus1", value, ps_bitstrm);
+        abs_delta_rps = value + 1;
+
+        delta_rps = (1 - 2 * delta_rps_sign) * (abs_delta_rps);
+
+
+
+        for(i = 0; i <= ps_stref_picset_ref->i1_num_delta_pocs; i++)
+        {
+            WORD32 ref_idc;
+
+            /*****************************************************************/
+            /* ref_idc is parsed as below                                    */
+            /* bits "1" ref_idc 1                                            */
+            /* bits "01" ref_idc 2                                           */
+            /* bits "00" ref_idc 0                                           */
+            /*****************************************************************/
+            BITS_PARSE("used_by_curr_pic_flag", value, ps_bitstrm, 1);
+            ref_idc = value;
+            ps_stref_picset->ai1_used[num_pics] = value;
+            /* If ref_idc is zero check for next bit */
+            if(0 == ref_idc)
+            {
+                BITS_PARSE("use_delta_flag", value, ps_bitstrm, 1);
+                ps_stref_picset->ai1_used[i] = value;
+                ref_idc = value << 1;
+            }
+            if((ref_idc == 1) || (ref_idc == 2))
+            {
+                WORD32 delta_poc;
+                delta_poc = delta_rps;
+                delta_poc +=
+                                ((i < ps_stref_picset_ref->i1_num_delta_pocs) ?
+                                ps_stref_picset_ref->ai2_delta_poc[i] :
+                                0);
+
+                ps_stref_picset->ai2_delta_poc[num_pics] = delta_poc;
+
+                if(delta_poc < 0)
+                {
+                    num_neg_pics++;
+                }
+                else
+                {
+                    num_pos_pics++;
+                }
+                num_pics++;
+            }
+            ps_stref_picset->ai1_ref_idc[i] = ref_idc;
+        }
+
+        num_neg_pics = CLIP3(num_neg_pics, 0, MAX_DPB_SIZE - 1);
+        num_pos_pics = CLIP3(num_pos_pics, 0, (MAX_DPB_SIZE - 1 - num_neg_pics));
+        num_pics = num_neg_pics + num_pos_pics;
+
+        ps_stref_picset->i1_num_ref_idc =
+                        ps_stref_picset_ref->i1_num_delta_pocs + 1;
+        ps_stref_picset->i1_num_delta_pocs = num_pics;
+        ps_stref_picset->i1_num_pos_pics = num_pos_pics;
+        ps_stref_picset->i1_num_neg_pics = num_neg_pics;
+
+
+        for(j = 1; j < num_pics; j++)
+        {
+            WORD32 delta_poc = ps_stref_picset->ai2_delta_poc[j];
+            WORD8 i1_used = ps_stref_picset->ai1_used[j];
+            for(k = j - 1; k >= 0; k--)
+            {
+                temp = ps_stref_picset->ai2_delta_poc[k];
+                if(delta_poc < temp)
+                {
+                    ps_stref_picset->ai2_delta_poc[k + 1] = temp;
+                    ps_stref_picset->ai1_used[k + 1] = ps_stref_picset->ai1_used[k];
+                    ps_stref_picset->ai2_delta_poc[k] = delta_poc;
+                    ps_stref_picset->ai1_used[k] = i1_used;
+                }
+            }
+        }
+        // flip the negative values to largest first
+        for(j = 0, k = num_neg_pics - 1; j < num_neg_pics >> 1; j++, k--)
+        {
+            WORD32 delta_poc = ps_stref_picset->ai2_delta_poc[j];
+            WORD8 i1_used = ps_stref_picset->ai1_used[j];
+            ps_stref_picset->ai2_delta_poc[j] = ps_stref_picset->ai2_delta_poc[k];
+            ps_stref_picset->ai1_used[j] = ps_stref_picset->ai1_used[k];
+            ps_stref_picset->ai2_delta_poc[k] = delta_poc;
+            ps_stref_picset->ai1_used[k] = i1_used;
+        }
+
+    }
+    else
+    {
+        WORD32 prev_poc = 0;
+        WORD32 poc;
+
+        UEV_PARSE("num_negative_pics", value, ps_bitstrm);
+        ps_stref_picset->i1_num_neg_pics = value;
+        ps_stref_picset->i1_num_neg_pics = CLIP3(ps_stref_picset->i1_num_neg_pics,
+                                                 0,
+                                                 MAX_DPB_SIZE - 1);
+
+        UEV_PARSE("num_positive_pics", value, ps_bitstrm);
+        ps_stref_picset->i1_num_pos_pics = value;
+        ps_stref_picset->i1_num_pos_pics = CLIP3(ps_stref_picset->i1_num_pos_pics,
+                                                 0,
+                                                 (MAX_DPB_SIZE - 1 - ps_stref_picset->i1_num_neg_pics));
+
+        ps_stref_picset->i1_num_delta_pocs =
+                        ps_stref_picset->i1_num_neg_pics +
+                        ps_stref_picset->i1_num_pos_pics;
+
+
+        for(i = 0; i < ps_stref_picset->i1_num_neg_pics; i++)
+        {
+            UEV_PARSE("delta_poc_s0_minus1", value, ps_bitstrm);
+            poc = prev_poc - (value + 1);
+            prev_poc = poc;
+            ps_stref_picset->ai2_delta_poc[i] = poc;
+
+            BITS_PARSE("used_by_curr_pic_s0_flag", value, ps_bitstrm, 1);
+            ps_stref_picset->ai1_used[i] = value;
+
+        }
+        prev_poc = 0;
+        for(i = ps_stref_picset->i1_num_neg_pics;
+                        i < ps_stref_picset->i1_num_delta_pocs;
+                        i++)
+        {
+            UEV_PARSE("delta_poc_s1_minus1", value, ps_bitstrm);
+            poc = prev_poc + (value + 1);
+            prev_poc = poc;
+            ps_stref_picset->ai2_delta_poc[i] = poc;
+
+            BITS_PARSE("used_by_curr_pic_s1_flag", value, ps_bitstrm, 1);
+            ps_stref_picset->ai1_used[i] = value;
+
+        }
+
+    }
+
+    return ret;
+}
+
+
+static WORD32 ihevcd_parse_sub_layer_hrd_parameters(bitstrm_t *ps_bitstrm,
+                                                    sub_lyr_hrd_params_t *ps_sub_layer_hrd_params,
+                                                    WORD32 cpb_cnt,
+                                                    WORD32 sub_pic_cpb_params_present_flag)
+{
+    WORD32 ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    WORD32 i;
+
+    for(i = 0; i <= cpb_cnt; i++)
+    {
+        UEV_PARSE("bit_rate_value_minus1[ i ]", ps_sub_layer_hrd_params->au4_bit_rate_value_minus1[i], ps_bitstrm);
+        UEV_PARSE("cpb_size_value_minus1[ i ]", ps_sub_layer_hrd_params->au4_cpb_size_value_minus1[i], ps_bitstrm);
+
+        if(sub_pic_cpb_params_present_flag)
+        {
+            UEV_PARSE("cpb_size_du_value_minus1[ i ]", ps_sub_layer_hrd_params->au4_cpb_size_du_value_minus1[i], ps_bitstrm);
+            UEV_PARSE("bit_rate_du_value_minus1[ i ]", ps_sub_layer_hrd_params->au4_bit_rate_du_value_minus1[i], ps_bitstrm);
+        }
+        BITS_PARSE("cbr_flag[ i ]", ps_sub_layer_hrd_params->au1_cbr_flag[i], ps_bitstrm, 1);
+    }
+
+    return ret;
+}
+
+
+static WORD32 ihevcd_parse_hrd_parameters(bitstrm_t *ps_bitstrm,
+                                          hrd_params_t *ps_hrd,
+                                          WORD32 common_info_present_flag,
+                                          WORD32 max_num_sub_layers_minus1)
+{
+    WORD32 ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    WORD32 i;
+
+    ps_hrd->u1_nal_hrd_parameters_present_flag = 0;
+    ps_hrd->u1_vcl_hrd_parameters_present_flag = 0;
+
+    ps_hrd->u1_sub_pic_cpb_params_present_flag = 0;
+
+    ps_hrd->u1_tick_divisor_minus2 = 0;
+    ps_hrd->u1_du_cpb_removal_delay_increment_length_minus1 = 0;
+    ps_hrd->u1_sub_pic_cpb_params_in_pic_timing_sei_flag = 0;
+    ps_hrd->u1_dpb_output_delay_du_length_minus1 = 0;
+
+    ps_hrd->u4_bit_rate_scale = 0;
+    ps_hrd->u4_cpb_size_scale = 0;
+    ps_hrd->u4_cpb_size_du_scale = 0;
+
+    ps_hrd->u1_initial_cpb_removal_delay_length_minus1 = 23;
+    ps_hrd->u1_au_cpb_removal_delay_length_minus1 = 23;
+    ps_hrd->u1_dpb_output_delay_length_minus1 = 23;
+
+    if(common_info_present_flag)
+    {
+        BITS_PARSE("nal_hrd_parameters_present_flag", ps_hrd->u1_nal_hrd_parameters_present_flag, ps_bitstrm, 1);
+        BITS_PARSE("vcl_hrd_parameters_present_flag", ps_hrd->u1_vcl_hrd_parameters_present_flag, ps_bitstrm, 1);
+
+        if(ps_hrd->u1_nal_hrd_parameters_present_flag  ||  ps_hrd->u1_vcl_hrd_parameters_present_flag)
+        {
+            BITS_PARSE("sub_pic_cpb_params_present_flag", ps_hrd->u1_sub_pic_cpb_params_present_flag, ps_bitstrm, 1);
+            if(ps_hrd->u1_sub_pic_cpb_params_present_flag)
+            {
+                BITS_PARSE("tick_divisor_minus2", ps_hrd->u1_tick_divisor_minus2, ps_bitstrm, 8);
+                BITS_PARSE("du_cpb_removal_delay_increment_length_minus1", ps_hrd->u1_du_cpb_removal_delay_increment_length_minus1, ps_bitstrm, 5);
+                BITS_PARSE("sub_pic_cpb_params_in_pic_timing_sei_flag", ps_hrd->u1_sub_pic_cpb_params_in_pic_timing_sei_flag, ps_bitstrm, 1);
+                BITS_PARSE("dpb_output_delay_du_length_minus1", ps_hrd->u1_dpb_output_delay_du_length_minus1, ps_bitstrm, 5);
+            }
+
+            BITS_PARSE("bit_rate_scale", ps_hrd->u4_bit_rate_scale, ps_bitstrm, 4);
+            BITS_PARSE("cpb_size_scale", ps_hrd->u4_cpb_size_scale, ps_bitstrm, 4);
+            if(ps_hrd->u1_sub_pic_cpb_params_present_flag)
+                BITS_PARSE("cpb_size_du_scale", ps_hrd->u4_cpb_size_du_scale, ps_bitstrm, 4);
+
+            BITS_PARSE("initial_cpb_removal_delay_length_minus1", ps_hrd->u1_initial_cpb_removal_delay_length_minus1, ps_bitstrm, 5);
+            BITS_PARSE("au_cpb_removal_delay_length_minus1", ps_hrd->u1_au_cpb_removal_delay_length_minus1, ps_bitstrm, 5);
+            BITS_PARSE("dpb_output_delay_length_minus1", ps_hrd->u1_dpb_output_delay_length_minus1, ps_bitstrm, 5);
+        }
+    }
+
+
+    for(i = 0; i <= max_num_sub_layers_minus1; i++)
+    {
+        BITS_PARSE("fixed_pic_rate_general_flag[ i ]", ps_hrd->au1_fixed_pic_rate_general_flag[i], ps_bitstrm, 1);
+
+        ps_hrd->au1_fixed_pic_rate_within_cvs_flag[i] = 1;
+        ps_hrd->au1_elemental_duration_in_tc_minus1[i] = 0;
+        ps_hrd->au1_low_delay_hrd_flag[i] = 0;
+        ps_hrd->au1_cpb_cnt_minus1[i] = 0;
+
+        if(!ps_hrd->au1_fixed_pic_rate_general_flag[i])
+            BITS_PARSE("fixed_pic_rate_within_cvs_flag[ i ]", ps_hrd->au1_fixed_pic_rate_within_cvs_flag[i], ps_bitstrm, 1);
+
+        if(ps_hrd->au1_fixed_pic_rate_within_cvs_flag[i])
+        {
+            UEV_PARSE("elemental_duration_in_tc_minus1[ i ]", ps_hrd->au1_elemental_duration_in_tc_minus1[i], ps_bitstrm);
+        }
+        else
+        {
+            BITS_PARSE("low_delay_hrd_flag[ i ]", ps_hrd->au1_low_delay_hrd_flag[i], ps_bitstrm, 1);
+        }
+
+        if(!ps_hrd->au1_low_delay_hrd_flag[i])
+            UEV_PARSE("cpb_cnt_minus1[ i ]", ps_hrd->au1_cpb_cnt_minus1[i], ps_bitstrm);
+
+        if(ps_hrd->u1_nal_hrd_parameters_present_flag)
+            ihevcd_parse_sub_layer_hrd_parameters(ps_bitstrm,
+                                                  &ps_hrd->as_sub_layer_hrd_params[i],
+                                                  ps_hrd->au1_cpb_cnt_minus1[i],
+                                                  ps_hrd->u1_sub_pic_cpb_params_present_flag);
+
+        if(ps_hrd->u1_vcl_hrd_parameters_present_flag)
+            ihevcd_parse_sub_layer_hrd_parameters(ps_bitstrm,
+                                                  &ps_hrd->as_sub_layer_hrd_params[i],
+                                                  ps_hrd->au1_cpb_cnt_minus1[i],
+                                                  ps_hrd->u1_sub_pic_cpb_params_present_flag);
+    }
+
+    return ret;
+}
+
+
+static WORD32 ihevcd_parse_vui_parameters(bitstrm_t *ps_bitstrm,
+                                          vui_t *ps_vui,
+                                          WORD32 sps_max_sub_layers_minus1)
+{
+    WORD32 ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+
+    BITS_PARSE("aspect_ratio_info_present_flag", ps_vui->u1_aspect_ratio_info_present_flag, ps_bitstrm, 1);
+
+    ps_vui->u1_aspect_ratio_idc = SAR_UNUSED;
+    ps_vui->u2_sar_width = 0;
+    ps_vui->u2_sar_height = 0;
+    if(ps_vui->u1_aspect_ratio_info_present_flag)
+    {
+        BITS_PARSE("aspect_ratio_idc", ps_vui->u1_aspect_ratio_idc, ps_bitstrm, 8);
+        if(ps_vui->u1_aspect_ratio_idc  ==  EXTENDED_SAR)
+        {
+            BITS_PARSE("sar_width", ps_vui->u2_sar_width, ps_bitstrm, 16);
+            BITS_PARSE("sar_height", ps_vui->u2_sar_height, ps_bitstrm, 16);
+        }
+    }
+
+    BITS_PARSE("overscan_info_present_flag", ps_vui->u1_overscan_info_present_flag, ps_bitstrm, 1);
+    ps_vui->u1_overscan_appropriate_flag = 0;
+    if(ps_vui->u1_overscan_info_present_flag)
+        BITS_PARSE("overscan_appropriate_flag", ps_vui->u1_overscan_appropriate_flag, ps_bitstrm, 1);
+
+    BITS_PARSE("video_signal_type_present_flag", ps_vui->u1_video_signal_type_present_flag, ps_bitstrm, 1);
+    ps_vui->u1_video_format = VID_FMT_UNSPECIFIED;
+    ps_vui->u1_video_full_range_flag = 0;
+    ps_vui->u1_colour_description_present_flag = 0;
+    if(ps_vui->u1_video_signal_type_present_flag)
+    {
+        BITS_PARSE("video_format", ps_vui->u1_video_format, ps_bitstrm, 3);
+        BITS_PARSE("video_full_range_flag", ps_vui->u1_video_full_range_flag, ps_bitstrm, 1);
+        BITS_PARSE("colour_description_present_flag", ps_vui->u1_colour_description_present_flag, ps_bitstrm, 1);
+        ps_vui->u1_colour_primaries = 2;
+        ps_vui->u1_transfer_characteristics = 2;
+        if(ps_vui->u1_colour_description_present_flag)
+        {
+            BITS_PARSE("colour_primaries", ps_vui->u1_colour_primaries, ps_bitstrm, 8);
+            BITS_PARSE("transfer_characteristics", ps_vui->u1_transfer_characteristics, ps_bitstrm, 8);
+            BITS_PARSE("matrix_coeffs", ps_vui->u1_matrix_coefficients, ps_bitstrm, 8);
+        }
+    }
+
+    BITS_PARSE("chroma_loc_info_present_flag", ps_vui->u1_chroma_loc_info_present_flag, ps_bitstrm, 1);
+    ps_vui->u1_chroma_sample_loc_type_top_field = 0;
+    ps_vui->u1_chroma_sample_loc_type_bottom_field = 0;
+    if(ps_vui->u1_chroma_loc_info_present_flag)
+    {
+        UEV_PARSE("chroma_sample_loc_type_top_field", ps_vui->u1_chroma_sample_loc_type_top_field, ps_bitstrm);
+        UEV_PARSE("chroma_sample_loc_type_bottom_field", ps_vui->u1_chroma_sample_loc_type_bottom_field, ps_bitstrm);
+    }
+
+    BITS_PARSE("neutral_chroma_indication_flag", ps_vui->u1_neutral_chroma_indication_flag, ps_bitstrm, 1);
+    BITS_PARSE("field_seq_flag", ps_vui->u1_field_seq_flag, ps_bitstrm, 1);
+    BITS_PARSE("frame_field_info_present_flag", ps_vui->u1_frame_field_info_present_flag, ps_bitstrm, 1);
+    BITS_PARSE("default_display_window_flag", ps_vui->u1_default_display_window_flag, ps_bitstrm, 1);
+    ps_vui->u4_def_disp_win_left_offset = 0;
+    ps_vui->u4_def_disp_win_right_offset = 0;
+    ps_vui->u4_def_disp_win_top_offset = 0;
+    ps_vui->u4_def_disp_win_bottom_offset = 0;
+    if(ps_vui->u1_default_display_window_flag)
+    {
+        UEV_PARSE("def_disp_win_left_offset", ps_vui->u4_def_disp_win_left_offset, ps_bitstrm);
+        UEV_PARSE("def_disp_win_right_offset", ps_vui->u4_def_disp_win_right_offset, ps_bitstrm);
+        UEV_PARSE("def_disp_win_top_offset", ps_vui->u4_def_disp_win_top_offset, ps_bitstrm);
+        UEV_PARSE("def_disp_win_bottom_offset", ps_vui->u4_def_disp_win_bottom_offset, ps_bitstrm);
+    }
+
+    BITS_PARSE("vui_timing_info_present_flag", ps_vui->u1_vui_timing_info_present_flag, ps_bitstrm, 1);
+    if(ps_vui->u1_vui_timing_info_present_flag)
+    {
+        BITS_PARSE("vui_num_units_in_tick", ps_vui->u4_vui_num_units_in_tick, ps_bitstrm, 32);
+        BITS_PARSE("vui_time_scale", ps_vui->u4_vui_time_scale, ps_bitstrm, 32);
+        BITS_PARSE("vui_poc_proportional_to_timing_flag", ps_vui->u1_poc_proportional_to_timing_flag, ps_bitstrm, 1);
+        if(ps_vui->u1_poc_proportional_to_timing_flag)
+            UEV_PARSE("vui_num_ticks_poc_diff_one_minus1", ps_vui->u1_num_ticks_poc_diff_one_minus1, ps_bitstrm);
+
+        BITS_PARSE("vui_hrd_parameters_present_flag", ps_vui->u1_vui_hrd_parameters_present_flag, ps_bitstrm, 1);
+        if(ps_vui->u1_vui_hrd_parameters_present_flag)
+            ihevcd_parse_hrd_parameters(ps_bitstrm, &ps_vui->s_vui_hrd_parameters, 1, sps_max_sub_layers_minus1);
+    }
+
+    BITS_PARSE("bitstream_restriction_flag", ps_vui->u1_bitstream_restriction_flag, ps_bitstrm, 1);
+    ps_vui->u1_tiles_fixed_structure_flag = 0;
+    ps_vui->u1_motion_vectors_over_pic_boundaries_flag = 1;
+    ps_vui->u1_restricted_ref_pic_lists_flag = 0;
+    ps_vui->u4_min_spatial_segmentation_idc = 0;
+    ps_vui->u1_max_bytes_per_pic_denom = 2;
+    ps_vui->u1_max_bits_per_mincu_denom = 1;
+    ps_vui->u1_log2_max_mv_length_horizontal = 15;
+    ps_vui->u1_log2_max_mv_length_vertical = 15;
+    if(ps_vui->u1_bitstream_restriction_flag)
+    {
+        BITS_PARSE("tiles_fixed_structure_flag", ps_vui->u1_tiles_fixed_structure_flag, ps_bitstrm, 1);
+        BITS_PARSE("motion_vectors_over_pic_boundaries_flag", ps_vui->u1_motion_vectors_over_pic_boundaries_flag, ps_bitstrm, 1);
+        BITS_PARSE("restricted_ref_pic_lists_flag", ps_vui->u1_restricted_ref_pic_lists_flag, ps_bitstrm, 1);
+
+        UEV_PARSE("min_spatial_segmentation_idc", ps_vui->u4_min_spatial_segmentation_idc, ps_bitstrm);
+        UEV_PARSE("max_bytes_per_pic_denom", ps_vui->u1_max_bytes_per_pic_denom, ps_bitstrm);
+        UEV_PARSE("max_bits_per_min_cu_denom", ps_vui->u1_max_bits_per_mincu_denom, ps_bitstrm);
+        UEV_PARSE("log2_max_mv_length_horizontal", ps_vui->u1_log2_max_mv_length_horizontal, ps_bitstrm);
+        UEV_PARSE("log2_max_mv_length_vertical", ps_vui->u1_log2_max_mv_length_vertical, ps_bitstrm);
+    }
+
+    return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Parses profile tier and level info for either general layer of sub_layer
+*
+* @par   Description
+*  Parses profile tier and level info for either general layer of sub_layer
+* as per section 7.3.3
+*
+* Since the same function is called for parsing general_profile and
+* sub_layer_profile etc, variables do not specify whether the syntax is
+* for general or sub_layer. Similarly trace functions also do not differentiate
+*
+* @param[in] ps_bitstrm
+*  Pointer to bitstream structure
+*
+* @param[out] ps_ptl
+*  Pointer to profile, tier level structure
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*******************************************************************************
+*/
+
+static IHEVCD_ERROR_T ihevcd_parse_profile_tier_level_layer(bitstrm_t *ps_bitstrm,
+                                                            profile_tier_lvl_t *ps_ptl)
+{
+    WORD32 value;
+    WORD32 i;
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+
+    BITS_PARSE("XXX_profile_space[]", value, ps_bitstrm, 2);
+    ps_ptl->i1_profile_space = value;
+
+    BITS_PARSE("XXX_tier_flag[]", value, ps_bitstrm, 1);
+    ps_ptl->i1_tier_flag = value;
+
+    BITS_PARSE("XXX_profile_idc[]", value, ps_bitstrm, 5);
+    ps_ptl->i1_profile_idc = value;
+
+    for(i = 0; i < MAX_PROFILE_COMPATBLTY; i++)
+    {
+        BITS_PARSE("XXX_profile_compatibility_flag[][j]", value, ps_bitstrm, 1);
+        ps_ptl->ai1_profile_compatibility_flag[i] = value;
+    }
+
+    BITS_PARSE("general_progressive_source_flag", value, ps_bitstrm, 1);
+    ps_ptl->i1_general_progressive_source_flag = value;
+
+    BITS_PARSE("general_interlaced_source_flag", value, ps_bitstrm, 1);
+    ps_ptl->i1_general_progressive_source_flag = value;
+
+    BITS_PARSE("general_non_packed_constraint_flag", value, ps_bitstrm, 1);
+    ps_ptl->i1_general_progressive_source_flag = value;
+
+    BITS_PARSE("general_frame_only_constraint_flag", value, ps_bitstrm, 1);
+    ps_ptl->i1_general_progressive_source_flag = value;
+
+    BITS_PARSE("XXX_reserved_zero_44bits[0..15]", value, ps_bitstrm, 16);
+
+    BITS_PARSE("XXX_reserved_zero_44bits[16..31]", value, ps_bitstrm, 16);
+
+    BITS_PARSE("XXX_reserved_zero_44bits[32..43]", value, ps_bitstrm, 12);
+    return ret;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Parses profile tier and level info
+*
+* @par   Description
+*  Parses profile tier and level info as per section 7.3.3
+* Called during VPS and SPS parsing
+* calls ihevcd_parse_profile_tier_level() for general layer and each sub_layers
+*
+* @param[in] ps_bitstrm
+*  Pointer to bitstream structure
+*
+* @param[out] ps_ptl
+*  Pointer to structure that contains profile, tier level for each layers
+*
+* @param[in] profile_present
+*  Flag to indicate if profile data is present
+*
+* @param[in] max_num_sub_layers
+*  Number of sub layers present
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*******************************************************************************
+*/
+
+static IHEVCD_ERROR_T ihevcd_profile_tier_level(bitstrm_t *ps_bitstrm,
+                                                profile_tier_lvl_info_t *ps_ptl,
+                                                WORD32 profile_present,
+                                                WORD32 max_num_sub_layers)
+{
+    WORD32 value;
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    WORD32 i;
+
+    if(profile_present)
+    {
+        ret = ihevcd_parse_profile_tier_level_layer(ps_bitstrm, &ps_ptl->s_ptl_gen);
+    }
+
+    BITS_PARSE("general_level_idc", value, ps_bitstrm, 8);
+    ps_ptl->s_ptl_gen.u1_level_idc = value;
+
+
+    for(i = 0; i < max_num_sub_layers; i++)
+    {
+        BITS_PARSE("sub_layer_profile_present_flag[i]", value, ps_bitstrm, 1);
+        ps_ptl->ai1_sub_layer_profile_present_flag[i] = value;
+
+        BITS_PARSE("sub_layer_level_present_flag[i]", value, ps_bitstrm, 1);
+        ps_ptl->ai1_sub_layer_level_present_flag[i] = value;
+    }
+
+    if(max_num_sub_layers > 0)
+    {
+        for(i = max_num_sub_layers; i < 8; i++)
+        {
+            BITS_PARSE("reserved_zero_2bits", value, ps_bitstrm, 2);
+        }
+    }
+
+    for(i = 0; i < max_num_sub_layers; i++)
+    {
+        if(ps_ptl->ai1_sub_layer_profile_present_flag[i])
+        {
+            ret = ihevcd_parse_profile_tier_level_layer(ps_bitstrm,
+                                                        &ps_ptl->as_ptl_sub[i]);
+        }
+        if(ps_ptl->ai1_sub_layer_level_present_flag[i])
+        {
+            BITS_PARSE("sub_layer_level_idc[i]", value, ps_bitstrm, 8);
+            ps_ptl->as_ptl_sub[i].u1_level_idc = value;
+
+        }
+    }
+
+
+
+    return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Parses Scaling List Data syntax
+*
+* @par Description:
+*  Parses Scaling List Data syntax as per Section: 7.3.6
+*
+* @param[in] ps_codec
+*  Pointer to codec context
+*
+* @returns  Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T  ihevcd_scaling_list_data(codec_t *ps_codec, WORD16 *pi2_scaling_mat)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    WORD32 size_id;
+    WORD32 matrix_id;
+    WORD32 value, dc_value = 0;
+    WORD32 next_coef;
+    WORD32 coef_num;
+    WORD32 i, j, offset;
+    bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+    WORD16 *pi2_scaling_mat_offset;
+    WORD32 scaling_mat_offset[] = { 0, 16, 32, 48, 64, 80, 96, 160, 224, 288, 352, 416, 480, 736, 992, 1248, 1504, 1760, 2016, 3040 };
+    UWORD8 *scan_table;
+
+    for(size_id = 0; size_id < 4; size_id++)
+    {
+        for(matrix_id = 0; matrix_id < ((size_id == 3) ? 2 : 6); matrix_id++)
+        {
+            WORD32 scaling_list_pred_mode_flag;
+            WORD32 scaling_list_delta_coef;
+            BITS_PARSE("scaling_list_pred_mode_flag", scaling_list_pred_mode_flag, ps_bitstrm, 1);
+
+            offset = size_id * 6 + matrix_id;
+            pi2_scaling_mat_offset = pi2_scaling_mat + scaling_mat_offset[offset];
+
+            if(!scaling_list_pred_mode_flag)
+            {
+                WORD32 num_elements;
+                UEV_PARSE("scaling_list_pred_matrix_id_delta", value,
+                          ps_bitstrm);
+                value = CLIP3(value, 0, matrix_id);
+
+                num_elements = (1 << (4 + (size_id << 1)));
+                if(0 != value)
+                    memcpy(pi2_scaling_mat_offset, pi2_scaling_mat_offset - value * num_elements, num_elements * sizeof(WORD16));
+            }
+            else
+            {
+                next_coef = 8;
+                coef_num = MIN(64, (1 << (4 + (size_id << 1))));
+
+                if(size_id > 1)
+                {
+                    SEV_PARSE("scaling_list_dc_coef_minus8", value,
+                              ps_bitstrm);
+
+                    next_coef = value + 8;
+                    dc_value = next_coef;
+                }
+                if(size_id < 2)
+                {
+                    scan_table = (UWORD8 *)gapv_ihevc_invscan[size_id + 1];
+
+                    for(i = 0; i < coef_num; i++)
+                    {
+                        SEV_PARSE("scaling_list_delta_coef",
+                                  scaling_list_delta_coef, ps_bitstrm);
+                        next_coef = (next_coef + scaling_list_delta_coef + 256)
+                                        % 256;
+                        pi2_scaling_mat_offset[scan_table[i]] = next_coef;
+                    }
+                }
+                else if(size_id == 2)
+                {
+                    scan_table = (UWORD8 *)gapv_ihevc_invscan[2];
+
+                    for(i = 0; i < coef_num; i++)
+                    {
+                        SEV_PARSE("scaling_list_delta_coef",
+                                  scaling_list_delta_coef, ps_bitstrm);
+                        next_coef = (next_coef + scaling_list_delta_coef + 256)
+                                        % 256;
+
+                        offset = scan_table[i];
+                        offset = (offset >> 3) * 16 * 2 + (offset & 0x7) * 2;
+                        pi2_scaling_mat_offset[offset] = next_coef;
+                        pi2_scaling_mat_offset[offset + 1] = next_coef;
+                        pi2_scaling_mat_offset[offset + 16] = next_coef;
+                        pi2_scaling_mat_offset[offset + 16 + 1] = next_coef;
+                    }
+                    pi2_scaling_mat_offset[0] = dc_value;
+                }
+                else
+                {
+                    scan_table = (UWORD8 *)gapv_ihevc_invscan[2];
+
+                    for(i = 0; i < coef_num; i++)
+                    {
+                        SEV_PARSE("scaling_list_delta_coef",
+                                  scaling_list_delta_coef, ps_bitstrm);
+                        next_coef = (next_coef + scaling_list_delta_coef + 256)
+                                        % 256;
+
+                        offset = scan_table[i];
+                        offset = (offset >> 3) * 32 * 4 + (offset & 0x7) * 4;
+
+                        for(j = 0; j < 4; j++)
+                        {
+                            pi2_scaling_mat_offset[offset + j * 32] = next_coef;
+                            pi2_scaling_mat_offset[offset + 1 + j * 32] = next_coef;
+                            pi2_scaling_mat_offset[offset + 2 + j * 32] = next_coef;
+                            pi2_scaling_mat_offset[offset + 3 + j * 32] = next_coef;
+                        }
+                        pi2_scaling_mat_offset[0] = dc_value;
+                    }
+                }
+            }
+        }
+    }
+
+    return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Parses VPS (Video Parameter Set)
+*
+* @par Description:
+*  Parse Video Parameter Set as per Section 7.3.2.1
+* update vps structure corresponding to vps ID
+* Till parsing VPS id, the elements are stored in local variables and are copied
+* later
+*
+* @param[in] ps_codec
+*  Pointer to codec context.
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_parse_vps(codec_t *ps_codec)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    WORD32 i;
+    WORD32 value;
+    WORD32 vps_id;
+    vps_t *ps_vps;
+    bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+#if 0
+    WORD32 j;
+#endif
+    BITS_PARSE("vps_video_parameter_set_id", value, ps_bitstrm, 4);
+    vps_id = value;
+
+    if(vps_id >= MAX_VPS_CNT)
+    {
+        ps_codec->s_parse.i4_error_code = IHEVCD_UNSUPPORTED_VPS_ID;
+        return IHEVCD_UNSUPPORTED_VPS_ID;
+    }
+
+
+    ps_vps = (ps_codec->s_parse.ps_vps_base + vps_id);
+
+    ps_vps->i1_vps_id = vps_id;
+
+    BITS_PARSE("vps_reserved_three_2bits", value, ps_bitstrm, 2);
+    ASSERT(value == 3);
+
+    BITS_PARSE("vps_max_layers_minus1", value, ps_bitstrm, 6);
+    //ps_vps->i1_vps_max_layers = value + 1;
+
+
+
+    BITS_PARSE("vps_max_sub_layers_minus1", value, ps_bitstrm, 3);
+    ps_vps->i1_vps_max_sub_layers = value + 1;
+
+    ASSERT(ps_vps->i1_vps_max_sub_layers < VPS_MAX_SUB_LAYERS);
+
+    BITS_PARSE("vps_temporal_id_nesting_flag", value, ps_bitstrm, 1);
+    ps_vps->i1_vps_temporal_id_nesting_flag = value;
+
+    BITS_PARSE("vps_reserved_ffff_16bits", value, ps_bitstrm, 16);
+    ASSERT(value == 0xFFFF);
+    // profile_and_level( 1, vps_max_sub_layers_minus1 )
+    ret = ihevcd_profile_tier_level(ps_bitstrm, &(ps_vps->s_ptl),
+                                    1, (ps_vps->i1_vps_max_sub_layers - 1));
+
+    BITS_PARSE("vps_sub_layer_ordering_info_present_flag", value, ps_bitstrm, 1);
+    ps_vps->i1_sub_layer_ordering_info_present_flag = value;
+    i = (ps_vps->i1_sub_layer_ordering_info_present_flag ?
+                    0 : (ps_vps->i1_vps_max_sub_layers - 1));
+    for(; i < ps_vps->i1_vps_max_sub_layers; i++)
+    {
+        UEV_PARSE("vps_max_dec_pic_buffering[i]", value, ps_bitstrm);
+        ps_vps->ai1_vps_max_dec_pic_buffering[i] = value;
+
+        /* vps_num_reorder_pics (no max) used in print in order to match with HM */
+        UEV_PARSE("vps_num_reorder_pics[i]", value, ps_bitstrm);
+        ps_vps->ai1_vps_max_num_reorder_pics[i] = value;
+
+        UEV_PARSE("vps_max_latency_increase[i]", value, ps_bitstrm);
+        ps_vps->ai1_vps_max_latency_increase[i] = value;
+    }
+
+
+
+    BITS_PARSE("vps_max_layer_id", value, ps_bitstrm, 6);
+    //ps_vps->i1_vps_max_layer_id  = value;
+
+    UEV_PARSE("vps_num_layer_sets_minus1", value, ps_bitstrm);
+    //ps_vps->i1_vps_num_layer_sets  = value + 1;
+
+    BITS_PARSE("vps_timing_info_present_flag", value, ps_bitstrm, 1);
+    //ps_vps->i1_vps_timing_info_present_flag  = value;
+
+
+
+    return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Parses SPS (Sequence Parameter Set)
+* sequence_parameter_set_rbsp()
+*
+* @par Description:
+*  Parse Sequence Parameter Set as per section  Section: 7.3.2.2
+* The sps is written to a temporary buffer and copied later to the
+* appropriate location
+*
+* @param[in] ps_codec
+*  Pointer to codec context
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_parse_sps(codec_t *ps_codec)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    WORD32 value;
+
+    WORD32 i;
+    WORD32 vps_id;
+    WORD32 sps_max_sub_layers;
+    WORD32 sps_id;
+    WORD32 sps_temporal_id_nesting_flag;
+    sps_t *ps_sps;
+    profile_tier_lvl_info_t s_ptl;
+    bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+
+
+    BITS_PARSE("video_parameter_set_id", value, ps_bitstrm, 4);
+    vps_id = value;
+    vps_id = CLIP3(vps_id, 0, MAX_VPS_CNT - 1);
+
+    BITS_PARSE("sps_max_sub_layers_minus1", value, ps_bitstrm, 3);
+    sps_max_sub_layers = value + 1;
+    sps_max_sub_layers = CLIP3(sps_max_sub_layers, 1, 7);
+
+    BITS_PARSE("sps_temporal_id_nesting_flag", value, ps_bitstrm, 1);
+    sps_temporal_id_nesting_flag = value;
+
+    //profile_and_level( 1, sps_max_sub_layers_minus1 )
+    ret = ihevcd_profile_tier_level(ps_bitstrm, &(s_ptl), 1,
+                                    (sps_max_sub_layers - 1));
+
+    UEV_PARSE("seq_parameter_set_id", value, ps_bitstrm);
+    sps_id = value;
+
+    if((sps_id >= MAX_SPS_CNT) || (sps_id < 0))
+    {
+        if(ps_codec->i4_sps_done)
+            return IHEVCD_UNSUPPORTED_SPS_ID;
+        else
+            sps_id = 0;
+    }
+
+
+    ps_sps = (ps_codec->s_parse.ps_sps_base + MAX_SPS_CNT - 1);
+    ps_sps->i1_sps_id = sps_id;
+    ps_sps->i1_vps_id = vps_id;
+    ps_sps->i1_sps_max_sub_layers = sps_max_sub_layers;
+    ps_sps->i1_sps_temporal_id_nesting_flag = sps_temporal_id_nesting_flag;
+    /* This is used only during initialization to get reorder count etc */
+    ps_codec->i4_sps_id = sps_id;
+    memcpy(&ps_sps->s_ptl, &s_ptl, sizeof(profile_tier_lvl_info_t));
+
+    UEV_PARSE("chroma_format_idc", value, ps_bitstrm);
+    ps_sps->i1_chroma_format_idc = value;
+
+    if(ps_sps->i1_chroma_format_idc != CHROMA_FMT_IDC_YUV420)
+    {
+        ps_codec->s_parse.i4_error_code = IHEVCD_UNSUPPORTED_CHROMA_FMT_IDC;
+        return (IHEVCD_ERROR_T)IHEVCD_UNSUPPORTED_CHROMA_FMT_IDC;
+    }
+
+    if(CHROMA_FMT_IDC_YUV444_PLANES == ps_sps->i1_chroma_format_idc)
+    {
+        BITS_PARSE("separate_colour_plane_flag", value, ps_bitstrm, 1);
+        ps_sps->i1_separate_colour_plane_flag = value;
+    }
+    else
+    {
+        ps_sps->i1_separate_colour_plane_flag = 0;
+    }
+
+    UEV_PARSE("pic_width_in_luma_samples", value, ps_bitstrm);
+    ps_sps->i2_pic_width_in_luma_samples = value;
+
+    UEV_PARSE("pic_height_in_luma_samples", value, ps_bitstrm);
+    ps_sps->i2_pic_height_in_luma_samples = value;
+
+    if((0 >= ps_sps->i2_pic_width_in_luma_samples) || (0 >= ps_sps->i2_pic_height_in_luma_samples))
+        return IHEVCD_INVALID_PARAMETER;
+
+    if((ps_sps->i2_pic_width_in_luma_samples > ps_codec->i4_max_wd) ||
+       (ps_sps->i2_pic_width_in_luma_samples * ps_sps->i2_pic_height_in_luma_samples >
+                       ps_codec->i4_max_wd * ps_codec->i4_max_ht) ||
+       (ps_sps->i2_pic_height_in_luma_samples > MAX(ps_codec->i4_max_wd, ps_codec->i4_max_ht)))
+    {
+        return (IHEVCD_ERROR_T)IHEVCD_UNSUPPORTED_DIMENSIONS;
+    }
+
+    BITS_PARSE("pic_cropping_flag", value, ps_bitstrm, 1);
+    ps_sps->i1_pic_cropping_flag = value;
+
+    if(ps_sps->i1_pic_cropping_flag)
+    {
+
+        UEV_PARSE("pic_crop_left_offset", value, ps_bitstrm);
+        ps_sps->i2_pic_crop_left_offset = value;
+
+        UEV_PARSE("pic_crop_right_offset", value, ps_bitstrm);
+        ps_sps->i2_pic_crop_right_offset = value;
+
+        UEV_PARSE("pic_crop_top_offset", value, ps_bitstrm);
+        ps_sps->i2_pic_crop_top_offset = value;
+
+        UEV_PARSE("pic_crop_bottom_offset", value, ps_bitstrm);
+        ps_sps->i2_pic_crop_bottom_offset = value;
+    }
+    else
+    {
+        ps_sps->i2_pic_crop_left_offset = 0;
+        ps_sps->i2_pic_crop_right_offset = 0;
+        ps_sps->i2_pic_crop_top_offset = 0;
+        ps_sps->i2_pic_crop_bottom_offset = 0;
+    }
+
+
+    UEV_PARSE("bit_depth_luma_minus8", value, ps_bitstrm);
+    if(0 != value)
+        return IHEVCD_UNSUPPORTED_BIT_DEPTH;
+
+    UEV_PARSE("bit_depth_chroma_minus8", value, ps_bitstrm);
+    if(0 != value)
+        return IHEVCD_UNSUPPORTED_BIT_DEPTH;
+
+    UEV_PARSE("log2_max_pic_order_cnt_lsb_minus4", value, ps_bitstrm);
+    ps_sps->i1_log2_max_pic_order_cnt_lsb = value + 4;
+
+    BITS_PARSE("sps_sub_layer_ordering_info_present_flag", value, ps_bitstrm, 1);
+    ps_sps->i1_sps_sub_layer_ordering_info_present_flag = value;
+
+
+    i = (ps_sps->i1_sps_sub_layer_ordering_info_present_flag ? 0 : (ps_sps->i1_sps_max_sub_layers - 1));
+    for(; i < ps_sps->i1_sps_max_sub_layers; i++)
+    {
+        UEV_PARSE("max_dec_pic_buffering", value, ps_bitstrm);
+        ps_sps->ai1_sps_max_dec_pic_buffering[i] = value + 1;
+
+        UEV_PARSE("num_reorder_pics", value, ps_bitstrm);
+        ps_sps->ai1_sps_max_num_reorder_pics[i] = value;
+
+        UEV_PARSE("max_latency_increase", value, ps_bitstrm);
+        ps_sps->ai1_sps_max_latency_increase[i] = value;
+    }
+    UEV_PARSE("log2_min_coding_block_size_minus3", value, ps_bitstrm);
+    ps_sps->i1_log2_min_coding_block_size = value + 3;
+
+    UEV_PARSE("log2_diff_max_min_coding_block_size", value, ps_bitstrm);
+    ps_sps->i1_log2_diff_max_min_coding_block_size = value;
+
+    UEV_PARSE("log2_min_transform_block_size_minus2", value, ps_bitstrm);
+    ps_sps->i1_log2_min_transform_block_size = value + 2;
+
+    UEV_PARSE("log2_diff_max_min_transform_block_size", value, ps_bitstrm);
+    ps_sps->i1_log2_diff_max_min_transform_block_size = value;
+
+    ps_sps->i1_log2_max_transform_block_size = ps_sps->i1_log2_min_transform_block_size +
+                    ps_sps->i1_log2_diff_max_min_transform_block_size;
+
+    ps_sps->i1_log2_ctb_size = ps_sps->i1_log2_min_coding_block_size +
+                    ps_sps->i1_log2_diff_max_min_coding_block_size;
+
+    if((ps_sps->i1_log2_min_coding_block_size < 3) ||
+                    (ps_sps->i1_log2_min_transform_block_size < 2) ||
+                    (ps_sps->i1_log2_diff_max_min_transform_block_size < 0) ||
+                    (ps_sps->i1_log2_max_transform_block_size > ps_sps->i1_log2_ctb_size) ||
+                    (ps_sps->i1_log2_ctb_size < 4) ||
+                    (ps_sps->i1_log2_ctb_size > 6))
+    {
+        return IHEVCD_INVALID_PARAMETER;
+    }
+
+    ps_sps->i1_log2_min_pcm_coding_block_size = 0;
+    ps_sps->i1_log2_diff_max_min_pcm_coding_block_size = 0;
+
+    UEV_PARSE("max_transform_hierarchy_depth_inter", value, ps_bitstrm);
+    ps_sps->i1_max_transform_hierarchy_depth_inter = value;
+
+    UEV_PARSE("max_transform_hierarchy_depth_intra", value, ps_bitstrm);
+    ps_sps->i1_max_transform_hierarchy_depth_intra = value;
+
+    /* String has a d (enabled) in order to match with HM */
+    BITS_PARSE("scaling_list_enabled_flag", value, ps_bitstrm, 1);
+    ps_sps->i1_scaling_list_enable_flag = value;
+
+    if(ps_sps->i1_scaling_list_enable_flag)
+    {
+        COPY_DEFAULT_SCALING_LIST(ps_sps->pi2_scaling_mat);
+        BITS_PARSE("sps_scaling_list_data_present_flag", value, ps_bitstrm, 1);
+        ps_sps->i1_sps_scaling_list_data_present_flag = value;
+
+        if(ps_sps->i1_sps_scaling_list_data_present_flag)
+            ihevcd_scaling_list_data(ps_codec, ps_sps->pi2_scaling_mat);
+    }
+    else
+    {
+        COPY_FLAT_SCALING_LIST(ps_sps->pi2_scaling_mat);
+    }
+    /* String is asymmetric_motion_partitions_enabled_flag instead of amp_enabled_flag in order to match with HM */
+    BITS_PARSE("asymmetric_motion_partitions_enabled_flag", value, ps_bitstrm, 1);
+    ps_sps->i1_amp_enabled_flag = value;
+
+    BITS_PARSE("sample_adaptive_offset_enabled_flag", value, ps_bitstrm, 1);
+    ps_sps->i1_sample_adaptive_offset_enabled_flag = value;
+
+    BITS_PARSE("pcm_enabled_flag", value, ps_bitstrm, 1);
+    ps_sps->i1_pcm_enabled_flag = value;
+
+    if(ps_sps->i1_pcm_enabled_flag)
+    {
+        BITS_PARSE("pcm_sample_bit_depth_luma", value, ps_bitstrm, 4);
+        ps_sps->i1_pcm_sample_bit_depth_luma = value + 1;
+
+        BITS_PARSE("pcm_sample_bit_depth_chroma", value, ps_bitstrm, 4);
+        ps_sps->i1_pcm_sample_bit_depth_chroma = value + 1;
+
+        UEV_PARSE("log2_min_pcm_coding_block_size_minus3", value, ps_bitstrm);
+        ps_sps->i1_log2_min_pcm_coding_block_size = value + 3;
+
+        UEV_PARSE("log2_diff_max_min_pcm_coding_block_size", value, ps_bitstrm);
+        ps_sps->i1_log2_diff_max_min_pcm_coding_block_size = value;
+        BITS_PARSE("pcm_loop_filter_disable_flag", value, ps_bitstrm, 1);
+        ps_sps->i1_pcm_loop_filter_disable_flag = value;
+
+    }
+    UEV_PARSE("num_short_term_ref_pic_sets", value, ps_bitstrm);
+    ps_sps->i1_num_short_term_ref_pic_sets = value;
+
+    ps_sps->i1_num_short_term_ref_pic_sets = CLIP3(ps_sps->i1_num_short_term_ref_pic_sets, 0, MAX_STREF_PICS_SPS);
+
+    for(i = 0; i < ps_sps->i1_num_short_term_ref_pic_sets; i++)
+        ihevcd_short_term_ref_pic_set(ps_bitstrm, &ps_sps->as_stref_picset[0], ps_sps->i1_num_short_term_ref_pic_sets, i, &ps_sps->as_stref_picset[i]);
+
+    BITS_PARSE("long_term_ref_pics_present_flag", value, ps_bitstrm, 1);
+    ps_sps->i1_long_term_ref_pics_present_flag = value;
+
+    if(ps_sps->i1_long_term_ref_pics_present_flag)
+    {
+        UEV_PARSE("num_long_term_ref_pics_sps", value, ps_bitstrm);
+        ps_sps->i1_num_long_term_ref_pics_sps = value;
+
+        for(i = 0; i < ps_sps->i1_num_long_term_ref_pics_sps; i++)
+        {
+            BITS_PARSE("lt_ref_pic_poc_lsb_sps[ i ]", value, ps_bitstrm, ps_sps->i1_log2_max_pic_order_cnt_lsb);
+            ps_sps->ai1_lt_ref_pic_poc_lsb_sps[i] = value;
+
+            BITS_PARSE("used_by_curr_pic_lt_sps_flag[ i ]", value, ps_bitstrm, 1);
+            ps_sps->ai1_used_by_curr_pic_lt_sps_flag[i] = value;
+        }
+    }
+
+    BITS_PARSE("sps_temporal_mvp_enable_flag", value, ps_bitstrm, 1);
+    ps_sps->i1_sps_temporal_mvp_enable_flag = value;
+
+    /* Print matches HM 8-2 */
+    BITS_PARSE("sps_strong_intra_smoothing_enable_flag", value, ps_bitstrm, 1);
+    ps_sps->i1_strong_intra_smoothing_enable_flag = value;
+
+    BITS_PARSE("vui_parameters_present_flag", value, ps_bitstrm, 1);
+    ps_sps->i1_vui_parameters_present_flag = value;
+
+    if(ps_sps->i1_vui_parameters_present_flag)
+        ihevcd_parse_vui_parameters(ps_bitstrm,
+                                    &ps_sps->s_vui_parameters,
+                                    ps_sps->i1_sps_max_sub_layers - 1);
+
+    BITS_PARSE("sps_extension_flag", value, ps_bitstrm, 1);
+
+
+    {
+        WORD32 numerator;
+        WORD32 ceil_offset;
+
+        ceil_offset = (1 << ps_sps->i1_log2_ctb_size) - 1;
+        numerator = ps_sps->i2_pic_width_in_luma_samples;
+
+        ps_sps->i2_pic_wd_in_ctb = ((numerator + ceil_offset) /
+                        (1 << ps_sps->i1_log2_ctb_size));
+
+        numerator = ps_sps->i2_pic_height_in_luma_samples;
+        ps_sps->i2_pic_ht_in_ctb = ((numerator + ceil_offset) /
+                        (1 << ps_sps->i1_log2_ctb_size));
+
+        ps_sps->i4_pic_size_in_ctb = ps_sps->i2_pic_ht_in_ctb *
+                        ps_sps->i2_pic_wd_in_ctb;
+
+        if(0 == ps_codec->i4_sps_done)
+            ps_codec->s_parse.i4_next_ctb_indx = ps_sps->i4_pic_size_in_ctb;
+
+        numerator = ps_sps->i2_pic_width_in_luma_samples;
+        ps_sps->i2_pic_wd_in_min_cb = numerator  /
+                        (1 << ps_sps->i1_log2_min_coding_block_size);
+
+        numerator = ps_sps->i2_pic_height_in_luma_samples;
+        ps_sps->i2_pic_ht_in_min_cb = numerator  /
+                        (1 << ps_sps->i1_log2_min_coding_block_size);
+    }
+    if((0 != ps_codec->i4_first_pic_done) &&
+                    ((ps_codec->i4_wd != ps_sps->i2_pic_width_in_luma_samples) ||
+                    (ps_codec->i4_ht != ps_sps->i2_pic_height_in_luma_samples)))
+    {
+        ps_codec->i4_reset_flag = 1;
+        ps_codec->i4_error_code = IVD_RES_CHANGED;
+        return (IHEVCD_ERROR_T)IHEVCD_FAIL;
+    }
+
+    /* Update display width and display height */
+    {
+        WORD32 disp_wd, disp_ht;
+        WORD32 crop_unit_x, crop_unit_y;
+        crop_unit_x = 1;
+        crop_unit_y = 1;
+
+        if(CHROMA_FMT_IDC_YUV420 == ps_sps->i1_chroma_format_idc)
+        {
+            crop_unit_x = 2;
+            crop_unit_y = 2;
+        }
+
+        disp_wd = ps_sps->i2_pic_width_in_luma_samples;
+        disp_wd -= ps_sps->i2_pic_crop_left_offset * crop_unit_x;
+        disp_wd -= ps_sps->i2_pic_crop_right_offset * crop_unit_x;
+
+
+        disp_ht = ps_sps->i2_pic_height_in_luma_samples;
+        disp_ht -= ps_sps->i2_pic_crop_top_offset * crop_unit_y;
+        disp_ht -= ps_sps->i2_pic_crop_bottom_offset * crop_unit_y;
+
+        if((0 >= disp_wd) || (0 >= disp_ht))
+            return IHEVCD_INVALID_PARAMETER;
+
+        ps_codec->i4_disp_wd = disp_wd;
+        ps_codec->i4_disp_ht = disp_ht;
+
+
+        ps_codec->i4_wd = ps_sps->i2_pic_width_in_luma_samples;
+        ps_codec->i4_ht = ps_sps->i2_pic_height_in_luma_samples;
+
+        {
+            WORD32 ref_strd;
+            ref_strd = ALIGN32(ps_sps->i2_pic_width_in_luma_samples + PAD_WD);
+            if(ps_codec->i4_strd < ref_strd)
+            {
+                ps_codec->i4_strd = ref_strd;
+            }
+        }
+
+        if(0 == ps_codec->i4_share_disp_buf)
+        {
+            if(ps_codec->i4_disp_strd < ps_codec->i4_disp_wd)
+            {
+                ps_codec->i4_disp_strd = ps_codec->i4_disp_wd;
+            }
+        }
+        else
+        {
+            if(ps_codec->i4_disp_strd < ps_codec->i4_strd)
+            {
+                ps_codec->i4_disp_strd = ps_codec->i4_strd;
+            }
+        }
+    }
+
+    ps_codec->i4_sps_done = 1;
+    return ret;
+}
+
+
+void ihevcd_unmark_pps(codec_t *ps_codec, WORD32 sps_id)
+{
+    WORD32 pps_id = 0;
+    pps_t *ps_pps = ps_codec->ps_pps_base;
+
+    for(pps_id = 0; pps_id < MAX_PPS_CNT - 1; pps_id++, ps_pps++)
+    {
+        if((ps_pps->i1_pps_valid) &&
+                        (ps_pps->i1_sps_id == sps_id))
+            ps_pps->i1_pps_valid = 0;
+    }
+}
+
+
+void ihevcd_copy_sps(codec_t *ps_codec, WORD32 sps_id, WORD32 sps_id_ref)
+{
+    sps_t *ps_sps, *ps_sps_ref;
+    WORD16 *pi2_scaling_mat_backup;
+    WORD32 scaling_mat_size;
+
+    SCALING_MAT_SIZE(scaling_mat_size);
+    ps_sps_ref = ps_codec->ps_sps_base + sps_id_ref;
+    ps_sps = ps_codec->ps_sps_base + sps_id;
+
+    if(ps_sps->i1_sps_valid)
+    {
+        if((ps_sps->i1_log2_ctb_size != ps_sps_ref->i1_log2_ctb_size) ||
+                        (ps_sps->i2_pic_wd_in_ctb != ps_sps_ref->i2_pic_wd_in_ctb) ||
+                        (ps_sps->i2_pic_ht_in_ctb != ps_sps_ref->i2_pic_ht_in_ctb))
+        {
+            ihevcd_unmark_pps(ps_codec, sps_id);
+        }
+    }
+
+    pi2_scaling_mat_backup = ps_sps->pi2_scaling_mat;
+
+    memcpy(ps_sps, ps_sps_ref, sizeof(sps_t));
+    ps_sps->pi2_scaling_mat = pi2_scaling_mat_backup;
+    memcpy(ps_sps->pi2_scaling_mat, ps_sps_ref->pi2_scaling_mat, scaling_mat_size * sizeof(WORD16));
+    ps_sps->i1_sps_valid = 1;
+
+    ps_codec->s_parse.ps_sps = ps_sps;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Parses PPS (Picture Parameter Set)
+*
+* @par Description:
+*  Parse Picture Parameter Set as per section  Section: 7.3.2.3
+* The pps is written to a temporary buffer and copied later to the
+* appropriate location
+*
+* @param[in] ps_codec
+*  Pointer to codec context
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_parse_pps(codec_t *ps_codec)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    WORD32 value;
+    WORD32 pps_id;
+
+    pps_t *ps_pps;
+    sps_t *ps_sps;
+    bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+
+
+    if(0 == ps_codec->i4_sps_done)
+        return IHEVCD_INVALID_HEADER;
+
+    UEV_PARSE("pic_parameter_set_id", value, ps_bitstrm);
+
+    pps_id = value;
+    if((pps_id >= MAX_PPS_CNT) || (pps_id < 0))
+    {
+        if(ps_codec->i4_pps_done)
+            return IHEVCD_UNSUPPORTED_PPS_ID;
+        else
+            pps_id = 0;
+    }
+
+
+    ps_pps = (ps_codec->s_parse.ps_pps_base + MAX_PPS_CNT - 1);
+
+    ps_pps->i1_pps_id = pps_id;
+
+    UEV_PARSE("seq_parameter_set_id", value, ps_bitstrm);
+    ps_pps->i1_sps_id = value;
+    ps_pps->i1_sps_id = CLIP3(ps_pps->i1_sps_id, 0, MAX_SPS_CNT - 2);
+
+    ps_sps = (ps_codec->s_parse.ps_sps_base + ps_pps->i1_sps_id);
+
+    /* If the SPS that is being referred to has not been parsed,
+     * copy an existing SPS to the current location */
+    if(0 == ps_sps->i1_sps_valid)
+    {
+        return IHEVCD_INVALID_HEADER;
+
+/*
+        sps_t *ps_sps_ref = ps_codec->ps_sps_base;
+        while(0 == ps_sps_ref->i1_sps_valid)
+            ps_sps_ref++;
+        ihevcd_copy_sps(ps_codec, ps_pps->i1_sps_id, ps_sps_ref->i1_sps_id);
+*/
+    }
+
+    BITS_PARSE("dependent_slices_enabled_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_dependent_slice_enabled_flag = value;
+
+    BITS_PARSE("output_flag_present_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_output_flag_present_flag = value;
+
+    BITS_PARSE("num_extra_slice_header_bits", value, ps_bitstrm, 3);
+    ps_pps->i1_num_extra_slice_header_bits = value;
+
+
+    BITS_PARSE("sign_data_hiding_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_sign_data_hiding_flag = value;
+
+    BITS_PARSE("cabac_init_present_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_cabac_init_present_flag = value;
+
+    UEV_PARSE("num_ref_idx_l0_default_active_minus1", value, ps_bitstrm);
+    ps_pps->i1_num_ref_idx_l0_default_active = value + 1;
+
+    UEV_PARSE("num_ref_idx_l1_default_active_minus1", value, ps_bitstrm);
+    ps_pps->i1_num_ref_idx_l1_default_active = value + 1;
+
+    SEV_PARSE("pic_init_qp_minus26", value, ps_bitstrm);
+    ps_pps->i1_pic_init_qp = value + 26;
+
+    BITS_PARSE("constrained_intra_pred_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_constrained_intra_pred_flag = value;
+
+    BITS_PARSE("transform_skip_enabled_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_transform_skip_enabled_flag = value;
+
+    BITS_PARSE("cu_qp_delta_enabled_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_cu_qp_delta_enabled_flag = value;
+
+    if(ps_pps->i1_cu_qp_delta_enabled_flag)
+    {
+        UEV_PARSE("diff_cu_qp_delta_depth", value, ps_bitstrm);
+        ps_pps->i1_diff_cu_qp_delta_depth = value;
+    }
+    else
+    {
+        ps_pps->i1_diff_cu_qp_delta_depth = 0;
+    }
+    ps_pps->i1_log2_min_cu_qp_delta_size = ps_sps->i1_log2_ctb_size - ps_pps->i1_diff_cu_qp_delta_depth;
+    /* Print different */
+    SEV_PARSE("cb_qp_offset", value, ps_bitstrm);
+    ps_pps->i1_pic_cb_qp_offset = value;
+
+    /* Print different */
+    SEV_PARSE("cr_qp_offset", value, ps_bitstrm);
+    ps_pps->i1_pic_cr_qp_offset = value;
+
+    /* Print different */
+    BITS_PARSE("slicelevel_chroma_qp_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_pic_slice_level_chroma_qp_offsets_present_flag = value;
+
+    BITS_PARSE("weighted_pred_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_weighted_pred_flag = value;
+
+    BITS_PARSE("weighted_bipred_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_weighted_bipred_flag = value;
+
+    BITS_PARSE("transquant_bypass_enable_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_transquant_bypass_enable_flag = value;
+
+    BITS_PARSE("tiles_enabled_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_tiles_enabled_flag = value;
+
+    BITS_PARSE("entropy_coding_sync_enabled_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_entropy_coding_sync_enabled_flag = value;
+
+    ps_pps->i1_loop_filter_across_tiles_enabled_flag = 0;
+    if(ps_pps->i1_tiles_enabled_flag)
+    {
+        UEV_PARSE("num_tile_columns_minus1", value, ps_bitstrm);
+        ps_pps->i1_num_tile_columns = value + 1;
+
+        UEV_PARSE("num_tile_rows_minus1", value, ps_bitstrm);
+        ps_pps->i1_num_tile_rows = value + 1;
+
+        if((ps_pps->i1_num_tile_columns < 1) ||
+                        (ps_pps->i1_num_tile_columns > ps_sps->i2_pic_wd_in_ctb) ||
+                        (ps_pps->i1_num_tile_rows < 1) ||
+                        (ps_pps->i1_num_tile_rows > ps_sps->i2_pic_ht_in_ctb))
+            return IHEVCD_INVALID_HEADER;
+
+        BITS_PARSE("uniform_spacing_flag", value, ps_bitstrm, 1);
+        ps_pps->i1_uniform_spacing_flag = value;
+
+
+        {
+
+            WORD32 start;
+            WORD32 i, j;
+
+
+            start = 0;
+            for(i = 0; i < ps_pps->i1_num_tile_columns; i++)
+            {
+                tile_t *ps_tile;
+                if(!ps_pps->i1_uniform_spacing_flag)
+                {
+                    if(i < (ps_pps->i1_num_tile_columns - 1))
+                    {
+                        UEV_PARSE("column_width_minus1[ i ]", value, ps_bitstrm);
+                        value += 1;
+                    }
+                    else
+                    {
+                        value = ps_sps->i2_pic_wd_in_ctb - start;
+                    }
+                }
+                else
+                {
+                    value = ((i + 1) * ps_sps->i2_pic_wd_in_ctb) / ps_pps->i1_num_tile_columns -
+                                    (i * ps_sps->i2_pic_wd_in_ctb) / ps_pps->i1_num_tile_columns;
+                }
+
+                for(j = 0; j < ps_pps->i1_num_tile_rows; j++)
+                {
+                    ps_tile = ps_pps->ps_tile + j * ps_pps->i1_num_tile_columns + i;
+                    ps_tile->u1_pos_x = start;
+                    ps_tile->u2_wd = value;
+                }
+                start += value;
+
+                if((start > ps_sps->i2_pic_wd_in_ctb) ||
+                                (value <= 0))
+                    return IHEVCD_INVALID_HEADER;
+            }
+
+            start = 0;
+            for(i = 0; i < (ps_pps->i1_num_tile_rows); i++)
+            {
+                tile_t *ps_tile;
+                if(!ps_pps->i1_uniform_spacing_flag)
+                {
+                    if(i < (ps_pps->i1_num_tile_rows - 1))
+                    {
+
+                        UEV_PARSE("row_height_minus1[ i ]", value, ps_bitstrm);
+                        value += 1;
+                    }
+                    else
+                    {
+                        value = ps_sps->i2_pic_ht_in_ctb - start;
+                    }
+                }
+                else
+                {
+                    value = ((i + 1) * ps_sps->i2_pic_ht_in_ctb) / ps_pps->i1_num_tile_rows -
+                                    (i * ps_sps->i2_pic_ht_in_ctb) / ps_pps->i1_num_tile_rows;
+                }
+
+                for(j = 0; j < ps_pps->i1_num_tile_columns; j++)
+                {
+                    ps_tile = ps_pps->ps_tile + i * ps_pps->i1_num_tile_columns + j;
+                    ps_tile->u1_pos_y = start;
+                    ps_tile->u2_ht = value;
+                }
+                start += value;
+
+                if((start > ps_sps->i2_pic_ht_in_ctb) ||
+                                (value <= 0))
+                    return IHEVCD_INVALID_HEADER;
+            }
+        }
+
+
+        BITS_PARSE("loop_filter_across_tiles_enabled_flag", value, ps_bitstrm, 1);
+        ps_pps->i1_loop_filter_across_tiles_enabled_flag = value;
+
+    }
+    else
+    {
+        /* If tiles are not present, set first tile in each PPS to have tile
+        width and height equal to picture width and height */
+        ps_pps->i1_num_tile_columns = 1;
+        ps_pps->i1_num_tile_rows = 1;
+        ps_pps->i1_uniform_spacing_flag = 1;
+
+        ps_pps->ps_tile->u1_pos_x = 0;
+        ps_pps->ps_tile->u1_pos_y = 0;
+        ps_pps->ps_tile->u2_wd = ps_sps->i2_pic_wd_in_ctb;
+        ps_pps->ps_tile->u2_ht = ps_sps->i2_pic_ht_in_ctb;
+    }
+
+    BITS_PARSE("loop_filter_across_slices_enabled_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_loop_filter_across_slices_enabled_flag = value;
+
+    BITS_PARSE("deblocking_filter_control_present_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_deblocking_filter_control_present_flag = value;
+
+    /* Default values */
+    ps_pps->i1_pic_disable_deblocking_filter_flag = 0;
+    ps_pps->i1_deblocking_filter_override_enabled_flag = 0;
+    ps_pps->i1_beta_offset_div2 = 0;
+    ps_pps->i1_tc_offset_div2 = 0;
+
+    if(ps_pps->i1_deblocking_filter_control_present_flag)
+    {
+
+        BITS_PARSE("deblocking_filter_override_enabled_flag", value, ps_bitstrm, 1);
+        ps_pps->i1_deblocking_filter_override_enabled_flag = value;
+
+        BITS_PARSE("pic_disable_deblocking_filter_flag", value, ps_bitstrm, 1);
+        ps_pps->i1_pic_disable_deblocking_filter_flag = value;
+
+        if(!ps_pps->i1_pic_disable_deblocking_filter_flag)
+        {
+
+            SEV_PARSE("pps_beta_offset_div2", value, ps_bitstrm);
+            ps_pps->i1_beta_offset_div2 = value;
+
+            SEV_PARSE("pps_tc_offset_div2", value, ps_bitstrm);
+            ps_pps->i1_tc_offset_div2 = value;
+
+        }
+    }
+
+    BITS_PARSE("pps_scaling_list_data_present_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_pps_scaling_list_data_present_flag = value;
+
+    if(ps_pps->i1_pps_scaling_list_data_present_flag)
+    {
+        COPY_DEFAULT_SCALING_LIST(ps_pps->pi2_scaling_mat);
+        ihevcd_scaling_list_data(ps_codec, ps_pps->pi2_scaling_mat);
+    }
+
+    BITS_PARSE("lists_modification_present_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_lists_modification_present_flag = value;
+    UEV_PARSE("log2_parallel_merge_level_minus2", value, ps_bitstrm);
+    ps_pps->i1_log2_parallel_merge_level = value + 2;
+
+    BITS_PARSE("slice_header_extension_present_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_slice_header_extension_present_flag = value;
+    /* Not present in HM */
+#if 0
+    BITS_PARSE("slice_extension_present_flag", value, ps_bitstrm, 1);
+    ps_pps->i1_slice_extension_present_flag = value;
+#endif
+    BITS_PARSE("pps_extension_flag", value, ps_bitstrm, 1);
+
+    ps_codec->i4_pps_done = 1;
+    return ret;
+}
+
+
+void ihevcd_copy_pps(codec_t *ps_codec, WORD32 pps_id, WORD32 pps_id_ref)
+{
+    pps_t *ps_pps, *ps_pps_ref;
+    WORD16 *pi2_scaling_mat_backup;
+    WORD32 scaling_mat_size;
+    tile_t *ps_tile_backup;
+    WORD32 max_tile_cols, max_tile_rows;
+
+    SCALING_MAT_SIZE(scaling_mat_size);
+    max_tile_cols = (ps_codec->i4_max_wd + MIN_TILE_WD - 1) / MIN_TILE_WD;
+    max_tile_rows = (ps_codec->i4_max_ht + MIN_TILE_HT - 1) / MIN_TILE_HT;
+
+    ps_pps_ref = ps_codec->ps_pps_base + pps_id_ref;
+    ps_pps = ps_codec->ps_pps_base + pps_id;
+
+    pi2_scaling_mat_backup = ps_pps->pi2_scaling_mat;
+    ps_tile_backup = ps_pps->ps_tile;
+
+    memcpy(ps_pps, ps_pps_ref, sizeof(pps_t));
+    ps_pps->pi2_scaling_mat = pi2_scaling_mat_backup;
+    ps_pps->ps_tile = ps_tile_backup;
+    memcpy(ps_pps->pi2_scaling_mat, ps_pps_ref->pi2_scaling_mat, scaling_mat_size * sizeof(WORD16));
+    memcpy(ps_pps->ps_tile, ps_pps_ref->ps_tile, max_tile_cols * max_tile_rows * sizeof(tile_t));
+
+    ps_pps->i1_pps_valid = 1;
+
+    ps_codec->s_parse.ps_pps = ps_pps;
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Parses SEI (Supplemental Enhancement Information)
+*
+* @par Description:
+*  Parses SEI (Supplemental Enhancement Information) as per Section: 7.3.7
+*
+* @param[in] ps_codec
+*  Pointer to codec context
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_parse_sei(codec_t *ps_codec)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    UNUSED(ps_codec);
+#if 0
+
+    sei_message( )
+    {
+        payloadType = 0
+        while( next_bits(8) == 0xFF )
+        {
+            ff_byte  /* equal to 0xFF */
+            payloadType += 255
+        }
+
+        BITS_PARSE("last_payload_type_byte", value, ps_bitstrm, 1);
+        ps_sei->i1_last_payload_type_byte = value;
+
+        payloadType += last_payload_type_byte
+        payloadSize = 0
+        while(next_bits(8) == 0xFF)
+        {
+            ff_byte  /* equal to 0xFF */
+            payloadSize += 255
+        }
+
+        BITS_PARSE("last_payload_size_byte", value, ps_bitstrm, 1);
+        ps_sei->i1_last_payload_size_byte = value;
+
+        payloadSize += last_payload_size_byte
+        sei_payload( payloadType, payloadSize )
+    }
+
+#endif
+    return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Parses Access unit delimiter
+*
+* @par Description:
+*  Parses Access unit delimiter as per section  Section: 7.3.2.5
+*
+* @param[in] ps_codec
+*  Pointer to codec context
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_parse_aud(codec_t *ps_codec)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    UNUSED(ps_codec);
+#if 0
+
+    access_unit_delimiter_rbsp( )
+    {
+
+        BITS_PARSE("pic_type", value, ps_bitstrm, 3);
+        ps_sei->i1_pic_type = value;
+
+        rbsp_trailing_bits( )
+    }
+
+
+#endif
+    return ret;
+}
+
+WORD32 ihevcd_extend_sign_bit(WORD32 value, WORD32 num_bits)
+{
+    WORD32 ret_value = value;
+    if(value >> (num_bits - 1))
+    {
+        ret_value |= (0xFFFFFFFF << num_bits);
+    }
+    return ret_value;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Calculate POC of the current slice
+*
+* @par Description:
+*  Calculates the current POC using the previous POC lsb and previous POC msb
+*
+* @param[in] ps_codec
+*  Pointer to codec context
+*
+* @param[in] i1_pic_order_cnt_lsb
+*  Current POC lsb
+*
+* @returns  Current absolute POC
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_calc_poc(codec_t *ps_codec, nal_header_t *ps_nal, WORD8 i1_log2_max_poc_lsb, WORD32 i2_poc_lsb)
+{
+    WORD32 i4_abs_poc, i4_poc_msb;
+    WORD32 max_poc_lsb;
+    WORD8 i1_nal_unit_type = ps_nal->i1_nal_unit_type;
+    max_poc_lsb = (1 << i1_log2_max_poc_lsb);
+
+    if((!ps_codec->i4_first_pic_done) && (!ps_codec->i4_pic_present))
+        ps_codec->i4_prev_poc_msb = -2 * max_poc_lsb;
+
+    if(NAL_IDR_N_LP == i1_nal_unit_type
+                    || NAL_IDR_W_LP == i1_nal_unit_type
+                    || NAL_BLA_N_LP == i1_nal_unit_type
+                    || NAL_BLA_W_DLP == i1_nal_unit_type
+                    || NAL_BLA_W_LP == i1_nal_unit_type
+                    || (NAL_CRA == i1_nal_unit_type && !ps_codec->i4_first_pic_done))
+    {
+        i4_poc_msb = ps_codec->i4_prev_poc_msb + 2 * max_poc_lsb;
+        ps_codec->i4_prev_poc_lsb = 0;
+        ps_codec->i4_max_prev_poc_lsb = 0;
+//        ps_codec->i4_prev_poc_msb = 0;
+    }
+    else
+    {
+
+        if((i2_poc_lsb < ps_codec->i4_prev_poc_lsb)
+                        && ((ps_codec->i4_prev_poc_lsb - i2_poc_lsb) >= max_poc_lsb / 2))
+        {
+            i4_poc_msb = ps_codec->i4_prev_poc_msb + max_poc_lsb;
+        }
+        else if((i2_poc_lsb > ps_codec->i4_prev_poc_lsb)
+                        && ((i2_poc_lsb - ps_codec->i4_prev_poc_lsb) > max_poc_lsb / 2))
+        {
+            i4_poc_msb = ps_codec->i4_prev_poc_msb - max_poc_lsb;
+        }
+        else
+        {
+            i4_poc_msb = ps_codec->i4_prev_poc_msb;
+        }
+
+
+    }
+
+    i4_abs_poc = i4_poc_msb + i2_poc_lsb;
+    ps_codec->i4_max_prev_poc_lsb = MAX(ps_codec->i4_max_prev_poc_lsb, i2_poc_lsb);
+
+    {
+        WORD32 is_reference_nal = ((i1_nal_unit_type <= NAL_RSV_VCL_R15) && (i1_nal_unit_type % 2 != 0)) || ((i1_nal_unit_type >= NAL_BLA_W_LP) && (i1_nal_unit_type <= NAL_RSV_RAP_VCL23));
+        WORD32 update_prev_poc = ((is_reference_nal) && ((i1_nal_unit_type < NAL_RADL_N) || (i1_nal_unit_type > NAL_RASL_R)));
+
+        if((0 == ps_nal->i1_nuh_temporal_id) &&
+                        (update_prev_poc))
+        {
+            ps_codec->i4_prev_poc_lsb = i2_poc_lsb;
+            ps_codec->i4_prev_poc_msb = i4_poc_msb;
+        }
+    }
+
+    return i4_abs_poc;
+}
+
+
+void ihevcd_copy_slice_hdr(codec_t *ps_codec, WORD32 slice_idx, WORD32 slice_idx_ref)
+{
+    slice_header_t *ps_slice_hdr, *ps_slice_hdr_ref;
+    WORD32 *pu4_entry_offset_backup;
+
+    ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr_base + slice_idx;
+    ps_slice_hdr_ref = ps_codec->s_parse.ps_slice_hdr_base + slice_idx_ref;
+
+    pu4_entry_offset_backup = ps_slice_hdr->pu4_entry_point_offset;
+    memcpy(ps_slice_hdr, ps_slice_hdr_ref, sizeof(slice_header_t));
+    ps_slice_hdr->pu4_entry_point_offset = pu4_entry_offset_backup;
+}
+
+
+

diff --git a/decoder/ihevcd_parse_headers.h b/decoder/ihevcd_parse_headers.h
new file mode 100644
index 0000000..2139f64
--- /dev/null
+++ b/decoder/ihevcd_parse_headers.h

@@ -0,0 +1,48 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_parse_headers.h
+*
+* @brief
+*  Parsing of various headers like VPS, SPS, PPS etc
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_PARSE_HEADERS_H_
+#define _IHEVCD_PARSE_HEADERS_H_
+
+void ihevcd_copy_sps(codec_t *ps_codec, WORD32 sps_id, WORD32 sps_id_ref);
+void ihevcd_copy_pps(codec_t *ps_codec, WORD32 pps_id, WORD32 pps_id_ref);
+void ihevcd_copy_slice_hdr(codec_t *ps_codec, WORD32 slice_idx, WORD32 slice_idx_ref);
+
+IHEVCD_ERROR_T ihevcd_parse_vps(codec_t *ps_codec);
+IHEVCD_ERROR_T ihevcd_parse_sps(codec_t *ps_codec);
+IHEVCD_ERROR_T ihevcd_parse_pps(codec_t *ps_codec);
+IHEVCD_ERROR_T ihevcd_parse_slice_header(codec_t *ps_codec,
+                                         nal_header_t *ps_nal);
+
+#endif /* _IHEVCD_PARSE_HEADERS_H_ */

diff --git a/decoder/ihevcd_parse_residual.c b/decoder/ihevcd_parse_residual.c
new file mode 100644
index 0000000..fc84fa3
--- /dev/null
+++ b/decoder/ihevcd_parse_residual.c

@@ -0,0 +1,905 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_parse_residual.c
+*
+* @brief
+*  Contains functions for parsing residual data at TU level
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+
+#include "ihevc_common_tables.h"
+#include "ihevc_error.h"
+#include "ihevc_cabac_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_utils.h"
+#include "ihevcd_parse_residual.h"
+#include "ihevcd_cabac.h"
+
+/**
+  *****************************************************************************
+  * @brief  returns context increment for sig coeff based on csbf neigbour
+  *         flags (bottom and right) and current coeff postion in 4x4 block
+  *         See section 9.3.3.1.4 for details on this context increment
+  *
+  * input   : neigbour csbf flags(bit0:rightcsbf, bit1:bottom csbf)
+  *           coeff idx in raster order (0-15)
+  *
+  * output  : context increment for sig coeff flag
+  *
+  *****************************************************************************
+  */
+const UWORD8 gau1_ihevcd_sigcoeff_ctxtinc[3][4][16] =
+{
+
+    {
+        /* nbr csbf = 0:  sigCtx = (xP+yP == 0) ? 2 : (xP+yP < 3) ? 1: 0 */
+        { 2,    1,    1,    1,    1,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+        /* nbr csbf = 1:  sigCtx = (yP == 0) ? 2 : (yP == 1) ? 1: 0      */
+        { 2,    1,    2,    0,    1,    2,    0,    0,    1,    2,    0,    0,    1,    0,    0,    0 },
+        /* nbr csbf = 2:  sigCtx = (xP == 0) ? 2 : (xP == 1) ? 1: 0      */
+        { 2,    2,    1,    2,    1,    0,    2,    1,    0,    0,    1,    0,    0,    0,    0,    0 },
+        /* nbr csbf = 3:  sigCtx = 2                                     */
+        { 2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2 },
+    },
+    {
+        /* nbr csbf = 0:  sigCtx = (xP+yP == 0) ? 2 : (xP+yP < 3) ? 1: 0 */
+        { 2,    1,    1,    0,    1,    1,    0,    0,    1,    0,    0,    0,    0,    0,    0,    0 },
+        /* nbr csbf = 1:  sigCtx = (yP == 0) ? 2 : (yP == 1) ? 1: 0      */
+        { 2,    2,    2,    2,    1,    1,    1,    1,    0,    0,    0,    0,    0,    0,    0,    0 },
+        /* nbr csbf = 2:  sigCtx = (xP == 0) ? 2 : (xP == 1) ? 1: 0      */
+        { 2,    1,    0,    0,    2,    1,    0,    0,    2,    1,    0,    0,    2,    1,    0,    0 },
+        /* nbr csbf = 3:  sigCtx = 2                                     */
+        { 2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2 },
+    },
+    {
+        /* nbr csbf = 0:  sigCtx = (xP+yP == 0) ? 2 : (xP+yP < 3) ? 1: 0 */
+        { 2,    1,    1,    0,    1,    1,    0,    0,    1,    0,    0,    0,    0,    0,    0,    0 },
+        /* nbr csbf = 1:  sigCtx = (yP == 0) ? 2 : (yP == 1) ? 1: 0      */
+        { 2,    1,    0,    0,    2,    1,    0,    0,    2,    1,    0,    0,    2,    1,    0,    0 },
+        /* nbr csbf = 2:  sigCtx = (xP == 0) ? 2 : (xP == 1) ? 1: 0      */
+        { 2,    2,    2,    2,    1,    1,    1,    1,    0,    0,    0,    0,    0,    0,    0,    0 },
+        /* nbr csbf = 3:  sigCtx = 2                                     */
+        { 2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2 },
+    },
+
+
+};
+
+
+
+/**
+  *****************************************************************************
+  * @brief  returns context increment for sig coeff for 4x4 tranform size as
+  *         per Table 9-39 in section 9.3.3.1.4
+  *
+  * input   : coeff idx in raster order (0-15)
+  *
+  * output  : context increment for sig coeff flag
+  *
+  *****************************************************************************
+  */
+const UWORD8 gau1_ihevcd_sigcoeff_ctxtinc_tr4[3][16] =
+{
+    /* Upright diagonal scan */
+    {
+        0,    2,    1,    6,
+        3,    4,    7,    6,
+        4,    5,    7,    8,
+        5,    8,    8,    8,
+    },
+    /* Horizontal scan */
+    {
+        0,    1,    4,    5,
+        2,    3,    4,    5,
+        6,    6,    8,    8,
+        7,    7,    8,    8,
+    },
+    /* Vertical scan */
+    {
+        0,    2,    6,    7,
+        1,    3,    6,    7,
+        4,    4,    8,    8,
+        5,    5,    8,    8,
+    },
+};
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Parses Residual coding
+*
+* @par Description:
+*  Parses Residual coding as per  Section:7.3.13
+*
+* @param[in] ps_codec
+*  Pointer to codec context
+*
+* @returns  error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_parse_residual_coding(codec_t *ps_codec,
+                                    WORD32 x0, WORD32 y0,
+                                    WORD32 log2_trafo_size,
+                                    WORD32 c_idx,
+                                    WORD32 intra_pred_mode)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    WORD32 transform_skip_flag;
+    WORD32 value;
+    pps_t *ps_pps;
+    WORD32 last_scan_pos, last_sub_blk;
+    bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+    WORD32 last_significant_coeff_x_prefix, last_significant_coeff_y_prefix;
+    WORD32 last_significant_coeff_x, last_significant_coeff_y;
+    const UWORD8 *pu1_scan_blk, *pu1_scan_coeff;
+    WORD32 scan_idx;
+    WORD32 i;
+    WORD32 sign_data_hiding_flag;
+    cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+    WORD32 gt1_ctxt = 1;
+    WORD32 c_max;
+    UWORD16 au2_csbf[9];
+    tu_sblk_coeff_data_t *ps_tu_sblk_coeff_data;
+    WORD8 *pi1_num_coded_subblks;
+    WORD32 num_subblks;
+    WORD32 sig_coeff_base_ctxt, abs_gt1_base_ctxt;
+    UNUSED(x0);
+    UNUSED(y0);
+    ps_pps = ps_codec->s_parse.ps_pps;
+
+    sign_data_hiding_flag = ps_pps->i1_sign_data_hiding_flag;
+    transform_skip_flag = 0;
+    if(ps_pps->i1_transform_skip_enabled_flag &&
+       !ps_codec->s_parse.s_cu.i4_cu_transquant_bypass &&
+       (log2_trafo_size == 2))
+    {
+        WORD32 ctxt_idx;
+
+        if(!c_idx)
+        {
+            ctxt_idx = IHEVC_CAB_TFM_SKIP0;
+        }
+        else
+        {
+            ctxt_idx = IHEVC_CAB_TFM_SKIP12;
+        }
+        TRACE_CABAC_CTXT("transform_skip_flag", ps_cabac->u4_range, ctxt_idx);
+        value = ihevcd_cabac_decode_bin(ps_cabac,
+                                        ps_bitstrm,
+                                        ctxt_idx);
+        AEV_TRACE("transform_skip_flag", value, ps_cabac->u4_range);
+        transform_skip_flag = value;
+    }
+
+    /* code the last_coeff_x_prefix as tunary binarized code */
+    {
+        WORD32 ctxt_idx_x, ctxt_idx_y, ctx_shift;
+        WORD32 ctx_offset;
+        c_max = (log2_trafo_size << 1) - 1;
+
+        if(!c_idx)
+        {
+            ctx_offset = (3 * (log2_trafo_size - 2)) + ((log2_trafo_size - 1) >> 2);
+            ctxt_idx_x = IHEVC_CAB_COEFFX_PREFIX + ctx_offset;
+            ctxt_idx_y = IHEVC_CAB_COEFFY_PREFIX + ctx_offset;
+            ctx_shift  = (log2_trafo_size + 1) >> 2;
+        }
+        else
+        {
+            ctxt_idx_x = IHEVC_CAB_COEFFX_PREFIX + 15;
+            ctxt_idx_y = IHEVC_CAB_COEFFY_PREFIX + 15;
+            ctx_shift  = log2_trafo_size  - 2;
+        }
+
+        TRACE_CABAC_CTXT("last_coeff_x_prefix", ps_cabac->u4_range, ctxt_idx_x);
+        last_significant_coeff_x_prefix = ihevcd_cabac_decode_bins_tunary(ps_cabac,
+                                                                          ps_bitstrm,
+                                                                          c_max,
+                                                                          ctxt_idx_x,
+                                                                          ctx_shift,
+                                                                          c_max);
+
+        AEV_TRACE("last_coeff_x_prefix", last_significant_coeff_x_prefix, ps_cabac->u4_range);
+
+        TRACE_CABAC_CTXT("last_coeff_y_prefix", ps_cabac->u4_range, ctxt_idx_y);
+        last_significant_coeff_y_prefix = ihevcd_cabac_decode_bins_tunary(ps_cabac,
+                                                                          ps_bitstrm,
+                                                                          c_max,
+                                                                          ctxt_idx_y,
+                                                                          ctx_shift,
+                                                                          c_max);
+
+        AEV_TRACE("last_coeff_y_prefix", last_significant_coeff_y_prefix, ps_cabac->u4_range);
+
+
+        last_significant_coeff_x = last_significant_coeff_x_prefix;
+        if(last_significant_coeff_x_prefix > 3)
+        {
+            WORD32 suf_length = ((last_significant_coeff_x_prefix - 2) >> 1);
+
+            value = ihevcd_cabac_decode_bypass_bins(ps_cabac,
+                                                    ps_bitstrm,
+                                                    suf_length);
+
+            AEV_TRACE("last_coeff_x_suffix", value, ps_cabac->u4_range);
+
+
+            last_significant_coeff_x =
+                            (1 << ((last_significant_coeff_x_prefix >> 1) - 1)) *
+                            (2 + (last_significant_coeff_x_prefix & 1)) + value;
+        }
+
+
+        last_significant_coeff_y = last_significant_coeff_y_prefix;
+        if(last_significant_coeff_y_prefix > 3)
+        {
+            WORD32 suf_length = ((last_significant_coeff_y_prefix - 2) >> 1);
+            value = ihevcd_cabac_decode_bypass_bins(ps_cabac,
+                                                    ps_bitstrm,
+                                                    suf_length);
+
+            AEV_TRACE("last_coeff_y_suffix", value, ps_cabac->u4_range);
+            last_significant_coeff_y =
+                            (1 << ((last_significant_coeff_y_prefix >> 1) - 1)) *
+                            (2 + (last_significant_coeff_y_prefix & 1)) + value;
+        }
+
+    }
+
+    /* Choose a scan matrix based on intra flag, intra pred mode, transform size
+     and luma/chroma */
+    scan_idx = SCAN_DIAG_UPRIGHT;
+    if(PRED_MODE_INTRA == ps_codec->s_parse.s_cu.i4_pred_mode)
+    {
+        if((2 == log2_trafo_size) || ((3 == log2_trafo_size) && (0 == c_idx)))
+        {
+            if((6 <= intra_pred_mode) &&
+               (14 >= intra_pred_mode))
+            {
+                scan_idx = SCAN_VERT;
+            }
+            else if((22 <= intra_pred_mode) &&
+                    (30 >= intra_pred_mode))
+            {
+                scan_idx = SCAN_HORZ;
+            }
+        }
+    }
+
+    /* In case the scan is vertical, then swap  X and Y positions */
+    if(SCAN_VERT == scan_idx)
+    {
+        SWAP(last_significant_coeff_x, last_significant_coeff_y);
+    }
+
+    {
+        WORD8 *pi1_scan_idx;
+        WORD8 *pi1_buf = (WORD8 *)ps_codec->s_parse.pv_tu_coeff_data;
+
+        /* First WORD8 gives number of coded subblocks */
+        pi1_num_coded_subblks = pi1_buf++;
+
+        /* Set number of coded subblocks in the current TU to zero */
+        /* This will be updated later */
+        *pi1_num_coded_subblks = 0;
+
+        /* Second WORD8 gives (scan idx << 1) | trans_skip */
+        pi1_scan_idx = pi1_buf++;
+        *pi1_scan_idx = (scan_idx << 1) | transform_skip_flag;
+
+        /* Store the incremented pointer in pv_tu_coeff_data */
+        ps_codec->s_parse.pv_tu_coeff_data = pi1_buf;
+
+    }
+    /**
+     * Given last_significant_coeff_y and last_significant_coeff_x find last sub block
+     * This is done by ignoring lower two bits of last_significant_coeff_y and last_significant_coeff_x
+     * and using scan matrix for lookup
+     */
+
+    /* If transform is 4x4, last_sub_blk is zero */
+    last_sub_blk = 0;
+
+    /* If transform is larger than 4x4, then based on scan_idx and transform size, choose a scan table */
+
+    if(log2_trafo_size > 2)
+    {
+        WORD32 scan_pos;
+        WORD32 scan_mat_size;
+        pu1_scan_blk = (UWORD8 *)gapv_ihevc_scan[scan_idx * 3 + (log2_trafo_size - 2 - 1)];
+
+
+        /* Divide the current transform to 4x4 subblocks and count number of 4x4 in the first row */
+        /* This will be size of scan matrix to be used for subblock scanning */
+        scan_mat_size = 1 << (log2_trafo_size - 2);
+        scan_pos = ((last_significant_coeff_y >> 2) * scan_mat_size) +
+                        (last_significant_coeff_x >> 2);
+
+        last_sub_blk = pu1_scan_blk[scan_pos];
+    }
+    pu1_scan_coeff  = &gau1_ihevc_scan4x4[scan_idx][0];
+
+    {
+        WORD32 scan_pos;
+
+        scan_pos = ((last_significant_coeff_y & 3) << 2) +
+                        (last_significant_coeff_x & 3);
+
+        last_scan_pos = pu1_scan_coeff[scan_pos];
+    }
+    pu1_scan_blk = (UWORD8 *)gapv_ihevc_invscan[scan_idx * 3 + (log2_trafo_size - 2 - 1)];
+    pu1_scan_coeff  = &gau1_ihevc_invscan4x4[scan_idx][0];
+
+    /* Set CSBF array to zero */
+    {
+        UWORD32 *pu4_csbf;
+        pu4_csbf = (void *)au2_csbf;
+        *pu4_csbf++ = 0;
+        *pu4_csbf++ = 0;
+        *pu4_csbf++ = 0;
+        *pu4_csbf = 0;
+        /* To avoid a check for y pos, 9th WORD16 in the array is set to zero */
+        au2_csbf[8] = 0;
+    }
+
+    /*************************************************************************/
+    /* derive base context index for sig coeff as per section 9.3.3.1.4      */
+    /* TODO; convert to look up based on luma/chroma, scan type and tfr size */
+    /*************************************************************************/
+    if(!c_idx)
+    {
+        sig_coeff_base_ctxt = IHEVC_CAB_COEFF_FLAG;
+        abs_gt1_base_ctxt = IHEVC_CAB_COEFABS_GRTR1_FLAG;
+
+        if(3 == log2_trafo_size)
+        {
+            /* 8x8 transform size */
+            sig_coeff_base_ctxt += (scan_idx == SCAN_DIAG_UPRIGHT) ? 9 : 15;
+        }
+        else  if(3 < log2_trafo_size)
+        {
+            /* larger transform sizes */
+            sig_coeff_base_ctxt += 21;
+        }
+    }
+    else
+    {
+        /* chroma context initializations */
+        sig_coeff_base_ctxt = IHEVC_CAB_COEFF_FLAG + 27;
+        abs_gt1_base_ctxt = IHEVC_CAB_COEFABS_GRTR1_FLAG + 16;
+
+        if(3 == log2_trafo_size)
+        {
+            /* 8x8 transform size */
+            sig_coeff_base_ctxt += 9;
+        }
+        else  if(3 < log2_trafo_size)
+        {
+            /* larger transform sizes */
+            sig_coeff_base_ctxt += 12;
+        }
+    }
+    num_subblks = 0;
+    /* Parse each 4x4 subblocks */
+    for(i = last_sub_blk; i >= 0; i--)
+    {
+        WORD32 sub_blk_pos;
+        WORD32 infer_sig_coeff_flag;
+        WORD32 cur_csbf;
+
+        WORD32 n;
+        WORD32 num_coeff;
+        /* Sig coeff map for 16 entries in raster scan order. Upper 16 bits are used.
+         * MSB gives sig coeff flag for 0th coeff and so on
+         * UWORD16 would have been enough but kept as UWORD32 for code optimizations
+         * In arm unnecessary masking operations are saved
+         */
+        UWORD32 u4_sig_coeff_map_raster;
+        WORD32 sign_hidden;
+
+        /* Sig coeff map in scan order */
+        UWORD32 u4_sig_coeff_map;
+        WORD32 coeff_abs_level_greater2_flag;
+        UWORD32 u4_coeff_abs_level_greater1_map;
+        UWORD32 u4_coeff_abs_level_greater2_map;
+        UWORD32 u4_coeff_sign_map;
+        WORD32 first_sig_scan_pos, last_sig_scan_pos, num_greater1_flag, first_greater1_scan_pos;
+        WORD32  num_sig_coeff, sum_abs_level;
+        WORD32 nbr_csbf;
+
+
+        WORD32 ctxt_set;
+        WORD32 rice_param;
+        WORD32 xs, ys;
+
+
+        sub_blk_pos  = 0;
+        if(i && (log2_trafo_size > 2))
+            sub_blk_pos = pu1_scan_blk[i];
+
+        /* Get xs and ys from scan position */
+        /* This is needed for context modelling of significant coeff flag */
+        xs = sub_blk_pos & ((1 << (log2_trafo_size - 2)) - 1);
+        ys = sub_blk_pos >> (log2_trafo_size - 2);
+
+
+        /* Check if neighbor subblocks are coded */
+        {
+
+            nbr_csbf = 0;
+
+            /* Get Bottom sub blocks CSBF */
+            nbr_csbf |= (au2_csbf[ys + 1] >> xs) & 1;
+            nbr_csbf <<= 1;
+
+            /* Get Right sub blocks CSBF */
+            /* Even if xs is equal to (1 << (log2_trafo_size - 2 )) - 1,
+               since au2_csbf is set to zero at the beginning, csbf for
+               neighbor will be read as 0 */
+
+            nbr_csbf |= (au2_csbf[ys] >> (xs + 1)) & 1;
+
+
+        }
+        cur_csbf = 0;
+
+        /* DC coeff is inferred, only if coded_sub_block is explicitly parsed as 1 */
+        /* i.e. it is not inferred for first and last subblock */
+        infer_sig_coeff_flag = 0;
+        if((i < last_sub_blk) && (i > 0))
+        {
+            WORD32 ctxt_idx  = IHEVC_CAB_CODED_SUBLK_IDX;
+
+            /* ctxt based on right / bottom avail csbf, section 9.3.3.1.3 */
+            ctxt_idx += (nbr_csbf) ? 1 : 0;
+
+            /* Ctxt based on luma or chroma */
+            ctxt_idx += c_idx  ? 2 : 0;
+            TRACE_CABAC_CTXT("coded_sub_block_flag", ps_cabac->u4_range, ctxt_idx);
+            IHEVCD_CABAC_DECODE_BIN(cur_csbf, ps_cabac, ps_bitstrm, ctxt_idx);
+            AEV_TRACE("coded_sub_block_flag", cur_csbf, ps_cabac->u4_range);
+
+            infer_sig_coeff_flag = 1;
+        }
+        else /* if((i == last_sub_blk) || (sub_blk_pos == 0)) */
+        {
+            /* CSBF is set to 1 for first and last subblock */
+            /* Note for these subblocks sig_coeff_map is not inferred but instead parsed */
+            cur_csbf = 1;
+        }
+
+        /* Set current sub blocks CSBF */
+        {
+            UWORD32 u4_mask = 1 << xs;
+            if(cur_csbf)
+                au2_csbf[ys] |= u4_mask;
+            else
+                au2_csbf[ys] &= ~u4_mask;
+
+        }
+
+        /* If current subblock is not coded, proceed to the next subblock */
+        if(0 == cur_csbf)
+            continue;
+
+        n = 15;
+        u4_sig_coeff_map_raster = 0;
+        u4_sig_coeff_map = 0;
+        num_coeff = 0;
+        if(i == last_sub_blk)
+        {
+            WORD32 pos = ((last_significant_coeff_y & 3) << 2) +
+                            (last_significant_coeff_x & 3);
+            n = (last_scan_pos - 1);
+            /* Set Significant coeff map for last significant coeff flag as 1 */
+            u4_sig_coeff_map_raster = 1 << pos;
+            u4_sig_coeff_map = 1 << last_scan_pos;
+            num_coeff = 1;
+        }
+
+        for(; n >= 0; n--)
+        {
+            WORD32 significant_coeff_flag;
+
+            if((n > 0 || !infer_sig_coeff_flag))
+            {
+                //WORD32 coeff_pos;
+                WORD32 sig_ctxinc;
+                WORD32 ctxt_idx;
+
+                /* Coefficient position is needed for deriving context index for significant_coeff_flag */
+                //coeff_pos = pu1_scan_coeff[n];
+                /* derive the context inc as per section 9.3.3.1.4 */
+                sig_ctxinc = 0;
+                if(2 == log2_trafo_size)
+                {
+
+                    /* 4x4 transform size increment uses lookup */
+                    sig_ctxinc = gau1_ihevcd_sigcoeff_ctxtinc_tr4[scan_idx][n];
+                }
+                else if(n || i)
+                {
+                    /* ctxt for AC coeff depends on curpos and neigbour csbf */
+                    sig_ctxinc = gau1_ihevcd_sigcoeff_ctxtinc[scan_idx][nbr_csbf][n];
+
+                    /* based on luma subblock pos */
+                    sig_ctxinc += (i && (!c_idx)) ? 3 : 0;
+
+                }
+                else
+                {
+                    /* DC coeff has fixed context for luma and chroma */
+                    sig_coeff_base_ctxt = (0 == c_idx) ? IHEVC_CAB_COEFF_FLAG :
+                                                         (IHEVC_CAB_COEFF_FLAG + 27);
+                }
+
+                ctxt_idx = sig_ctxinc + sig_coeff_base_ctxt;
+                TRACE_CABAC_CTXT("significant_coeff_flag", ps_cabac->u4_range, ctxt_idx);
+                IHEVCD_CABAC_DECODE_BIN(significant_coeff_flag, ps_cabac,
+                                        ps_bitstrm,
+                                        ctxt_idx);
+                AEV_TRACE("significant_coeff_flag", significant_coeff_flag, ps_cabac->u4_range);
+
+
+                /* If at least one non-zero coeff is signalled then do not infer sig coeff map */
+                /* for (0,0) coeff in the current sub block */
+                if(significant_coeff_flag)
+                    infer_sig_coeff_flag = 0;
+
+//                u4_sig_coeff_map_raster |= significant_coeff_flag
+//                              << coeff_pos;
+                u4_sig_coeff_map |= significant_coeff_flag << n;
+                num_coeff += significant_coeff_flag;
+            }
+
+
+        }
+        /*********************************************************************/
+        /* If infer_sig_coeff_flag is 1 then treat the 0th coeff as non zero */
+        /* If infer_sig_coeff_flag is zero, then last significant_coeff_flag */
+        /* is parsed in the above loop                                       */
+        /*********************************************************************/
+        if(infer_sig_coeff_flag)
+        {
+            u4_sig_coeff_map_raster |= 1;
+            u4_sig_coeff_map |= 1;
+            num_coeff++;
+        }
+
+        /*********************************************************************/
+        /* First subblock does not get an explicit csbf. It is assumed to    */
+        /* be 1. For this subblock there is chance of getting all            */
+        /* sig_coeff_flags to be zero. In such a case proceed to the next    */
+        /* subblock(which is end of parsing for the current transform block) */
+        /*********************************************************************/
+
+        if(0 == num_coeff)
+            continue;
+
+        /* Increment number of coded subblocks for the current TU */
+        num_subblks++;
+
+        /* Set sig coeff map and subblock position */
+        ps_tu_sblk_coeff_data = (tu_sblk_coeff_data_t *)ps_codec->s_parse.pv_tu_coeff_data;
+        ps_tu_sblk_coeff_data->u2_sig_coeff_map = u4_sig_coeff_map;
+        ps_tu_sblk_coeff_data->u2_subblk_pos = (ys << 8) | xs;
+
+        first_sig_scan_pos = 16;
+        last_sig_scan_pos = -1;
+        num_greater1_flag = 0;
+        first_greater1_scan_pos = -1;
+        u4_coeff_abs_level_greater1_map = 0;
+
+
+        /* context set based on luma subblock pos */
+        ctxt_set = (i && (!c_idx)) ? 2 : 0;
+
+        /* See section 9.3.3.1.5           */
+        ctxt_set += (0 == gt1_ctxt) ? 1 : 0;
+
+        gt1_ctxt = 1;
+        /* Instead of initializing n to 15, set it to 31-CLZ(sig coeff map) */
+        {
+            UWORD32 u4_sig_coeff_map_shift;
+            UWORD32 clz;
+            clz = CLZ(u4_sig_coeff_map);
+            n = 31 - clz;
+            u4_sig_coeff_map_shift = u4_sig_coeff_map << clz;
+            /* For loop for n changed to do while to break early if sig_coeff_map_shift becomes zero */
+            do
+            {
+                //WORD32 coeff_pos;
+                WORD32 ctxt_idx;
+
+                //TODO: Scan lookup will be removed later and instead u4_sig_coeff_map will be used
+                //coeff_pos = pu1_scan_coeff[n];
+
+                if((u4_sig_coeff_map_shift >> 31) & 1)
+                {
+
+                    /* abs_level_greater1_flag is sent for only first 8 non-zero levels in a subblock */
+                    if(num_greater1_flag < 8)
+                    {
+                        WORD32 coeff_abs_level_greater1_flag;
+
+                        ctxt_idx = (ctxt_set * 4) + abs_gt1_base_ctxt + gt1_ctxt;
+
+                        TRACE_CABAC_CTXT("coeff_abs_level_greater1_flag", ps_cabac->u4_range, ctxt_idx);
+                        IHEVCD_CABAC_DECODE_BIN(coeff_abs_level_greater1_flag, ps_cabac, ps_bitstrm, ctxt_idx);
+                        AEV_TRACE("coeff_abs_level_greater1_flag", coeff_abs_level_greater1_flag, ps_cabac->u4_range);
+
+                        u4_coeff_abs_level_greater1_map |= coeff_abs_level_greater1_flag << n;
+                        num_greater1_flag++;
+
+                        /* first_greater1_scan_pos is obtained using CLZ on u4_coeff_abs_level_greater1_map*/
+                        /*  outside the loop instead of the following check inside the loop                */
+                        /* if( coeff_abs_level_greater1_flag && first_greater1_scan_pos == -1) */
+                        /*    first_greater1_scan_pos = n;                                     */
+
+                        if(coeff_abs_level_greater1_flag)
+                        {
+                            gt1_ctxt = 0;
+                        }
+                        else if(gt1_ctxt && (gt1_ctxt < 3))
+                        {
+                            gt1_ctxt++;
+                        }
+
+                    }
+                    else
+                        break;
+
+                    /* instead of computing last and first significan scan position using checks below */
+                    /* They are computed outside the loop using CLZ and CTZ on sig_coeff_map */
+                    /* if(last_sig_scan_pos == -1)                          */
+                    /*    last_sig_scan_pos = n;                            */
+                    /*  first_sig_scan_pos = n;                             */
+                }
+                u4_sig_coeff_map_shift <<= 1;
+                n--;
+                /* If there are zero coeffs, then shift by as many zero coeffs and decrement n */
+                clz = CLZ(u4_sig_coeff_map_shift);
+                u4_sig_coeff_map_shift <<= clz;
+                n -= clz;
+            }while(u4_sig_coeff_map_shift);
+        }
+        /* At this level u4_sig_coeff_map is non-zero i.e. has atleast one non-zero coeff */
+        last_sig_scan_pos = (31 - CLZ(u4_sig_coeff_map));
+        first_sig_scan_pos = CTZ(u4_sig_coeff_map);
+        sign_hidden = (((last_sig_scan_pos - first_sig_scan_pos) > 3) && !ps_codec->s_parse.s_cu.i4_cu_transquant_bypass);
+
+        u4_coeff_abs_level_greater2_map = 0;
+
+        if(u4_coeff_abs_level_greater1_map)
+        {
+            /* Check if the first level > 1 is greater than 2 */
+            WORD32 ctxt_idx;
+            first_greater1_scan_pos = (31 - CLZ(u4_coeff_abs_level_greater1_map));
+
+
+            ctxt_idx = IHEVC_CAB_COEFABS_GRTR2_FLAG;
+
+            ctxt_idx += (!c_idx) ? ctxt_set : (ctxt_set + 4);
+            TRACE_CABAC_CTXT("coeff_abs_level_greater2_flag", ps_cabac->u4_range, ctxt_idx);
+            IHEVCD_CABAC_DECODE_BIN(coeff_abs_level_greater2_flag, ps_cabac, ps_bitstrm, ctxt_idx);
+            AEV_TRACE("coeff_abs_level_greater2_flag", coeff_abs_level_greater2_flag, ps_cabac->u4_range);
+            u4_coeff_abs_level_greater2_map = coeff_abs_level_greater2_flag << first_greater1_scan_pos;
+        }
+
+
+        u4_coeff_sign_map = 0;
+
+        /* Parse sign flags */
+        if(!sign_data_hiding_flag || !sign_hidden)
+        {
+            IHEVCD_CABAC_DECODE_BYPASS_BINS(value, ps_cabac, ps_bitstrm, num_coeff);
+            AEV_TRACE("sign_flags", value, ps_cabac->u4_range);
+            u4_coeff_sign_map = value << (32 - num_coeff);
+        }
+        else
+        {
+            IHEVCD_CABAC_DECODE_BYPASS_BINS(value, ps_cabac, ps_bitstrm, (num_coeff - 1));
+            AEV_TRACE("sign_flags", value, ps_cabac->u4_range);
+            u4_coeff_sign_map = value << (32 - (num_coeff - 1));
+        }
+
+        num_sig_coeff = 0;
+        sum_abs_level = 0;
+        rice_param = 0;
+        {
+            UWORD32 clz;
+            UWORD32 u4_sig_coeff_map_shift;
+            clz = CLZ(u4_sig_coeff_map);
+            n = 31 - clz;
+            u4_sig_coeff_map_shift = u4_sig_coeff_map << clz;
+            /* For loop for n changed to do while to break early if sig_coeff_map_shift becomes zero */
+            do
+            {
+
+                if((u4_sig_coeff_map_shift >> 31) & 1)
+                {
+                    WORD32 base_lvl;
+                    WORD32 coeff_abs_level_remaining;
+                    WORD32 level;
+                    base_lvl = 1;
+
+                    /* Update base_lvl if it is greater than 1 */
+                    if((u4_coeff_abs_level_greater1_map >> n) & 1)
+                        base_lvl++;
+
+                    /* Update base_lvl if it is greater than 2 */
+                    if((u4_coeff_abs_level_greater2_map >> n) & 1)
+                        base_lvl++;
+
+                    /* If level is greater than 3/2/1 based on the greater1 and greater2 maps,
+                     * decode remaining level (level - base_lvl) will be signalled as bypass bins
+                     */
+                    coeff_abs_level_remaining = 0;
+                    if(base_lvl == ((num_sig_coeff < 8) ? ((n == first_greater1_scan_pos) ? 3 : 2) : 1))
+                    {
+                        UWORD32 u4_prefix;
+                        WORD32 bin;
+
+                        u4_prefix = 0;
+
+                        do
+                        {
+                            IHEVCD_CABAC_DECODE_BYPASS_BIN(bin, ps_cabac, ps_bitstrm);
+                            u4_prefix++;
+
+                            if((WORD32)u4_prefix == 19 - rice_param)
+                            {
+                                bin = 1;
+                                break;
+                            }
+
+                        }while(bin);
+
+                        u4_prefix = u4_prefix - 1;
+                        if(u4_prefix < 3)
+                        {
+                            UWORD32 u4_suffix;
+
+                            coeff_abs_level_remaining = (u4_prefix << rice_param);
+                            if(rice_param)
+                            {
+                                IHEVCD_CABAC_DECODE_BYPASS_BINS(u4_suffix, ps_cabac, ps_bitstrm, rice_param);
+
+                                coeff_abs_level_remaining |= u4_suffix;
+                            }
+                        }
+                        else
+                        {
+                            UWORD32 u4_suffix;
+                            UWORD32 u4_numbins;
+
+                            //u4_prefix = CLIP3(u4_prefix, 0, 19 - rice_param);
+
+                            u4_numbins = (u4_prefix - 3 + rice_param);
+                            coeff_abs_level_remaining = (((1 << (u4_prefix - 3)) + 3 - 1) << rice_param);
+                            if(u4_numbins)
+                            {
+                                IHEVCD_CABAC_DECODE_BYPASS_BINS(u4_suffix, ps_cabac, ps_bitstrm, u4_numbins);
+                                coeff_abs_level_remaining += u4_suffix;
+                            }
+                        }
+
+
+                        AEV_TRACE("coeff_abs_level_remaining", coeff_abs_level_remaining, ps_cabac->u4_range);
+                        base_lvl += coeff_abs_level_remaining;
+
+                    }
+
+                    /* update the rice param based on coeff level */
+                    if((base_lvl > (3 << rice_param)) && (rice_param < 4))
+                    {
+                        rice_param++;
+                    }
+
+                    /* Compute absolute level */
+                    level = base_lvl;
+
+                    /* Update level with the sign */
+                    if((u4_coeff_sign_map >> 31) & 1)
+                        level = -level;
+
+                    u4_coeff_sign_map <<= 1;
+                    /* Update sign in case sign is hidden */
+                    if(sign_data_hiding_flag && sign_hidden)
+                    {
+                        sum_abs_level += base_lvl;
+
+                        if(n == first_sig_scan_pos && ((sum_abs_level % 2) == 1))
+                            level = -level;
+                    }
+
+                    /* Store the resulting level in non-zero level array */
+                    ps_tu_sblk_coeff_data->ai2_level[num_sig_coeff++] = level;
+                    //AEV_TRACE("level", level, 0);
+                }
+                u4_sig_coeff_map_shift <<= 1;
+                n--;
+                /* If there are zero coeffs, then shift by as many zero coeffs and decrement n */
+                clz = CLZ(u4_sig_coeff_map_shift);
+                u4_sig_coeff_map_shift <<= clz;
+                n -= clz;
+
+
+            }while(u4_sig_coeff_map_shift);
+        }
+
+        /* Increment the pv_tu_sblk_coeff_data */
+        {
+            UWORD8 *pu1_buf = (UWORD8 *)ps_codec->s_parse.pv_tu_coeff_data;
+            pu1_buf += sizeof(tu_sblk_coeff_data_t) - SUBBLK_COEFF_CNT * sizeof(WORD16);
+            pu1_buf += num_coeff * sizeof(WORD16);
+            ps_codec->s_parse.pv_tu_coeff_data = pu1_buf;
+
+        }
+
+    }
+    /* Set number of coded sub blocks in the current TU */
+    *pi1_num_coded_subblks = num_subblks;
+
+    return ret;
+}

diff --git a/decoder/ihevcd_parse_residual.h b/decoder/ihevcd_parse_residual.h
new file mode 100644
index 0000000..792a162
--- /dev/null
+++ b/decoder/ihevcd_parse_residual.h

@@ -0,0 +1,45 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_parse_residual.h
+*
+* @brief
+*  Parsing of residual data
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_PARSE_RESIDUAL_H_
+#define _IHEVCD_PARSE_RESIDUAL_H_
+WORD32 ihevcd_parse_residual_coding(codec_t *ps_codec,
+                                    WORD32 x0, WORD32 y0,
+                                    WORD32 log2_trafo_size,
+                                    WORD32 c_idx,
+                                    WORD32 intra_pred_mode);
+
+#endif /* _IHEVCD_PARSE_RESIDUAL_H_ */

diff --git a/decoder/ihevcd_parse_slice.c b/decoder/ihevcd_parse_slice.c
new file mode 100644
index 0000000..8f81e64
--- /dev/null
+++ b/decoder/ihevcd_parse_slice.c

@@ -0,0 +1,3525 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevcd_parse_slice.c
+ *
+ * @brief
+ *  Contains functions for parsing slice data
+ *
+ * @author
+ *  Harish
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_mem_fns.h"
+#include "ihevc_platform_macros.h"
+
+#include "ihevc_common_tables.h"
+#include "ihevc_error.h"
+#include "ihevc_cabac_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_utils.h"
+#include "ihevcd_parse_slice.h"
+#include "ihevcd_parse_residual.h"
+#include "ihevcd_cabac.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_intra_pred_mode_prediction.h"
+#include "ihevcd_common_tables.h"
+#include "ihevcd_process_slice.h"
+#ifdef GPU_BUILD
+#include "ihevcd_opencl_mc_interface.h"
+#endif
+#include "ihevcd_debug.h"
+#include "ihevcd_get_mv.h"
+#include "ihevcd_boundary_strength.h"
+#include "ihevcd_ilf_padding.h"
+#include "ihevcd_statistics.h"
+/* Bit stream offset threshold */
+#define BITSTRM_OFF_THRS 8
+
+/**
+ * Table used to decode part_mode if AMP is enabled and current CU is not min CU
+ */
+const UWORD8 gau1_part_mode_amp[] = { PART_nLx2N, PART_nRx2N, PART_Nx2N, 0xFF, PART_2NxnU, PART_2NxnD, PART_2NxN, 0xFF };
+
+const UWORD32 gau4_ct_depth_mask[] = { 0x0, 0x55555555, 0xAAAAAAAA, 0xFFFFFFFF };
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Parses Transform tree syntax
+ *
+ * @par Description:
+ *  Parses Transform tree syntax as per Section:7.3.9.8
+ *
+ * @param[in] ps_codec
+ *  Pointer to codec context
+ *
+ * @returns  Status
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+WORD32 ihevcd_parse_transform_tree(codec_t *ps_codec,
+                                   WORD32 x0, WORD32 y0,
+                                   WORD32 cu_x_base, WORD32 cu_y_base,
+                                   WORD32 log2_trafo_size,
+                                   WORD32 trafo_depth,
+                                   WORD32 blk_idx,
+                                   WORD32 intra_pred_mode)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    sps_t *ps_sps;
+    pps_t *ps_pps;
+    WORD32 value;
+    WORD32 x1, y1;
+    WORD32 max_trafo_depth;
+
+    bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+    WORD32 intra_split_flag;
+    WORD32 split_transform_flag;
+    WORD32 ctxt_idx;
+    cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+
+    max_trafo_depth = ps_codec->s_parse.s_cu.i4_max_trafo_depth;
+    ps_sps = ps_codec->s_parse.ps_sps;
+    ps_pps = ps_codec->s_parse.ps_pps;
+    intra_split_flag = ps_codec->s_parse.s_cu.i4_intra_split_flag;
+
+    {
+        split_transform_flag = 0;
+        if((log2_trafo_size <= ps_sps->i1_log2_max_transform_block_size) &&
+                        (log2_trafo_size > ps_sps->i1_log2_min_transform_block_size) &&
+                        (trafo_depth < max_trafo_depth) &&
+                        !(intra_split_flag && (trafo_depth == 0)))
+        {
+            /* encode the split transform flag, context derived as per Table9-37 */
+            ctxt_idx = IHEVC_CAB_SPLIT_TFM + (5 - log2_trafo_size);
+
+            TRACE_CABAC_CTXT("split_transform_flag", ps_cabac->u4_range, ctxt_idx);
+            split_transform_flag = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+            AEV_TRACE("split_transform_flag", split_transform_flag,
+                      ps_cabac->u4_range);
+
+        }
+        else
+        {
+            WORD32 inter_split_flag = 0;
+
+            if((0 == ps_sps->i1_max_transform_hierarchy_depth_inter) &&
+                            (PRED_MODE_INTER == ps_codec->s_parse.s_cu.i4_pred_mode) &&
+                            (PART_2Nx2N != ps_codec->s_parse.s_cu.i4_part_mode) &&
+                            (0 == trafo_depth))
+            {
+                inter_split_flag = 1;
+            }
+
+            if((log2_trafo_size > ps_sps->i1_log2_max_transform_block_size) ||
+                            ((1 == intra_split_flag) && (0 == trafo_depth)) ||
+                            (1 == inter_split_flag))
+            {
+                split_transform_flag = 1;
+            }
+        }
+
+        if(0 == trafo_depth)
+        {
+            ps_codec->s_parse.s_cu.ai1_cbf_cr[trafo_depth] = 0;
+            ps_codec->s_parse.s_cu.ai1_cbf_cb[trafo_depth] = 0;
+        }
+        else
+        {
+            ps_codec->s_parse.s_cu.ai1_cbf_cb[trafo_depth] = ps_codec->s_parse.s_cu.ai1_cbf_cb[trafo_depth - 1];
+            ps_codec->s_parse.s_cu.ai1_cbf_cr[trafo_depth] = ps_codec->s_parse.s_cu.ai1_cbf_cr[trafo_depth - 1];
+        }
+        if(trafo_depth == 0 || log2_trafo_size > 2)
+        {
+            ctxt_idx = IHEVC_CAB_CBCR_IDX + trafo_depth;
+            /* CBF for Cb/Cr is sent only if the parent CBF for Cb/Cr is non-zero */
+            if((trafo_depth == 0) || ps_codec->s_parse.s_cu.ai1_cbf_cb[trafo_depth - 1])
+            {
+                TRACE_CABAC_CTXT("cbf_cb", ps_cabac->u4_range, ctxt_idx);
+                value = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+                AEV_TRACE("cbf_cb", value, ps_cabac->u4_range);
+                ps_codec->s_parse.s_cu.ai1_cbf_cb[trafo_depth] = value;
+            }
+
+            if((trafo_depth == 0) || ps_codec->s_parse.s_cu.ai1_cbf_cr[trafo_depth - 1])
+            {
+                TRACE_CABAC_CTXT("cbf_cr", ps_cabac->u4_range, ctxt_idx);
+                value = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+                AEV_TRACE("cbf_cr", value, ps_cabac->u4_range);
+                ps_codec->s_parse.s_cu.ai1_cbf_cr[trafo_depth] = value;
+            }
+        }
+        if(split_transform_flag)
+        {
+            WORD32 intra_pred_mode_tmp;
+            x1 = x0 + ((1 << log2_trafo_size) >> 1);
+            y1 = y0 + ((1 << log2_trafo_size) >> 1);
+
+            /* For transform depth of zero, intra pred mode as decoded at CU */
+            /* level is sent to the transform tree nodes */
+            /* When depth is non-zero intra pred mode of parent node is sent */
+            /* This takes care of passing correct mode to all the child nodes */
+            intra_pred_mode_tmp = trafo_depth ? intra_pred_mode : ps_codec->s_parse.s_cu.ai4_intra_luma_pred_mode[0];
+            ihevcd_parse_transform_tree(ps_codec, x0, y0, x0, y0, log2_trafo_size - 1, trafo_depth + 1, 0, intra_pred_mode_tmp);
+
+            intra_pred_mode_tmp = trafo_depth ? intra_pred_mode : ps_codec->s_parse.s_cu.ai4_intra_luma_pred_mode[1];
+            ihevcd_parse_transform_tree(ps_codec, x1, y0, x0, y0, log2_trafo_size - 1, trafo_depth + 1, 1, intra_pred_mode_tmp);
+
+            intra_pred_mode_tmp = trafo_depth ? intra_pred_mode : ps_codec->s_parse.s_cu.ai4_intra_luma_pred_mode[2];
+            ihevcd_parse_transform_tree(ps_codec, x0, y1, x0, y0, log2_trafo_size - 1, trafo_depth + 1, 2, intra_pred_mode_tmp);
+
+            intra_pred_mode_tmp = trafo_depth ? intra_pred_mode : ps_codec->s_parse.s_cu.ai4_intra_luma_pred_mode[3];
+            ihevcd_parse_transform_tree(ps_codec, x1, y1, x0, y0, log2_trafo_size - 1, trafo_depth + 1, 3, intra_pred_mode_tmp);
+
+        }
+        else
+        {
+            WORD32 ctb_x_base;
+            WORD32 ctb_y_base;
+            WORD32 cu_qp_delta_abs;
+
+
+
+            tu_t *ps_tu = ps_codec->s_parse.ps_tu;
+            cu_qp_delta_abs = 0;
+            ctb_x_base = ps_codec->s_parse.i4_ctb_x << ps_sps->i1_log2_ctb_size;
+            ctb_y_base = ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size;
+
+            if((ps_codec->s_parse.s_cu.i4_pred_mode == PRED_MODE_INTRA) ||
+                            (trafo_depth != 0) ||
+                            (ps_codec->s_parse.s_cu.ai1_cbf_cb[trafo_depth]) ||
+                            (ps_codec->s_parse.s_cu.ai1_cbf_cr[trafo_depth]))
+            {
+                ctxt_idx = IHEVC_CAB_CBF_LUMA_IDX;
+                ctxt_idx += (trafo_depth == 0) ? 1 : 0;
+
+                TRACE_CABAC_CTXT("cbf_luma", ps_cabac->u4_range, ctxt_idx);
+                value = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+                AEV_TRACE("cbf_luma", value, ps_cabac->u4_range);
+
+                ps_codec->s_parse.s_cu.i1_cbf_luma = value;
+            }
+            else
+            {
+                ps_codec->s_parse.s_cu.i1_cbf_luma = 1;
+            }
+
+            /* Initialize ps_tu to default values */
+            /* If required change this to WORD32 packed write */
+            ps_tu->b1_cb_cbf = 0;
+            ps_tu->b1_cr_cbf = 0;
+            ps_tu->b1_y_cbf = 0;
+            ps_tu->b4_pos_x = ((x0 - ctb_x_base) >> 2);
+            ps_tu->b4_pos_y = ((y0 - ctb_y_base) >> 2);
+            ps_tu->b1_transquant_bypass = ps_codec->s_parse.s_cu.i4_cu_transquant_bypass;
+            ps_tu->b3_size = (log2_trafo_size - 2);
+            ps_tu->b7_qp = ps_codec->s_parse.u4_qp;
+
+            ps_tu->b6_luma_intra_mode = intra_pred_mode;
+            ps_tu->b3_chroma_intra_mode_idx = ps_codec->s_parse.s_cu.i4_intra_chroma_pred_mode_idx;
+
+            /* Section:7.3.12  Transform unit syntax inlined here */
+            if(ps_codec->s_parse.s_cu.i1_cbf_luma ||
+                            ps_codec->s_parse.s_cu.ai1_cbf_cb[trafo_depth] ||
+                            ps_codec->s_parse.s_cu.ai1_cbf_cr[trafo_depth])
+            {
+                WORD32 intra_pred_mode_chroma;
+                if(ps_pps->i1_cu_qp_delta_enabled_flag && !ps_codec->s_parse.i4_is_cu_qp_delta_coded)
+                {
+
+
+                    WORD32 c_max        = TU_MAX_QP_DELTA_ABS;
+                    WORD32 ctxt_inc     = IHEVC_CAB_QP_DELTA_ABS;
+                    WORD32 ctxt_inc_max = CTXT_MAX_QP_DELTA_ABS;
+
+                    TRACE_CABAC_CTXT("cu_qp_delta_abs", ps_cabac->u4_range, ctxt_inc);
+                    /* qp_delta_abs is coded as combination of tunary and eg0 code  */
+                    /* See Table 9-32 and Table 9-37 for details on cu_qp_delta_abs */
+                    cu_qp_delta_abs = ihevcd_cabac_decode_bins_tunary(ps_cabac,
+                                                                      ps_bitstrm,
+                                                                      c_max,
+                                                                      ctxt_inc,
+                                                                      0,
+                                                                      ctxt_inc_max);
+                    if(cu_qp_delta_abs >= c_max)
+                    {
+                        value = ihevcd_cabac_decode_bypass_bins_egk(ps_cabac, ps_bitstrm, 0);
+                        cu_qp_delta_abs += value;
+                    }
+                    AEV_TRACE("cu_qp_delta_abs", cu_qp_delta_abs, ps_cabac->u4_range);
+
+
+                    ps_codec->s_parse.i4_is_cu_qp_delta_coded = 1;
+
+
+                    if(cu_qp_delta_abs)
+                    {
+                        value = ihevcd_cabac_decode_bypass_bin(ps_cabac, ps_bitstrm);
+                        AEV_TRACE("cu_qp_delta_sign", value, ps_cabac->u4_range);
+
+                        if(value)
+                            cu_qp_delta_abs = -cu_qp_delta_abs;
+
+                    }
+                    ps_codec->s_parse.s_cu.i4_cu_qp_delta = cu_qp_delta_abs;
+
+                }
+
+                if(ps_codec->s_parse.s_cu.i1_cbf_luma)
+                {
+                    ps_tu->b1_y_cbf = 1;
+                    ihevcd_parse_residual_coding(ps_codec, x0, y0, log2_trafo_size, 0, intra_pred_mode);
+                }
+
+                if(4 == ps_codec->s_parse.s_cu.i4_intra_chroma_pred_mode_idx)
+                    intra_pred_mode_chroma = ps_codec->s_parse.s_cu.ai4_intra_luma_pred_mode[0];
+                else
+                {
+                    intra_pred_mode_chroma = gau1_intra_pred_chroma_modes[ps_codec->s_parse.s_cu.i4_intra_chroma_pred_mode_idx];
+
+                    if(intra_pred_mode_chroma ==
+                                    ps_codec->s_parse.s_cu.ai4_intra_luma_pred_mode[0])
+                    {
+                        intra_pred_mode_chroma = INTRA_ANGULAR(34);
+                    }
+
+                }
+                if(log2_trafo_size > 2)
+                {
+                    if(ps_codec->s_parse.s_cu.ai1_cbf_cb[trafo_depth])
+                    {
+                        ps_tu->b1_cb_cbf = 1;
+                        ihevcd_parse_residual_coding(ps_codec, x0, y0, log2_trafo_size - 1, 1, intra_pred_mode_chroma);
+                    }
+
+                    if(ps_codec->s_parse.s_cu.ai1_cbf_cr[trafo_depth])
+                    {
+                        ps_tu->b1_cr_cbf = 1;
+                        ihevcd_parse_residual_coding(ps_codec, x0, y0, log2_trafo_size - 1, 2, intra_pred_mode_chroma);
+                    }
+                }
+                else if(blk_idx == 3)
+                {
+                    if(ps_codec->s_parse.s_cu.ai1_cbf_cb[trafo_depth])
+                    {
+                        ps_tu->b1_cb_cbf = 1;
+                        ihevcd_parse_residual_coding(ps_codec, cu_x_base, cu_y_base, log2_trafo_size, 1, intra_pred_mode_chroma);
+                    }
+
+                    if(ps_codec->s_parse.s_cu.ai1_cbf_cr[trafo_depth])
+                    {
+                        ps_tu->b1_cr_cbf = 1;
+                        ihevcd_parse_residual_coding(ps_codec, cu_x_base, cu_y_base, log2_trafo_size, 2, intra_pred_mode_chroma);
+                    }
+                }
+                else
+                {
+                    //ps_tu->b1_chroma_present = 0;
+                    ps_tu->b3_chroma_intra_mode_idx = INTRA_PRED_CHROMA_IDX_NONE;
+                }
+            }
+            else
+            {
+                if((3 != blk_idx) && (2 == log2_trafo_size))
+                {
+                    ps_tu->b3_chroma_intra_mode_idx = INTRA_PRED_CHROMA_IDX_NONE;
+                }
+            }
+
+            /* Set the first TU in CU flag */
+            {
+                if((ps_codec->s_parse.s_cu.i4_pos_x << 3) == (ps_tu->b4_pos_x << 2) &&
+                                (ps_codec->s_parse.s_cu.i4_pos_y << 3) == (ps_tu->b4_pos_y << 2))
+                {
+                    ps_tu->b1_first_tu_in_cu = 1;
+                }
+                else
+                {
+                    ps_tu->b1_first_tu_in_cu = 0;
+                }
+            }
+            ps_codec->s_parse.ps_tu++;
+            ps_codec->s_parse.s_cu.i4_tu_cnt++;
+            ps_codec->s_parse.i4_pic_tu_idx++;
+        }
+    }
+    return ret;
+}
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Parses Motion vector difference
+ *
+ * @par Description:
+ *  Parses Motion vector difference as per Section:7.3.9.9
+ *
+ * @param[in] ps_codec
+ *  Pointer to codec context
+ *
+ * @returns  Error from IHEVCD_ERROR_T
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+IHEVCD_ERROR_T ihevcd_parse_mvd(codec_t *ps_codec, mv_t *ps_mv)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    WORD32 value;
+    WORD32 abs_mvd;
+    bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+    WORD32 abs_mvd_greater0_flag[2];
+    WORD32 abs_mvd_greater1_flag[2];
+    WORD32 ctxt_idx;
+    cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+
+
+    ctxt_idx  = IHEVC_CAB_MVD_GRT0;
+    /* encode absmvd_x > 0 */
+    TRACE_CABAC_CTXT("abs_mvd_greater0_flag[0]", ps_cabac->u4_range, ctxt_idx);
+    abs_mvd_greater0_flag[0] = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+    AEV_TRACE("abs_mvd_greater0_flag[0]", abs_mvd_greater0_flag[0], ps_cabac->u4_range);
+
+    /* encode absmvd_y > 0 */
+    TRACE_CABAC_CTXT("abs_mvd_greater0_flag[1]", ps_cabac->u4_range, ctxt_idx);
+    abs_mvd_greater0_flag[1] = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+    AEV_TRACE("abs_mvd_greater0_flag[1]", abs_mvd_greater0_flag[1], ps_cabac->u4_range);
+
+    ctxt_idx  = IHEVC_CAB_MVD_GRT1;
+    abs_mvd_greater1_flag[0] = 0;
+    abs_mvd_greater1_flag[1] = 0;
+
+    if(abs_mvd_greater0_flag[0])
+    {
+        TRACE_CABAC_CTXT("abs_mvd_greater1_flag[0]", ps_cabac->u4_range, ctxt_idx);
+        abs_mvd_greater1_flag[0] = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+        AEV_TRACE("abs_mvd_greater1_flag[0]", abs_mvd_greater1_flag[0], ps_cabac->u4_range);
+    }
+    if(abs_mvd_greater0_flag[1])
+    {
+        TRACE_CABAC_CTXT("abs_mvd_greater1_flag[1]", ps_cabac->u4_range, ctxt_idx);
+        abs_mvd_greater1_flag[1] = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+        AEV_TRACE("abs_mvd_greater1_flag[1]", abs_mvd_greater1_flag[1], ps_cabac->u4_range);
+    }
+    abs_mvd = 0;
+    if(abs_mvd_greater0_flag[0])
+    {
+        abs_mvd = 1;
+        if(abs_mvd_greater1_flag[0])
+        {
+            value = ihevcd_cabac_decode_bypass_bins_egk(ps_cabac, ps_bitstrm, 1);
+            AEV_TRACE("abs_mvd_minus2[0]", value, ps_cabac->u4_range);
+            abs_mvd = value + 2;
+        }
+        value = ihevcd_cabac_decode_bypass_bin(ps_cabac, ps_bitstrm);
+        AEV_TRACE("mvd_sign_flag[0]", value, ps_cabac->u4_range);
+        if(value)
+        {
+            abs_mvd = -abs_mvd;
+        }
+
+    }
+    ps_mv->i2_mvx = abs_mvd;
+    abs_mvd = 0;
+    if(abs_mvd_greater0_flag[1])
+    {
+        abs_mvd = 1;
+        if(abs_mvd_greater1_flag[1])
+        {
+            value = ihevcd_cabac_decode_bypass_bins_egk(ps_cabac, ps_bitstrm, 1);
+            AEV_TRACE("abs_mvd_minus2[1]", value, ps_cabac->u4_range);
+            abs_mvd = value + 2;
+
+        }
+        value = ihevcd_cabac_decode_bypass_bin(ps_cabac, ps_bitstrm);
+        AEV_TRACE("mvd_sign_flag[1]", value, ps_cabac->u4_range);
+
+        if(value)
+        {
+            abs_mvd = -abs_mvd;
+        }
+    }
+    ps_mv->i2_mvy = abs_mvd;
+
+    return ret;
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Parses PCM sample
+ *
+ *
+ * @par Description:
+ *  Parses PCM sample as per Section:7.3.9.7 Pcm sample syntax
+ *
+ * @param[in] ps_codec
+ *  Pointer to codec context
+ *
+ * @returns  Error from IHEVCD_ERROR_T
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+IHEVCD_ERROR_T  ihevcd_parse_pcm_sample(codec_t *ps_codec,
+                                        WORD32 x0,
+                                        WORD32 y0,
+                                        WORD32 log2_cb_size)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+    sps_t *ps_sps;
+
+    WORD32 value;
+    WORD32 i;
+
+    WORD32 num_bits;
+    UWORD32 u4_sig_coeff_map;
+    bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+    tu_t *ps_tu = ps_codec->s_parse.ps_tu;
+    tu_sblk_coeff_data_t *ps_tu_sblk_coeff_data;
+    UWORD8 *pu1_coeff_data;
+    ps_sps = ps_codec->s_parse.ps_sps;
+
+    UNUSED(value);
+    UNUSED(ps_tu);
+    UNUSED(ps_cabac);
+    UNUSED(x0);
+    UNUSED(y0);
+
+    {
+        WORD8 *pi1_scan_idx;
+        WORD8 *pi1_buf = (WORD8 *)ps_codec->s_parse.pv_tu_coeff_data;
+        WORD8 *pi1_num_coded_subblks;
+
+        /* First WORD8 gives number of coded subblocks */
+        pi1_num_coded_subblks = pi1_buf++;
+
+        /* Set number of coded subblocks in the current TU to zero */
+        /* For PCM there will be only one subblock which is the same size as CU */
+        *pi1_num_coded_subblks = 1;
+
+        /* Second WORD8 gives (scan idx << 1) | trans_skip */
+        pi1_scan_idx = pi1_buf++;
+        *pi1_scan_idx = (0 << 1) | 1;
+
+        /* Store the incremented pointer in pv_tu_coeff_data */
+        ps_codec->s_parse.pv_tu_coeff_data = pi1_buf;
+
+    }
+
+    u4_sig_coeff_map = 0xFFFFFFFF;
+    ps_tu_sblk_coeff_data = (tu_sblk_coeff_data_t *)ps_codec->s_parse.pv_tu_coeff_data;
+    ps_tu_sblk_coeff_data->u2_sig_coeff_map = u4_sig_coeff_map;
+    ps_tu_sblk_coeff_data->u2_subblk_pos = 0;
+
+    pu1_coeff_data = (UWORD8 *)&ps_tu_sblk_coeff_data->ai2_level[0];
+
+    num_bits = ps_sps->i1_pcm_sample_bit_depth_luma;
+
+    for(i = 0; i < 1 << (log2_cb_size << 1); i++)
+    {
+        TRACE_CABAC_CTXT("pcm_sample_luma", ps_cabac->u4_range, 0);
+        BITS_PARSE("pcm_sample_luma", value, ps_bitstrm, num_bits);
+
+        //ps_pcmsample_t->i1_pcm_sample_luma[i] = value;
+        *pu1_coeff_data++ = value << (BIT_DEPTH_LUMA - num_bits);
+    }
+
+    num_bits = ps_sps->i1_pcm_sample_bit_depth_chroma;
+
+    for(i = 0; i < (1 << (log2_cb_size << 1)) >> 1; i++)
+    {
+        TRACE_CABAC_CTXT("pcm_sample_chroma", ps_cabac->u4_range, 0);
+        BITS_PARSE("pcm_sample_chroma", value, ps_bitstrm, num_bits);
+
+        // ps_pcmsample_t->i1_pcm_sample_chroma[i] = value;
+        *pu1_coeff_data++ = value << (BIT_DEPTH_CHROMA - num_bits);
+    }
+
+    ps_codec->s_parse.pv_tu_coeff_data = pu1_coeff_data;
+
+    return ret;
+}
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Parses Prediction unit
+ *
+ * @par Description:
+ *  Parses Prediction unit as per Section:7.3.9.6
+ *
+ * @param[in] ps_codec
+ *  Pointer to codec context
+ *
+ * @returns  Error from IHEVCD_ERROR_T
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+IHEVCD_ERROR_T  ihevcd_parse_pu_mvp(codec_t *ps_codec, pu_t *ps_pu)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    WORD32 value;
+    slice_header_t *ps_slice_hdr;
+    bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+    cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+    WORD32 inter_pred_idc;
+
+    ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr;
+
+    if(ps_slice_hdr->i1_slice_type == BSLICE)
+    {
+        WORD32 pu_w_plus_pu_h;
+        WORD32 ctxt_idx;
+        /* required to check if w+h==12 case */
+        pu_w_plus_pu_h = ((ps_pu->b4_wd + 1) << 2) + ((ps_pu->b4_ht + 1) << 2);
+        if(12 == pu_w_plus_pu_h)
+        {
+            ctxt_idx = IHEVC_CAB_INTER_PRED_IDC + 4;
+            TRACE_CABAC_CTXT("inter_pred_idc", ps_cabac->u4_range, ctxt_idx);
+            inter_pred_idc = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm,
+                                                     ctxt_idx);
+        }
+        else
+        {
+            /* larger PUs can be encoded as bi_pred/l0/l1 inter_pred_idc */
+            WORD32 is_bipred;
+
+            ctxt_idx = IHEVC_CAB_INTER_PRED_IDC + ps_codec->s_parse.i4_ct_depth;
+            TRACE_CABAC_CTXT("inter_pred_idc", ps_cabac->u4_range, ctxt_idx);
+            is_bipred = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+            inter_pred_idc = PRED_BI;
+            if(!is_bipred)
+            {
+                ctxt_idx = IHEVC_CAB_INTER_PRED_IDC + 4;
+                inter_pred_idc = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm,
+                                                         ctxt_idx);
+            }
+        }
+
+        AEV_TRACE("inter_pred_idc", inter_pred_idc, ps_cabac->u4_range);
+    }
+    else
+        inter_pred_idc = PRED_L0;
+    ps_pu->mv.i1_l0_ref_idx = 0;
+    ps_pu->mv.i1_l1_ref_idx = 0;
+    /* Decode MVD for L0 for PRED_L0 or PRED_BI */
+    if(inter_pred_idc != PRED_L1)
+    {
+        WORD32 active_refs = ps_slice_hdr->i1_num_ref_idx_l0_active;
+        WORD32 ref_idx = 0;
+        WORD32 ctxt_idx;
+
+        if(active_refs > 1)
+        {
+            ctxt_idx = IHEVC_CAB_INTER_REF_IDX;
+            /* encode the context modelled first bin */
+            TRACE_CABAC_CTXT("ref_idx", ps_cabac->u4_range, ctxt_idx);
+            ref_idx = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+
+            if((active_refs > 2) && ref_idx)
+            {
+                WORD32 value;
+                /* encode the context modelled second bin */
+                ctxt_idx++;
+                value = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+                ref_idx += value;
+                if((active_refs > 3) && value)
+                {
+                    /* encode remaining bypass bins */
+                    ref_idx = ihevcd_cabac_decode_bypass_bins_tunary(ps_cabac,
+                                                                     ps_bitstrm,
+                                                                     (active_refs - 3)
+                    );
+                    ref_idx += 2;
+                }
+            }
+            AEV_TRACE("ref_idx", ref_idx, ps_cabac->u4_range);
+        }
+
+        ref_idx = CLIP3(ref_idx, 0, MAX_DPB_SIZE - 1);
+        ps_pu->mv.i1_l0_ref_idx = ref_idx;
+
+        ihevcd_parse_mvd(ps_codec, &ps_pu->mv.s_l0_mv);
+
+        ctxt_idx = IHEVC_CAB_MVP_L0L1;
+        value = ihevcd_cabac_decode_bin(ps_cabac,
+                                        ps_bitstrm,
+                                        ctxt_idx);
+
+        AEV_TRACE("mvp_l0/l1_flag", value, ps_cabac->u4_range);
+
+        ps_pu->b1_l0_mvp_idx = value;
+
+    }
+    /* Decode MVD for L1 for PRED_L1 or PRED_BI */
+    if(inter_pred_idc != PRED_L0)
+    {
+        WORD32 active_refs = ps_slice_hdr->i1_num_ref_idx_l1_active;
+        WORD32 ref_idx = 0;
+        WORD32 ctxt_idx;
+
+        if(active_refs > 1)
+        {
+
+            ctxt_idx = IHEVC_CAB_INTER_REF_IDX;
+            TRACE_CABAC_CTXT("ref_idx", ps_cabac->u4_range, ctxt_idx);
+            /* encode the context modelled first bin */
+            ref_idx = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+
+            if((active_refs > 2) && ref_idx)
+            {
+                WORD32 value;
+                /* encode the context modelled second bin */
+                ctxt_idx++;
+                value = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+                ref_idx += value;
+                if((active_refs > 3) && value)
+                {
+                    /* encode remaining bypass bins */
+                    ref_idx = ihevcd_cabac_decode_bypass_bins_tunary(ps_cabac,
+                                                                     ps_bitstrm,
+                                                                     (active_refs - 3)
+                    );
+                    ref_idx += 2;
+                }
+            }
+
+            AEV_TRACE("ref_idx", ref_idx, ps_cabac->u4_range);
+        }
+
+        ref_idx = CLIP3(ref_idx, 0, MAX_DPB_SIZE - 1);
+        ps_pu->mv.i1_l1_ref_idx = ref_idx;
+
+        if(ps_slice_hdr->i1_mvd_l1_zero_flag && inter_pred_idc == PRED_BI)
+        {
+            ps_pu->mv.s_l1_mv.i2_mvx = 0;
+            ps_pu->mv.s_l1_mv.i2_mvy = 0;
+        }
+        else
+        {
+            ihevcd_parse_mvd(ps_codec, &ps_pu->mv.s_l1_mv);
+        }
+
+        ctxt_idx = IHEVC_CAB_MVP_L0L1;
+        value = ihevcd_cabac_decode_bin(ps_cabac,
+                                        ps_bitstrm,
+                                        ctxt_idx);
+
+        AEV_TRACE("mvp_l0/l1_flag", value, ps_cabac->u4_range);
+        ps_pu->b1_l1_mvp_idx = value;
+
+    }
+
+    ps_pu->b2_pred_mode = inter_pred_idc;
+    return ret;
+}
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Parses Prediction unit
+ *
+ * @par Description:
+ *  Parses Prediction unit as per Section:7.3.9.6
+ *
+ * @param[in] ps_codec
+ *  Pointer to codec context
+ *
+ * @returns  Error from IHEVCD_ERROR_T
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+IHEVCD_ERROR_T  ihevcd_parse_prediction_unit(codec_t *ps_codec,
+                                             WORD32 x0,
+                                             WORD32 y0,
+                                             WORD32 wd,
+                                             WORD32 ht)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    slice_header_t *ps_slice_hdr;
+    sps_t *ps_sps;
+    bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+    WORD32 ctb_x_base;
+    WORD32 ctb_y_base;
+
+    pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+    cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+
+    ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr;
+
+    /* Set PU structure to default values */
+    memset(ps_pu, 0, sizeof(pu_t));
+
+    ps_sps = ps_codec->s_parse.ps_sps;
+    ctb_x_base = ps_codec->s_parse.i4_ctb_x << ps_sps->i1_log2_ctb_size;
+    ctb_y_base = ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size;
+
+    ps_pu->b4_pos_x = (x0 - ctb_x_base) >> 2;
+    ps_pu->b4_pos_y = (y0 - ctb_y_base) >> 2;
+    ps_pu->b4_wd = (wd >> 2) - 1;
+    ps_pu->b4_ht = (ht >> 2) - 1;
+
+    ps_pu->b1_intra_flag = 0;
+    ps_pu->b3_part_mode = ps_codec->s_parse.s_cu.i4_part_mode;
+
+    if(PRED_MODE_SKIP == ps_codec->s_parse.s_cu.i4_pred_mode)
+    {
+        WORD32 merge_idx = 0;
+        if(ps_slice_hdr->i1_max_num_merge_cand > 1)
+        {
+            WORD32 ctxt_idx = IHEVC_CAB_MERGE_IDX_EXT;
+            WORD32 bin;
+
+            TRACE_CABAC_CTXT("merge_idx", ps_cabac->u4_range, ctxt_idx);
+            bin = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+            if(bin)
+            {
+                if(ps_slice_hdr->i1_max_num_merge_cand > 2)
+                {
+                    merge_idx = ihevcd_cabac_decode_bypass_bins_tunary(
+                                    ps_cabac, ps_bitstrm,
+                                    (ps_slice_hdr->i1_max_num_merge_cand - 2));
+                }
+                merge_idx++;
+            }
+            AEV_TRACE("merge_idx", merge_idx, ps_cabac->u4_range);
+        }
+        ps_pu->b1_merge_flag = 1;
+        ps_pu->b3_merge_idx = merge_idx;
+
+    }
+    else
+    {
+        /* MODE_INTER */
+        WORD32 merge_flag;
+        WORD32 ctxt_idx = IHEVC_CAB_MERGE_FLAG_EXT;
+        TRACE_CABAC_CTXT("merge_flag", ps_cabac->u4_range, ctxt_idx);
+        merge_flag = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+        AEV_TRACE("merge_flag", merge_flag, ps_cabac->u4_range);
+
+        ps_pu->b1_merge_flag = merge_flag;
+
+        if(merge_flag)
+        {
+            WORD32 merge_idx = 0;
+            if(ps_slice_hdr->i1_max_num_merge_cand > 1)
+            {
+                WORD32 ctxt_idx = IHEVC_CAB_MERGE_IDX_EXT;
+                WORD32 bin;
+                TRACE_CABAC_CTXT("merge_idx", ps_cabac->u4_range, ctxt_idx);
+                bin = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+                if(bin)
+                {
+                    if(ps_slice_hdr->i1_max_num_merge_cand > 2)
+                    {
+                        merge_idx = ihevcd_cabac_decode_bypass_bins_tunary(
+                                        ps_cabac, ps_bitstrm,
+                                        (ps_slice_hdr->i1_max_num_merge_cand - 2));
+                    }
+                    merge_idx++;
+                }
+                AEV_TRACE("merge_idx", merge_idx, ps_cabac->u4_range);
+            }
+
+            ps_pu->b3_merge_idx = merge_idx;
+        }
+        else
+        {
+            ihevcd_parse_pu_mvp(ps_codec, ps_pu);
+        }
+
+    }
+    STATS_UPDATE_PU_SIZE(ps_pu);
+    /* Increment PU pointer */
+    ps_codec->s_parse.ps_pu++;
+    ps_codec->s_parse.i4_pic_pu_idx++;
+    return ret;
+}
+
+
+WORD32 ihevcd_parse_part_mode_amp(cab_ctxt_t *ps_cabac, bitstrm_t *ps_bitstrm)
+{
+    WORD32 ctxt_idx = IHEVC_CAB_PART_MODE;
+    WORD32 part_mode_idx;
+    WORD32 part_mode;
+    WORD32 bin;
+
+    part_mode = 0;
+    TRACE_CABAC_CTXT("part_mode", ps_cabac->u4_range, ctxt_idx);
+    bin = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx++);
+
+    if(!bin)
+    {
+        bin = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx++);
+        part_mode_idx = bin;
+        part_mode_idx <<= 1;
+
+        /* Following takes of handling context increment for 3rd bin in part_mode */
+        /* When AMP is enabled and the current is not min CB */
+        /* Context for 3rd bin is 3 and not 2 */
+        ctxt_idx += 1;
+
+        bin = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+        part_mode_idx |= bin;
+
+        part_mode_idx <<= 1;
+        if(!bin)
+        {
+
+            bin = ihevcd_cabac_decode_bypass_bin(ps_cabac, ps_bitstrm);
+            part_mode_idx |= bin;
+        }
+        part_mode = gau1_part_mode_amp[part_mode_idx];
+    }
+    return part_mode;
+}
+IHEVCD_ERROR_T ihevcd_parse_coding_unit_intra(codec_t *ps_codec,
+                                              WORD32 x0,
+                                              WORD32 y0,
+                                              WORD32 log2_cb_size)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    sps_t *ps_sps;
+    cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+    bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+    WORD32 pcm_flag;
+    WORD32 value;
+    WORD32 cb_size = 1 << log2_cb_size;
+    WORD32 part_mode =  ps_codec->s_parse.s_cu.i4_part_mode;
+    tu_t *ps_tu = ps_codec->s_parse.ps_tu;
+    pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+    WORD32 ctb_x_base;
+    WORD32 ctb_y_base;
+    ps_sps = ps_codec->s_parse.ps_sps;
+    ctb_x_base = ps_codec->s_parse.i4_ctb_x << ps_sps->i1_log2_ctb_size;
+    ctb_y_base = ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size;
+
+    memset(ps_pu, 0, sizeof(pu_t));
+    ps_pu->b1_intra_flag = 1;
+    ps_pu->b4_wd = (cb_size >> 2) - 1;
+    ps_pu->b4_ht = (cb_size >> 2) - 1;
+    ps_pu->b4_pos_x = (x0 - ctb_x_base) >> 2;
+    ps_pu->b4_pos_y = (y0 - ctb_y_base) >> 2;
+
+    pcm_flag = 0;
+    if((PART_2Nx2N == part_mode) && (ps_sps->i1_pcm_enabled_flag)
+                    && (log2_cb_size
+                                    >= ps_sps->i1_log2_min_pcm_coding_block_size)
+                    && (log2_cb_size
+                                    <= (ps_sps->i1_log2_min_pcm_coding_block_size + ps_sps->i1_log2_diff_max_min_pcm_coding_block_size)))
+    {
+        TRACE_CABAC_CTXT("pcm_flag", ps_cabac->u4_range, 0);
+        pcm_flag = ihevcd_cabac_decode_terminate(ps_cabac, ps_bitstrm);
+        AEV_TRACE("pcm_flag", pcm_flag, ps_cabac->u4_range);
+    }
+
+    ps_codec->s_parse.i4_cu_pcm_flag = pcm_flag;
+    if(pcm_flag)
+    {
+        UWORD8 *pu1_luma_intra_pred_mode_top, *pu1_luma_intra_pred_mode_left;
+        WORD32 i,  num_pred_blocks;
+
+        if(ps_codec->s_parse.s_bitstrm.u4_bit_ofst % 8)
+        {
+            TRACE_CABAC_CTXT("pcm_alignment_zero_bit", ps_cabac->u4_range, 0);
+            ihevcd_bits_flush_to_byte_boundary(&ps_codec->s_parse.s_bitstrm);
+            AEV_TRACE("pcm_alignment_zero_bit", 0, ps_cabac->u4_range);
+        }
+
+        ihevcd_parse_pcm_sample(ps_codec, x0, y0, log2_cb_size);
+
+        ihevcd_cabac_reset(&ps_codec->s_parse.s_cabac,
+                           &ps_codec->s_parse.s_bitstrm);
+
+        ps_tu = ps_codec->s_parse.ps_tu;
+        ps_tu->b1_cb_cbf = 1;
+        ps_tu->b1_cr_cbf = 1;
+        ps_tu->b1_y_cbf = 1;
+        ps_tu->b4_pos_x = ((x0 - ctb_x_base) >> 2);
+        ps_tu->b4_pos_y = ((y0 - ctb_y_base) >> 2);
+        ps_tu->b1_transquant_bypass = 1;
+        ps_tu->b3_size = (log2_cb_size - 2);
+        ps_tu->b7_qp = ps_codec->s_parse.u4_qp;
+        ps_tu->b3_chroma_intra_mode_idx = INTRA_PRED_CHROMA_IDX_NONE;
+        ps_tu->b6_luma_intra_mode   = INTRA_PRED_NONE;
+
+        /* Set the first TU in CU flag */
+        {
+            if((ps_codec->s_parse.s_cu.i4_pos_x << 3) == (ps_tu->b4_pos_x << 2) &&
+                            (ps_codec->s_parse.s_cu.i4_pos_y << 3) == (ps_tu->b4_pos_y << 2))
+            {
+                ps_tu->b1_first_tu_in_cu = 1;
+            }
+            else
+            {
+                ps_tu->b1_first_tu_in_cu = 0;
+            }
+        }
+
+        /* Update the intra pred mode for PCM to INTRA_DC(default mode) */
+        pu1_luma_intra_pred_mode_top = ps_codec->s_parse.pu1_luma_intra_pred_mode_top
+                        + (ps_codec->s_parse.s_cu.i4_pos_x * 2);
+
+        pu1_luma_intra_pred_mode_left = ps_codec->s_parse.pu1_luma_intra_pred_mode_left
+                        + (ps_codec->s_parse.s_cu.i4_pos_y * 2);
+
+        num_pred_blocks = 1; /* Because PCM part mode will be 2Nx2N */
+
+        ps_codec->s_func_selector.ihevc_memset_fptr(pu1_luma_intra_pred_mode_left, INTRA_DC, (cb_size / num_pred_blocks) / MIN_PU_SIZE);
+        ps_codec->s_func_selector.ihevc_memset_fptr(pu1_luma_intra_pred_mode_top, INTRA_DC, (cb_size / num_pred_blocks) / MIN_PU_SIZE);
+
+
+        /* Set no_loop_filter appropriately */
+        if(1 == ps_sps->i1_pcm_loop_filter_disable_flag)
+        {
+            UWORD8 *pu1_pic_no_loop_filter_flag;
+            WORD32 numbytes_row;
+            UWORD32 u4_mask;
+
+            pu1_pic_no_loop_filter_flag = ps_codec->s_parse.pu1_pic_no_loop_filter_flag;
+            numbytes_row =  (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
+            pu1_pic_no_loop_filter_flag += (y0 / 8) * numbytes_row;
+            pu1_pic_no_loop_filter_flag += (x0 / 64);
+            /* Generate (cb_size / 8) number of 1s */
+            /* i.e (log2_cb_size - 2) number of 1s */
+            u4_mask = LSB_ONES((cb_size >> 3));
+            for(i = 0; i < (cb_size / 8); i++)
+            {
+                *pu1_pic_no_loop_filter_flag |= (u4_mask << (((x0) / 8) % 8));
+                pu1_pic_no_loop_filter_flag += numbytes_row;
+            }
+        }
+        /* Increment ps_tu and tu_idx */
+        ps_codec->s_parse.ps_tu++;
+        ps_codec->s_parse.s_cu.i4_tu_cnt++;
+        ps_codec->s_parse.i4_pic_tu_idx++;
+
+    }
+    else
+    {
+        WORD32 cnt = 0;
+        WORD32 i;
+        WORD32 part_cnt;
+
+        part_cnt = (part_mode == PART_NxN) ? 4 : 1;
+
+        for(i = 0; i < part_cnt; i++)
+        {
+            TRACE_CABAC_CTXT("prev_intra_pred_luma_flag", ps_cabac->u4_range, IHEVC_CAB_INTRA_LUMA_PRED_FLAG);
+            value = ihevcd_cabac_decode_bin(ps_cabac,
+                                            ps_bitstrm,
+                                            IHEVC_CAB_INTRA_LUMA_PRED_FLAG);
+
+            ps_codec->s_parse.s_cu.ai4_prev_intra_luma_pred_flag[i] =
+                            value;
+            AEV_TRACE("prev_intra_pred_luma_flag", value, ps_cabac->u4_range);
+        }
+
+        for(i = 0; i < part_cnt; i++)
+        {
+            if(ps_codec->s_parse.s_cu.ai4_prev_intra_luma_pred_flag[cnt])
+            {
+                value = ihevcd_cabac_decode_bypass_bins_tunary(ps_cabac, ps_bitstrm, 2);
+                AEV_TRACE("mpm_idx", value, ps_cabac->u4_range);
+                ps_codec->s_parse.s_cu.ai4_mpm_idx[cnt] = value;
+            }
+            else
+            {
+                value = ihevcd_cabac_decode_bypass_bins(ps_cabac, ps_bitstrm, 5);
+                AEV_TRACE("rem_intra_luma_pred_mode", value,
+                          ps_cabac->u4_range);
+                ps_codec->s_parse.s_cu.ai4_rem_intra_luma_pred_mode[cnt] =
+                                value;
+            }
+            cnt++;
+        }
+        TRACE_CABAC_CTXT("intra_chroma_pred_mode", ps_cabac->u4_range, IHEVC_CAB_CHROMA_PRED_MODE);
+        value = ihevcd_cabac_decode_bin(ps_cabac,
+                                        ps_bitstrm,
+                                        IHEVC_CAB_CHROMA_PRED_MODE);
+        ps_codec->s_parse.s_cu.i4_intra_chroma_pred_mode_idx = 4;
+        if(value)
+        {
+            ps_codec->s_parse.s_cu.i4_intra_chroma_pred_mode_idx =
+                            ihevcd_cabac_decode_bypass_bins(ps_cabac,
+                                                            ps_bitstrm, 2);
+        }
+        AEV_TRACE("intra_chroma_pred_mode",
+                  ps_codec->s_parse.s_cu.i4_intra_chroma_pred_mode_idx,
+                  ps_cabac->u4_range);
+
+
+        ihevcd_intra_pred_mode_prediction(ps_codec, log2_cb_size, x0, y0);
+    }
+    STATS_UPDATE_PU_SIZE(ps_pu);
+    /* Increment PU pointer */
+    ps_codec->s_parse.ps_pu++;
+    ps_codec->s_parse.i4_pic_pu_idx++;
+
+    return ret;
+}
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Parses coding unit
+ *
+ * @par Description:
+ *  Parses coding unit as per Section:7.3.9.5
+ *
+ * @param[in] ps_codec
+ *  Pointer to codec context
+ *
+ * @returns  Error from IHEVCD_ERROR_T
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+IHEVCD_ERROR_T  ihevcd_parse_coding_unit(codec_t *ps_codec,
+                                         WORD32 x0,
+                                         WORD32 y0,
+                                         WORD32 log2_cb_size)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    sps_t *ps_sps;
+    pps_t *ps_pps;
+    WORD32 cb_size;
+    slice_header_t *ps_slice_hdr;
+    WORD32 skip_flag;
+    WORD32 pcm_flag;
+    UWORD32 *pu4_skip_top = ps_codec->s_parse.pu4_skip_cu_top;
+    UWORD32 u4_skip_left = ps_codec->s_parse.u4_skip_cu_left;
+    bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+    tu_t *ps_tu = ps_codec->s_parse.ps_tu;
+
+    WORD32 cu_pos_x;
+    WORD32 cu_pos_y;
+    cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+
+    ASSERT(0 == (x0 % 8));
+    ASSERT(0 == (y0 % 8));
+
+    ps_codec->s_parse.s_cu.i4_tu_cnt = 0;
+    ps_sps = ps_codec->s_parse.ps_sps;
+    ps_pps = ps_codec->s_parse.ps_pps;
+
+    cu_pos_x = ps_codec->s_parse.s_cu.i4_pos_x;
+    cu_pos_y = ps_codec->s_parse.s_cu.i4_pos_y;
+
+
+
+    ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr;
+
+
+    cb_size = 1 << log2_cb_size;
+
+    ps_codec->s_parse.s_cu.i4_cu_transquant_bypass = 0;
+
+    if(ps_pps->i1_transquant_bypass_enable_flag)
+    {
+        TRACE_CABAC_CTXT("cu_transquant_bypass_flag", ps_cabac->u4_range, IHEVC_CAB_CU_TQ_BYPASS_FLAG);
+        ps_codec->s_parse.s_cu.i4_cu_transquant_bypass =
+                        ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm,
+                                                IHEVC_CAB_CU_TQ_BYPASS_FLAG);
+        /* Update transquant_bypass in ps_tu */
+
+        AEV_TRACE("cu_transquant_bypass_flag", ps_codec->s_parse.s_cu.i4_cu_transquant_bypass,
+                  ps_cabac->u4_range);
+
+        if(ps_codec->s_parse.s_cu.i4_cu_transquant_bypass)
+        {
+            UWORD8 *pu1_pic_no_loop_filter_flag = ps_codec->s_parse.pu1_pic_no_loop_filter_flag;
+            UWORD32 u4_mask;
+            WORD32 i;
+            WORD32 numbytes_row;
+            numbytes_row =  (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
+            pu1_pic_no_loop_filter_flag += (y0 / 8) * numbytes_row;
+            pu1_pic_no_loop_filter_flag += (x0 / 64);
+
+            /* Generate (cb_size / 8) number of 1s */
+            /* i.e (log2_cb_size - 2) number of 1s */
+            u4_mask = LSB_ONES((cb_size >> 3));
+            for(i = 0; i < (cb_size / 8); i++)
+            {
+                *pu1_pic_no_loop_filter_flag |= (u4_mask << (((x0) / 8) % 8));
+                pu1_pic_no_loop_filter_flag += numbytes_row;
+            }
+        }
+    }
+
+    {
+        UWORD32 u4_skip_top = 0;
+        UWORD32 u4_mask;
+        UWORD32 u4_top_mask, u4_left_mask;
+        UWORD32 u4_min_cu_x = x0 / 8;
+        UWORD32 u4_min_cu_y = y0 / 8;
+
+        pu4_skip_top += (u4_min_cu_x / 32);
+
+
+        if(ps_slice_hdr->i1_slice_type != ISLICE)
+        {
+            WORD32 ctx_idx_inc;
+            ctx_idx_inc = 0;
+
+            if((0 != cu_pos_y) ||
+                            ((0 != ps_codec->s_parse.i4_ctb_slice_y) &&
+                                            (0 != ps_codec->s_parse.i4_ctb_tile_y)))
+            {
+                u4_skip_top = *pu4_skip_top;
+                u4_skip_top >>= (u4_min_cu_x % 32);
+                if(u4_skip_top & 1)
+                    ctx_idx_inc++;
+            }
+
+            /*****************************************************************/
+            /* If cu_pos_x is non-zero then left is available                */
+            /* If cu_pos_x is zero then ensure both the following are true   */
+            /*    Current CTB is not the first CTB in a tile row             */
+            /*    Current CTB is not the first CTB in a slice                */
+            /*****************************************************************/
+            if((0 != cu_pos_x) ||
+                            (((0 != ps_codec->s_parse.i4_ctb_slice_x) || (0 != ps_codec->s_parse.i4_ctb_slice_y)) &&
+                                            (0 != ps_codec->s_parse.i4_ctb_tile_x)))
+            {
+                u4_skip_left >>= (u4_min_cu_y % 32);
+                if(u4_skip_left & 1)
+                    ctx_idx_inc++;
+            }
+            TRACE_CABAC_CTXT("cu_skip_flag", ps_cabac->u4_range, (IHEVC_CAB_SKIP_FLAG + ctx_idx_inc));
+            skip_flag = ihevcd_cabac_decode_bin(ps_cabac,
+                                                ps_bitstrm,
+                                                (IHEVC_CAB_SKIP_FLAG + ctx_idx_inc));
+
+            AEV_TRACE("cu_skip_flag", skip_flag, ps_cabac->u4_range);
+        }
+        else
+            skip_flag = 0;
+
+        /* Update top skip_flag */
+        u4_skip_top = *pu4_skip_top;
+        /* Since Max cb_size is 64, maximum of 8 bits will be set or reset */
+        /* Also since Coding block will be within 64x64 grid, only 8bits within a WORD32
+         * need to be updated. These 8 bits will not cross 8 bit boundaries
+         */
+        u4_mask = LSB_ONES(cb_size / 8);
+        u4_top_mask = u4_mask << (u4_min_cu_x % 32);
+
+
+        if(skip_flag)
+        {
+            u4_skip_top |= u4_top_mask;
+        }
+        else
+        {
+            u4_skip_top &= ~u4_top_mask;
+        }
+        *pu4_skip_top = u4_skip_top;
+
+        /* Update left skip_flag */
+        u4_skip_left = ps_codec->s_parse.u4_skip_cu_left;
+        u4_mask = LSB_ONES(cb_size / 8);
+        u4_left_mask = u4_mask << (u4_min_cu_y % 32);
+
+        if(skip_flag)
+        {
+            u4_skip_left |= u4_left_mask;
+        }
+        else
+        {
+            u4_skip_left &= ~u4_left_mask;
+        }
+        ps_codec->s_parse.u4_skip_cu_left = u4_skip_left;
+    }
+    ps_codec->s_parse.i4_cu_pcm_flag = 0;
+
+    if(skip_flag)
+    {
+        WORD32 ctb_x_base;
+        WORD32 ctb_y_base;
+
+        ctb_x_base = ps_codec->s_parse.i4_ctb_x << ps_sps->i1_log2_ctb_size;
+        ctb_y_base = ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size;
+
+        ps_tu->b1_cb_cbf = 0;
+        ps_tu->b1_cr_cbf = 0;
+        ps_tu->b1_y_cbf = 0;
+        ps_tu->b4_pos_x = ((x0 - ctb_x_base) >> 2);
+        ps_tu->b4_pos_y = ((y0 - ctb_y_base) >> 2);
+        ps_tu->b1_transquant_bypass = 0;
+        ps_tu->b3_size = (log2_cb_size - 2);
+        ps_tu->b7_qp = ps_codec->s_parse.u4_qp;
+        ps_tu->b3_chroma_intra_mode_idx = INTRA_PRED_CHROMA_IDX_NONE;
+        ps_tu->b6_luma_intra_mode   = INTRA_PRED_NONE;
+
+        /* Set the first TU in CU flag */
+        {
+            if((ps_codec->s_parse.s_cu.i4_pos_x << 3) == (ps_tu->b4_pos_x << 2) &&
+                            (ps_codec->s_parse.s_cu.i4_pos_y << 3) == (ps_tu->b4_pos_y << 2))
+            {
+                ps_tu->b1_first_tu_in_cu = 1;
+            }
+            else
+            {
+                ps_tu->b1_first_tu_in_cu = 0;
+            }
+        }
+
+        ps_codec->s_parse.ps_tu++;
+        ps_codec->s_parse.s_cu.i4_tu_cnt++;
+        ps_codec->s_parse.i4_pic_tu_idx++;
+
+        ps_codec->s_parse.s_cu.i4_pred_mode = PRED_MODE_SKIP;
+        ps_codec->s_parse.s_cu.i4_part_mode = PART_2Nx2N;
+        {
+            pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+            ps_pu->b2_part_idx = 0;
+            ihevcd_parse_prediction_unit(ps_codec, x0, y0, cb_size, cb_size);
+            STATS_UPDATE_PU_SKIP_SIZE(ps_pu);
+        }
+    }
+    else
+    {
+        WORD32 pred_mode;
+        WORD32 part_mode;
+        WORD32 intra_split_flag;
+        WORD32 is_mincb;
+        cb_size = (1 << log2_cb_size);
+        is_mincb = (cb_size == (1 << ps_sps->i1_log2_min_coding_block_size));
+        pcm_flag = 0;
+        if(ps_slice_hdr->i1_slice_type != ISLICE)
+        {
+            TRACE_CABAC_CTXT("pred_mode_flag", ps_cabac->u4_range, IHEVC_CAB_PRED_MODE);
+            pred_mode = ihevcd_cabac_decode_bin(ps_cabac,
+                                                ps_bitstrm,
+                                                IHEVC_CAB_PRED_MODE);
+
+            AEV_TRACE("pred_mode_flag", pred_mode, ps_cabac->u4_range);
+        }
+        else
+        {
+            pred_mode = PRED_MODE_INTRA;
+        }
+
+        /* If current CU is intra then set corresponging bit in picture level intra map */
+        if(PRED_MODE_INTRA == pred_mode)
+        {
+            UWORD8 *pu1_pic_intra_flag = ps_codec->s_parse.pu1_pic_intra_flag;
+            UWORD32 u4_mask;
+            WORD32 i;
+            WORD32 numbytes_row;
+            numbytes_row =  (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
+            pu1_pic_intra_flag += (y0 / 8) * numbytes_row;
+            pu1_pic_intra_flag += (x0 / 64);
+
+            /* Generate (cb_size / 8) number of 1s */
+            /* i.e (log2_cb_size - 2) number of 1s */
+            u4_mask = LSB_ONES((cb_size >> 3));
+            for(i = 0; i < (cb_size / 8); i++)
+            {
+                *pu1_pic_intra_flag |= (u4_mask << (((x0) / 8) % 8));
+                pu1_pic_intra_flag += numbytes_row;
+            }
+        }
+
+        ps_codec->s_parse.s_cu.i4_pred_mode = pred_mode;
+        intra_split_flag = 0;
+        if((PRED_MODE_INTRA != pred_mode) ||
+                        is_mincb)
+        {
+            UWORD32 bin;
+            if(PRED_MODE_INTRA == pred_mode)
+            {
+                TRACE_CABAC_CTXT("part_mode", ps_cabac->u4_range, IHEVC_CAB_PART_MODE);
+                bin = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, IHEVC_CAB_PART_MODE);
+                part_mode = (bin) ? PART_2Nx2N : PART_NxN;
+            }
+            else
+            {
+                WORD32 amp_enabled = ps_sps->i1_amp_enabled_flag;
+
+                UWORD32 u4_max_bin_cnt = 0;
+
+
+
+                if(amp_enabled && !is_mincb)
+                {
+                    part_mode = ihevcd_parse_part_mode_amp(ps_cabac, ps_bitstrm);
+                }
+                else
+                {
+                    WORD32 ctxt_inc = IHEVC_CAB_PART_MODE;
+
+                    u4_max_bin_cnt = 2;
+                    if((is_mincb) && (cb_size > 8))
+                    {
+                        u4_max_bin_cnt++;
+                    }
+
+                    part_mode = -1;
+                    TRACE_CABAC_CTXT("part_mode", ps_cabac->u4_range, IHEVC_CAB_PART_MODE);
+                    do
+                    {
+                        bin = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm,
+                                                      ctxt_inc++);
+                        part_mode++;
+                    }while(--u4_max_bin_cnt && !bin);
+
+                    /* If the last bin was zero, then increment part mode by 1 */
+                    if(!bin)
+                        part_mode++;
+                }
+
+
+            }
+
+            AEV_TRACE("part_mode", part_mode, ps_cabac->u4_range);
+
+        }
+        else
+        {
+            part_mode = 0;
+            intra_split_flag = 0;
+        }
+        ps_codec->s_parse.s_cu.i4_part_mode = part_mode;
+
+        if((PRED_MODE_INTRA == ps_codec->s_parse.s_cu.i4_pred_mode) &&
+                        (PART_NxN == ps_codec->s_parse.s_cu.i4_part_mode))
+        {
+            intra_split_flag = 1;
+        }
+        ps_codec->s_parse.s_cu.i4_part_mode = part_mode;
+        ps_codec->s_parse.s_cu.i4_intra_split_flag = intra_split_flag;
+        if(pred_mode == PRED_MODE_INTRA)
+        {
+            ps_codec->s_parse.i4_cu_pcm_flag = 0;
+            ihevcd_parse_coding_unit_intra(ps_codec, x0, y0, log2_cb_size);
+            pcm_flag = ps_codec->s_parse.i4_cu_pcm_flag;
+
+        }
+        else
+        {
+            if(part_mode == PART_2Nx2N)
+            {
+                pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+                ihevcd_parse_prediction_unit(ps_codec, x0, y0, cb_size, cb_size);
+                ps_pu->b2_part_idx = 0;
+            }
+            else if(part_mode == PART_2NxN)
+            {
+                pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+
+                ihevcd_parse_prediction_unit(ps_codec, x0, y0, cb_size, cb_size / 2);
+                ps_pu->b2_part_idx = 0;
+
+                ps_pu = ps_codec->s_parse.ps_pu;
+                ihevcd_parse_prediction_unit(ps_codec, x0, y0 + (cb_size / 2), cb_size, cb_size / 2);
+
+                ps_pu->b2_part_idx = 1;
+            }
+            else if(part_mode == PART_Nx2N)
+            {
+                pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+                ihevcd_parse_prediction_unit(ps_codec, x0, y0, cb_size / 2, cb_size);
+                ps_pu->b2_part_idx = 0;
+                ps_pu = ps_codec->s_parse.ps_pu;
+                ihevcd_parse_prediction_unit(ps_codec, x0 + (cb_size / 2), y0, cb_size / 2, cb_size);
+
+                ps_pu->b2_part_idx = 1;
+            }
+            else if(part_mode == PART_2NxnU)
+            {
+                pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+                ihevcd_parse_prediction_unit(ps_codec, x0, y0, cb_size, cb_size / 4);
+                ps_pu->b2_part_idx = 0;
+                ps_pu = ps_codec->s_parse.ps_pu;
+                ihevcd_parse_prediction_unit(ps_codec, x0, y0 + (cb_size / 4), cb_size, cb_size * 3 / 4);
+
+                ps_pu->b2_part_idx = 1;
+            }
+            else if(part_mode == PART_2NxnD)
+            {
+                pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+                ihevcd_parse_prediction_unit(ps_codec, x0, y0, cb_size, cb_size * 3 / 4);
+                ps_pu->b2_part_idx = 0;
+                ps_pu = ps_codec->s_parse.ps_pu;
+                ihevcd_parse_prediction_unit(ps_codec, x0, y0 + (cb_size * 3 / 4), cb_size, cb_size / 4);
+
+                ps_pu->b2_part_idx = 1;
+            }
+            else if(part_mode == PART_nLx2N)
+            {
+                pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+                ihevcd_parse_prediction_unit(ps_codec, x0, y0, cb_size / 4, cb_size);
+                ps_pu->b2_part_idx = 0;
+                ps_pu = ps_codec->s_parse.ps_pu;
+                ihevcd_parse_prediction_unit(ps_codec, x0 + (cb_size / 4), y0, cb_size * 3 / 4, cb_size);
+
+                ps_pu->b2_part_idx = 1;
+            }
+            else if(part_mode == PART_nRx2N)
+            {
+                pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+                ihevcd_parse_prediction_unit(ps_codec, x0, y0, cb_size * 3 / 4, cb_size);
+                ps_pu->b2_part_idx = 0;
+                ps_pu = ps_codec->s_parse.ps_pu;
+                ihevcd_parse_prediction_unit(ps_codec, x0 + (cb_size * 3 / 4), y0, cb_size / 4, cb_size);
+                ps_pu->b2_part_idx = 1;
+            }
+            else
+            { /* PART_NxN */
+                pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+
+                ihevcd_parse_prediction_unit(ps_codec, x0, y0, cb_size / 2, cb_size / 2);
+                ps_pu->b2_part_idx = 0;
+                ps_pu = ps_codec->s_parse.ps_pu;
+                ihevcd_parse_prediction_unit(ps_codec, x0 + (cb_size / 2), y0, cb_size / 2, cb_size / 2);
+
+                ps_pu->b2_part_idx = 1;
+                ps_pu = ps_codec->s_parse.ps_pu;
+                ihevcd_parse_prediction_unit(ps_codec, x0, y0 + (cb_size / 2), cb_size / 2, cb_size / 2);
+
+                ps_pu->b2_part_idx = 2;
+                ps_pu = ps_codec->s_parse.ps_pu;
+                ihevcd_parse_prediction_unit(ps_codec, x0 + (cb_size / 2), y0 + (cb_size / 2), cb_size / 2, cb_size / 2);
+
+                ps_pu->b2_part_idx = 3;
+            }
+        }
+
+        if(!pcm_flag)
+        {
+            WORD32 no_residual_syntax_flag = 0;
+            pu_t *ps_pu;
+            /* Since ps_pu is incremented for each PU parsed, decrement by 1 to
+             *  access last decoded PU
+             */
+            ps_pu = ps_codec->s_parse.ps_pu - 1;
+            if((PRED_MODE_INTRA != pred_mode) &&
+                            (!((part_mode == PART_2Nx2N) && ps_pu->b1_merge_flag)))
+            {
+
+                TRACE_CABAC_CTXT("rqt_root_cbf", ps_cabac->u4_range, IHEVC_CAB_NORES_IDX);
+                no_residual_syntax_flag = ihevcd_cabac_decode_bin(ps_cabac,
+                                                                  ps_bitstrm,
+                                                                  IHEVC_CAB_NORES_IDX);
+
+                AEV_TRACE("rqt_root_cbf", no_residual_syntax_flag,
+                          ps_cabac->u4_range);
+                /* TODO: HACK FOR COMPLIANCE WITH HM REFERENCE DECODER */
+                /*********************************************************/
+                /* currently the HM decoder expects qtroot cbf instead of */
+                /* no_residue_flag which has opposite meaning             */
+                /* This will be fixed once the software / spec is fixed   */
+                /*********************************************************/
+                no_residual_syntax_flag = 1 - no_residual_syntax_flag;
+            }
+
+            if(!no_residual_syntax_flag)
+            {
+
+                ps_codec->s_parse.s_cu.i4_max_trafo_depth = (pred_mode == PRED_MODE_INTRA) ?
+                                (ps_sps->i1_max_transform_hierarchy_depth_intra + intra_split_flag) :
+                                (ps_sps->i1_max_transform_hierarchy_depth_inter);
+                ihevcd_parse_transform_tree(ps_codec, x0, y0, x0, y0,
+                                            log2_cb_size, 0, 0,
+                                            ps_codec->s_parse.s_cu.ai4_intra_luma_pred_mode[0]);
+            }
+            else
+            {
+                WORD32 ctb_x_base;
+                WORD32 ctb_y_base;
+
+                ctb_x_base = ps_codec->s_parse.i4_ctb_x << ps_sps->i1_log2_ctb_size;
+                ctb_y_base = ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size;
+
+                ps_tu = ps_codec->s_parse.ps_tu;
+                ps_tu->b1_cb_cbf = 0;
+                ps_tu->b1_cr_cbf = 0;
+                ps_tu->b1_y_cbf = 0;
+                ps_tu->b4_pos_x = ((x0 - ctb_x_base) >> 2);
+                ps_tu->b4_pos_y = ((y0 - ctb_y_base) >> 2);
+                ps_tu->b1_transquant_bypass = 0;
+                ps_tu->b3_size = (log2_cb_size - 2);
+                ps_tu->b7_qp = ps_codec->s_parse.u4_qp;
+                ps_tu->b3_chroma_intra_mode_idx = INTRA_PRED_CHROMA_IDX_NONE;
+                ps_tu->b6_luma_intra_mode   = ps_codec->s_parse.s_cu.ai4_intra_luma_pred_mode[0];
+
+                /* Set the first TU in CU flag */
+                {
+                    if((ps_codec->s_parse.s_cu.i4_pos_x << 3) == (ps_tu->b4_pos_x << 2) &&
+                                    (ps_codec->s_parse.s_cu.i4_pos_y << 3) == (ps_tu->b4_pos_y << 2))
+                    {
+                        ps_tu->b1_first_tu_in_cu = 1;
+                    }
+                    else
+                    {
+                        ps_tu->b1_first_tu_in_cu = 0;
+                    }
+                }
+                ps_codec->s_parse.ps_tu++;
+                ps_codec->s_parse.s_cu.i4_tu_cnt++;
+                ps_codec->s_parse.i4_pic_tu_idx++;
+
+            }
+        }
+
+    }
+
+
+
+
+    return ret;
+}
+
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Parses Coding Quad Tree
+ *
+ * @par Description:
+ *  Parses Coding Quad Tree as per Section:7.3.9.4
+ *
+ * @param[in] ps_codec
+ *  Pointer to codec context
+ *
+ * @returns  Error from IHEVCD_ERROR_T
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+IHEVCD_ERROR_T ihevcd_parse_coding_quadtree(codec_t *ps_codec,
+                                            WORD32 x0,
+                                            WORD32 y0,
+                                            WORD32 log2_cb_size,
+                                            WORD32 ct_depth)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    sps_t *ps_sps;
+    pps_t *ps_pps;
+    WORD32 split_cu_flag;
+    WORD32 x1, y1;
+    WORD32 cu_pos_x;
+    WORD32 cu_pos_y;
+    bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+    cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+    WORD32 cb_size = 1 << log2_cb_size;
+    ps_sps = ps_codec->s_parse.ps_sps;
+    ps_pps = ps_codec->s_parse.ps_pps;
+
+    /* Compute CU position with respect to current CTB in (8x8) units */
+    cu_pos_x = (x0 - (ps_codec->s_parse.i4_ctb_x << ps_sps->i1_log2_ctb_size)) >> 3;
+    cu_pos_y = (y0 - (ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size)) >> 3;
+
+    ps_codec->s_parse.s_cu.i4_pos_x = cu_pos_x;
+    ps_codec->s_parse.s_cu.i4_pos_y = cu_pos_y;
+
+    ps_codec->s_parse.s_cu.i4_log2_cb_size = log2_cb_size;
+
+    ps_codec->s_parse.i4_ct_depth = ct_depth;
+    {
+        UWORD32 *pu4_ct_depth_top = ps_codec->s_parse.pu4_ct_depth_top;
+        UWORD32 u4_ct_depth_left = ps_codec->s_parse.u4_ct_depth_left;
+        UWORD32 u4_ct_depth_top = 0;
+        UWORD32 u4_mask;
+        UWORD32 u4_top_mask, u4_left_mask;
+        WORD32  ctxt_idx;
+        UWORD32 u4_min_cu_x = x0 / 8;
+        UWORD32 u4_min_cu_y = y0 / 8;
+
+        pu4_ct_depth_top += (u4_min_cu_x / 16);
+
+
+
+
+        if(((x0 + (1 << log2_cb_size)) <= ps_sps->i2_pic_width_in_luma_samples) &&
+                        ((y0 + (1 << log2_cb_size)) <= ps_sps->i2_pic_height_in_luma_samples) &&
+                        (log2_cb_size > ps_sps->i1_log2_min_coding_block_size))
+        {
+
+            ctxt_idx = IHEVC_CAB_SPLIT_CU_FLAG;
+            /* Split cu context increment is decided based on left and top Coding tree
+             * depth which is stored at frame level
+             */
+            /* Check if the CTB is in first row in the current slice or tile */
+            if((0 != cu_pos_y) ||
+                            ((0 != ps_codec->s_parse.i4_ctb_slice_y) &&
+                                            (0 != ps_codec->s_parse.i4_ctb_tile_y)))
+            {
+                u4_ct_depth_top = *pu4_ct_depth_top;
+                u4_ct_depth_top >>= ((u4_min_cu_x % 16) * 2);
+                u4_ct_depth_top &= 3;
+
+                if((WORD32)u4_ct_depth_top > ct_depth)
+                    ctxt_idx++;
+            }
+
+            /* Check if the CTB is in first column in the current slice or tile */
+            /*****************************************************************/
+            /* If cu_pos_x is non-zero then left is available                */
+            /* If cu_pos_x is zero then ensure both the following are true   */
+            /*    Current CTB is not the first CTB in a tile row             */
+            /*    Current CTB is not the first CTB in a slice                */
+            /*****************************************************************/
+            if((0 != cu_pos_x) ||
+                            (((0 != ps_codec->s_parse.i4_ctb_slice_x) || (0 != ps_codec->s_parse.i4_ctb_slice_y)) &&
+                                            (0 != ps_codec->s_parse.i4_ctb_tile_x)))
+            {
+                u4_ct_depth_left >>= ((u4_min_cu_y % 16) * 2);
+                u4_ct_depth_left &= 3;
+                if((WORD32)u4_ct_depth_left > ct_depth)
+                    ctxt_idx++;
+            }
+            TRACE_CABAC_CTXT("split_cu_flag", ps_cabac->u4_range, ctxt_idx);
+            split_cu_flag = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+            AEV_TRACE("split_cu_flag", split_cu_flag, ps_cabac->u4_range);
+        }
+        else
+        {
+            if(log2_cb_size > ps_sps->i1_log2_min_coding_block_size)
+                split_cu_flag = 1;
+            else
+                split_cu_flag = 0;
+        }
+
+        if(0 == split_cu_flag)
+        {
+            /* Update top ct_depth */
+            u4_ct_depth_top = *pu4_ct_depth_top;
+            /* Since Max cb_size is 64, maximum of 8 bits will be set or reset */
+            /* Also since Coding block will be within 64x64 grid, only 8bits within a WORD32
+             * need to be updated. These 8 bits will not cross 8 bit boundaries
+             */
+            u4_mask = DUP_LSB_11(cb_size / 8);
+
+            u4_top_mask = u4_mask << ((u4_min_cu_x % 16) * 2);
+            u4_ct_depth_top &= ~u4_top_mask;
+
+            if(ct_depth)
+            {
+                u4_top_mask = gau4_ct_depth_mask[ct_depth] & u4_mask;
+
+                u4_top_mask = u4_top_mask << ((u4_min_cu_x % 16) * 2);
+                u4_ct_depth_top |= u4_top_mask;
+            }
+
+            *pu4_ct_depth_top = u4_ct_depth_top;
+
+            /* Update left ct_depth */
+            u4_ct_depth_left = ps_codec->s_parse.u4_ct_depth_left;
+
+            u4_left_mask = u4_mask << ((u4_min_cu_y % 16) * 2);
+
+            u4_ct_depth_left &= ~u4_left_mask;
+            if(ct_depth)
+            {
+                u4_left_mask = gau4_ct_depth_mask[ct_depth] & u4_mask;
+
+                u4_left_mask = u4_left_mask << ((u4_min_cu_y % 16) * 2);
+                u4_ct_depth_left |= u4_left_mask;
+            }
+
+            ps_codec->s_parse.u4_ct_depth_left = u4_ct_depth_left;
+        }
+    }
+    if((ps_pps->i1_cu_qp_delta_enabled_flag) &&
+                    (log2_cb_size >= ps_pps->i1_log2_min_cu_qp_delta_size))
+    {
+        ps_codec->s_parse.i4_is_cu_qp_delta_coded = 0;
+        ps_codec->s_parse.i4_cu_qp_delta = 0;
+    }
+    if(split_cu_flag)
+    {
+        x1 = x0 + ((1 << log2_cb_size) >> 1);
+        y1 = y0 + ((1 << log2_cb_size) >> 1);
+
+        ihevcd_parse_coding_quadtree(ps_codec, x0, y0, log2_cb_size - 1, ct_depth + 1);
+
+        /* At frame boundaries coding quadtree nodes are sent only if they fall within the frame */
+        if(x1 < ps_sps->i2_pic_width_in_luma_samples)
+            ihevcd_parse_coding_quadtree(ps_codec, x1, y0, log2_cb_size - 1, ct_depth + 1);
+
+        if(y1 < ps_sps->i2_pic_height_in_luma_samples)
+            ihevcd_parse_coding_quadtree(ps_codec, x0, y1, log2_cb_size - 1, ct_depth + 1);
+
+        if((x1 < ps_sps->i2_pic_width_in_luma_samples) &&
+                        (y1 < ps_sps->i2_pic_height_in_luma_samples))
+            ihevcd_parse_coding_quadtree(ps_codec, x1, y1, log2_cb_size - 1, ct_depth + 1);
+    }
+    else
+    {
+        /* Set current group QP if current CU is aligned with the group */
+        {
+            WORD32 cu_pos_x = ps_codec->s_parse.s_cu.i4_pos_x << 3;
+            WORD32 cu_pos_y = ps_codec->s_parse.s_cu.i4_pos_y << 3;
+
+            WORD32 qpg_x = (cu_pos_x - (cu_pos_x & ((1 << ps_pps->i1_log2_min_cu_qp_delta_size) - 1)));
+            WORD32 qpg_y = (cu_pos_y - (cu_pos_y & ((1 << ps_pps->i1_log2_min_cu_qp_delta_size) - 1)));
+
+            if((cu_pos_x == qpg_x) &&
+                            (cu_pos_y == qpg_y))
+            {
+                ps_codec->s_parse.u4_qpg = ps_codec->s_parse.u4_qp;
+
+                ps_codec->s_parse.s_cu.i4_cu_qp_delta = 0;
+
+            }
+        }
+
+        ihevcd_parse_coding_unit(ps_codec, x0, y0, log2_cb_size);
+
+        if(ps_pps->i1_cu_qp_delta_enabled_flag)
+        {
+            WORD32 qp_pred, qp_left, qp_top;
+            WORD32 cu_pos_x;
+            WORD32 cu_pos_y;
+            WORD32 qpg_x;
+            WORD32 qpg_y;
+            WORD32 i, j;
+            WORD32 qp;
+            WORD32 cur_cu_offset;
+            tu_t *ps_tu = ps_codec->s_parse.ps_tu;
+            WORD32 cb_size = 1 << ps_codec->s_parse.s_cu.i4_log2_cb_size;
+
+            cu_pos_x = ps_codec->s_parse.s_cu.i4_pos_x << 3;
+            cu_pos_y = ps_codec->s_parse.s_cu.i4_pos_y << 3;
+
+            qpg_x = (cu_pos_x - (cu_pos_x & ((1 << ps_pps->i1_log2_min_cu_qp_delta_size) - 1))) >> 3;
+            qpg_y = (cu_pos_y - (cu_pos_y & ((1 << ps_pps->i1_log2_min_cu_qp_delta_size) - 1))) >> 3;
+
+            /*previous coded Qp*/
+            qp_left = ps_codec->s_parse.u4_qpg;
+            qp_top = ps_codec->s_parse.u4_qpg;
+
+            if(qpg_x > 0)
+            {
+                qp_left = ps_codec->s_parse.ai1_8x8_cu_qp[qpg_x - 1 + (qpg_y * 8)];
+            }
+            if(qpg_y > 0)
+            {
+                qp_top = ps_codec->s_parse.ai1_8x8_cu_qp[qpg_x + ((qpg_y - 1) * 8)];
+            }
+
+            qp_pred = (qp_left + qp_top + 1) >> 1;
+            /* Since qp_pred + ps_codec->s_parse.s_cu.i4_cu_qp_delta can be negative,
+            52 is added before taking modulo 52 */
+            qp = (qp_pred + ps_codec->s_parse.s_cu.i4_cu_qp_delta + 52) % 52;
+
+            cur_cu_offset = (cu_pos_x >> 3) + cu_pos_y;
+            for(i = 0; i < (cb_size >> 3); i++)
+            {
+                for(j = 0; j < (cb_size >> 3); j++)
+                {
+                    ps_codec->s_parse.ai1_8x8_cu_qp[cur_cu_offset + (i * 8) + j] = qp;
+                }
+            }
+
+            ps_codec->s_parse.u4_qp = qp;
+            ps_codec->s_parse.s_cu.i4_qp = qp;
+
+
+            /* When change in QP is signaled, update the QP in TUs that are already parsed in the CU */
+            {
+                tu_t *ps_tu_tmp;
+                ps_tu_tmp = ps_tu - ps_codec->s_parse.s_cu.i4_tu_cnt;
+                ps_tu->b7_qp = ps_codec->s_parse.u4_qp;
+                while(ps_tu_tmp != ps_tu)
+                {
+                    ps_tu_tmp->b7_qp = ps_codec->s_parse.u4_qp;
+
+                    ps_tu_tmp++;
+                }
+            }
+            if(ps_codec->s_parse.s_cu.i4_cu_qp_delta)
+            {
+                WORD32 ctb_indx;
+                ctb_indx = ps_codec->s_parse.i4_ctb_x + ps_sps->i2_pic_wd_in_ctb * ps_codec->s_parse.i4_ctb_y;
+                ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp_const_in_ctb[ctb_indx >> 3] &= (~(1 << (ctb_indx & 7)));
+            }
+
+        }
+
+    }
+
+
+
+
+    return ret;
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Parses SAO (Sample adaptive offset syntax)
+ *
+ * @par Description:
+ *  Parses SAO (Sample adaptive offset syntax) as per  Section:7.3.9.3
+ *
+ * @param[in] ps_codec
+ *  Pointer to codec context
+ *
+ * @returns  Error from IHEVCD_ERROR_T
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+IHEVCD_ERROR_T  ihevcd_parse_sao(codec_t *ps_codec)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    sps_t *ps_sps;
+    sao_t *ps_sao;
+    WORD32 rx;
+    WORD32 ry;
+    WORD32 value;
+    bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+    WORD32 sao_merge_left_flag;
+    WORD32 sao_merge_up_flag;
+    slice_header_t *ps_slice_hdr;
+    cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+    WORD32 ctxt_idx;
+
+    ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr_base;
+    ps_slice_hdr += (ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1));
+
+    ps_sps = (ps_codec->s_parse.ps_sps);
+    rx = ps_codec->s_parse.i4_ctb_x;
+    ry = ps_codec->s_parse.i4_ctb_y;
+
+    ps_sao = ps_codec->s_parse.ps_pic_sao + rx + ry * ps_sps->i2_pic_wd_in_ctb;
+
+    /* Default values */
+    ps_sao->b3_y_type_idx = 0;
+    ps_sao->b3_cb_type_idx = 0;
+    ps_sao->b3_cr_type_idx = 0;
+
+    UNUSED(value);
+    ctxt_idx = IHEVC_CAB_SAO_MERGE;
+    sao_merge_left_flag = 0;
+    sao_merge_up_flag = 0;
+    if(rx > 0)
+    {
+        /*TODO:Implemented only for slice. condition for tile is not tested*/
+        if(((0 != ps_codec->s_parse.i4_ctb_slice_x) || (0 != ps_codec->s_parse.i4_ctb_slice_y)) &&
+                        (0 != ps_codec->s_parse.i4_ctb_tile_x))
+        {
+
+            TRACE_CABAC_CTXT("sao_merge_flag", ps_cabac->u4_range, ctxt_idx);
+            sao_merge_left_flag = ihevcd_cabac_decode_bin(ps_cabac,
+                                                          ps_bitstrm,
+                                                          ctxt_idx);
+            AEV_TRACE("sao_merge_flag", sao_merge_left_flag, ps_cabac->u4_range);
+        }
+
+    }
+    if(ry > 0 && !sao_merge_left_flag)
+    {
+        if((ps_codec->s_parse.i4_ctb_slice_y > 0) && (ps_codec->s_parse.i4_ctb_tile_y > 0))
+        {
+            TRACE_CABAC_CTXT("sao_merge_flag", ps_cabac->u4_range, ctxt_idx);
+            sao_merge_up_flag = ihevcd_cabac_decode_bin(ps_cabac,
+                                                        ps_bitstrm,
+                                                        ctxt_idx);
+            AEV_TRACE("sao_merge_flag", sao_merge_up_flag, ps_cabac->u4_range);
+        }
+    }
+    ctxt_idx = IHEVC_CAB_SAO_TYPE;
+
+    if(sao_merge_left_flag)
+    {
+        *ps_sao = *(ps_sao - 1);
+    }
+    else if(sao_merge_up_flag)
+    {
+        *ps_sao = *(ps_sao - ps_sps->i2_pic_wd_in_ctb);
+    }
+    else // if(!sao_merge_up_flag && !sao_merge_left_flag)
+    {
+        WORD32 c_idx;
+        WORD32 sao_type_idx = 0;
+        for(c_idx = 0; c_idx < 3; c_idx++)
+        {
+            if((ps_slice_hdr->i1_slice_sao_luma_flag && c_idx == 0) || (ps_slice_hdr->i1_slice_sao_chroma_flag && c_idx > 0))
+            {
+
+
+                /* sao_type_idx will be same for c_idx == 1 and c_idx == 2 - hence not initialized to zero for c_idx == 2*/
+
+                if(c_idx == 0)
+                {
+                    sao_type_idx = 0;
+                    TRACE_CABAC_CTXT("sao_type_idx", ps_cabac->u4_range, ctxt_idx);
+                    sao_type_idx = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+
+                    if(sao_type_idx)
+                    {
+                        sao_type_idx += ihevcd_cabac_decode_bypass_bin(ps_cabac, ps_bitstrm);
+                    }
+                    AEV_TRACE("sao_type_idx", sao_type_idx,  ps_cabac->u4_range);
+
+                    ps_sao->b3_y_type_idx = sao_type_idx;
+                }
+                if(c_idx == 1)
+                {
+                    sao_type_idx = 0;
+                    TRACE_CABAC_CTXT("sao_type_idx", ps_cabac->u4_range, ctxt_idx);
+                    sao_type_idx = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+                    if(sao_type_idx)
+                    {
+                        sao_type_idx += ihevcd_cabac_decode_bypass_bin(ps_cabac, ps_bitstrm);
+                    }
+
+                    AEV_TRACE("sao_type_idx", sao_type_idx,  ps_cabac->u4_range);
+
+                    ps_sao->b3_cb_type_idx = sao_type_idx;
+                    ps_sao->b3_cr_type_idx = sao_type_idx;
+                }
+
+                if(sao_type_idx != 0)
+                {
+                    WORD32 i;
+                    WORD32 sao_offset[4];
+                    WORD32 sao_band_position = 0;
+                    WORD32 c_max =  (1 << (MIN(BIT_DEPTH, 10) - 5)) - 1;
+                    for(i = 0; i < 4; i++)
+                    {
+                        sao_offset[i] = ihevcd_cabac_decode_bypass_bins_tunary(ps_cabac, ps_bitstrm, c_max);
+                        AEV_TRACE("sao_offset_abs", sao_offset[i], ps_cabac->u4_range);
+
+                        if((2 == sao_type_idx) && (i > 1))
+                        {
+                            sao_offset[i] = -sao_offset[i];
+                        }
+                    }
+
+                    if(sao_type_idx == 1)
+                    {
+                        for(i = 0; i < 4; i++)
+                        {
+                            if(sao_offset[i] != 0)
+                            {
+                                value = ihevcd_cabac_decode_bypass_bin(ps_cabac, ps_bitstrm);
+                                AEV_TRACE("sao_offset_sign", value, ps_cabac->u4_range);
+
+                                if(value)
+                                {
+                                    sao_offset[i] = -sao_offset[i];
+                                }
+                            }
+                        }
+                        value = ihevcd_cabac_decode_bypass_bins(ps_cabac, ps_bitstrm, 5);
+                        AEV_TRACE("sao_band_position", value, ps_cabac->u4_range);
+
+                        sao_band_position = value;
+                    }
+                    else
+                    {
+                        if(c_idx == 0)
+                        {
+                            value = ihevcd_cabac_decode_bypass_bins(ps_cabac, ps_bitstrm, 2);
+                            AEV_TRACE("sao_eo_class", value, ps_cabac->u4_range);
+
+                            ps_sao->b3_y_type_idx += value;
+                        }
+
+                        if(c_idx == 1)
+                        {
+                            value = ihevcd_cabac_decode_bypass_bins(ps_cabac, ps_bitstrm, 2);
+                            AEV_TRACE("sao_eo_class", value, ps_cabac->u4_range);
+
+                            ps_sao->b3_cb_type_idx += value;
+                            ps_sao->b3_cr_type_idx += value;
+                        }
+                    }
+
+                    if(0 == c_idx)
+                    {
+                        ps_sao->b4_y_offset_1 = sao_offset[0];
+                        ps_sao->b4_y_offset_2 = sao_offset[1];
+                        ps_sao->b4_y_offset_3 = sao_offset[2];
+                        ps_sao->b4_y_offset_4 = sao_offset[3];
+
+                        ps_sao->b5_y_band_pos = sao_band_position;
+                    }
+                    else if(1 == c_idx)
+                    {
+                        ps_sao->b4_cb_offset_1 = sao_offset[0];
+                        ps_sao->b4_cb_offset_2 = sao_offset[1];
+                        ps_sao->b4_cb_offset_3 = sao_offset[2];
+                        ps_sao->b4_cb_offset_4 = sao_offset[3];
+
+                        ps_sao->b5_cb_band_pos = sao_band_position;
+                    }
+                    else // 2 == c_idx
+                    {
+                        ps_sao->b4_cr_offset_1 = sao_offset[0];
+                        ps_sao->b4_cr_offset_2 = sao_offset[1];
+                        ps_sao->b4_cr_offset_3 = sao_offset[2];
+                        ps_sao->b4_cr_offset_4 = sao_offset[3];
+
+                        ps_sao->b5_cr_band_pos = sao_band_position;
+                    }
+                }
+            }
+        }
+    }
+
+    return ret;
+}
+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  Parses Slice data syntax
+ *
+ * @par Description:
+ *  Parses Slice data syntax as per Section:7.3.9.1
+ *
+ * @param[in] ps_codec
+ *  Pointer to codec context
+ *
+ * @returns  Error from IHEVCD_ERROR_T
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+IHEVCD_ERROR_T ihevcd_parse_slice_data(codec_t *ps_codec)
+{
+
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    WORD32 end_of_slice_flag;
+    sps_t *ps_sps;
+    pps_t *ps_pps;
+    slice_header_t *ps_slice_hdr;
+    WORD32 end_of_pic;
+    tile_t *ps_tile, *ps_tile_prev;
+    WORD32 i;
+    WORD32 ctb_addr;
+    WORD32 tile_idx;
+    WORD32 cabac_init_idc;
+    WORD32 ctb_size;
+    WORD32 num_ctb_in_row;
+    WORD32 num_min4x4_in_ctb;
+    WORD32 slice_qp;
+    WORD32 slice_start_ctb_idx;
+    WORD32 tile_start_ctb_idx;
+
+#ifdef GPU_BUILD
+    WORD32 total_ctb_cnt = 0;
+    proc_job_t s_job;
+    gpu_ctxt_t *ps_gpu = &ps_codec->s_gpu_ctxt;
+#endif
+
+    ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr_base;
+    ps_pps = ps_codec->s_parse.ps_pps_base;
+    ps_sps = ps_codec->s_parse.ps_sps_base;
+
+    /* Get current slice header, pps and sps */
+    ps_slice_hdr += (ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1));
+    ps_pps  += ps_slice_hdr->i1_pps_id;
+    ps_sps  += ps_pps->i1_sps_id;
+
+    if(0 != ps_codec->s_parse.i4_cur_slice_idx)
+    {
+        if(!ps_slice_hdr->i1_dependent_slice_flag)
+        {
+            ps_codec->s_parse.i4_cur_independent_slice_idx++;
+            if(MAX_SLICE_HDR_CNT == ps_codec->s_parse.i4_cur_independent_slice_idx)
+                ps_codec->s_parse.i4_cur_independent_slice_idx = 0;
+        }
+    }
+
+
+    ctb_size = 1 << ps_sps->i1_log2_ctb_size;
+    num_min4x4_in_ctb = (ctb_size / 4) * (ctb_size / 4);
+    num_ctb_in_row = ps_sps->i2_pic_wd_in_ctb;
+
+    /* Update the parse context */
+    if(0 == ps_codec->i4_slice_error)
+    {
+        ps_codec->s_parse.i4_ctb_x = ps_slice_hdr->i2_ctb_x;
+        ps_codec->s_parse.i4_ctb_y = ps_slice_hdr->i2_ctb_y;
+    }
+    ps_codec->s_parse.ps_pps = ps_pps;
+    ps_codec->s_parse.ps_sps = ps_sps;
+    ps_codec->s_parse.ps_slice_hdr = ps_slice_hdr;
+
+    /* Derive Tile positions for the current CTB */
+    /* Change this to lookup if required */
+    ihevcd_get_tile_pos(ps_pps, ps_sps, ps_codec->s_parse.i4_ctb_x,
+                        ps_codec->s_parse.i4_ctb_y,
+                        &ps_codec->s_parse.i4_ctb_tile_x,
+                        &ps_codec->s_parse.i4_ctb_tile_y,
+                        &tile_idx);
+    ps_codec->s_parse.ps_tile = ps_pps->ps_tile + tile_idx;
+    ps_codec->s_parse.i4_cur_tile_idx = tile_idx;
+    ps_tile = ps_codec->s_parse.ps_tile;
+    if(tile_idx)
+        ps_tile_prev = ps_tile - 1;
+    else
+        ps_tile_prev = ps_tile;
+
+    /* If the present slice is dependent, then store the previous
+     * independent slices' ctb x and y values for decoding process */
+    if(0 == ps_codec->i4_slice_error)
+    {
+        if(1 == ps_slice_hdr->i1_dependent_slice_flag)
+        {
+            /*If slice is present at the start of a new tile*/
+            if((0 == ps_codec->s_parse.i4_ctb_tile_x) && (0 == ps_codec->s_parse.i4_ctb_tile_y))
+            {
+                ps_codec->s_parse.i4_ctb_slice_x = 0;
+                ps_codec->s_parse.i4_ctb_slice_y = 0;
+            }
+        }
+
+        if(!ps_slice_hdr->i1_dependent_slice_flag)
+        {
+            ps_codec->s_parse.i4_ctb_slice_x = 0;
+            ps_codec->s_parse.i4_ctb_slice_y = 0;
+        }
+    }
+
+    /* Frame level initializations */
+    if((0 == ps_codec->s_parse.i4_ctb_y) &&
+                    (0 == ps_codec->s_parse.i4_ctb_x))
+    {
+        ret = ihevcd_parse_pic_init(ps_codec);
+        RETURN_IF((ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), ret);
+
+        ps_codec->s_parse.pu4_pic_tu_idx[0] = 0;
+        ps_codec->s_parse.pu4_pic_pu_idx[0] = 0;
+        ps_codec->s_parse.i4_cur_independent_slice_idx = 0;
+        ps_codec->s_parse.i4_ctb_tile_x = 0;
+        ps_codec->s_parse.i4_ctb_tile_y = 0;
+    }
+
+    {
+        /* Updating the poc list of current slice to ps_mv_buf */
+        mv_buf_t *ps_mv_buf = ps_codec->s_parse.ps_cur_mv_buf;
+
+        if(ps_slice_hdr->i1_num_ref_idx_l1_active != 0)
+        {
+            for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
+            {
+                ps_mv_buf->l1_collocated_poc[(ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1))][i] = ((pic_buf_t *)ps_slice_hdr->as_ref_pic_list1[i].pv_pic_buf)->i4_abs_poc;
+                ps_mv_buf->u1_l1_collocated_poc_lt[(ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1))][i] = ((pic_buf_t *)ps_slice_hdr->as_ref_pic_list1[i].pv_pic_buf)->u1_used_as_ref;
+            }
+        }
+
+        if(ps_slice_hdr->i1_num_ref_idx_l0_active != 0)
+        {
+            for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
+            {
+                ps_mv_buf->l0_collocated_poc[(ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1))][i] = ((pic_buf_t *)ps_slice_hdr->as_ref_pic_list0[i].pv_pic_buf)->i4_abs_poc;
+                ps_mv_buf->u1_l0_collocated_poc_lt[(ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1))][i] = ((pic_buf_t *)ps_slice_hdr->as_ref_pic_list0[i].pv_pic_buf)->u1_used_as_ref;
+            }
+        }
+    }
+
+    /*Initialize the low delay flag at the beginning of every slice*/
+    if((0 == ps_codec->s_parse.i4_ctb_slice_x) || (0 == ps_codec->s_parse.i4_ctb_slice_y))
+    {
+        /* Lowdelay flag */
+        WORD32 cur_poc, ref_list_poc, flag = 1;
+        cur_poc = ps_slice_hdr->i4_abs_pic_order_cnt;
+        for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
+        {
+            ref_list_poc = ((mv_buf_t *)ps_slice_hdr->as_ref_pic_list0[i].pv_mv_buf)->i4_abs_poc;
+            if(ref_list_poc > cur_poc)
+            {
+                flag = 0;
+                break;
+            }
+        }
+        if(flag && (ps_slice_hdr->i1_slice_type == BSLICE))
+        {
+            for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
+            {
+                ref_list_poc = ((mv_buf_t *)ps_slice_hdr->as_ref_pic_list1[i].pv_mv_buf)->i4_abs_poc;
+                if(ref_list_poc > cur_poc)
+                {
+                    flag = 0;
+                    break;
+                }
+            }
+        }
+        ps_slice_hdr->i1_low_delay_flag = flag;
+    }
+
+    /* initialize the cabac init idc based on slice type */
+    if(ps_slice_hdr->i1_slice_type == ISLICE)
+    {
+        cabac_init_idc = 0;
+    }
+    else if(ps_slice_hdr->i1_slice_type == PSLICE)
+    {
+        cabac_init_idc = ps_slice_hdr->i1_cabac_init_flag ? 2 : 1;
+    }
+    else
+    {
+        cabac_init_idc = ps_slice_hdr->i1_cabac_init_flag ? 1 : 2;
+    }
+
+    slice_qp = ps_slice_hdr->i1_slice_qp_delta + ps_pps->i1_pic_init_qp;
+    slice_qp = CLIP3(slice_qp, 0, 51);
+
+    /*Update QP value for every indepndent slice or for every dependent slice that begins at the start of a new tile*/
+    if((0 == ps_slice_hdr->i1_dependent_slice_flag) ||
+                    ((1 == ps_slice_hdr->i1_dependent_slice_flag) && ((0 == ps_codec->s_parse.i4_ctb_tile_x) && (0 == ps_codec->s_parse.i4_ctb_tile_y))))
+    {
+        ps_codec->s_parse.u4_qp = slice_qp;
+    }
+
+    /*Cabac init at the beginning of a slice*/
+    //If the slice is a dependent slice, not present at the start of a tile
+    if((1 == ps_slice_hdr->i1_dependent_slice_flag) && (!((ps_codec->s_parse.i4_ctb_tile_x == 0) && (ps_codec->s_parse.i4_ctb_tile_y == 0))))
+    {
+        if((0 == ps_pps->i1_entropy_coding_sync_enabled_flag) || (ps_pps->i1_entropy_coding_sync_enabled_flag && (0 != ps_codec->s_parse.i4_ctb_x)))
+        {
+            ihevcd_cabac_reset(&ps_codec->s_parse.s_cabac,
+                               &ps_codec->s_parse.s_bitstrm);
+        }
+    }
+    else if((0 == ps_pps->i1_entropy_coding_sync_enabled_flag) || (ps_pps->i1_entropy_coding_sync_enabled_flag && (0 != ps_codec->s_parse.i4_ctb_x)))
+    {
+        ihevcd_cabac_init(&ps_codec->s_parse.s_cabac,
+                          &ps_codec->s_parse.s_bitstrm,
+                          slice_qp,
+                          cabac_init_idc,
+                          &gau1_ihevc_cab_ctxts[cabac_init_idc][slice_qp][0]);
+    }
+
+
+    do
+    {
+
+        {
+            WORD32 cur_ctb_idx = ps_codec->s_parse.i4_ctb_x
+                            + ps_codec->s_parse.i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+            if(1 == ps_codec->i4_num_cores && 0 == cur_ctb_idx % RESET_TU_BUF_NCTB)
+            {
+                ps_codec->s_parse.ps_tu = ps_codec->s_parse.ps_pic_tu;
+                ps_codec->s_parse.i4_pic_tu_idx = 0;
+            }
+        }
+
+        end_of_pic = 0;
+        /* Section:7.3.7 Coding tree unit syntax */
+        /* coding_tree_unit() inlined here */
+        /* If number of cores is greater than 1, then add job to the queue */
+        //TODO: Dual core implementation might need a different algo for better load balancing
+        /* At the start of ctb row parsing in a tile, queue a job for processing the current tile row */
+        ps_codec->s_parse.i4_ctb_num_pcm_blks = 0;
+
+
+        /*At the beginning of each tile-which is not the beginning of a slice, cabac context must be initialized.
+         * Hence, check for the tile beginning here */
+        if(((0 == ps_codec->s_parse.i4_ctb_tile_x) && (0 == ps_codec->s_parse.i4_ctb_tile_y))
+                        && (!((ps_tile->u1_pos_x == 0) && (ps_tile->u1_pos_y == 0)))
+                        && (!((0 == ps_codec->s_parse.i4_ctb_slice_x) && (0 == ps_codec->s_parse.i4_ctb_slice_y))))
+        {
+            slice_qp = ps_slice_hdr->i1_slice_qp_delta + ps_pps->i1_pic_init_qp;
+            slice_qp = CLIP3(slice_qp, 0, 51);
+            ps_codec->s_parse.u4_qp = slice_qp;
+
+            ihevcd_get_tile_pos(ps_pps, ps_sps, ps_codec->s_parse.i4_ctb_x,
+                                ps_codec->s_parse.i4_ctb_y,
+                                &ps_codec->s_parse.i4_ctb_tile_x,
+                                &ps_codec->s_parse.i4_ctb_tile_y,
+                                &tile_idx);
+
+            ps_codec->s_parse.ps_tile = ps_pps->ps_tile + tile_idx;
+            ps_codec->s_parse.i4_cur_tile_idx = tile_idx;
+            ps_tile_prev = ps_tile - 1;
+
+            tile_start_ctb_idx = ps_tile->u1_pos_x
+                            + ps_tile->u1_pos_y * (ps_sps->i2_pic_wd_in_ctb);
+
+            slice_start_ctb_idx =  ps_slice_hdr->i2_ctb_x
+                            + ps_slice_hdr->i2_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+
+            /*For slices that span across multiple tiles*/
+            if(slice_start_ctb_idx < tile_start_ctb_idx)
+            {       /* 2 Cases
+             * 1 - slice spans across frame-width- but does not start from 1st column
+             * 2 - Slice spans across multiple tiles anywhere is a frame
+             */
+                ps_codec->s_parse.i4_ctb_slice_y = ps_tile->u1_pos_y - ps_slice_hdr->i2_ctb_y;
+                if(!(((ps_slice_hdr->i2_ctb_x + ps_tile_prev->u2_wd) % ps_sps->i2_pic_wd_in_ctb) == ps_tile->u1_pos_x)) //Case 2
+                {
+                    if(ps_slice_hdr->i2_ctb_y <= ps_tile->u1_pos_y)
+                    {
+                        //Check if ctb x is before or after
+                        if(ps_slice_hdr->i2_ctb_x > ps_tile->u1_pos_x)
+                        {
+                            ps_codec->s_parse.i4_ctb_slice_y -= 1;
+                        }
+                    }
+                }
+                /*ps_codec->s_parse.i4_ctb_slice_y = ps_tile->u1_pos_y - ps_slice_hdr->i2_ctb_y;
+                if (ps_slice_hdr->i2_ctb_y <= ps_tile->u1_pos_y)
+                {
+                    //Check if ctb x is before or after
+                    if (ps_slice_hdr->i2_ctb_x > ps_tile->u1_pos_x )
+                    {
+                        ps_codec->s_parse.i4_ctb_slice_y -= 1 ;
+                    }
+                }*/
+            }
+
+            if(!ps_slice_hdr->i1_dependent_slice_flag)
+            {
+                ihevcd_cabac_init(&ps_codec->s_parse.s_cabac,
+                                  &ps_codec->s_parse.s_bitstrm,
+                                  slice_qp,
+                                  cabac_init_idc,
+                                  &gau1_ihevc_cab_ctxts[cabac_init_idc][slice_qp][0]);
+
+            }
+        }
+        /* If number of cores is greater than 1, then add job to the queue */
+        //TODO: Dual core implementation might need a different algo for better load balancing
+        /* At the start of ctb row parsing in a tile, queue a job for processing the current tile row */
+
+        if(0 == ps_codec->s_parse.i4_ctb_tile_x)
+        {
+
+#ifndef GPU_BUILD
+            if(1 < ps_codec->i4_num_cores)
+            {
+                proc_job_t s_job;
+                IHEVCD_ERROR_T ret;
+                s_job.i4_cmd    = CMD_PROCESS;
+                s_job.i2_ctb_cnt = (WORD16)ps_tile->u2_wd;
+                s_job.i2_ctb_x = (WORD16)ps_codec->s_parse.i4_ctb_x;
+                s_job.i2_ctb_y = (WORD16)ps_codec->s_parse.i4_ctb_y;
+                s_job.i2_slice_idx = (WORD16)ps_codec->s_parse.i4_cur_slice_idx;
+                s_job.i4_tu_coeff_data_ofst = (UWORD8 *)ps_codec->s_parse.pv_tu_coeff_data -
+                                (UWORD8 *)ps_codec->s_parse.pv_pic_tu_coeff_data;
+                ret = ihevcd_jobq_queue((jobq_t *)ps_codec->s_parse.pv_proc_jobq, &s_job, sizeof(proc_job_t), 1);
+
+                if(ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS)
+                    return ret;
+            }
+            else
+#endif
+            {
+#ifdef GPU_BUILD
+                process_ctxt_t *ps_proc = &ps_codec->as_process[(ps_codec->i4_num_cores == 1) ? 1 : (ps_codec->i4_num_cores - 1)];
+#else
+                process_ctxt_t *ps_proc = &ps_codec->as_process[0];
+#endif
+                WORD32 tu_coeff_data_ofst = (UWORD8 *)ps_codec->s_parse.pv_tu_coeff_data -
+                                (UWORD8 *)ps_codec->s_parse.pv_pic_tu_coeff_data;
+
+                /* If the codec is running in single core mode,
+                 * initialize zeroth process context
+                 * TODO: Dual core mode might need a different implementation instead of jobq
+                 */
+
+                ps_proc->i4_ctb_cnt = ps_tile->u2_wd;
+                ps_proc->i4_ctb_x   = ps_codec->s_parse.i4_ctb_x;
+                ps_proc->i4_ctb_y   = ps_codec->s_parse.i4_ctb_y;
+                ps_proc->i4_cur_slice_idx = ps_codec->s_parse.i4_cur_slice_idx;
+
+#ifdef GPU_BUILD
+                ps_proc->ps_slice_hdr = ps_slice_hdr;
+                ps_gpu->ai4_tu_coeff_data_ofst[ps_codec->s_parse.i4_ctb_tile_y] = tu_coeff_data_ofst;
+
+                ps_gpu->ai4_cur_slice_idx[ps_codec->s_parse.i4_ctb_tile_y] = ps_codec->s_parse.i4_cur_slice_idx;
+#endif
+                ihevcd_init_proc_ctxt(ps_proc, tu_coeff_data_ofst);
+            }
+        }
+
+
+        /* Restore cabac context model from top right CTB if entropy sync is enabled */
+        if(ps_pps->i1_entropy_coding_sync_enabled_flag)
+        {
+            /*TODO Handle single CTB and top-right belonging to a different slice */
+            if(0 == ps_codec->s_parse.i4_ctb_x)
+            {
+                //WORD32 size = sizeof(ps_codec->s_parse.s_cabac.au1_ctxt_models);
+                WORD32 default_ctxt = 0;
+
+                if((0 == ps_codec->s_parse.i4_ctb_slice_y) && (!ps_slice_hdr->i1_dependent_slice_flag))
+                    default_ctxt = 1;
+                if(1 == ps_sps->i2_pic_wd_in_ctb)
+                    default_ctxt = 1;
+
+                ps_codec->s_parse.u4_qp = slice_qp;
+                if(default_ctxt)
+                {
+                    //memcpy(&ps_codec->s_parse.s_cabac.au1_ctxt_models, &gau1_ihevc_cab_ctxts[cabac_init_idc][slice_qp][0], size);
+                    ihevcd_cabac_init(&ps_codec->s_parse.s_cabac,
+                                      &ps_codec->s_parse.s_bitstrm,
+                                      slice_qp,
+                                      cabac_init_idc,
+                                      &gau1_ihevc_cab_ctxts[cabac_init_idc][slice_qp][0]);
+
+                }
+                else
+                {
+                    //memcpy(&ps_codec->s_parse.s_cabac.au1_ctxt_models, &ps_codec->s_parse.s_cabac.au1_ctxt_models_sync, size);
+                    ihevcd_cabac_init(&ps_codec->s_parse.s_cabac,
+                                      &ps_codec->s_parse.s_bitstrm,
+                                      slice_qp,
+                                      cabac_init_idc,
+                                      (const UWORD8 *)&ps_codec->s_parse.s_cabac.au1_ctxt_models_sync);
+
+                }
+            }
+        }
+
+
+
+        if(0 == ps_codec->i4_slice_error)
+        {
+            if(ps_slice_hdr->i1_slice_sao_luma_flag || ps_slice_hdr->i1_slice_sao_chroma_flag)
+                ihevcd_parse_sao(ps_codec);
+        }
+        else
+        {
+            sao_t *ps_sao = ps_codec->s_parse.ps_pic_sao +
+                            ps_codec->s_parse.i4_ctb_x +
+                            ps_codec->s_parse.i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+
+            /* Default values */
+            ps_sao->b3_y_type_idx = 0;
+            ps_sao->b3_cb_type_idx = 0;
+            ps_sao->b3_cr_type_idx = 0;
+        }
+
+        //AEV_TRACE("CTB x", ps_codec->s_parse.i4_ctb_x, 0);
+        //AEV_TRACE("CTB y", ps_codec->s_parse.i4_ctb_y, 0);
+
+        {
+            WORD32 ctb_indx;
+            ctb_indx = ps_codec->s_parse.i4_ctb_x + ps_sps->i2_pic_wd_in_ctb * ps_codec->s_parse.i4_ctb_y;
+            ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp_const_in_ctb[ctb_indx >> 3] |= (1 << (ctb_indx & 7));
+            {
+                UWORD16 *pu1_slice_idx = ps_codec->s_parse.pu1_slice_idx;
+                pu1_slice_idx[ctb_indx] = ps_codec->s_parse.i4_cur_independent_slice_idx;
+            }
+        }
+
+        if(0 == ps_codec->i4_slice_error)
+        {
+            ihevcd_parse_coding_quadtree(ps_codec,
+                                         (ps_codec->s_parse.i4_ctb_x << ps_sps->i1_log2_ctb_size),
+                                         (ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size),
+                                         ps_sps->i1_log2_ctb_size,
+                                         0);
+        }
+        else
+        {
+            tu_t *ps_tu = ps_codec->s_parse.ps_tu;
+            pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+
+            ps_tu->b1_cb_cbf = 0;
+            ps_tu->b1_cr_cbf = 0;
+            ps_tu->b1_y_cbf = 0;
+            ps_tu->b4_pos_x = 0;
+            ps_tu->b4_pos_y = 0;
+            ps_tu->b1_transquant_bypass = 0;
+            ps_tu->b3_size = (ps_sps->i1_log2_ctb_size - 2);
+            ps_tu->b7_qp = ps_codec->s_parse.u4_qp;
+            ps_tu->b3_chroma_intra_mode_idx = INTRA_PRED_CHROMA_IDX_NONE;
+            ps_tu->b6_luma_intra_mode   = INTRA_PRED_NONE;
+            ps_tu->b1_first_tu_in_cu = 1;
+
+            ps_codec->s_parse.ps_tu++;
+            ps_codec->s_parse.s_cu.i4_tu_cnt++;
+            ps_codec->s_parse.i4_pic_tu_idx++;
+
+            ps_codec->s_parse.s_cu.i4_pred_mode = PRED_MODE_SKIP;
+            ps_codec->s_parse.s_cu.i4_part_mode = PART_2Nx2N;
+
+            ps_pu->b2_part_idx = 0;
+            ps_pu->b4_pos_x = 0;
+            ps_pu->b4_pos_y = 0;
+            ps_pu->b4_wd = (ctb_size >> 2) - 1;
+            ps_pu->b4_ht = (ctb_size >> 2) - 1;
+            ps_pu->b1_intra_flag = 0;
+            ps_pu->b3_part_mode = ps_codec->s_parse.s_cu.i4_part_mode;
+            ps_pu->b1_merge_flag = 1;
+            ps_pu->b3_merge_idx = 0;
+
+            ps_codec->s_parse.ps_pu++;
+            ps_codec->s_parse.i4_pic_pu_idx++;
+
+        }
+
+        if(0 == ps_codec->i4_slice_error)
+            end_of_slice_flag = ihevcd_cabac_decode_terminate(&ps_codec->s_parse.s_cabac, &ps_codec->s_parse.s_bitstrm);
+        else
+            end_of_slice_flag = 0;
+
+        AEV_TRACE("end_of_slice_flag", end_of_slice_flag, ps_codec->s_parse.s_cabac.u4_range);
+
+
+        /* In case of tiles or entropy sync, terminate cabac and copy cabac context backed up at the end of top-right CTB */
+        if(ps_pps->i1_tiles_enabled_flag || ps_pps->i1_entropy_coding_sync_enabled_flag)
+        {
+            WORD32 end_of_tile = 0;
+            WORD32 end_of_tile_row = 0;
+
+            /* Take a back up of cabac context models if entropy sync is enabled */
+            if(ps_pps->i1_entropy_coding_sync_enabled_flag || ps_pps->i1_tiles_enabled_flag)
+            {
+                if(1 == ps_codec->s_parse.i4_ctb_x)
+                {
+                    WORD32 size = sizeof(ps_codec->s_parse.s_cabac.au1_ctxt_models);
+                    memcpy(&ps_codec->s_parse.s_cabac.au1_ctxt_models_sync, &ps_codec->s_parse.s_cabac.au1_ctxt_models, size);
+                }
+            }
+
+            /* Since tiles and entropy sync are not enabled simultaneously, the following will not result in any problems */
+            if((ps_codec->s_parse.i4_ctb_tile_x + 1) == (ps_tile->u2_wd))
+            {
+                end_of_tile_row = 1;
+                if((ps_codec->s_parse.i4_ctb_tile_y + 1) == ps_tile->u2_ht)
+                    end_of_tile = 1;
+            }
+            if((0 == end_of_slice_flag) &&
+                            ((ps_pps->i1_tiles_enabled_flag && end_of_tile) ||
+                                            (ps_pps->i1_entropy_coding_sync_enabled_flag && end_of_tile_row)))
+            {
+                WORD32 end_of_sub_stream_one_bit;
+                end_of_sub_stream_one_bit = ihevcd_cabac_decode_terminate(&ps_codec->s_parse.s_cabac, &ps_codec->s_parse.s_bitstrm);
+                AEV_TRACE("end_of_sub_stream_one_bit", end_of_sub_stream_one_bit, ps_codec->s_parse.s_cabac.u4_range);
+
+                /* TODO: Remove the check for offset when HM is updated to include a byte unconditionally even for aligned location */
+                /* For Ittiam streams this check should not be there, for HM9.1 streams this should be there */
+                if(ps_codec->s_parse.s_bitstrm.u4_bit_ofst % 8)
+                    ihevcd_bits_flush_to_byte_boundary(&ps_codec->s_parse.s_bitstrm);
+
+                UNUSED(end_of_sub_stream_one_bit);
+            }
+        }
+        {
+            WORD32 ctb_indx;
+
+            ctb_addr = ps_codec->s_parse.i4_ctb_y * num_ctb_in_row + ps_codec->s_parse.i4_ctb_x;
+
+            ctb_indx = ++ctb_addr;
+
+            /* Store pu_idx for next CTB in frame level pu_idx array */
+
+            //In case of multiple tiles, if end-of-tile row is reached
+            if((ps_tile->u2_wd == (ps_codec->s_parse.i4_ctb_tile_x + 1)) && (ps_tile->u2_wd != ps_sps->i2_pic_wd_in_ctb))
+            {
+                ctb_indx = (ps_sps->i2_pic_wd_in_ctb * (ps_codec->s_parse.i4_ctb_tile_y + 1 + ps_tile->u1_pos_y)) + ps_tile->u1_pos_x; //idx is the beginning of next row in current tile.
+                if(ps_tile->u2_ht == (ps_codec->s_parse.i4_ctb_tile_y + 1))
+                {
+                    //If the current ctb is the last tile's last ctb
+                    if((ps_tile->u2_wd + ps_tile->u1_pos_x == ps_sps->i2_pic_wd_in_ctb) && ((ps_tile->u2_ht + ps_tile->u1_pos_y == ps_sps->i2_pic_ht_in_ctb)))
+                    {
+                        ctb_indx = ctb_addr; //Next continuous ctb address
+                    }
+                    else //Not last tile's end , but a tile end
+                    {
+                        tile_t *ps_next_tile = ps_codec->s_parse.ps_tile + 1;
+                        ctb_indx = ps_next_tile->u1_pos_x + (ps_next_tile->u1_pos_y * ps_sps->i2_pic_wd_in_ctb); //idx is the beginning of first row in next tile.
+                    }
+                }
+            }
+
+            ps_codec->s_parse.pu4_pic_pu_idx[ctb_indx] = ps_codec->s_parse.i4_pic_pu_idx;
+            ps_codec->s_parse.i4_next_pu_ctb_cnt = ctb_indx;
+
+            ps_codec->s_parse.pu1_pu_map += num_min4x4_in_ctb;
+
+            /* Store tu_idx for next CTB in frame level tu_idx array */
+            if(1 == ps_codec->i4_num_cores)
+            {
+                ctb_indx = (0 == ctb_addr % RESET_TU_BUF_NCTB) ?
+                                RESET_TU_BUF_NCTB : ctb_addr % RESET_TU_BUF_NCTB;
+
+                //In case of multiple tiles, if end-of-tile row is reached
+                if((ps_tile->u2_wd == (ps_codec->s_parse.i4_ctb_tile_x + 1)) && (ps_tile->u2_wd != ps_sps->i2_pic_wd_in_ctb))
+                {
+                    ctb_indx = (ps_sps->i2_pic_wd_in_ctb * (ps_codec->s_parse.i4_ctb_tile_y + 1 + ps_tile->u1_pos_y)) + ps_tile->u1_pos_x; //idx is the beginning of next row in current tile.
+                    if(ps_tile->u2_ht == (ps_codec->s_parse.i4_ctb_tile_y + 1))
+                    {
+                        //If the current ctb is the last tile's last ctb
+                        if((ps_tile->u2_wd + ps_tile->u1_pos_x == ps_sps->i2_pic_wd_in_ctb) && ((ps_tile->u2_ht + ps_tile->u1_pos_y == ps_sps->i2_pic_ht_in_ctb)))
+                        {
+                            ctb_indx = (0 == ctb_addr % RESET_TU_BUF_NCTB) ?
+                                            RESET_TU_BUF_NCTB : ctb_addr % RESET_TU_BUF_NCTB;
+                        }
+                        else  //Not last tile's end , but a tile end
+                        {
+                            tile_t *ps_next_tile = ps_codec->s_parse.ps_tile + 1;
+                            ctb_indx =  ps_next_tile->u1_pos_x + (ps_next_tile->u1_pos_y * ps_sps->i2_pic_wd_in_ctb); //idx is the beginning of first row in next tile.
+                        }
+                    }
+                }
+                ps_codec->s_parse.i4_next_tu_ctb_cnt = ctb_indx;
+                ps_codec->s_parse.pu4_pic_tu_idx[ctb_indx] = ps_codec->s_parse.i4_pic_tu_idx;
+            }
+            else
+            {
+                ctb_indx = ctb_addr;
+                if((ps_tile->u2_wd == (ps_codec->s_parse.i4_ctb_tile_x + 1)) && (ps_tile->u2_wd != ps_sps->i2_pic_wd_in_ctb))
+                {
+                    ctb_indx = (ps_sps->i2_pic_wd_in_ctb * (ps_codec->s_parse.i4_ctb_tile_y + 1 + ps_tile->u1_pos_y)) + ps_tile->u1_pos_x; //idx is the beginning of next row in current tile.
+                    if(ps_tile->u2_ht == (ps_codec->s_parse.i4_ctb_tile_y + 1))
+                    {
+                        //If the current ctb is the last tile's last ctb
+                        if((ps_tile->u2_wd + ps_tile->u1_pos_x == ps_sps->i2_pic_wd_in_ctb) && ((ps_tile->u2_ht + ps_tile->u1_pos_y == ps_sps->i2_pic_ht_in_ctb)))
+                        {
+                            ctb_indx = ctb_addr;
+                        }
+                        else  //Not last tile's end , but a tile end
+                        {
+                            tile_t *ps_next_tile = ps_codec->s_parse.ps_tile + 1;
+                            ctb_indx =  ps_next_tile->u1_pos_x + (ps_next_tile->u1_pos_y * ps_sps->i2_pic_wd_in_ctb); //idx is the beginning of first row in next tile.
+                        }
+                    }
+                }
+                ps_codec->s_parse.i4_next_tu_ctb_cnt = ctb_indx;
+                ps_codec->s_parse.pu4_pic_tu_idx[ctb_indx] = ps_codec->s_parse.i4_pic_tu_idx;
+            }
+            ps_codec->s_parse.pu1_tu_map += num_min4x4_in_ctb;
+        }
+
+
+        if(ps_codec->i4_num_cores <= MV_PRED_NUM_CORES_THRESHOLD)
+        {
+            /*************************************************/
+            /****************   MV pred **********************/
+            /*************************************************/
+            WORD8 u1_top_ctb_avail = 1;
+            WORD8 u1_left_ctb_avail = 1;
+            WORD8 u1_top_lt_ctb_avail = 1;
+            WORD8 u1_top_rt_ctb_avail = 1;
+            WORD16 i2_wd_in_ctb;
+
+            tile_start_ctb_idx = ps_tile->u1_pos_x
+                            + ps_tile->u1_pos_y * (ps_sps->i2_pic_wd_in_ctb);
+
+            slice_start_ctb_idx =  ps_slice_hdr->i2_ctb_x
+                            + ps_slice_hdr->i2_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+
+            if((slice_start_ctb_idx < tile_start_ctb_idx))
+            {
+                //Slices span across multiple tiles.
+                i2_wd_in_ctb = ps_sps->i2_pic_wd_in_ctb;
+            }
+            else
+            {
+                i2_wd_in_ctb = ps_tile->u2_wd;
+            }
+            /* slice and tile boundaries */
+            if((0 == ps_codec->s_parse.i4_ctb_y) || (0 == ps_codec->s_parse.i4_ctb_tile_y))
+            {
+                u1_top_ctb_avail = 0;
+                u1_top_lt_ctb_avail = 0;
+                u1_top_rt_ctb_avail = 0;
+            }
+
+            if((0 == ps_codec->s_parse.i4_ctb_x) || (0 == ps_codec->s_parse.i4_ctb_tile_x))
+            {
+                u1_left_ctb_avail = 0;
+                u1_top_lt_ctb_avail = 0;
+                if((0 == ps_codec->s_parse.i4_ctb_slice_y) || (0 == ps_codec->s_parse.i4_ctb_tile_y))
+                {
+                    u1_top_ctb_avail = 0;
+                    if((i2_wd_in_ctb - 1) != ps_codec->s_parse.i4_ctb_slice_x) //TODO: For tile, not implemented
+                    {
+                        u1_top_rt_ctb_avail = 0;
+                    }
+                }
+            }
+            /*For slices not beginning at start of a ctb row*/
+            else if(ps_codec->s_parse.i4_ctb_x > 0)
+            {
+                if((0 == ps_codec->s_parse.i4_ctb_slice_y) || (0 == ps_codec->s_parse.i4_ctb_tile_y))
+                {
+                    u1_top_ctb_avail = 0;
+                    u1_top_lt_ctb_avail = 0;
+                    if(0 == ps_codec->s_parse.i4_ctb_slice_x)
+                    {
+                        u1_left_ctb_avail = 0;
+                    }
+                    if((i2_wd_in_ctb - 1) != ps_codec->s_parse.i4_ctb_slice_x)
+                    {
+                        u1_top_rt_ctb_avail = 0;
+                    }
+                }
+                else if((1 == ps_codec->s_parse.i4_ctb_slice_y) && (0 == ps_codec->s_parse.i4_ctb_slice_x))
+                {
+                    u1_top_lt_ctb_avail = 0;
+                }
+            }
+
+            if(((ps_sps->i2_pic_wd_in_ctb - 1) == ps_codec->s_parse.i4_ctb_x) || ((ps_tile->u2_wd - 1) == ps_codec->s_parse.i4_ctb_tile_x))
+            {
+                u1_top_rt_ctb_avail = 0;
+            }
+
+            if(PSLICE == ps_slice_hdr->i1_slice_type
+                            || BSLICE == ps_slice_hdr->i1_slice_type)
+            {
+                mv_ctxt_t s_mv_ctxt;
+                process_ctxt_t *ps_proc;
+                UWORD32 *pu4_ctb_top_pu_idx;
+                UWORD32 *pu4_ctb_left_pu_idx;
+                UWORD32 *pu4_ctb_top_left_pu_idx;
+                WORD32 i4_ctb_pu_cnt;
+                WORD32 cur_ctb_idx;
+                WORD32 next_ctb_idx;
+                WORD32 cur_pu_idx;
+                ps_proc = &ps_codec->as_process[(ps_codec->i4_num_cores == 1) ? 1 : (ps_codec->i4_num_cores - 1)];
+                cur_ctb_idx = ps_codec->s_parse.i4_ctb_x
+                                + ps_codec->s_parse.i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+                next_ctb_idx = ps_codec->s_parse.i4_next_pu_ctb_cnt;
+                i4_ctb_pu_cnt = ps_codec->s_parse.pu4_pic_pu_idx[next_ctb_idx]
+                                - ps_codec->s_parse.pu4_pic_pu_idx[cur_ctb_idx];
+
+                cur_pu_idx = ps_codec->s_parse.pu4_pic_pu_idx[cur_ctb_idx];
+
+                pu4_ctb_top_pu_idx = ps_proc->pu4_pic_pu_idx_top
+                                + (ps_codec->s_parse.i4_ctb_x * ctb_size / MIN_PU_SIZE);
+                pu4_ctb_left_pu_idx = ps_proc->pu4_pic_pu_idx_left;
+                pu4_ctb_top_left_pu_idx = &ps_proc->u4_ctb_top_left_pu_idx;
+
+                /* Initializing s_mv_ctxt */
+                {
+                    s_mv_ctxt.ps_pps = ps_pps;
+                    s_mv_ctxt.ps_sps = ps_sps;
+                    s_mv_ctxt.ps_slice_hdr = ps_slice_hdr;
+                    s_mv_ctxt.i4_ctb_x = ps_codec->s_parse.i4_ctb_x;
+                    s_mv_ctxt.i4_ctb_y = ps_codec->s_parse.i4_ctb_y;
+                    s_mv_ctxt.ps_pu = &ps_codec->s_parse.ps_pic_pu[cur_pu_idx];
+                    s_mv_ctxt.ps_pic_pu = ps_codec->s_parse.ps_pic_pu;
+                    s_mv_ctxt.ps_tile = ps_tile;
+                    s_mv_ctxt.pu4_pic_pu_idx_map = ps_proc->pu4_pic_pu_idx_map;
+                    s_mv_ctxt.pu4_pic_pu_idx = ps_codec->s_parse.pu4_pic_pu_idx;
+                    s_mv_ctxt.pu1_pic_pu_map = ps_codec->s_parse.pu1_pic_pu_map;
+                    s_mv_ctxt.i4_ctb_pu_cnt = i4_ctb_pu_cnt;
+                    s_mv_ctxt.i4_ctb_start_pu_idx = cur_pu_idx;
+                    s_mv_ctxt.u1_top_ctb_avail = u1_top_ctb_avail;
+                    s_mv_ctxt.u1_top_rt_ctb_avail = u1_top_rt_ctb_avail;
+                    s_mv_ctxt.u1_top_lt_ctb_avail = u1_top_lt_ctb_avail;
+                    s_mv_ctxt.u1_left_ctb_avail = u1_left_ctb_avail;
+                }
+
+                ihevcd_get_mv_ctb(&s_mv_ctxt, pu4_ctb_top_pu_idx,
+                                  pu4_ctb_left_pu_idx, pu4_ctb_top_left_pu_idx);
+
+            }
+            else
+            {
+                WORD32 num_minpu_in_ctb = (ctb_size / MIN_PU_SIZE) * (ctb_size / MIN_PU_SIZE);
+                UWORD8 *pu1_pic_pu_map_ctb = ps_codec->s_parse.pu1_pic_pu_map +
+                                (ps_codec->s_parse.i4_ctb_x + ps_codec->s_parse.i4_ctb_y * ps_sps->i2_pic_wd_in_ctb) * num_minpu_in_ctb;
+                process_ctxt_t *ps_proc = &ps_codec->as_process[(ps_codec->i4_num_cores == 1) ? 1 : (ps_codec->i4_num_cores - 1)];
+                WORD32 row, col;
+                WORD32 pu_cnt;
+                WORD32 num_pu_per_ctb;
+                WORD32 cur_ctb_idx;
+                WORD32 next_ctb_idx;
+                WORD32 ctb_start_pu_idx;
+                UWORD32 *pu4_nbr_pu_idx = ps_proc->pu4_pic_pu_idx_map;
+                WORD32 nbr_pu_idx_strd = MAX_CTB_SIZE / MIN_PU_SIZE + 2;
+                pu_t *ps_pu;
+
+                for(row = 0; row < ctb_size / MIN_PU_SIZE; row++)
+                {
+                    for(col = 0; col < ctb_size / MIN_PU_SIZE; col++)
+                    {
+                        pu1_pic_pu_map_ctb[row * ctb_size / MIN_PU_SIZE + col] = 0;
+                    }
+                }
+
+
+                /* Neighbor PU idx update inside CTB */
+                /* 1byte per 4x4. Indicates the PU idx that 4x4 block belongs to */
+
+                cur_ctb_idx = ps_codec->s_parse.i4_ctb_x
+                                + ps_codec->s_parse.i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+                next_ctb_idx = ps_codec->s_parse.i4_next_pu_ctb_cnt;
+                num_pu_per_ctb = ps_codec->s_parse.pu4_pic_pu_idx[next_ctb_idx]
+                                - ps_codec->s_parse.pu4_pic_pu_idx[cur_ctb_idx];
+                ctb_start_pu_idx = ps_codec->s_parse.pu4_pic_pu_idx[cur_ctb_idx];
+                ps_pu = &ps_codec->s_parse.ps_pic_pu[ctb_start_pu_idx];
+
+                for(pu_cnt = 0; pu_cnt < num_pu_per_ctb; pu_cnt++, ps_pu++)
+                {
+                    UWORD32 cur_pu_idx;
+                    WORD32 pu_ht = (ps_pu->b4_ht + 1) << 2;
+                    WORD32 pu_wd = (ps_pu->b4_wd + 1) << 2;
+
+                    cur_pu_idx = ctb_start_pu_idx + pu_cnt;
+
+                    for(row = 0; row < pu_ht / MIN_PU_SIZE; row++)
+                        for(col = 0; col < pu_wd / MIN_PU_SIZE; col++)
+                            pu4_nbr_pu_idx[(1 + ps_pu->b4_pos_x + col)
+                                            + (1 + ps_pu->b4_pos_y + row)
+                                            * nbr_pu_idx_strd] =
+                                            cur_pu_idx;
+                }
+
+                /* Updating Top and Left pointers */
+                {
+                    WORD32 rows_remaining = ps_sps->i2_pic_height_in_luma_samples
+                                    - (ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size);
+                    WORD32 ctb_size_left = MIN(ctb_size, rows_remaining);
+
+                    /* Top Left */
+                    /* saving top left before updating top ptr, as updating top ptr will overwrite the top left for the next ctb */
+                    ps_proc->u4_ctb_top_left_pu_idx = ps_proc->pu4_pic_pu_idx_top[(ps_codec->s_parse.i4_ctb_x * ctb_size / MIN_PU_SIZE) + ctb_size / MIN_PU_SIZE - 1];
+                    for(i = 0; i < ctb_size / MIN_PU_SIZE; i++)
+                    {
+                        /* Left */
+                        /* Last column of au4_nbr_pu_idx */
+                        ps_proc->pu4_pic_pu_idx_left[i] = pu4_nbr_pu_idx[(ctb_size / MIN_PU_SIZE)
+                                        + (i + 1) * nbr_pu_idx_strd];
+                        /* Top */
+                        /* Last row of au4_nbr_pu_idx */
+                        ps_proc->pu4_pic_pu_idx_top[(ps_codec->s_parse.i4_ctb_x * ctb_size / MIN_PU_SIZE) + i] =
+                                        pu4_nbr_pu_idx[(ctb_size_left / MIN_PU_SIZE) * nbr_pu_idx_strd + i + 1];
+
+                    }
+                }
+            }
+
+            /*************************************************/
+            /******************  BS, QP  *********************/
+            /*************************************************/
+            /* Check if deblock is disabled for the current slice or if it is disabled for the current picture
+             * because of disable deblock api
+             */
+            if(0 == ps_codec->i4_disable_deblk_pic)
+            {
+                if((0 == ps_slice_hdr->i1_slice_disable_deblocking_filter_flag) &&
+                                (0 == ps_codec->i4_slice_error))
+                {
+                    WORD32 i4_ctb_tu_cnt;
+                    WORD32 cur_ctb_idx, next_ctb_idx;
+                    WORD32 cur_pu_idx;
+                    WORD32 cur_tu_idx;
+                    process_ctxt_t *ps_proc;
+
+                    ps_proc = &ps_codec->as_process[(ps_codec->i4_num_cores == 1) ? 1 : (ps_codec->i4_num_cores - 1)];
+                    cur_ctb_idx = ps_codec->s_parse.i4_ctb_x
+                                    + ps_codec->s_parse.i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+
+                    cur_pu_idx = ps_codec->s_parse.pu4_pic_pu_idx[cur_ctb_idx];
+                    next_ctb_idx = ps_codec->s_parse.i4_next_tu_ctb_cnt;
+                    if(1 == ps_codec->i4_num_cores)
+                    {
+                        i4_ctb_tu_cnt = ps_codec->s_parse.pu4_pic_tu_idx[next_ctb_idx] -
+                                        ps_codec->s_parse.pu4_pic_tu_idx[cur_ctb_idx % RESET_TU_BUF_NCTB];
+
+                        cur_tu_idx = ps_codec->s_parse.pu4_pic_tu_idx[cur_ctb_idx % RESET_TU_BUF_NCTB];
+                    }
+                    else
+                    {
+                        i4_ctb_tu_cnt = ps_codec->s_parse.pu4_pic_tu_idx[next_ctb_idx] -
+                                        ps_codec->s_parse.pu4_pic_tu_idx[cur_ctb_idx];
+
+                        cur_tu_idx = ps_codec->s_parse.pu4_pic_tu_idx[cur_ctb_idx];
+                    }
+
+                    ps_codec->s_parse.s_bs_ctxt.ps_pps = ps_codec->s_parse.ps_pps;
+                    ps_codec->s_parse.s_bs_ctxt.ps_sps = ps_codec->s_parse.ps_sps;
+                    ps_codec->s_parse.s_bs_ctxt.ps_codec = ps_codec;
+                    ps_codec->s_parse.s_bs_ctxt.i4_ctb_tu_cnt = i4_ctb_tu_cnt;
+                    ps_codec->s_parse.s_bs_ctxt.i4_ctb_x = ps_codec->s_parse.i4_ctb_x;
+                    ps_codec->s_parse.s_bs_ctxt.i4_ctb_y = ps_codec->s_parse.i4_ctb_y;
+                    ps_codec->s_parse.s_bs_ctxt.i4_ctb_tile_x = ps_codec->s_parse.i4_ctb_tile_x;
+                    ps_codec->s_parse.s_bs_ctxt.i4_ctb_tile_y = ps_codec->s_parse.i4_ctb_tile_y;
+                    ps_codec->s_parse.s_bs_ctxt.i4_ctb_slice_x = ps_codec->s_parse.i4_ctb_slice_x;
+                    ps_codec->s_parse.s_bs_ctxt.i4_ctb_slice_y = ps_codec->s_parse.i4_ctb_slice_y;
+                    ps_codec->s_parse.s_bs_ctxt.ps_tu = &ps_codec->s_parse.ps_pic_tu[cur_tu_idx];
+                    ps_codec->s_parse.s_bs_ctxt.ps_pu = &ps_codec->s_parse.ps_pic_pu[cur_pu_idx];
+                    ps_codec->s_parse.s_bs_ctxt.pu4_pic_pu_idx_map = ps_proc->pu4_pic_pu_idx_map;
+                    ps_codec->s_parse.s_bs_ctxt.i4_next_pu_ctb_cnt = ps_codec->s_parse.i4_next_pu_ctb_cnt;
+                    ps_codec->s_parse.s_bs_ctxt.i4_next_tu_ctb_cnt = ps_codec->s_parse.i4_next_tu_ctb_cnt;
+                    ps_codec->s_parse.s_bs_ctxt.pu1_slice_idx = ps_codec->s_parse.pu1_slice_idx;
+                    ps_codec->s_parse.s_bs_ctxt.ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr;
+                    ps_codec->s_parse.s_bs_ctxt.ps_tile = ps_codec->s_parse.ps_tile;
+
+                    if(ISLICE == ps_slice_hdr->i1_slice_type)
+                    {
+                        ihevcd_ctb_boundary_strength_islice(&ps_codec->s_parse.s_bs_ctxt);
+                    }
+                    else
+                    {
+                        ihevcd_ctb_boundary_strength_pbslice(&ps_codec->s_parse.s_bs_ctxt);
+                    }
+                }
+                else
+                {
+                    WORD32 vert_bs_strd = ps_sps->i2_pic_wd_in_ctb * (ctb_size * ctb_size / 8 / 16);
+                    WORD32 horz_bs_strd = (ps_sps->i2_pic_wd_in_ctb + 1) * (ctb_size * ctb_size / 8 / 16);
+                    UWORD32 *pu4_vert_bs = (UWORD32 *)((UWORD8 *)ps_codec->s_parse.s_bs_ctxt.pu4_pic_vert_bs +
+                                    ps_codec->s_parse.i4_ctb_x * (ctb_size * ctb_size / 8 / 16) +
+                                    ps_codec->s_parse.i4_ctb_y * vert_bs_strd);
+                    UWORD32 *pu4_horz_bs = (UWORD32 *)((UWORD8 *)ps_codec->s_parse.s_bs_ctxt.pu4_pic_horz_bs +
+                                    ps_codec->s_parse.i4_ctb_x * (ctb_size * ctb_size / 8 / 16) +
+                                    ps_codec->s_parse.i4_ctb_y * horz_bs_strd);
+
+                    memset(pu4_vert_bs, 0, (ctb_size / 8 + 1) * (ctb_size / 4) / 8 * 2);
+                    memset(pu4_horz_bs, 0, (ctb_size / 8) * (ctb_size / 4) / 8 * 2);
+
+                }
+            }
+
+        }
+
+
+        /* Update the parse status map */
+        {
+            sps_t *ps_sps = ps_codec->s_parse.ps_sps;
+            UWORD8 *pu1_buf;
+            WORD32 idx;
+            idx = (ps_codec->s_parse.i4_ctb_x);
+            idx += ((ps_codec->s_parse.i4_ctb_y) * ps_sps->i2_pic_wd_in_ctb);
+            pu1_buf = (ps_codec->pu1_parse_map + idx);
+            *pu1_buf = 1;
+        }
+
+        /* Increment CTB x and y positions */
+        ps_codec->s_parse.i4_ctb_tile_x++;
+        ps_codec->s_parse.i4_ctb_x++;
+        ps_codec->s_parse.i4_ctb_slice_x++;
+
+        /*If tiles are enabled, handle the slice counters differently*/
+        if(ps_pps->i1_tiles_enabled_flag)
+        {
+            //Indicates multiple tiles in a slice case
+            tile_start_ctb_idx = ps_tile->u1_pos_x
+                            + ps_tile->u1_pos_y * (ps_sps->i2_pic_wd_in_ctb);
+
+            slice_start_ctb_idx =  ps_slice_hdr->i2_ctb_x
+                            + ps_slice_hdr->i2_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+
+            if((slice_start_ctb_idx < tile_start_ctb_idx))
+            {
+                if(ps_codec->s_parse.i4_ctb_slice_x == (ps_tile->u1_pos_x + ps_tile->u2_wd))
+                {
+                    /* Reached end of slice row within a tile /frame */
+                    ps_codec->s_parse.i4_ctb_slice_y++;
+                    ps_codec->s_parse.i4_ctb_slice_x = ps_tile->u1_pos_x; //todo:Check
+                }
+            }
+            //Indicates multiple slices in a tile case - hence, reset slice_x
+            else if(ps_codec->s_parse.i4_ctb_slice_x == (ps_tile->u2_wd))
+            {
+                ps_codec->s_parse.i4_ctb_slice_y++;
+                ps_codec->s_parse.i4_ctb_slice_x = 0;
+            }
+        }
+        else
+        {
+            if(ps_codec->s_parse.i4_ctb_slice_x == ps_tile->u2_wd)
+            {
+                /* Reached end of slice row within a tile /frame */
+                ps_codec->s_parse.i4_ctb_slice_y++;
+                ps_codec->s_parse.i4_ctb_slice_x = 0;
+            }
+        }
+
+
+        if(ps_codec->s_parse.i4_ctb_tile_x == (ps_tile->u2_wd))
+        {
+            /* Reached end of tile row */
+            ps_codec->s_parse.i4_ctb_tile_x = 0;
+            ps_codec->s_parse.i4_ctb_x = ps_tile->u1_pos_x;
+
+            ps_codec->s_parse.i4_ctb_tile_y++;
+            ps_codec->s_parse.i4_ctb_y++;
+
+            if(ps_codec->s_parse.i4_ctb_tile_y == (ps_tile->u2_ht))
+            {
+                /* Reached End of Tile */
+                ps_codec->s_parse.i4_ctb_tile_y = 0;
+                ps_codec->s_parse.i4_ctb_tile_x = 0;
+                ps_codec->s_parse.ps_tile++;
+
+                if((ps_tile->u2_ht + ps_tile->u1_pos_y  ==  ps_sps->i2_pic_ht_in_ctb) && (ps_tile->u2_wd + ps_tile->u1_pos_x  ==  ps_sps->i2_pic_wd_in_ctb))
+                {
+                    /* Reached end of frame */
+                    end_of_pic = 1;
+                    ps_codec->s_parse.i4_ctb_x = 0;
+                    ps_codec->s_parse.i4_ctb_y = ps_sps->i2_pic_ht_in_ctb;
+                }
+                else
+                {
+                    /* Initialize ctb_x and ctb_y to start of next tile */
+                    ps_tile = ps_codec->s_parse.ps_tile;
+                    ps_codec->s_parse.i4_ctb_x = ps_tile->u1_pos_x;
+                    ps_codec->s_parse.i4_ctb_y = ps_tile->u1_pos_y;
+                    ps_codec->s_parse.i4_ctb_tile_y = 0;
+                    ps_codec->s_parse.i4_ctb_tile_x = 0;
+                    ps_codec->s_parse.i4_ctb_slice_x = ps_tile->u1_pos_x;
+                    ps_codec->s_parse.i4_ctb_slice_y = ps_tile->u1_pos_y;
+
+                }
+            }
+
+        }
+
+        ps_codec->s_parse.i4_next_ctb_indx = ps_codec->s_parse.i4_ctb_x +
+                        ps_codec->s_parse.i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+
+        /* If the current slice is in error, check if the next slice's address
+         * is reached and mark the end_of_slice flag */
+        if(ps_codec->i4_slice_error)
+        {
+            slice_header_t *ps_slice_hdr_next = ps_slice_hdr + 1;
+            WORD32 next_slice_addr = ps_slice_hdr_next->i2_ctb_x +
+                            ps_slice_hdr_next->i2_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+
+            if(ps_codec->s_parse.i4_next_ctb_indx == next_slice_addr)
+                end_of_slice_flag = 1;
+        }
+
+#ifndef GPU_BUILD
+        /* If the codec is running in single core mode
+         * then call process function for current CTB
+         */
+        if((1 == ps_codec->i4_num_cores) && (ps_codec->s_parse.i4_ctb_tile_x == 0))
+        {
+            process_ctxt_t *ps_proc = &ps_codec->as_process[0];
+//          ps_proc->i4_ctb_cnt = ihevcd_nctb_cnt(ps_codec, ps_sps);
+            ps_proc->i4_ctb_cnt = ps_proc->ps_tile->u2_wd;
+            ihevcd_process(ps_proc);
+        }
+#else
+        /* Now call the function that will popluated mc data for the
+         * current ctb.
+         */
+        if(ps_codec->u4_gpu_enabled) // == ps_codec->i4_num_cores)
+        {
+            process_ctxt_t *ps_proc = &ps_codec->as_process[(ps_codec->i4_num_cores == 1) ? 1 : (ps_codec->i4_num_cores - 1)];
+            WORD32 nctb_mc = 1;
+            WORD32 cur_ctb_idx;
+            WORD32 cur_pu_idx;
+            //ps_proc->i4_ctb_cnt = ihevcd_nctb_cnt(ps_codec, ps_sps);
+            //ihevcd_process(ps_proc);
+            cur_ctb_idx = ps_proc->i4_ctb_x + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+            cur_pu_idx = ps_proc->pu4_pic_pu_idx[cur_ctb_idx];
+            ps_proc->ps_pu = &ps_proc->ps_pic_pu[cur_pu_idx];
+            ps_proc->ps_slice_hdr = ps_slice_hdr;
+
+            if(ISLICE != ps_slice_hdr->i1_slice_type)
+                ihevcd_gpu_mc_populate_data_nctb(ps_proc, nctb_mc);
+
+            ps_proc->i4_ctb_x      += nctb_mc;
+            ps_proc->i4_ctb_cnt    -= nctb_mc;
+            ps_proc->i4_ctb_tile_x += nctb_mc;
+        }
+
+        total_ctb_cnt++;
+        ps_gpu->i4_curr_grain_ctb_cnt++;
+        if(1)
+        {
+
+            if(ps_gpu->i4_curr_grain_ctb_cnt == ps_gpu->ai4_ctbs_in_grain[ps_gpu->i4_curr_grain_idx])
+            {
+                process_ctxt_t *ps_proc = &ps_codec->as_process[(ps_codec->i4_num_cores == 1) ? 1 : (ps_codec->i4_num_cores - 1)];
+
+                if(ps_codec->u4_gpu_enabled)
+                    ihevcd_gpu_mc_execute(ps_proc);
+
+
+#if 1
+                if(1 < ps_codec->i4_num_cores)
+                {
+                    IHEVCD_ERROR_T ret;
+                    WORD32 i, cnt = ps_gpu->ai4_grain_ht_in_ctb[ps_gpu->i4_curr_grain_idx];
+                    WORD32 ctb_y_idx = 0;
+
+                    for(i = 0; i < ps_gpu->i4_curr_grain_idx; i++)
+                        ctb_y_idx += ps_gpu->ai4_grain_ht_in_ctb[i];
+
+                    if(ps_gpu->i4_curr_grain_idx == 0)
+                        cnt--;
+                    else if(ps_gpu->i4_curr_grain_idx == (GRANULARITY - 1))
+                        cnt++;
+
+                    if(ps_gpu->i4_curr_grain_idx != 0)
+                        ctb_y_idx--;
+
+                    for(i = 0; i < cnt; i++)
+                    {
+                        s_job.i4_cmd    = CMD_PROCESS;
+                        s_job.i2_ctb_cnt = (WORD16)ps_sps->i2_pic_wd_in_ctb;
+                        s_job.i2_ctb_x = 0; //(WORD16)ps_codec->s_parse.i4_ctb_tile_x;
+                        s_job.i2_ctb_y = (WORD16)ctb_y_idx;
+                        s_job.i2_slice_idx = (WORD16)ps_codec->s_parse.i4_cur_slice_idx;
+                        s_job.i4_tu_coeff_data_ofst = ps_gpu->ai4_tu_coeff_data_ofst[ctb_y_idx];
+                        s_job.i2_granularity_idx = ps_gpu->i4_curr_grain_idx;
+                        s_job.i2_slice_idx = (WORD16)ps_gpu->ai4_cur_slice_idx[ctb_y_idx];
+
+                        printf("Queued ctb y row %d\n", ctb_y_idx);
+
+                        if((i == 0) && (ps_codec->u4_gpu_enabled))
+                            s_job.i2_wait = 1;
+                        else
+                            s_job.i2_wait = 0;
+
+                        ret = ihevcd_jobq_queue(ps_codec->s_parse.pv_proc_jobq, &s_job, sizeof(proc_job_t), 1);
+                        ASSERT(ret == IHEVC_SUCCESS);
+                        ctb_y_idx++;
+
+                    }
+                }
+#endif
+                ps_gpu->i4_curr_grain_ctb_cnt = 0;
+                ps_gpu->i4_curr_grain_idx++;
+
+            }
+        }
+#endif
+
+        /* If the bytes for the current slice are exhausted
+         * set end_of_slice flag to 1
+         * This slice will be treated as incomplete */
+        if((UWORD8 *)ps_codec->s_parse.s_bitstrm.pu1_buf_max + BITSTRM_OFF_THRS <
+                                        ((UWORD8 *)ps_codec->s_parse.s_bitstrm.pu4_buf + (ps_codec->s_parse.s_bitstrm.u4_bit_ofst / 8)))
+        {
+            // end_of_slice_flag = ps_codec->i4_slice_error ? 0 : 1;
+
+            if(0 == ps_codec->i4_slice_error)
+                end_of_slice_flag = 1;
+        }
+
+
+        if(end_of_pic)
+            break;
+    } while(!end_of_slice_flag);
+
+    /* Increment the slice index for parsing next slice */
+    if(0 == end_of_pic)
+    {
+#ifdef GPU_BUILD
+        // TODO GPU : The following logic needs different implementation.
+#endif
+        while(1)
+        {
+
+            WORD32 parse_slice_idx;
+#ifdef GPU_BUILD
+            WORD32 min_proc_slice_idx;
+            WORD32 proc_idx = (ps_codec->u4_parsing_view * 2) + (ps_codec->i4_num_cores - 1);
+            /* Identify the min slice index currently in use by processing threads */
+            min_proc_slice_idx = ps_codec->as_process[proc_idx].i4_cur_slice_idx;
+#endif
+            parse_slice_idx = ps_codec->s_parse.i4_cur_slice_idx;
+            parse_slice_idx++;
+
+#if 0
+            for(i = 1; i < (ps_codec->i4_num_cores - 1); i++)
+            {
+                if(ps_codec->as_process[i].i4_cur_slice_idx
+                                < min_proc_slice_idx)
+                    min_proc_slice_idx =
+                                    ps_codec->as_process[i].i4_cur_slice_idx;
+
+
+            }
+
+
+            /* If MAX slice header count is reached, then reset the parsing slice idx to zero */
+            if(parse_slice_idx == MAX_SLICE_HDR_CNT)
+            {
+                parse_slice_idx = 0;
+            }
+
+            /* If parse_slice_idx and min_proc_slice_idx are different then break */
+            if(parse_slice_idx != min_proc_slice_idx)
+            {
+                ps_codec->s_parse.i4_cur_slice_idx = parse_slice_idx;
+                break;
+            }
+            else
+            {
+                /* If Processing threads are still using the slice where parsing thread
+                 * has to write next slice data, wait for processing threads to consume that slice
+                 */
+                ithread_yield();
+            }
+#else
+            {
+                /* If the next slice header is not initialized, update cur_slice_idx and break */
+                if((1 == ps_codec->i4_num_cores) || (0 != (parse_slice_idx & (MAX_SLICE_HDR_CNT - 1))))
+                {
+                    ps_codec->s_parse.i4_cur_slice_idx = parse_slice_idx;
+                    break;
+                }
+
+                /* If the next slice header is initialised, wait for the parsed slices to be processed */
+                else
+                {
+#ifndef GPU_BUILD
+                    WORD32 ctb_indx = 0;
+
+                    while(ctb_indx != ps_sps->i4_pic_size_in_ctb)
+                    {
+                        WORD32 parse_status = *(ps_codec->pu1_parse_map + ctb_indx);
+                        WORD32 proc_status = *(ps_codec->pu1_proc_map + ctb_indx) & 1;
+
+                        if(parse_status == proc_status)
+                            ctb_indx++;
+                    }
+                    ps_codec->s_parse.i4_cur_slice_idx = parse_slice_idx;
+                    break;
+#else
+                    printf("\nFix this code for multiCore multi-Slice\n");
+                    exit(-1);
+#endif
+                }
+
+            }
+#endif
+        }
+
+    }
+    else
+    {
+#ifdef GPU_BUILD
+        if(1 == ps_codec->i4_num_cores)
+        {
+
+            if(!ps_pps->i1_tiles_enabled_flag)
+            {
+                process_ctxt_t *ps_proc = &ps_codec->as_process[ps_codec->i4_num_cores - 1];
+                WORD32 i;
+                WORD32 tu_coeff_data_ofst = 0;
+                ps_proc->i4_ctb_cnt = total_ctb_cnt;
+                ps_proc->i4_ctb_x   = 0; //ps_codec->s_parse.i4_ctb_tile_x;
+                ps_proc->i4_ctb_y   = 0; //ps_codec->s_parse.i4_ctb_tile_y;
+                ps_proc->i4_cur_slice_idx = ps_gpu->ai4_cur_slice_idx[0]; //ps_codec->s_parse.i4_cur_slice_idx;
+
+                for(i = 0; i < GRANULARITY; i++)
+                {
+                    ps_proc->i4_ctb_cnt = ps_gpu->ai4_ctbs_in_grain[i];
+                    total_ctb_cnt -= ps_gpu->ai4_ctbs_in_grain[i];
+
+//                  if(i == 0)
+//                  {
+//                      ps_proc->i4_ctb_cnt -= ps_sps->i2_pic_wd_in_ctb;
+//                      total_ctb_cnt += ps_sps->i2_pic_wd_in_ctb;
+//                  }
+//                  else if(i == (GRANULARITY - 1))
+//                  {
+//                      ps_proc->i4_ctb_cnt += ps_sps->i2_pic_wd_in_ctb;
+//                      //total_ctb_cnt -= ps_sps->i2_pic_wd_in_ctb;
+//                  }
+
+                    // TODO GPU : Buggy don't wait for I-slice.
+                    if(ps_codec->u4_gpu_enabled)
+                    {
+                        ihevcd_gpu_mc_wait(ps_proc, i);
+                    }
+
+                    //printf("Calling ihevcd_init_proc_ctxt ps_proc->i4_ctb_cnt = %d\n", ps_proc->i4_ctb_cnt);
+
+                    ihevcd_init_proc_ctxt(ps_proc, tu_coeff_data_ofst);
+                    //printf("ihevcd_process\n");
+                    ihevcd_process(ps_proc);
+                    tu_coeff_data_ofst  = (UWORD8 *)ps_proc->pv_tu_coeff_data - (UWORD8 *)ps_proc->pv_pic_tu_coeff_data;
+                }
+
+
+            }
+            else
+            {
+                process_ctxt_t *ps_proc = &ps_codec->as_process[ps_codec->i4_num_cores - 1];
+                WORD32 i, j, k, l;
+                WORD32 tu_coeff_data_ofst = 0;
+                tile_t *ps_tile = ps_pps->ps_tile;
+
+
+                // ps_proc->i4_cur_slice_idx = ps_gpu->ai4_cur_slice_idx[0];//ps_codec->s_parse.i4_cur_slice_idx;
+                ps_proc->i4_cur_slice_idx = 0;
+
+                i = 0;
+                printf("Processing tile\n");
+                for(j = 0; j < ps_pps->i1_num_tile_rows; j++)
+                {
+                    if(ps_gpu->ai4_grain_pos_y[i] == ps_tile->u1_pos_y)
+                    {
+                        // TODO GPU : Buggy don't wait for I-slice.
+                        if(ps_codec->u4_gpu_enabled)
+                        {
+                            ihevcd_gpu_mc_wait(ps_proc, i);
+                        }
+                        i++;
+
+                    }
+                    for(k = 0; k < ps_pps->i1_num_tile_columns; k++)
+                    {
+                        ps_proc->i4_ctb_x   = ps_tile->u1_pos_x;
+                        ps_proc->i4_ctb_y   = ps_tile->u1_pos_y;
+
+//                      ps_proc->i4_cur_slice_idx = *(ps_codec->s_parse.pu1_slice_idx + ps_proc->i4_ctb_x + ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb );
+                        for(l = 0; l < ps_tile->u2_ht; l++)
+                        {
+                            ps_proc->i4_ctb_cnt = ps_tile->u2_wd; //* ps_tile->u2_ht;
+
+                            ihevcd_init_proc_ctxt(ps_proc, tu_coeff_data_ofst);
+                            //printf("ihevcd_process\n");
+                            ihevcd_process(ps_proc);
+                            tu_coeff_data_ofst  = (UWORD8 *)ps_proc->pv_tu_coeff_data - (UWORD8 *)ps_proc->pv_pic_tu_coeff_data;
+                        }
+                        ps_tile++;
+                    }
+                }
+
+
+
+            }
+        }
+#endif
+#if FRAME_ILF_PAD
+        if(FRAME_ILF_PAD && 1 == ps_codec->i4_num_cores)
+        {
+            if(ps_slice_hdr->i4_abs_pic_order_cnt == 0)
+            {
+                DUMP_PRE_ILF(ps_codec->as_process[0].pu1_cur_pic_luma,
+                             ps_codec->as_process[0].pu1_cur_pic_chroma,
+                             ps_sps->i2_pic_width_in_luma_samples,
+                             ps_sps->i2_pic_height_in_luma_samples,
+                             ps_codec->i4_strd);
+
+                DUMP_BS(ps_codec->as_process[0].s_bs_ctxt.pu4_pic_vert_bs,
+                        ps_codec->as_process[0].s_bs_ctxt.pu4_pic_horz_bs,
+                        ps_sps->i2_pic_wd_in_ctb * (ctb_size * ctb_size / 8 / 16) * ps_sps->i2_pic_ht_in_ctb,
+                        (ps_sps->i2_pic_wd_in_ctb + 1) * (ctb_size * ctb_size / 8 / 16) * ps_sps->i2_pic_ht_in_ctb);
+
+                DUMP_QP(ps_codec->as_process[0].s_bs_ctxt.pu1_pic_qp,
+                        (ps_sps->i2_pic_height_in_luma_samples * ps_sps->i2_pic_width_in_luma_samples) / (MIN_CU_SIZE * MIN_CU_SIZE));
+
+                DUMP_QP_CONST_IN_CTB(ps_codec->as_process[0].s_bs_ctxt.pu1_pic_qp_const_in_ctb,
+                                     (ps_sps->i2_pic_height_in_luma_samples * ps_sps->i2_pic_width_in_luma_samples) / (MIN_CTB_SIZE * MIN_CTB_SIZE) / 8);
+
+                DUMP_NO_LOOP_FILTER(ps_codec->as_process[0].pu1_pic_no_loop_filter_flag,
+                                    (ps_sps->i2_pic_width_in_luma_samples / MIN_CU_SIZE) * (ps_sps->i2_pic_height_in_luma_samples / MIN_CU_SIZE) / 8);
+
+                DUMP_OFFSETS(ps_slice_hdr->i1_beta_offset_div2,
+                             ps_slice_hdr->i1_tc_offset_div2,
+                             ps_pps->i1_pic_cb_qp_offset,
+                             ps_pps->i1_pic_cr_qp_offset);
+            }
+            ps_codec->s_parse.s_deblk_ctxt.ps_pps = ps_codec->s_parse.ps_pps;
+            ps_codec->s_parse.s_deblk_ctxt.ps_sps = ps_codec->s_parse.ps_sps;
+            ps_codec->s_parse.s_deblk_ctxt.ps_codec = ps_codec;
+            ps_codec->s_parse.s_deblk_ctxt.ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr;
+            ps_codec->s_parse.s_deblk_ctxt.is_chroma_yuv420sp_vu = (ps_codec->e_ref_chroma_fmt == IV_YUV_420SP_VU);
+
+            ps_codec->s_parse.s_sao_ctxt.ps_pps = ps_codec->s_parse.ps_pps;
+            ps_codec->s_parse.s_sao_ctxt.ps_sps = ps_codec->s_parse.ps_sps;
+            ps_codec->s_parse.s_sao_ctxt.ps_codec = ps_codec;
+            ps_codec->s_parse.s_sao_ctxt.ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr;
+
+            ihevcd_ilf_pad_frame(&ps_codec->s_parse.s_deblk_ctxt, &ps_codec->s_parse.s_sao_ctxt);
+
+        }
+#endif
+        ps_codec->s_parse.i4_end_of_frame = 1;
+    }
+    return ret;
+}
+
+
+
+
+
+
+
+

diff --git a/decoder/ihevcd_parse_slice.h b/decoder/ihevcd_parse_slice.h
new file mode 100644
index 0000000..ca518f6
--- /dev/null
+++ b/decoder/ihevcd_parse_slice.h

@@ -0,0 +1,43 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_parse_slice.h
+*
+* @brief
+*  Parsing of slice level data
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_PARSE_SLICE_H_
+#define _IHEVCD_PARSE_SLICE_H_
+
+
+IHEVCD_ERROR_T ihevcd_parse_mvd(codec_t *ps_codec, mv_t *ps_mv);
+IHEVCD_ERROR_T ihevcd_parse_slice_data(codec_t *ps_codec);
+#endif /* _IHEVCD_PARSE_SLICE_H_ */

diff --git a/decoder/ihevcd_parse_slice_header.c b/decoder/ihevcd_parse_slice_header.c
new file mode 100644
index 0000000..7bb6084
--- /dev/null
+++ b/decoder/ihevcd_parse_slice_header.c

@@ -0,0 +1,1090 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_parse_headers.c
+*
+* @brief
+*  Contains functions for parsing headers
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_quant_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_parse_headers.h"
+#include "ihevcd_parse_slice_header.h"
+#include "ihevcd_ref_list.h"
+#ifdef GPU_BUILD
+#include "ihevcd_opencl_mc_interface.h"
+#endif
+
+mv_buf_t* ihevcd_mv_mgr_get_poc(buf_mgr_t *ps_mv_buf_mgr, UWORD32 abs_poc);
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Parses VPS operation point
+*
+* @par   Description
+* Parses VPS operation point as per section 7.3.5
+*
+* @param[out] ps_vps
+*  Pointer to VPS structure
+*
+* @param[in] ps_bitstrm
+*  Pointer to bitstream structure
+*
+* @param[in] ops_idx
+*  Operating point index
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_operation_point_set(vps_t *ps_vps, bitstrm_t *ps_bitstrm, WORD32 ops_idx)
+{
+    WORD32 i;
+    WORD32 value;
+    UNUSED(ops_idx);
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    for(i = 0; i <= ps_vps->i1_vps_max_nuh_reserved_zero_layer_id; i++)
+    {
+        BITS_PARSE("list_entry_l0[ i ]", value, ps_bitstrm, 1);
+        //ps_vps->ai1_layer_id_included_flag[ops_idx][i] = value;
+
+    }
+    UNUSED(value);
+
+    return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Parses pic_lismod_t (picture list mod syntax)  Section:7.3.8.3 Reference
+* picture list mod syntax
+*
+* @par Description:
+*  Parse pict list mod synt and update pic_lismod_t struct
+*
+* @param[in] ps_codec
+*  Pointer to codec context
+*
+* @returns  Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_ref_pic_list_modification(bitstrm_t *ps_bitstrm,
+                                        slice_header_t *ps_slice_hdr,
+                                        WORD32 num_poc_total_curr)
+{
+    WORD32 ret = IHEVCD_SUCCESS;
+    WORD32 value;
+    WORD32 i;
+    rplm_t *ps_rplm;
+    WORD32 num_bits_list_entry;
+
+    ps_rplm = &(ps_slice_hdr->s_rplm);
+
+    /* Calculate Ceil(Log2(num_poc_total_curr)) */
+    {
+        num_bits_list_entry = 32 - CLZ(num_poc_total_curr);
+        /* Check if num_poc_total_curr is power of 2 */
+        if(0 == (num_poc_total_curr & (num_poc_total_curr - 1)))
+        {
+            num_bits_list_entry--;
+        }
+    }
+
+    if(ps_slice_hdr->i1_slice_type  == PSLICE || ps_slice_hdr->i1_slice_type  == BSLICE)
+    {
+        BITS_PARSE("ref_pic_list_modification_flag_l0", value, ps_bitstrm, 1);
+        ps_rplm->i1_ref_pic_list_modification_flag_l0 = value;
+
+        if(ps_rplm->i1_ref_pic_list_modification_flag_l0)
+            for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
+            {
+                BITS_PARSE("list_entry_l0", value, ps_bitstrm, num_bits_list_entry);
+                ps_rplm->i1_list_entry_l0[i] = value;
+
+                ps_rplm->i1_list_entry_l0[i] = CLIP3(ps_rplm->i1_list_entry_l0[i], 0, num_poc_total_curr - 1);
+            }
+    }
+
+    if(ps_slice_hdr->i1_slice_type  == BSLICE)
+    {
+        BITS_PARSE("ref_pic_list_modification_flag_l1", value, ps_bitstrm, 1);
+        ps_rplm->i1_ref_pic_list_modification_flag_l1 = value;
+
+        if(ps_rplm->i1_ref_pic_list_modification_flag_l1)
+            for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
+            {
+                BITS_PARSE("list_entry_l1", value, ps_bitstrm, num_bits_list_entry);
+                ps_rplm->i1_list_entry_l1[i] = value;
+
+                ps_rplm->i1_list_entry_l1[i] = CLIP3(ps_rplm->i1_list_entry_l1[i], 0, num_poc_total_curr - 1);
+            }
+
+    }
+
+    return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Parse Slice Header
+* slice_header_syntax()
+*
+* @par Description:
+*  Parse Slice Header as per  Section: 7.3.8
+*
+* @param[in] ps_codec
+*  Pointer to codec context
+*
+* @returns  Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+IHEVCD_ERROR_T ihevcd_parse_slice_header(codec_t *ps_codec,
+                                         nal_header_t *ps_nal)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    WORD32 value;
+    WORD32 i;
+    WORD32 sps_id;
+
+    pps_t *ps_pps;
+    sps_t *ps_sps;
+    slice_header_t *ps_slice_hdr;
+    WORD32 disable_deblocking_filter_flag;
+    bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+    WORD32 idr_pic_flag;
+    WORD32 pps_id;
+    WORD32 first_slice_in_pic_flag;
+    WORD32 no_output_of_prior_pics_flag = 0;
+    WORD8 i1_nal_unit_type = ps_nal->i1_nal_unit_type;
+    WORD32 num_poc_total_curr = 0;
+    WORD32 slice_address;
+
+    if(ps_codec->i4_slice_error == 1)
+        return ret;
+
+#ifdef GPU_BUILD
+    //TODO GPU : Later define it for ARM only version as well
+    ps_codec->s_parse.ps_slice_hdr_base = ps_codec->aps_slice_hdr_base[ps_codec->u4_parsing_view];
+#endif
+    idr_pic_flag = (NAL_IDR_W_LP == i1_nal_unit_type) ||
+                    (NAL_IDR_N_LP == i1_nal_unit_type);
+
+
+    BITS_PARSE("first_slice_in_pic_flag", first_slice_in_pic_flag, ps_bitstrm, 1);
+    if((NAL_BLA_W_LP <= i1_nal_unit_type) &&
+       (NAL_RSV_RAP_VCL23          >= i1_nal_unit_type))
+    {
+        BITS_PARSE("no_output_of_prior_pics_flag", no_output_of_prior_pics_flag, ps_bitstrm, 1);
+    }
+    UEV_PARSE("pic_parameter_set_id", pps_id, ps_bitstrm);
+    pps_id = CLIP3(pps_id, 0, MAX_PPS_CNT - 2);
+
+    /* Get the current PPS structure */
+    ps_pps = ps_codec->s_parse.ps_pps_base + pps_id;
+    if(0 == ps_pps->i1_pps_valid)
+    {
+        pps_t *ps_pps_ref = ps_codec->ps_pps_base;
+        while(0 == ps_pps_ref->i1_pps_valid)
+            ps_pps_ref++;
+
+        if((ps_pps_ref - ps_codec->ps_pps_base >= MAX_PPS_CNT - 1))
+            return IHEVCD_INVALID_HEADER;
+
+        ihevcd_copy_pps(ps_codec, pps_id, ps_pps_ref->i1_pps_id);
+    }
+
+    /* Get SPS id for the current PPS */
+    sps_id = ps_pps->i1_sps_id;
+
+    /* Get the current SPS structure */
+    ps_sps = ps_codec->s_parse.ps_sps_base + sps_id;
+
+    /* When the current slice is the first in a pic,
+     *  check whether the previous frame is complete
+     *  If the previous frame is incomplete -
+     *  treat the remaining CTBs as skip */
+    if((0 != ps_codec->u4_pic_cnt || ps_codec->i4_pic_present) &&
+                    first_slice_in_pic_flag)
+    {
+        if(ps_codec->i4_pic_present)
+        {
+            slice_header_t *ps_slice_hdr_next;
+            ps_codec->i4_slice_error = 1;
+            ps_codec->s_parse.i4_cur_slice_idx--;
+            if(ps_codec->s_parse.i4_cur_slice_idx < 0)
+                ps_codec->s_parse.i4_cur_slice_idx = 0;
+
+            ps_slice_hdr_next = ps_codec->s_parse.ps_slice_hdr_base + ((ps_codec->s_parse.i4_cur_slice_idx + 1) & (MAX_SLICE_HDR_CNT - 1));
+            ps_slice_hdr_next->i2_ctb_x = 0;
+            ps_slice_hdr_next->i2_ctb_y = ps_codec->s_parse.ps_sps->i2_pic_ht_in_ctb;
+            return ret;
+        }
+        else
+        {
+            ps_codec->i4_slice_error = 0;
+        }
+    }
+
+    if(first_slice_in_pic_flag)
+    {
+        ps_codec->s_parse.i4_cur_slice_idx = 0;
+    }
+    else
+    {
+        /* If the current slice is not the first slice in the pic,
+         * but the first one to be parsed, set the current slice indx to 1
+         * Treat the first slice to be missing and copy the current slice header
+         * to the first one */
+        if(0 == ps_codec->i4_pic_present)
+            ps_codec->s_parse.i4_cur_slice_idx = 1;
+    }
+
+    ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr_base + (ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1));
+
+#ifdef GPU_BUILD
+    /* OpenCL Ping Pong buffer */
+    // TODO GPU : Find out why this memcpy is required.
+    if(ps_codec->u4_parsing_view == 1)
+    {
+        //ps_slice_hdr += MAX_SLICE_HDR_CNT;
+        memcpy(ps_slice_hdr, ps_slice_hdr - MAX_SLICE_HDR_CNT, sizeof(slice_header_t));
+    }
+    else if(ps_codec->u4_parsing_view == 0)
+    {
+        if(1 != ps_codec->i4_num_cores)
+            memcpy(ps_slice_hdr, ps_slice_hdr + MAX_SLICE_HDR_CNT, sizeof(slice_header_t));
+    }
+#endif
+
+    if((ps_pps->i1_dependent_slice_enabled_flag) &&
+       (!first_slice_in_pic_flag))
+    {
+        BITS_PARSE("dependent_slice_flag", value, ps_bitstrm, 1);
+
+        /* If dependendent slice, copy slice header from previous slice */
+        if(value && (ps_codec->s_parse.i4_cur_slice_idx > 0))
+        {
+            ihevcd_copy_slice_hdr(ps_codec,
+                                  (ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1)),
+                                  ((ps_codec->s_parse.i4_cur_slice_idx - 1) & (MAX_SLICE_HDR_CNT - 1)));
+        }
+        ps_slice_hdr->i1_dependent_slice_flag = value;
+    }
+    else
+    {
+        ps_slice_hdr->i1_dependent_slice_flag = 0;
+    }
+    ps_slice_hdr->i1_nal_unit_type = i1_nal_unit_type;
+    ps_slice_hdr->i1_pps_id = pps_id;
+    ps_slice_hdr->i1_first_slice_in_pic_flag = first_slice_in_pic_flag;
+
+    ps_slice_hdr->i1_no_output_of_prior_pics_flag = 1;
+    if((NAL_BLA_W_LP <= i1_nal_unit_type) &&
+                    (NAL_RSV_RAP_VCL23          >= i1_nal_unit_type))
+    {
+        ps_slice_hdr->i1_no_output_of_prior_pics_flag = no_output_of_prior_pics_flag;
+    }
+    ps_slice_hdr->i1_pps_id = pps_id;
+
+    if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+    {
+        WORD32 num_bits;
+
+        /* Use CLZ to compute Ceil( Log2( PicSizeInCtbsY ) ) */
+        num_bits = 32 - CLZ(ps_sps->i4_pic_size_in_ctb - 1);
+        BITS_PARSE("slice_address", value, ps_bitstrm, num_bits);
+
+        slice_address = value;
+        /* If slice address is greater than the number of CTBs in a picture,
+         * ignore the slice */
+        if(value >= ps_sps->i4_pic_size_in_ctb)
+            return IHEVCD_IGNORE_SLICE;
+    }
+    else
+    {
+        slice_address = 0;
+    }
+
+    if(!ps_slice_hdr->i1_dependent_slice_flag)
+    {
+        ps_slice_hdr->i1_pic_output_flag = 1;
+        ps_slice_hdr->i4_pic_order_cnt_lsb = 0;
+        ps_slice_hdr->i1_num_long_term_sps = 0;
+        ps_slice_hdr->i1_num_long_term_pics = 0;
+
+        for(i = 0; i < ps_pps->i1_num_extra_slice_header_bits; i++)
+        {
+            BITS_PARSE("slice_reserved_undetermined_flag[ i ]", value, ps_bitstrm, 1);
+            //slice_reserved_undetermined_flag[ i ]
+        }
+        UEV_PARSE("slice_type", value, ps_bitstrm);
+        ps_slice_hdr->i1_slice_type = value;
+
+        /* If the picture is IRAP, slice type must be equal to ISLICE */
+        if((ps_slice_hdr->i1_nal_unit_type >= NAL_BLA_W_LP) &&
+                        (ps_slice_hdr->i1_nal_unit_type <= NAL_RSV_RAP_VCL23))
+            ps_slice_hdr->i1_slice_type = ISLICE;
+
+        if((ps_slice_hdr->i1_slice_type < 0) ||
+                        (ps_slice_hdr->i1_slice_type > 2))
+            return IHEVCD_IGNORE_SLICE;
+
+        if(ps_pps->i1_output_flag_present_flag)
+        {
+            BITS_PARSE("pic_output_flag", value, ps_bitstrm, 1);
+            ps_slice_hdr->i1_pic_output_flag = value;
+        }
+        ps_slice_hdr->i1_colour_plane_id = 0;
+        if(1 == ps_sps->i1_separate_colour_plane_flag)
+        {
+            BITS_PARSE("colour_plane_id", value, ps_bitstrm, 2);
+            ps_slice_hdr->i1_colour_plane_id = value;
+        }
+        ps_slice_hdr->i1_slice_temporal_mvp_enable_flag = 0;
+
+        if(!idr_pic_flag)
+        {
+
+            WORD32 st_rps_idx;
+            WORD32 num_neg_pics;
+            WORD32 num_pos_pics;
+            WORD8 *pi1_used;
+
+            BITS_PARSE("pic_order_cnt_lsb", value, ps_bitstrm, ps_sps->i1_log2_max_pic_order_cnt_lsb);
+            //value = ihevcd_extend_sign_bit(value, ps_sps->i1_log2_max_pic_order_cnt_lsb);
+            ps_slice_hdr->i4_pic_order_cnt_lsb = value;
+
+            BITS_PARSE("short_term_ref_pic_set_sps_flag", value, ps_bitstrm, 1);
+            ps_slice_hdr->i1_short_term_ref_pic_set_sps_flag = value;
+
+            if(1 == ps_slice_hdr->i1_short_term_ref_pic_set_sps_flag)
+            {
+                WORD32 numbits;
+
+                numbits = 32 - CLZ(ps_sps->i1_num_short_term_ref_pic_sets - 1);
+                BITS_PARSE("short_term_ref_pic_set_idx", value, ps_bitstrm, numbits);
+                ps_slice_hdr->i1_short_term_ref_pic_set_idx = value;
+                ps_slice_hdr->i1_short_term_ref_pic_set_idx = CLIP3(ps_slice_hdr->i1_short_term_ref_pic_set_idx, 0, MAX_STREF_PICS_SPS - 1);
+
+                st_rps_idx = ps_slice_hdr->i1_short_term_ref_pic_set_idx;
+                num_neg_pics = ps_sps->as_stref_picset[st_rps_idx].i1_num_neg_pics;
+                num_pos_pics = ps_sps->as_stref_picset[st_rps_idx].i1_num_pos_pics;
+                pi1_used = ps_sps->as_stref_picset[st_rps_idx].ai1_used;
+            }
+            else
+            {
+                ihevcd_short_term_ref_pic_set(ps_bitstrm,
+                                              &ps_sps->as_stref_picset[0],
+                                              ps_sps->i1_num_short_term_ref_pic_sets,
+                                              ps_sps->i1_num_short_term_ref_pic_sets,
+                                              &ps_slice_hdr->s_stref_picset);
+
+                st_rps_idx = ps_sps->i1_num_short_term_ref_pic_sets;
+                num_neg_pics = ps_slice_hdr->s_stref_picset.i1_num_neg_pics;
+                num_pos_pics = ps_slice_hdr->s_stref_picset.i1_num_pos_pics;
+                pi1_used = ps_slice_hdr->s_stref_picset.ai1_used;
+            }
+
+            if(ps_sps->i1_long_term_ref_pics_present_flag)
+            {
+                if(ps_sps->i1_num_long_term_ref_pics_sps > 0)
+                {
+                    UEV_PARSE("num_long_term_sps", value, ps_bitstrm);
+                    ps_slice_hdr->i1_num_long_term_sps = value;
+
+                    ps_slice_hdr->i1_num_long_term_sps = CLIP3(ps_slice_hdr->i1_num_long_term_sps,
+                                                               0, MAX_DPB_SIZE - num_neg_pics - num_pos_pics);
+                }
+                UEV_PARSE("num_long_term_pics", value, ps_bitstrm);
+                ps_slice_hdr->i1_num_long_term_pics = value;
+                ps_slice_hdr->i1_num_long_term_pics = CLIP3(ps_slice_hdr->i1_num_long_term_pics,
+                                                            0, MAX_DPB_SIZE - num_neg_pics - num_pos_pics -
+                                                            ps_slice_hdr->i1_num_long_term_sps);
+
+                for(i = 0; i < (ps_slice_hdr->i1_num_long_term_sps +
+                                ps_slice_hdr->i1_num_long_term_pics); i++)
+                {
+                    if(i < ps_slice_hdr->i1_num_long_term_sps)
+                    {
+                        /* Use CLZ to compute Ceil( Log2( num_long_term_ref_pics_sps ) ) */
+                        WORD32 num_bits = 32 - CLZ(ps_sps->i1_num_long_term_ref_pics_sps);
+                        BITS_PARSE("lt_idx_sps[ i ]", value, ps_bitstrm, num_bits);
+                        ps_slice_hdr->ai4_poc_lsb_lt[i] = ps_sps->ai1_lt_ref_pic_poc_lsb_sps[value];
+                        ps_slice_hdr->ai1_used_by_curr_pic_lt_flag[i] = ps_sps->ai1_used_by_curr_pic_lt_sps_flag[value];
+
+                    }
+                    else
+                    {
+                        BITS_PARSE("poc_lsb_lt[ i ]", value, ps_bitstrm, ps_sps->i1_log2_max_pic_order_cnt_lsb);
+                        ps_slice_hdr->ai4_poc_lsb_lt[i] = value;
+
+                        BITS_PARSE("used_by_curr_pic_lt_flag[ i ]", value, ps_bitstrm, 1);
+                        ps_slice_hdr->ai1_used_by_curr_pic_lt_flag[i] = value;
+
+                    }
+                    BITS_PARSE("delta_poc_msb_present_flag[ i ]", value, ps_bitstrm, 1);
+                    ps_slice_hdr->ai1_delta_poc_msb_present_flag[i] = value;
+
+
+                    ps_slice_hdr->ai1_delta_poc_msb_cycle_lt[i] = 0;
+                    if(ps_slice_hdr->ai1_delta_poc_msb_present_flag[i])
+                    {
+
+                        UEV_PARSE("delata_poc_msb_cycle_lt[ i ]", value, ps_bitstrm);
+                        ps_slice_hdr->ai1_delta_poc_msb_cycle_lt[i] = value;
+                    }
+
+                    if((i != 0) && (i != ps_slice_hdr->i1_num_long_term_sps))
+                    {
+                        ps_slice_hdr->ai1_delta_poc_msb_cycle_lt[i] += ps_slice_hdr->ai1_delta_poc_msb_cycle_lt[i - 1];
+                    }
+
+                }
+            }
+
+            for(i = 0; i < num_neg_pics + num_pos_pics; i++)
+            {
+                if(pi1_used[i])
+                {
+                    num_poc_total_curr++;
+                }
+            }
+            for(i = 0; i < ps_slice_hdr->i1_num_long_term_sps + ps_slice_hdr->i1_num_long_term_pics; i++)
+            {
+                if(ps_slice_hdr->ai1_used_by_curr_pic_lt_flag[i])
+                {
+                    num_poc_total_curr++;
+                }
+            }
+
+
+            if(ps_sps->i1_sps_temporal_mvp_enable_flag)
+            {
+                BITS_PARSE("enable_temporal_mvp_flag", value, ps_bitstrm, 1);
+                ps_slice_hdr->i1_slice_temporal_mvp_enable_flag = value;
+            }
+
+        }
+        ps_slice_hdr->i1_slice_sao_luma_flag = 0;
+        ps_slice_hdr->i1_slice_sao_chroma_flag = 0;
+        if(ps_sps->i1_sample_adaptive_offset_enabled_flag)
+        {
+            BITS_PARSE("slice_sao_luma_flag", value, ps_bitstrm, 1);
+            ps_slice_hdr->i1_slice_sao_luma_flag = value;
+
+            BITS_PARSE("slice_sao_chroma_flag", value, ps_bitstrm, 1);
+            ps_slice_hdr->i1_slice_sao_chroma_flag = value;
+
+        }
+
+        ps_slice_hdr->i1_max_num_merge_cand = 1;
+        ps_slice_hdr->i1_cabac_init_flag = 0;
+
+        ps_slice_hdr->i1_num_ref_idx_l0_active = 0;
+        ps_slice_hdr->i1_num_ref_idx_l1_active = 0;
+        ps_slice_hdr->i1_slice_cb_qp_offset = 0;
+        ps_slice_hdr->i1_slice_cr_qp_offset = 0;
+        if((PSLICE == ps_slice_hdr->i1_slice_type) ||
+           (BSLICE == ps_slice_hdr->i1_slice_type))
+        {
+            BITS_PARSE("num_ref_idx_active_override_flag", value, ps_bitstrm, 1);
+            ps_slice_hdr->i1_num_ref_idx_active_override_flag = value;
+
+            if(ps_slice_hdr->i1_num_ref_idx_active_override_flag)
+            {
+                UEV_PARSE("num_ref_idx_l0_active_minus1", value, ps_bitstrm);
+                ps_slice_hdr->i1_num_ref_idx_l0_active = value + 1;
+
+                if(BSLICE == ps_slice_hdr->i1_slice_type)
+                {
+                    UEV_PARSE("num_ref_idx_l1_active_minus1", value, ps_bitstrm);
+                    ps_slice_hdr->i1_num_ref_idx_l1_active = value + 1;
+                }
+
+            }
+            else
+            {
+                ps_slice_hdr->i1_num_ref_idx_l0_active = ps_pps->i1_num_ref_idx_l0_default_active;
+
+                if(BSLICE == ps_slice_hdr->i1_slice_type)
+                {
+                    ps_slice_hdr->i1_num_ref_idx_l1_active = ps_pps->i1_num_ref_idx_l1_default_active;
+                }
+            }
+
+            ps_slice_hdr->i1_num_ref_idx_l0_active = CLIP3(ps_slice_hdr->i1_num_ref_idx_l0_active, 0, MAX_DPB_SIZE - 1);
+            ps_slice_hdr->i1_num_ref_idx_l1_active = CLIP3(ps_slice_hdr->i1_num_ref_idx_l1_active, 0, MAX_DPB_SIZE - 1);
+
+            if(0 == num_poc_total_curr)
+                return IHEVCD_IGNORE_SLICE;
+            if((ps_pps->i1_lists_modification_present_flag) && (num_poc_total_curr > 1))
+            {
+                ihevcd_ref_pic_list_modification(ps_bitstrm,
+                                                 ps_slice_hdr, num_poc_total_curr);
+            }
+            else
+            {
+                ps_slice_hdr->s_rplm.i1_ref_pic_list_modification_flag_l0 = 0;
+                ps_slice_hdr->s_rplm.i1_ref_pic_list_modification_flag_l1 = 0;
+            }
+
+            if(BSLICE == ps_slice_hdr->i1_slice_type)
+            {
+                BITS_PARSE("mvd_l1_zero_flag", value, ps_bitstrm, 1);
+                ps_slice_hdr->i1_mvd_l1_zero_flag = value;
+            }
+
+            ps_slice_hdr->i1_cabac_init_flag = 0;
+            if(ps_pps->i1_cabac_init_present_flag)
+            {
+                BITS_PARSE("cabac_init_flag", value, ps_bitstrm, 1);
+                ps_slice_hdr->i1_cabac_init_flag = value;
+
+            }
+            ps_slice_hdr->i1_collocated_from_l0_flag = 1;
+            ps_slice_hdr->i1_collocated_ref_idx = 0;
+            if(ps_slice_hdr->i1_slice_temporal_mvp_enable_flag)
+            {
+                if(BSLICE == ps_slice_hdr->i1_slice_type)
+                {
+                    BITS_PARSE("collocated_from_l0_flag", value, ps_bitstrm, 1);
+                    ps_slice_hdr->i1_collocated_from_l0_flag = value;
+                }
+
+                if((ps_slice_hdr->i1_collocated_from_l0_flag  &&  (ps_slice_hdr->i1_num_ref_idx_l0_active > 1)) ||
+                   (!ps_slice_hdr->i1_collocated_from_l0_flag  && (ps_slice_hdr->i1_num_ref_idx_l1_active > 1)))
+                {
+                    UEV_PARSE("collocated_ref_idx", value, ps_bitstrm);
+                    ps_slice_hdr->i1_collocated_ref_idx = value;
+                }
+
+            }
+            ps_slice_hdr->i1_collocated_ref_idx = CLIP3(ps_slice_hdr->i1_collocated_ref_idx, 0, MAX_DPB_SIZE - 1);
+
+            if((ps_pps->i1_weighted_pred_flag  &&   (PSLICE == ps_slice_hdr->i1_slice_type)) ||
+               (ps_pps->i1_weighted_bipred_flag  &&  (BSLICE == ps_slice_hdr->i1_slice_type)))
+            {
+                ihevcd_parse_pred_wt_ofst(ps_bitstrm, ps_sps, ps_pps, ps_slice_hdr);
+            }
+            UEV_PARSE("five_minus_max_num_merge_cand", value, ps_bitstrm);
+            ps_slice_hdr->i1_max_num_merge_cand = 5 - value;
+
+        }
+        ps_slice_hdr->i1_max_num_merge_cand = CLIP3(ps_slice_hdr->i1_max_num_merge_cand, 1, 5);
+        SEV_PARSE("slice_qp_delta", value, ps_bitstrm);
+        ps_slice_hdr->i1_slice_qp_delta = value;
+
+        if(ps_pps->i1_pic_slice_level_chroma_qp_offsets_present_flag)
+        {
+            SEV_PARSE("slice_cb_qp_offset", value, ps_bitstrm);
+            ps_slice_hdr->i1_slice_cb_qp_offset = value;
+
+            SEV_PARSE("slice_cr_qp_offset", value, ps_bitstrm);
+            ps_slice_hdr->i1_slice_cr_qp_offset = value;
+
+        }
+        ps_slice_hdr->i1_deblocking_filter_override_flag = 0;
+        ps_slice_hdr->i1_slice_disable_deblocking_filter_flag  = ps_pps->i1_pic_disable_deblocking_filter_flag;
+        ps_slice_hdr->i1_beta_offset_div2 = ps_pps->i1_beta_offset_div2;
+        ps_slice_hdr->i1_tc_offset_div2 = ps_pps->i1_tc_offset_div2;
+
+        disable_deblocking_filter_flag = ps_pps->i1_pic_disable_deblocking_filter_flag;
+
+        if(ps_pps->i1_deblocking_filter_control_present_flag)
+        {
+
+            if(ps_pps->i1_deblocking_filter_override_enabled_flag)
+            {
+                BITS_PARSE("deblocking_filter_override_flag", value, ps_bitstrm, 1);
+                ps_slice_hdr->i1_deblocking_filter_override_flag = value;
+            }
+
+            if(ps_slice_hdr->i1_deblocking_filter_override_flag)
+            {
+                BITS_PARSE("slice_disable_deblocking_filter_flag", value, ps_bitstrm, 1);
+                ps_slice_hdr->i1_slice_disable_deblocking_filter_flag = value;
+                disable_deblocking_filter_flag = ps_slice_hdr->i1_slice_disable_deblocking_filter_flag;
+
+                if(!ps_slice_hdr->i1_slice_disable_deblocking_filter_flag)
+                {
+                    SEV_PARSE("beta_offset_div2", value, ps_bitstrm);
+                    ps_slice_hdr->i1_beta_offset_div2 = value;
+
+                    SEV_PARSE("tc_offset_div2", value, ps_bitstrm);
+                    ps_slice_hdr->i1_tc_offset_div2 = value;
+
+                }
+            }
+        }
+
+        ps_slice_hdr->i1_slice_loop_filter_across_slices_enabled_flag = ps_pps->i1_loop_filter_across_slices_enabled_flag;
+        if(ps_pps->i1_loop_filter_across_slices_enabled_flag  &&
+                        (ps_slice_hdr->i1_slice_sao_luma_flag  ||  ps_slice_hdr->i1_slice_sao_chroma_flag  || !disable_deblocking_filter_flag))
+        {
+            BITS_PARSE("slice_loop_filter_across_slices_enabled_flag", value, ps_bitstrm, 1);
+            ps_slice_hdr->i1_slice_loop_filter_across_slices_enabled_flag = value;
+        }
+
+    }
+
+    /* Check sanity of slice */
+    if((!first_slice_in_pic_flag) &&
+                    (ps_codec->i4_pic_present))
+    {
+#ifdef GPU_BUILD
+        //TODO GPU : Later define it for ARM only version as well
+        slice_header_t *ps_slice_hdr_base = ps_codec->aps_slice_hdr_base[ps_codec->u4_parsing_view];
+#else
+        slice_header_t *ps_slice_hdr_base = ps_codec->ps_slice_hdr_base;
+#endif
+
+#if 0
+        if((ps_slice_hdr_base->i1_pps_id != ps_slice_hdr->i1_pps_id) ||
+                        (ps_slice_hdr_base->i1_pic_output_flag != ps_slice_hdr->i1_pic_output_flag) ||
+                        (ps_slice_hdr_base->i1_no_output_of_prior_pics_flag != ps_slice_hdr->i1_no_output_of_prior_pics_flag) ||
+                        (ps_slice_hdr_base->i4_pic_order_cnt_lsb != ps_slice_hdr->i4_pic_order_cnt_lsb) ||
+                        (ps_slice_hdr_base->i1_short_term_ref_pic_set_sps_flag != ps_slice_hdr->i1_short_term_ref_pic_set_sps_flag) ||
+                        (ps_slice_hdr_base->i1_short_term_ref_pic_set_idx != ps_slice_hdr->i1_short_term_ref_pic_set_idx) ||
+                        (ps_slice_hdr_base->i1_num_long_term_sps != ps_slice_hdr->i1_num_long_term_sps) ||
+                        (ps_slice_hdr_base->i1_num_long_term_pics != ps_slice_hdr->i1_num_long_term_pics) ||
+                        (ps_slice_hdr_base->i1_slice_temporal_mvp_enable_flag != ps_slice_hdr->i1_slice_temporal_mvp_enable_flag))
+        {
+            return IHEVCD_IGNORE_SLICE;
+        }
+#else
+
+        /* According to the standard, the above conditions must be satisfied - But for error resilience,
+         * only the following conditions are checked */
+        if((ps_slice_hdr_base->i1_pps_id != ps_slice_hdr->i1_pps_id) ||
+                        (ps_slice_hdr_base->i4_pic_order_cnt_lsb != ps_slice_hdr->i4_pic_order_cnt_lsb))
+        {
+            return IHEVCD_IGNORE_SLICE;
+        }
+#endif
+
+    }
+
+
+    if(0 == ps_codec->i4_pic_present)
+    {
+        ps_slice_hdr->i4_abs_pic_order_cnt = ihevcd_calc_poc(ps_codec, ps_nal, ps_sps->i1_log2_max_pic_order_cnt_lsb, ps_slice_hdr->i4_pic_order_cnt_lsb);
+    }
+    else
+    {
+        ps_slice_hdr->i4_abs_pic_order_cnt = ps_codec->s_parse.i4_abs_pic_order_cnt;
+    }
+
+
+    if(!first_slice_in_pic_flag)
+    {
+        /* Check if the current slice belongs to the same pic (Pic being parsed) */
+        if(ps_codec->s_parse.i4_abs_pic_order_cnt == ps_slice_hdr->i4_abs_pic_order_cnt)
+        {
+
+            /* If the Next CTB's index is less than the slice address,
+             * the previous slice is incomplete.
+             * Indicate slice error, and treat the remaining CTBs as skip */
+            if(slice_address > ps_codec->s_parse.i4_next_ctb_indx)
+            {
+                if(ps_codec->i4_pic_present)
+                {
+                    ps_codec->i4_slice_error = 1;
+                    ps_codec->s_parse.i4_cur_slice_idx--;
+                    if(ps_codec->s_parse.i4_cur_slice_idx < 0)
+                        ps_codec->s_parse.i4_cur_slice_idx = 0;
+
+                    return ret;
+                }
+                else
+                {
+                    return IHEVCD_IGNORE_SLICE;
+                }
+            }
+            /* If the slice address is less than the next CTB's index,
+             * extra CTBs have been decoded in the previous slice.
+             * Ignore the current slice. Treat it as incomplete */
+            else if(slice_address < ps_codec->s_parse.i4_next_ctb_indx)
+            {
+                return IHEVCD_IGNORE_SLICE;
+            }
+            else
+            {
+                ps_codec->i4_slice_error = 0;
+            }
+        }
+
+        /* The current slice does not belong to the pic that is being parsed */
+        else
+        {
+            /* The previous pic is incomplete.
+             * Treat the remaining CTBs as skip */
+            if(ps_codec->i4_pic_present)
+            {
+                slice_header_t *ps_slice_hdr_next;
+                ps_codec->i4_slice_error = 1;
+                ps_codec->s_parse.i4_cur_slice_idx--;
+                if(ps_codec->s_parse.i4_cur_slice_idx < 0)
+                    ps_codec->s_parse.i4_cur_slice_idx = 0;
+
+                ps_slice_hdr_next = ps_codec->s_parse.ps_slice_hdr_base + ((ps_codec->s_parse.i4_cur_slice_idx + 1) & (MAX_SLICE_HDR_CNT - 1));
+                ps_slice_hdr_next->i2_ctb_x = 0;
+                ps_slice_hdr_next->i2_ctb_y = ps_codec->s_parse.ps_sps->i2_pic_ht_in_ctb;
+                return ret;
+            }
+
+            /* If the previous pic is complete,
+             * return if the current slice is dependant
+             * otherwise, update the parse context's POC */
+            else
+            {
+                if(ps_slice_hdr->i1_dependent_slice_flag)
+                    return IHEVCD_IGNORE_SLICE;
+
+                ps_codec->s_parse.i4_abs_pic_order_cnt = ps_slice_hdr->i4_abs_pic_order_cnt;
+            }
+        }
+    }
+
+    /* If the slice is the first slice in the pic, update the parse context's POC */
+    else
+    {
+        /* If the first slice is repeated, ignore the second occurrence
+         * If any other slice is repeated, the CTB addr will be greater than the slice addr,
+         * and hence the second occurrence is ignored */
+        if(ps_codec->s_parse.i4_abs_pic_order_cnt == ps_slice_hdr->i4_abs_pic_order_cnt)
+            return IHEVCD_IGNORE_SLICE;
+
+        ps_codec->s_parse.i4_abs_pic_order_cnt = ps_slice_hdr->i4_abs_pic_order_cnt;
+    }
+
+    // printf("POC: %d\n", ps_slice_hdr->i4_abs_pic_order_cnt);
+    // AEV_TRACE("POC", ps_slice_hdr->i4_abs_pic_order_cnt, 0);
+    ps_slice_hdr->i4_num_entry_point_offsets = 0;
+    if((ps_pps->i1_tiles_enabled_flag) ||
+       (ps_pps->i1_entropy_coding_sync_enabled_flag))
+    {
+        UEV_PARSE("num_entry_point_offsets", value, ps_bitstrm);
+        ps_slice_hdr->i4_num_entry_point_offsets = value;
+
+        {
+            WORD32 max_num_entry_point_offsets;
+            if((ps_pps->i1_tiles_enabled_flag) &&
+                            (ps_pps->i1_entropy_coding_sync_enabled_flag))
+            {
+                max_num_entry_point_offsets = ps_pps->i1_num_tile_columns * (ps_sps->i2_pic_ht_in_ctb - 1);
+            }
+            else if(ps_pps->i1_tiles_enabled_flag)
+            {
+                max_num_entry_point_offsets = ps_pps->i1_num_tile_columns * ps_pps->i1_num_tile_rows;
+            }
+            else
+            {
+                max_num_entry_point_offsets = (ps_sps->i2_pic_ht_in_ctb - 1);
+            }
+
+            ps_slice_hdr->i4_num_entry_point_offsets = CLIP3(ps_slice_hdr->i4_num_entry_point_offsets,
+                                                             0, max_num_entry_point_offsets);
+        }
+
+        if(ps_slice_hdr->i4_num_entry_point_offsets > 0)
+        {
+            UEV_PARSE("offset_len_minus1", value, ps_bitstrm);
+            ps_slice_hdr->i1_offset_len = value + 1;
+
+            for(i = 0; i < ps_slice_hdr->i4_num_entry_point_offsets; i++)
+            {
+                BITS_PARSE("entry_point_offset", value, ps_bitstrm, ps_slice_hdr->i1_offset_len);
+
+                /* TODO: pu4_entry_point_offset needs to be initialized */
+                //ps_slice_hdr->pu4_entry_point_offset[i] = value;
+            }
+
+        }
+    }
+
+    if(ps_pps->i1_slice_header_extension_present_flag)
+    {
+        UEV_PARSE("slice_header_extension_length", value, ps_bitstrm);
+        ps_slice_hdr->i2_slice_header_extension_length = value;
+
+
+        for(i = 0; i < ps_slice_hdr->i2_slice_header_extension_length; i++)
+        {
+            BITS_PARSE("slice_header_extension_data_byte", value, ps_bitstrm, 8);
+        }
+
+    }
+
+    ihevcd_bits_flush_to_byte_boundary(ps_bitstrm);
+
+    {
+        dpb_mgr_t *ps_dpb_mgr = (dpb_mgr_t *)ps_codec->pv_dpb_mgr;
+        WORD32 r_idx;
+
+        if((NAL_IDR_W_LP == ps_slice_hdr->i1_nal_unit_type) ||
+                        (NAL_IDR_N_LP == ps_slice_hdr->i1_nal_unit_type)  ||
+                        (NAL_BLA_N_LP == ps_slice_hdr->i1_nal_unit_type)  ||
+                        (NAL_BLA_W_DLP == ps_slice_hdr->i1_nal_unit_type) ||
+                        (NAL_BLA_W_LP == ps_slice_hdr->i1_nal_unit_type)  ||
+                        (0 == ps_codec->u4_pic_cnt))
+        {
+#ifdef GPU_BUILD
+            /* TODO GPU : Following fix not tested. */
+            for(i = 0; i < MAX_DPB_BUFS; i++)
+            {
+                if(ps_dpb_mgr->as_dpb_info[i].ps_pic_buf)
+                    ps_dpb_mgr->as_dpb_info[i].ps_pic_buf->u1_used_as_ref = UNUSED_FOR_REF;
+            }
+
+#else
+            for(i = 0; i < MAX_DPB_BUFS; i++)
+            {
+                if(ps_dpb_mgr->as_dpb_info[i].ps_pic_buf)
+                {
+                    pic_buf_t *ps_pic_buf = ps_dpb_mgr->as_dpb_info[i].ps_pic_buf;
+                    mv_buf_t *ps_mv_buf;
+
+                    /* Long term index is set to MAX_DPB_BUFS to ensure it is not added as LT */
+                    ihevc_dpb_mgr_del_ref((dpb_mgr_t *)ps_codec->pv_dpb_mgr, (buf_mgr_t *)ps_codec->pv_pic_buf_mgr, ps_pic_buf->i4_abs_poc);
+                    /* Find buffer id of the MV bank corresponding to the buffer being freed (Buffer with POC of u4_abs_poc) */
+                    ps_mv_buf = (mv_buf_t *)ps_codec->ps_mv_buf;
+                    for(i = 0; i < BUF_MGR_MAX_CNT; i++)
+                    {
+                        if(ps_mv_buf->i4_abs_poc == ps_pic_buf->i4_abs_poc)
+                        {
+                            ihevc_buf_mgr_release((buf_mgr_t *)ps_codec->pv_mv_buf_mgr, i, BUF_MGR_REF);
+                            break;
+                        }
+                        ps_mv_buf++;
+                    }
+
+                }
+
+            }
+
+            /* Initialize the reference lists to NULL
+             * This is done to take care of the cases where the first pic is not IDR
+             * but the reference list is not created for the first pic because
+             * pic count is zero leaving the reference list uninitialised  */
+            for(r_idx = 0; r_idx < MAX_DPB_SIZE; r_idx++)
+            {
+                ps_slice_hdr->as_ref_pic_list0[r_idx].pv_pic_buf = NULL;
+                ps_slice_hdr->as_ref_pic_list0[r_idx].pv_mv_buf = NULL;
+
+                ps_slice_hdr->as_ref_pic_list1[r_idx].pv_pic_buf = NULL;
+                ps_slice_hdr->as_ref_pic_list1[r_idx].pv_mv_buf = NULL;
+            }
+
+#endif
+        }
+        else
+        {
+            WORD32 ret;
+            ret = ihevcd_ref_list(ps_codec, ps_pps, ps_sps, ps_slice_hdr);
+
+            if(IHEVCD_REF_PIC_NOT_FOUND == ret)
+                return IHEVCD_IGNORE_SLICE;
+        }
+
+    }
+
+    /* Fill the remaining entries of the reference lists with the nearest POC
+     * This is done to handle cases where there is a corruption in the reference index */
+    if(ps_codec->i4_pic_present)
+    {
+        pic_buf_t *ps_pic_buf_ref;
+        mv_buf_t *ps_mv_buf_ref;
+        WORD32 r_idx;
+        dpb_mgr_t *ps_dpb_mgr = (dpb_mgr_t *)ps_codec->pv_dpb_mgr;
+        buf_mgr_t *ps_mv_buf_mgr = (buf_mgr_t *)ps_codec->pv_mv_buf_mgr;
+
+        ps_pic_buf_ref = ihevc_dpb_mgr_get_ref_by_nearest_poc(ps_dpb_mgr, ps_slice_hdr->i4_abs_pic_order_cnt);
+        if(NULL == ps_pic_buf_ref)
+        {
+            ps_pic_buf_ref = ps_codec->as_process[0].ps_cur_pic;
+            ps_mv_buf_ref = ps_codec->s_parse.ps_cur_mv_buf;
+        }
+        else
+        {
+            ps_mv_buf_ref = ihevcd_mv_mgr_get_poc(ps_mv_buf_mgr, ps_pic_buf_ref->i4_abs_poc);
+        }
+
+        for(r_idx = 0; r_idx < ps_slice_hdr->i1_num_ref_idx_l0_active; r_idx++)
+        {
+            if(NULL == ps_slice_hdr->as_ref_pic_list0[r_idx].pv_pic_buf)
+            {
+                ps_slice_hdr->as_ref_pic_list0[r_idx].pv_pic_buf = (void *)ps_pic_buf_ref;
+                ps_slice_hdr->as_ref_pic_list0[r_idx].pv_mv_buf = (void *)ps_mv_buf_ref;
+            }
+        }
+
+        for(r_idx = ps_slice_hdr->i1_num_ref_idx_l0_active; r_idx < MAX_DPB_SIZE; r_idx++)
+        {
+            ps_slice_hdr->as_ref_pic_list0[r_idx].pv_pic_buf = (void *)ps_pic_buf_ref;
+            ps_slice_hdr->as_ref_pic_list0[r_idx].pv_mv_buf = (void *)ps_mv_buf_ref;
+        }
+
+        for(r_idx = 0; r_idx < ps_slice_hdr->i1_num_ref_idx_l1_active; r_idx++)
+        {
+            if(NULL == ps_slice_hdr->as_ref_pic_list1[r_idx].pv_pic_buf)
+            {
+                ps_slice_hdr->as_ref_pic_list1[r_idx].pv_pic_buf = (void *)ps_pic_buf_ref;
+                ps_slice_hdr->as_ref_pic_list1[r_idx].pv_mv_buf = (void *)ps_mv_buf_ref;
+            }
+        }
+
+        for(r_idx = ps_slice_hdr->i1_num_ref_idx_l1_active; r_idx < MAX_DPB_SIZE; r_idx++)
+        {
+            ps_slice_hdr->as_ref_pic_list1[r_idx].pv_pic_buf = (void *)ps_pic_buf_ref;
+            ps_slice_hdr->as_ref_pic_list1[r_idx].pv_mv_buf = (void *)ps_mv_buf_ref;
+        }
+    }
+
+    /* Update slice address in the header */
+    if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+    {
+        ps_slice_hdr->i2_ctb_x = slice_address % ps_sps->i2_pic_wd_in_ctb;
+        ps_slice_hdr->i2_ctb_y = slice_address / ps_sps->i2_pic_wd_in_ctb;
+
+        if(!ps_slice_hdr->i1_dependent_slice_flag)
+        {
+            ps_slice_hdr->i2_independent_ctb_x = ps_slice_hdr->i2_ctb_x;
+            ps_slice_hdr->i2_independent_ctb_y = ps_slice_hdr->i2_ctb_y;
+        }
+    }
+    else
+    {
+        ps_slice_hdr->i2_ctb_x = 0;
+        ps_slice_hdr->i2_ctb_y = 0;
+
+        ps_slice_hdr->i2_independent_ctb_x = 0;
+        ps_slice_hdr->i2_independent_ctb_y = 0;
+    }
+
+    /* If the first slice in the pic is missing, copy the current slice header to
+     * the first slice's header */
+    if((!first_slice_in_pic_flag) &&
+                    (0 == ps_codec->i4_pic_present))
+    {
+        slice_header_t *ps_slice_hdr_prev = ps_codec->s_parse.ps_slice_hdr_base;
+        ihevcd_copy_slice_hdr(ps_codec, 0, (ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1)));
+
+        ps_codec->i4_slice_error = 1;
+
+        ps_slice_hdr_prev->i2_ctb_x = 0;
+        ps_slice_hdr_prev->i2_ctb_y = 0;
+
+        ps_codec->s_parse.i4_ctb_x = 0;
+        ps_codec->s_parse.i4_ctb_y = 0;
+
+        ps_codec->s_parse.i4_cur_slice_idx = 0;
+
+        if((ps_slice_hdr->i2_ctb_x == 0) &&
+                        (ps_slice_hdr->i2_ctb_y == 0))
+        {
+            ps_slice_hdr->i2_ctb_x++;
+        }
+    }
+
+    {
+        /* If skip B is enabled,
+         * ignore pictures that are non-reference
+         * TODO: (i1_nal_unit_type < NAL_BLA_W_LP) && (i1_nal_unit_type % 2 == 0) only says it is
+         * sub-layer non-reference slice. May need to find a way to detect actual non-reference pictures*/
+
+        if((i1_nal_unit_type < NAL_BLA_W_LP) &&
+                        (i1_nal_unit_type % 2 == 0))
+        {
+            if(IVD_SKIP_B == ps_codec->e_pic_skip_mode)
+                return IHEVCD_IGNORE_SLICE;
+        }
+
+        /* If skip PB is enabled,
+         * decode only I slices */
+        if((IVD_SKIP_PB == ps_codec->e_pic_skip_mode) &&
+                        (ISLICE != ps_slice_hdr->i1_slice_type))
+        {
+            return IHEVCD_IGNORE_SLICE;
+        }
+    }
+
+    return ret;
+}

diff --git a/decoder/ihevcd_parse_slice_header.h b/decoder/ihevcd_parse_slice_header.h
new file mode 100644
index 0000000..6f085b7
--- /dev/null
+++ b/decoder/ihevcd_parse_slice_header.h

@@ -0,0 +1,53 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_parse_slice_header.h
+*
+* @brief
+*  Parsing of slice header
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_PARSE_SLICE_HEADER_H_
+#define _IHEVCD_PARSE_SLICE_HEADER_H_
+
+IHEVCD_ERROR_T ihevcd_short_term_ref_pic_set(bitstrm_t *ps_bitstrm,
+                                             stref_picset_t *ps_stref_picset_base,
+                                             WORD32 num_short_term_ref_pic_sets,
+                                             WORD32 idx,
+                                             stref_picset_t *ps_stref_picset);
+
+WORD32 ihevcd_parse_pred_wt_ofst(bitstrm_t *ps_bitstrm,
+                                 sps_t *ps_sps,
+                                 pps_t *ps_pps,
+                                 slice_header_t *ps_slice_hdr);
+
+WORD32 ihevcd_calc_poc(codec_t *ps_codec, nal_header_t *ps_nal, WORD8 i1_log2_max_poc_lsb, WORD32 i2_poc_lsb);
+
+
+
+#endif /* _IHEVCD_PARSE_SLICE_HEADER_H_ */

diff --git a/decoder/ihevcd_process_slice.c b/decoder/ihevcd_process_slice.c
new file mode 100644
index 0000000..83aed05
--- /dev/null
+++ b/decoder/ihevcd_process_slice.c

@@ -0,0 +1,1738 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevcd_process_slice.c
+ *
+ * @brief
+ *  Contains functions for processing slice data
+ *
+ * @author
+ *  Harish
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_padding.h"
+#include "ihevc_iquant_itrans_recon.h"
+#include "ihevc_chroma_iquant_itrans_recon.h"
+#include "ihevc_recon.h"
+#include "ihevc_chroma_recon.h"
+#include "ihevc_iquant_recon.h"
+#include "ihevc_chroma_iquant_recon.h"
+#include "ihevc_intra_pred.h"
+
+#include "ihevc_error.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_quant_tables.h"
+#include "ihevcd_common_tables.h"
+
+#include "ihevcd_profile.h"
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_utils.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_get_mv.h"
+#include "ihevcd_inter_pred.h"
+#include "ihevcd_iquant_itrans_recon_ctb.h"
+#include "ihevcd_boundary_strength.h"
+#include "ihevcd_deblk.h"
+#include "ihevcd_fmt_conv.h"
+#ifdef GPU_BUILD
+#include "ihevcd_opencl_mc_interface.h"
+#endif
+#include "ihevcd_sao.h"
+#include "ihevcd_profile.h"
+
+IHEVCD_ERROR_T ihevcd_fmt_conv(codec_t *ps_codec,
+                               process_ctxt_t *ps_proc,
+                               UWORD8 *pu1_y_dst,
+                               UWORD8 *pu1_u_dst,
+                               UWORD8 *pu1_v_dst,
+                               WORD32 cur_row,
+                               WORD32 num_rows);
+
+typedef enum
+{
+    PROC_ALL,
+    PROC_INTER_PRED,
+    PROC_RECON,
+    PROC_DEBLK,
+    PROC_SAO
+}proc_type_t;
+
+void ihevcd_proc_map_check(process_ctxt_t *ps_proc, proc_type_t proc_type, WORD32 nctb)
+{
+    tile_t *ps_tile = ps_proc->ps_tile;
+    sps_t *ps_sps = ps_proc->ps_sps;
+    pps_t *ps_pps = ps_proc->ps_pps;
+    codec_t *ps_codec = ps_proc->ps_codec;
+    WORD32 idx;
+    WORD32 nop_cnt;
+    WORD32 bit_pos = proc_type;
+    WORD32 bit_mask = (1 << bit_pos);
+
+    if(ps_proc->i4_check_proc_status)
+    {
+        nop_cnt = PROC_NOP_CNT;
+        while(1)
+        {
+            volatile UWORD8 *pu1_buf;
+            volatile WORD32 status;
+            status = 1;
+            /* Check if all dependencies for the next nCTBs are met */
+            {
+                WORD32 x_pos;
+
+                {
+                    /* Check if the top right of next nCTBs are processed */
+                    if(ps_proc->i4_ctb_y > 0)
+                    {
+                        x_pos = (ps_proc->i4_ctb_tile_x + nctb);
+                        idx = MIN(x_pos, (ps_tile->u2_wd - 1));
+
+                        /* Check if top-right CTB for the last CTB in nCTB is within the tile */
+                        {
+                            idx += ps_tile->u1_pos_x;
+                            idx += ((ps_proc->i4_ctb_y - 1)
+                                            * ps_sps->i2_pic_wd_in_ctb);
+#ifdef GPU_BUILD
+                            //TODO GPU : Later define it for ARM only version as well
+                            pu1_buf = (ps_proc->pu1_proc_map + idx);
+#else
+                            pu1_buf = (ps_codec->pu1_proc_map + idx);
+#endif
+                            status = *pu1_buf & bit_mask;
+                        }
+                    }
+                }
+
+                /* If tiles are enabled, then test left and top-left as well */
+                ps_pps = ps_proc->ps_pps;
+                if(ps_pps->i1_tiles_enabled_flag)
+                {
+                    /*Check if left ctb is processed*/
+                    if((ps_proc->i4_ctb_x > 0) && ((0 != status)))
+                    {
+                        x_pos   = ps_tile->u1_pos_x + ps_proc->i4_ctb_tile_x - 1;
+                        idx     = x_pos + (ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+#ifdef GPU_BUILD
+                        //TODO GPU : Later define it for ARM only version as well
+                        pu1_buf = (ps_proc->pu1_proc_map + idx);
+#else
+                        pu1_buf = (ps_codec->pu1_proc_map + idx);
+#endif
+                        status  = *pu1_buf & bit_mask;
+                    }
+
+                    /*Check if top left ctb is processed*/
+                    if((ps_proc->i4_ctb_x > 0) && (0 != status) && (ps_proc->i4_ctb_y > 0))
+                    {
+                        x_pos   = ps_tile->u1_pos_x + ps_proc->i4_ctb_tile_x - 1;
+                        idx     = x_pos + ((ps_proc->i4_ctb_y - 1) * ps_sps->i2_pic_wd_in_ctb);
+#ifdef GPU_BUILD
+                        //TODO GPU : Later define it for ARM only version as well
+                        pu1_buf = (ps_proc->pu1_proc_map + idx);
+#else
+                        pu1_buf = (ps_codec->pu1_proc_map + idx);
+#endif
+                        status  = *pu1_buf & bit_mask;
+                    }
+                }
+            }
+
+            if(status)
+                break;
+
+            /* if dependencies are not met, then wait for few cycles.
+             * Even after few iterations, if the dependencies are not met then yield
+             */
+            if(nop_cnt > 0)
+            {
+                NOP(128);
+                nop_cnt -= 128;
+            }
+            else
+            {
+                nop_cnt = PROC_NOP_CNT;
+                ithread_yield();
+                //NOP(128 * 16);
+            }
+        }
+    }
+}
+
+void ihevcd_proc_map_update(process_ctxt_t *ps_proc, proc_type_t proc_type, WORD32 nctb)
+{
+    codec_t *ps_codec = ps_proc->ps_codec;
+    WORD32 i, idx;
+    WORD32 bit_pos = proc_type;
+    WORD32 bit_mask = (1 << bit_pos);
+
+    /* Update the current CTBs processing status */
+    if(ps_proc->i4_check_proc_status)
+    {
+        for(i = 0; i < nctb; i++)
+        {
+            sps_t *ps_sps = ps_proc->ps_sps;
+            UWORD8 *pu1_buf;
+            idx = (ps_proc->i4_ctb_x + i);
+            idx += ((ps_proc->i4_ctb_y) * ps_sps->i2_pic_wd_in_ctb);
+#ifdef GPU_BUILD
+            //TODO GPU : Later define it for ARM only version as well
+            pu1_buf = (ps_proc->pu1_proc_map + idx);
+#else
+            pu1_buf = (ps_codec->pu1_proc_map + idx);
+#endif
+            *pu1_buf = *pu1_buf | bit_mask;
+        }
+    }
+}
+
+
+void ihevcd_slice_hdr_update(process_ctxt_t *ps_proc)
+{
+
+    /* Slice x and y are initialized in proc_init. But initialize slice x and y count here
+     *  if a new slice begins at the middle of a row since proc_init is invoked only at the beginning of each row */
+    if(!((ps_proc->i4_ctb_x == 0) && (ps_proc->i4_ctb_y == 0)))
+    {
+#ifdef GPU_BUILD
+        //TODO GPU : Later define it for ARM only version as well
+        slice_header_t *ps_slice_hdr_next = ps_proc->ps_slice_hdr_base + ((ps_proc->i4_cur_slice_idx + 1) & (MAX_SLICE_HDR_CNT - 1));
+#else
+        slice_header_t *ps_slice_hdr_next = ps_proc->ps_codec->ps_slice_hdr_base + ((ps_proc->i4_cur_slice_idx + 1) & (MAX_SLICE_HDR_CNT - 1));
+#endif
+
+        if((ps_slice_hdr_next->i2_ctb_x == ps_proc->i4_ctb_x)
+                        && (ps_slice_hdr_next->i2_ctb_y == ps_proc->i4_ctb_y))
+        {
+            if(0 == ps_slice_hdr_next->i1_dependent_slice_flag)
+            {
+                ps_proc->i4_ctb_slice_x = 0;
+                ps_proc->i4_ctb_slice_y = 0;
+            }
+
+            ps_proc->i4_cur_slice_idx++;
+            ps_proc->ps_slice_hdr = ps_slice_hdr_next;
+        }
+
+    }
+}
+
+void ihevcd_ctb_pos_update(process_ctxt_t *ps_proc, WORD32 nctb)
+{
+    WORD32 tile_start_ctb_idx, slice_start_ctb_idx;
+    slice_header_t *ps_slice_hdr = ps_proc->ps_slice_hdr;
+    tile_t *ps_tile = ps_proc->ps_tile;
+    sps_t *ps_sps = ps_proc->ps_sps;
+
+    /* Update x and y positions */
+    ps_proc->i4_ctb_tile_x += nctb;
+    ps_proc->i4_ctb_x += nctb;
+
+    ps_proc->i4_ctb_slice_x += nctb;
+    /*If tile are enabled, then handle the tile & slice counters differently*/
+    if(ps_proc->ps_pps->i1_tiles_enabled_flag)
+    {
+        /* Update slice counters*/
+        slice_start_ctb_idx = ps_slice_hdr->i2_ctb_x + (ps_slice_hdr->i2_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+        tile_start_ctb_idx = ps_tile->u1_pos_x + (ps_tile->u1_pos_y * ps_sps->i2_pic_wd_in_ctb);
+        /*
+         * There can be 2 cases where slice counters must be handled differently.
+         * 1 - Multiple tiles span across a single/one of the many slice.
+         * 2 - Multiple slices span across a single/one of the many tiles.
+         */
+
+        /*Case 1 */
+        if(slice_start_ctb_idx < tile_start_ctb_idx)
+        {
+            /*End of tile row*/
+            if(ps_proc->i4_ctb_x > ps_slice_hdr->i2_ctb_x)
+            {
+                if(ps_proc->i4_ctb_slice_x >= (ps_tile->u2_wd + ps_tile->u1_pos_x))
+                {
+                    ps_proc->i4_ctb_slice_y++;
+                    ps_proc->i4_ctb_slice_x = ps_proc->i4_ctb_slice_x
+                                    - ps_tile->u2_wd;
+                }
+            }
+            else
+            {
+                WORD32 temp_stride = (ps_sps->i2_pic_wd_in_ctb - ps_slice_hdr->i2_ctb_x);
+                if(ps_proc->i4_ctb_slice_x >= (temp_stride + ps_tile->u2_wd + ps_tile->u1_pos_x))
+                {
+                    ps_proc->i4_ctb_slice_y++;
+                    ps_proc->i4_ctb_slice_x = ps_proc->i4_ctb_slice_x
+                                    - ps_tile->u2_wd;
+                }
+            }
+        }
+        /*Case 2*/
+        else if(ps_proc->i4_ctb_slice_x >= (ps_tile->u2_wd))
+        {
+            /*End of tile row*/
+            ps_proc->i4_ctb_slice_y++;
+            ps_proc->i4_ctb_slice_x = 0;
+        }
+    }
+    else
+    {
+        if(ps_proc->i4_ctb_slice_x >= ps_tile->u2_wd)
+        {
+            ps_proc->i4_ctb_slice_y++;
+            ps_proc->i4_ctb_slice_x = ps_proc->i4_ctb_slice_x
+                            - ps_tile->u2_wd;
+        }
+    }
+}
+
+void ihevcd_ctb_avail_update(process_ctxt_t *ps_proc)
+{
+    slice_header_t *ps_slice_hdr = ps_proc->ps_slice_hdr;
+    sps_t *ps_sps = ps_proc->ps_sps;
+    tile_t *ps_tile_prev;
+    tile_t *ps_tile = ps_proc->ps_tile;
+    WORD32 cur_pu_idx;
+    WORD32 tile_start_ctb_idx, slice_start_ctb_idx;
+    WORD16 i2_wd_in_ctb;
+    WORD32 continuous_tiles = 0;
+    WORD32 cur_ctb_idx;
+    WORD32 check_tile_wd;
+
+    if((0 != ps_tile->u1_pos_x) && (0 != ps_tile->u1_pos_y))
+    {
+        ps_tile_prev = ps_tile - 1;
+    }
+    else
+    {
+        ps_tile_prev = ps_tile;
+    }
+
+
+    check_tile_wd = ps_slice_hdr->i2_ctb_x + ps_tile_prev->u2_wd;
+    if(!(((check_tile_wd >= ps_sps->i2_pic_wd_in_ctb) && (check_tile_wd % ps_sps->i2_pic_wd_in_ctb == ps_tile->u1_pos_x))
+                                    || ((ps_slice_hdr->i2_ctb_x == ps_tile->u1_pos_x))))
+    {
+        continuous_tiles = 1;
+    }
+
+    slice_start_ctb_idx = ps_slice_hdr->i2_ctb_x + (ps_slice_hdr->i2_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+    tile_start_ctb_idx = ps_tile->u1_pos_x + (ps_tile->u1_pos_y * ps_sps->i2_pic_wd_in_ctb);
+
+    if((slice_start_ctb_idx < tile_start_ctb_idx) && (continuous_tiles))
+    {
+        //Slices span across multiple tiles.
+        i2_wd_in_ctb = ps_sps->i2_pic_wd_in_ctb;
+    }
+    else
+    {
+        i2_wd_in_ctb = ps_tile->u2_wd;
+    }
+    cur_ctb_idx = ps_proc->i4_ctb_x
+                    + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+
+    /* Ctb level availability */
+    /* Bottom left will not be available at a CTB level, no need to pass this */
+    ps_proc->u1_top_ctb_avail = 1;
+    ps_proc->u1_left_ctb_avail = 1;
+    ps_proc->u1_top_lt_ctb_avail = 1;
+    ps_proc->u1_top_rt_ctb_avail = 1;
+    /* slice and tile boundaries */
+
+    if((0 == ps_proc->i4_ctb_y) || (0 == ps_proc->i4_ctb_tile_y))
+    {
+        ps_proc->u1_top_ctb_avail = 0;
+        ps_proc->u1_top_lt_ctb_avail = 0;
+        ps_proc->u1_top_rt_ctb_avail = 0;
+    }
+
+    if((0 == ps_proc->i4_ctb_x) || (0 == ps_proc->i4_ctb_tile_x))
+    {
+        ps_proc->u1_left_ctb_avail = 0;
+        ps_proc->u1_top_lt_ctb_avail = 0;
+        if((0 == ps_proc->i4_ctb_slice_y) || (0 == ps_proc->i4_ctb_tile_y))
+        {
+            ps_proc->u1_top_ctb_avail = 0;
+            if((i2_wd_in_ctb - 1) != ps_proc->i4_ctb_slice_x)
+            {
+                ps_proc->u1_top_rt_ctb_avail = 0;
+            }
+        }
+    }
+    /*For slices not beginning at start of a ctb row*/
+    else if(ps_proc->i4_ctb_x > 0)
+    {
+        if((0 == ps_proc->i4_ctb_slice_y) || (0 == ps_proc->i4_ctb_tile_y))
+        {
+            ps_proc->u1_top_ctb_avail = 0;
+            ps_proc->u1_top_lt_ctb_avail = 0;
+            if(0 == ps_proc->i4_ctb_slice_x)
+            {
+                ps_proc->u1_left_ctb_avail = 0;
+            }
+            if((i2_wd_in_ctb - 1) != ps_proc->i4_ctb_slice_x)
+            {
+                ps_proc->u1_top_rt_ctb_avail = 0;
+            }
+        }
+        else if((1 == ps_proc->i4_ctb_slice_y) && (0 == ps_proc->i4_ctb_slice_x))
+        {
+            ps_proc->u1_top_lt_ctb_avail = 0;
+        }
+    }
+
+    if((ps_proc->i4_ctb_x == (ps_sps->i2_pic_wd_in_ctb - 1)) || ((ps_tile->u2_wd - 1) == ps_proc->i4_ctb_tile_x))
+    {
+        ps_proc->u1_top_rt_ctb_avail = 0;
+    }
+
+
+#if 0
+    if((((0 == ps_proc->i4_ctb_slice_x)
+         && (0 == ps_proc->i4_ctb_slice_y))
+        || (0 == ps_proc->i4_ctb_tile_x)))
+    {
+        ps_proc->u1_left_ctb_avail = 0;
+        ps_proc->u1_top_lt_ctb_avail = 0;
+    }
+    if((0 == ps_proc->i4_ctb_slice_y) || (0 == ps_proc->i4_ctb_tile_y))
+    {
+        ps_proc->u1_top_ctb_avail = 0;
+        ps_proc->u1_top_lt_ctb_avail = 0;
+        ps_proc->u1_top_rt_ctb_avail = 0;
+    }
+    /* Image boundaries */
+    if(ps_proc->i4_ctb_x == 0)
+    {
+        ps_proc->u1_left_ctb_avail = 0;
+        ps_proc->u1_top_lt_ctb_avail = 0;
+    }
+    if(ps_proc->i4_ctb_x == (ps_sps->i2_pic_wd_in_ctb - 1))
+    {
+        ps_proc->u1_top_rt_ctb_avail = 0;
+    }
+    if(ps_proc->i4_ctb_y == 0)
+    {
+        ps_proc->u1_top_ctb_avail = 0;
+        ps_proc->u1_top_lt_ctb_avail = 0;
+        ps_proc->u1_top_rt_ctb_avail = 0;
+    }
+#endif
+    {
+        WORD32 next_ctb_idx;
+        next_ctb_idx = cur_ctb_idx + 1;
+
+        if(ps_tile->u2_wd == (ps_proc->i4_ctb_tile_x + 1))
+        {
+            if((ps_proc->i4_ctb_tile_y + 1) == ps_tile->u2_ht)
+            {
+                //Last tile
+                if(((ps_proc->i4_ctb_tile_y + 1 + ps_tile->u1_pos_y) == ps_sps->i2_pic_ht_in_ctb) && ((ps_proc->i4_ctb_tile_x + 1 + ps_tile->u1_pos_x) == ps_sps->i2_pic_wd_in_ctb))
+                {
+                    next_ctb_idx = cur_ctb_idx + 1;
+                }
+                else //Not last tile, but new tile
+                {
+                    tile_t *ps_tile_next = ps_proc->ps_tile + 1;
+                    next_ctb_idx = ps_tile_next->u1_pos_x + (ps_tile_next->u1_pos_y * ps_sps->i2_pic_wd_in_ctb);
+                }
+            }
+            else //End of each tile row
+            {
+                next_ctb_idx = ((ps_tile->u1_pos_y + ps_proc->i4_ctb_tile_y + 1) * ps_sps->i2_pic_wd_in_ctb) + ps_tile->u1_pos_x;
+            }
+        }
+        ps_proc->i4_next_pu_ctb_cnt = next_ctb_idx;
+        ps_proc->i4_ctb_pu_cnt =
+                        ps_proc->pu4_pic_pu_idx[next_ctb_idx]
+                        - ps_proc->pu4_pic_pu_idx[cur_ctb_idx];
+        cur_pu_idx = ps_proc->pu4_pic_pu_idx[cur_ctb_idx];
+        ps_proc->i4_ctb_start_pu_idx = cur_pu_idx;
+        ps_proc->ps_pu = &ps_proc->ps_pic_pu[cur_pu_idx];
+    }
+}
+
+void ihevcd_update_ctb_tu_cnt(process_ctxt_t *ps_proc)
+{
+    sps_t *ps_sps = ps_proc->ps_sps;
+    codec_t *ps_codec = ps_proc->ps_codec;
+    WORD32 cur_ctb_idx;
+
+    cur_ctb_idx = ps_proc->i4_ctb_x
+                    + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+
+    {
+        tile_t *ps_tile;
+        WORD32 next_ctb_tu_idx;
+        ps_tile = ps_proc->ps_tile;
+
+
+        if(1 == ps_codec->i4_num_cores)
+        {
+            next_ctb_tu_idx = cur_ctb_idx % RESET_TU_BUF_NCTB + 1;
+            if(ps_tile->u2_wd == (ps_proc->i4_ctb_tile_x + 1))
+            {
+                if((ps_proc->i4_ctb_tile_y + 1) == ps_tile->u2_ht)
+                {
+                    //Last tile
+                    if(((ps_proc->i4_ctb_tile_y + 1 + ps_tile->u1_pos_y) == ps_sps->i2_pic_ht_in_ctb) && ((ps_proc->i4_ctb_tile_x + 1 + ps_tile->u1_pos_x) == ps_sps->i2_pic_wd_in_ctb))
+                    {
+                        next_ctb_tu_idx = (cur_ctb_idx % RESET_TU_BUF_NCTB) + 1;
+                    }
+                    else //Not last tile, but new tile
+                    {
+                        tile_t *ps_tile_next = ps_proc->ps_tile + 1;
+                        next_ctb_tu_idx = ps_tile_next->u1_pos_x + (ps_tile_next->u1_pos_y * ps_sps->i2_pic_wd_in_ctb);
+                    }
+                }
+                else //End of each tile row
+                {
+                    next_ctb_tu_idx = ((ps_tile->u1_pos_y + ps_proc->i4_ctb_tile_y + 1) * ps_sps->i2_pic_wd_in_ctb) + ps_tile->u1_pos_x;
+                }
+            }
+            ps_proc->i4_next_tu_ctb_cnt = next_ctb_tu_idx;
+            ps_proc->i4_ctb_tu_cnt = ps_proc->pu4_pic_tu_idx[next_ctb_tu_idx] - ps_proc->pu4_pic_tu_idx[cur_ctb_idx % RESET_TU_BUF_NCTB];
+        }
+        else
+        {
+            next_ctb_tu_idx = cur_ctb_idx + 1;
+            if(ps_tile->u2_wd == (ps_proc->i4_ctb_tile_x + 1))
+            {
+                if((ps_proc->i4_ctb_tile_y + 1) == ps_tile->u2_ht)
+                {
+                    //Last tile
+                    if(((ps_proc->i4_ctb_tile_y + 1 + ps_tile->u1_pos_y) == ps_sps->i2_pic_ht_in_ctb) && ((ps_proc->i4_ctb_tile_x + 1 + ps_tile->u1_pos_x) == ps_sps->i2_pic_wd_in_ctb))
+                    {
+                        next_ctb_tu_idx = (cur_ctb_idx % RESET_TU_BUF_NCTB) + 1;
+                    }
+                    else //Not last tile, but new tile
+                    {
+                        tile_t *ps_tile_next = ps_proc->ps_tile + 1;
+                        next_ctb_tu_idx = ps_tile_next->u1_pos_x + (ps_tile_next->u1_pos_y * ps_sps->i2_pic_wd_in_ctb);
+                    }
+                }
+                else //End of each tile row
+                {
+                    next_ctb_tu_idx = ((ps_tile->u1_pos_y + ps_proc->i4_ctb_tile_y + 1) * ps_sps->i2_pic_wd_in_ctb) + ps_tile->u1_pos_x;
+                }
+            }
+            ps_proc->i4_next_tu_ctb_cnt = next_ctb_tu_idx;
+            ps_proc->i4_ctb_tu_cnt = ps_proc->pu4_pic_tu_idx[next_ctb_tu_idx] -
+                            ps_proc->pu4_pic_tu_idx[cur_ctb_idx];
+        }
+    }
+}
+
+IHEVCD_ERROR_T ihevcd_process(process_ctxt_t *ps_proc)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    codec_t *ps_codec;
+    sps_t *ps_sps = ps_proc->ps_sps;
+
+    WORD32 nctb;
+    WORD32 i;
+    WORD32 idx;
+    WORD32 nop_cnt;
+    WORD32 num_minpu_in_ctb;
+    WORD32 cur_slice_idx, cur_ctb_tile_x, cur_ctb_slice_x, cur_ctb_tile_y, cur_ctb_slice_y;
+    WORD32 nxt_ctb_slice_y, nxt_ctb_slice_x;
+    tu_t *ps_tu_cur, *ps_tu_nxt;
+    UWORD8 *pu1_pu_map_cur, *pu1_pu_map_nxt;
+    WORD32 num_ctb, num_ctb_tmp;
+    proc_type_t proc_type;
+
+
+    WORD32 ctb_size = 1 << ps_sps->i1_log2_ctb_size;
+
+    PROFILE_DISABLE_PROCESS_CTB();
+
+    ps_codec = ps_proc->ps_codec;
+    num_minpu_in_ctb = (ctb_size / MIN_PU_SIZE) * (ctb_size / MIN_PU_SIZE);
+
+    nctb = MIN(ps_codec->i4_proc_nctb, ps_proc->i4_ctb_cnt);
+    nctb = MIN(nctb, (ps_proc->ps_tile->u2_wd - ps_proc->i4_ctb_tile_x));
+
+    if(ps_proc->i4_cur_slice_idx > (MAX_SLICE_HDR_CNT - 2 * ps_sps->i2_pic_wd_in_ctb))
+    {
+        num_ctb = 1;
+    }
+    else
+    {
+        num_ctb = ps_proc->i4_nctb;
+    }
+    nxt_ctb_slice_y = ps_proc->i4_ctb_slice_y;
+    nxt_ctb_slice_x = ps_proc->i4_ctb_slice_x;
+    pu1_pu_map_nxt = ps_proc->pu1_pu_map;
+    ps_tu_nxt = ps_proc->ps_tu;
+
+    while(ps_proc->i4_ctb_cnt)
+    {
+        ps_proc->i4_ctb_slice_y = nxt_ctb_slice_y;
+        ps_proc->i4_ctb_slice_x = nxt_ctb_slice_x;
+        ps_proc->pu1_pu_map = pu1_pu_map_nxt;
+        ps_proc->ps_tu = ps_tu_nxt;
+
+        cur_ctb_tile_x = ps_proc->i4_ctb_tile_x;
+        cur_ctb_tile_y = ps_proc->i4_ctb_tile_y;
+        cur_ctb_slice_x = ps_proc->i4_ctb_slice_x;
+        cur_ctb_slice_y = ps_proc->i4_ctb_slice_y;
+        cur_slice_idx = ps_proc->i4_cur_slice_idx;
+        ps_tu_cur = ps_proc->ps_tu;
+        pu1_pu_map_cur = ps_proc->pu1_pu_map;
+        proc_type = PROC_INTER_PRED;
+
+        if(ps_proc->i4_ctb_cnt < num_ctb)
+        {
+            num_ctb = ps_proc->i4_ctb_cnt;
+        }
+#ifdef GPU_BUILD
+        num_ctb = MIN(num_ctb, (ps_proc->ps_tile->u2_wd - ps_proc->i4_ctb_tile_x));
+#endif
+        num_ctb_tmp = num_ctb;
+
+        while(num_ctb_tmp)
+        {
+            slice_header_t *ps_slice_hdr;
+            tile_t *ps_tile = ps_proc->ps_tile;
+
+            /* Waiting for Parsing to be done*/
+            {
+
+
+                nop_cnt = PROC_NOP_CNT;
+                if(ps_proc->i4_check_parse_status || ps_proc->i4_check_proc_status)
+                {
+                    while(1)
+                    {
+                        volatile UWORD8 *pu1_buf;
+                        volatile WORD32 status;
+                        status = 1;
+#ifdef GPU_BUILD
+                        /* If GPU is enabled, don't check for the status of parsing
+                         * since processing starts after waiting for MC which means
+                         * parsing is done.*/
+                        //TODO GPU : Also remove the flag being updated in parsing
+#endif
+                        /* Check if all dependencies for the next nCTBs are met */
+#ifndef GPU_BUILD
+                        /* Check if the next nCTBs are parsed */
+                        if(ps_proc->i4_check_parse_status)
+                        {
+                            idx = (ps_proc->i4_ctb_x + nctb - 1);
+                            idx += (ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+                            pu1_buf = (ps_codec->pu1_parse_map + idx);
+                            status = *pu1_buf;
+                        }
+#endif
+
+                        if(status)
+                            break;
+
+                        /* if dependencies are not met, then wait for few cycles.
+                         * Even after few iterations, if the dependencies are not met then yield
+                         */
+                        if(nop_cnt > 0)
+                        {
+                            NOP(128);
+                            nop_cnt -= 128;
+                        }
+                        else
+                        {
+                            nop_cnt = PROC_NOP_CNT;
+                            ithread_yield();
+                        }
+                    }
+                }
+            }
+
+            /* Check proc map to ensure dependencies for recon are met */
+            ihevcd_proc_map_check(ps_proc, proc_type, nctb);
+
+            ihevcd_slice_hdr_update(ps_proc);
+            ps_slice_hdr = ps_proc->ps_slice_hdr;
+
+            //ihevcd_mv_prediction();
+            //ihevcd_lvl_unpack();
+            //ihevcd_inter_iq_it_recon();
+            //Following does prediction, iq, it and recon on a TU by TU basis for intra TUs
+            //ihevcd_intra_process();
+            //ihevcd_ctb_boundary_strength_islice(ps_proc, ctb_size);
+            //ihevcd_deblk_ctb(ps_proc);
+
+            /* iq,it recon of Intra TU */
+            {
+                UWORD32 *pu4_ctb_top_pu_idx, *pu4_ctb_left_pu_idx, *pu4_ctb_top_left_pu_idx;
+                WORD32 cur_ctb_idx;
+
+                ihevcd_ctb_avail_update(ps_proc);
+
+#if DEBUG_DUMP_FRAME_BUFFERS_INFO
+                au1_pic_avail_ctb_flags[ps_proc->i4_ctb_x + ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb] =
+                                ((ps_proc->u1_top_ctb_avail << 3) | (ps_proc->u1_left_ctb_avail << 2) | (ps_proc->u1_top_lt_ctb_avail << 1) | (ps_proc->u1_top_rt_ctb_avail));
+                au4_pic_ctb_slice_xy[ps_proc->i4_ctb_x + ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb] =
+                                (((UWORD16)ps_proc->i4_ctb_slice_x << 16) | ((UWORD16)ps_proc->i4_ctb_slice_y << 16));
+#endif
+
+                /*************************************************/
+                /****************   MV pred **********************/
+                /*************************************************/
+                if(PSLICE == ps_slice_hdr->i1_slice_type
+                                || BSLICE == ps_slice_hdr->i1_slice_type)
+                {
+                    mv_ctxt_t s_mv_ctxt;
+
+                    pu4_ctb_top_pu_idx = ps_proc->pu4_pic_pu_idx_top
+                                    + (ps_proc->i4_ctb_x * ctb_size / MIN_PU_SIZE);
+                    pu4_ctb_left_pu_idx = ps_proc->pu4_pic_pu_idx_left;
+                    pu4_ctb_top_left_pu_idx = &ps_proc->u4_ctb_top_left_pu_idx;
+
+                    /* Initializing s_mv_ctxt */
+                    if(ps_codec->i4_num_cores > MV_PRED_NUM_CORES_THRESHOLD)
+                    {
+                        s_mv_ctxt.ps_pps = ps_proc->ps_pps;
+                        s_mv_ctxt.ps_sps = ps_proc->ps_sps;
+                        s_mv_ctxt.ps_slice_hdr = ps_proc->ps_slice_hdr;
+                        s_mv_ctxt.i4_ctb_x = ps_proc->i4_ctb_x;
+                        s_mv_ctxt.i4_ctb_y = ps_proc->i4_ctb_y;
+                        s_mv_ctxt.ps_pu = ps_proc->ps_pu;
+                        s_mv_ctxt.ps_pic_pu = ps_proc->ps_pic_pu;
+                        s_mv_ctxt.ps_tile = ps_tile;
+                        s_mv_ctxt.pu4_pic_pu_idx_map = ps_proc->pu4_pic_pu_idx_map;
+                        s_mv_ctxt.pu4_pic_pu_idx = ps_proc->pu4_pic_pu_idx;
+                        s_mv_ctxt.pu1_pic_pu_map = ps_proc->pu1_pic_pu_map;
+                        s_mv_ctxt.i4_ctb_pu_cnt = ps_proc->i4_ctb_pu_cnt;
+                        s_mv_ctxt.i4_ctb_start_pu_idx = ps_proc->i4_ctb_start_pu_idx;
+                        s_mv_ctxt.u1_top_ctb_avail = ps_proc->u1_top_ctb_avail;
+                        s_mv_ctxt.u1_top_rt_ctb_avail = ps_proc->u1_top_rt_ctb_avail;
+                        s_mv_ctxt.u1_top_lt_ctb_avail = ps_proc->u1_top_lt_ctb_avail;
+                        s_mv_ctxt.u1_left_ctb_avail = ps_proc->u1_left_ctb_avail;
+
+                        ihevcd_get_mv_ctb(&s_mv_ctxt, pu4_ctb_top_pu_idx,
+                                          pu4_ctb_left_pu_idx, pu4_ctb_top_left_pu_idx);
+                    }
+
+                    ihevcd_inter_pred_ctb(ps_proc);
+                }
+                else if(ps_codec->i4_num_cores > MV_PRED_NUM_CORES_THRESHOLD)
+                {
+                    WORD32 next_ctb_idx, num_pu_per_ctb, ctb_start_pu_idx, pu_cnt;
+                    pu_t *ps_pu;
+                    WORD32 num_minpu_in_ctb = (ctb_size / MIN_PU_SIZE) * (ctb_size / MIN_PU_SIZE);
+                    UWORD8 *pu1_pic_pu_map_ctb = ps_proc->pu1_pic_pu_map +
+                                    (ps_proc->i4_ctb_x + ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb) * num_minpu_in_ctb;
+                    WORD32 row, col;
+                    UWORD32 *pu4_nbr_pu_idx = ps_proc->pu4_pic_pu_idx_map;
+                    WORD32 nbr_pu_idx_strd = MAX_CTB_SIZE / MIN_PU_SIZE + 2;
+
+                    for(row = 0; row < ctb_size / MIN_PU_SIZE; row++)
+                    {
+                        for(col = 0; col < ctb_size / MIN_PU_SIZE; col++)
+                        {
+                            pu1_pic_pu_map_ctb[row * ctb_size / MIN_PU_SIZE + col] = 0;
+                        }
+                    }
+                    /* Neighbor PU idx update inside CTB */
+                    /* 1byte per 4x4. Indicates the PU idx that 4x4 block belongs to */
+
+                    cur_ctb_idx = ps_proc->i4_ctb_x
+                                    + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+                    next_ctb_idx = ps_proc->i4_next_pu_ctb_cnt;
+                    num_pu_per_ctb = ps_proc->pu4_pic_pu_idx[next_ctb_idx]
+                                    - ps_proc->pu4_pic_pu_idx[cur_ctb_idx];
+                    ctb_start_pu_idx = ps_proc->pu4_pic_pu_idx[cur_ctb_idx];
+                    ps_pu = &ps_proc->ps_pic_pu[ctb_start_pu_idx];
+
+                    for(pu_cnt = 0; pu_cnt < num_pu_per_ctb; pu_cnt++, ps_pu++)
+                    {
+                        UWORD32 cur_pu_idx;
+                        WORD32 pu_ht = (ps_pu->b4_ht + 1) << 2;
+                        WORD32 pu_wd = (ps_pu->b4_wd + 1) << 2;
+
+                        cur_pu_idx = ctb_start_pu_idx + pu_cnt;
+
+                        for(row = 0; row < pu_ht / MIN_PU_SIZE; row++)
+                            for(col = 0; col < pu_wd / MIN_PU_SIZE; col++)
+                                pu4_nbr_pu_idx[(1 + ps_pu->b4_pos_x + col)
+                                                + (1 + ps_pu->b4_pos_y + row)
+                                                * nbr_pu_idx_strd] =
+                                                cur_pu_idx;
+                    }
+
+                    /* Updating Top and Left pointers */
+                    {
+                        WORD32 rows_remaining = ps_sps->i2_pic_height_in_luma_samples
+                                        - (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size);
+                        WORD32 ctb_size_left = MIN(ctb_size, rows_remaining);
+
+                        /* Top Left */
+                        /* saving top left before updating top ptr, as updating top ptr will overwrite the top left for the next ctb */
+                        ps_proc->u4_ctb_top_left_pu_idx = ps_proc->pu4_pic_pu_idx_top[((ps_proc->i4_ctb_x + 1) * ctb_size / MIN_PU_SIZE) - 1];
+                        for(i = 0; i < ctb_size / MIN_PU_SIZE; i++)
+                        {
+                            /* Left */
+                            /* Last column of au4_nbr_pu_idx */
+                            ps_proc->pu4_pic_pu_idx_left[i] =
+                                            pu4_nbr_pu_idx[(ctb_size / MIN_PU_SIZE) + (i + 1) * nbr_pu_idx_strd];
+                            /* Top */
+                            /* Last row of au4_nbr_pu_idx */
+                            ps_proc->pu4_pic_pu_idx_top[(ps_proc->i4_ctb_x * ctb_size / MIN_PU_SIZE) + i] =
+                                            pu4_nbr_pu_idx[(ctb_size_left / MIN_PU_SIZE) * nbr_pu_idx_strd + i + 1];
+
+                        }
+                    }
+                }
+            }
+
+            if(ps_proc->ps_pps->i1_tiles_enabled_flag)
+            {
+                /*Update the tile index buffer with tile information for the current ctb*/
+                UWORD16 *pu1_tile_idx = ps_proc->pu1_tile_idx;
+                pu1_tile_idx[(ps_proc->i4_ctb_x + (ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb))]
+                                = ps_proc->i4_cur_tile_idx;
+            }
+
+            /*************************************************/
+            /*********** BS, QP and Deblocking  **************/
+            /*************************************************/
+            /* Boundary strength call has to be after IQ IT recon since QP population needs ps_proc->i4_qp_const_inc_ctb flag */
+
+            {
+                slice_header_t *ps_slice_hdr;
+                ps_slice_hdr = ps_proc->ps_slice_hdr;
+
+
+                /* Check if deblock is disabled for the current slice or if it is disabled for the current picture
+                 * because of disable deblock api
+                 */
+                if(0 == ps_codec->i4_disable_deblk_pic)
+                {
+                    if(ps_codec->i4_num_cores > MV_PRED_NUM_CORES_THRESHOLD)
+                    {
+                        if((0 == ps_slice_hdr->i1_slice_disable_deblocking_filter_flag) &&
+                                        (0 == ps_codec->i4_slice_error))
+                        {
+                            ihevcd_update_ctb_tu_cnt(ps_proc);
+                            ps_proc->s_bs_ctxt.ps_pps = ps_proc->ps_pps;
+                            ps_proc->s_bs_ctxt.ps_sps = ps_proc->ps_sps;
+                            ps_proc->s_bs_ctxt.ps_codec = ps_proc->ps_codec;
+                            ps_proc->s_bs_ctxt.i4_ctb_tu_cnt = ps_proc->i4_ctb_tu_cnt;
+                            ps_proc->s_bs_ctxt.i4_ctb_x = ps_proc->i4_ctb_x;
+                            ps_proc->s_bs_ctxt.i4_ctb_y = ps_proc->i4_ctb_y;
+                            ps_proc->s_bs_ctxt.i4_ctb_tile_x = ps_proc->i4_ctb_tile_x;
+                            ps_proc->s_bs_ctxt.i4_ctb_tile_y = ps_proc->i4_ctb_tile_y;
+                            ps_proc->s_bs_ctxt.i4_ctb_slice_x = ps_proc->i4_ctb_slice_x;
+                            ps_proc->s_bs_ctxt.i4_ctb_slice_y = ps_proc->i4_ctb_slice_y;
+                            ps_proc->s_bs_ctxt.ps_tu = ps_proc->ps_tu;
+                            ps_proc->s_bs_ctxt.ps_pu = ps_proc->ps_pu;
+                            ps_proc->s_bs_ctxt.pu4_pic_pu_idx_map = ps_proc->pu4_pic_pu_idx_map;
+                            ps_proc->s_bs_ctxt.i4_next_pu_ctb_cnt = ps_proc->i4_next_pu_ctb_cnt;
+                            ps_proc->s_bs_ctxt.i4_next_tu_ctb_cnt = ps_proc->i4_next_tu_ctb_cnt;
+                            ps_proc->s_bs_ctxt.pu1_slice_idx = ps_proc->pu1_slice_idx;
+                            ps_proc->s_bs_ctxt.ps_slice_hdr = ps_proc->ps_slice_hdr;
+                            ps_proc->s_bs_ctxt.ps_tile = ps_proc->ps_tile;
+
+                            if(ISLICE == ps_slice_hdr->i1_slice_type)
+                            {
+                                ihevcd_ctb_boundary_strength_islice(&ps_proc->s_bs_ctxt);
+                            }
+                            else
+                            {
+                                ihevcd_ctb_boundary_strength_pbslice(&ps_proc->s_bs_ctxt);
+                            }
+                        }
+                        else
+                        {
+                            WORD32 vert_bs_strd = ps_sps->i2_pic_wd_in_ctb * (ctb_size * ctb_size / 8 / 16);
+                            WORD32 horz_bs_strd = (ps_sps->i2_pic_wd_in_ctb + 1) * (ctb_size * ctb_size / 8 / 16);
+                            UWORD32 *pu4_vert_bs = (UWORD32 *)((UWORD8 *)ps_proc->s_bs_ctxt.pu4_pic_vert_bs +
+                                            ps_proc->i4_ctb_x * (ctb_size * ctb_size / 8 / 16) +
+                                            ps_proc->i4_ctb_y * vert_bs_strd);
+                            UWORD32 *pu4_horz_bs = (UWORD32 *)((UWORD8 *)ps_proc->s_bs_ctxt.pu4_pic_horz_bs +
+                                            ps_proc->i4_ctb_x * (ctb_size * ctb_size / 8 / 16) +
+                                            ps_proc->i4_ctb_y * horz_bs_strd);
+
+                            memset(pu4_vert_bs, 0, (ctb_size / 8 + 1) * (ctb_size / 4) / 8 * 2);
+                            memset(pu4_horz_bs, 0, (ctb_size / 8) * (ctb_size / 4) / 8 * 2);
+
+                        }
+                    }
+                }
+            }
+
+            /* Per CTB update the following */
+            {
+                WORD32 cur_ctb_idx = ps_proc->i4_ctb_x
+                                + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+                cur_ctb_idx++;
+
+                ps_proc->pu1_pu_map += nctb * num_minpu_in_ctb;
+                ps_proc->ps_tu += ps_proc->i4_ctb_tu_cnt;
+                if((1 == ps_codec->i4_num_cores) &&
+                                (0 == cur_ctb_idx % RESET_TU_BUF_NCTB))
+                {
+                    ps_proc->ps_tu = ps_proc->ps_pic_tu;
+                }
+                ps_proc->ps_pu += ps_proc->i4_ctb_pu_cnt;
+            }
+
+            /* Update proc map for recon*/
+            ihevcd_proc_map_update(ps_proc, proc_type, nctb);
+
+            num_ctb_tmp -= nctb;
+            ihevcd_ctb_pos_update(ps_proc, nctb);
+
+        }
+
+        if(cur_slice_idx != ps_proc->i4_cur_slice_idx)
+        {
+#ifdef GPU_BUILD
+            //TODO GPU : Later define it for ARM only version as well
+            ps_proc->ps_slice_hdr = ps_proc->ps_slice_hdr_base + ((cur_slice_idx)&(MAX_SLICE_HDR_CNT - 1));
+#else
+            ps_proc->ps_slice_hdr = ps_codec->ps_slice_hdr_base + ((cur_slice_idx)&(MAX_SLICE_HDR_CNT - 1));
+#endif
+            ps_proc->i4_cur_slice_idx = cur_slice_idx;
+        }
+        /* Restore the saved variables  */
+        num_ctb_tmp = num_ctb;
+        ps_proc->i4_ctb_x -= num_ctb;
+        ps_proc->i4_ctb_tile_x = cur_ctb_tile_x;
+        ps_proc->i4_ctb_slice_x = cur_ctb_slice_x;
+        ps_proc->i4_ctb_tile_y = cur_ctb_tile_y;
+        ps_proc->i4_ctb_slice_y = cur_ctb_slice_y;
+        ps_proc->pu1_pu_map = pu1_pu_map_cur;
+        ps_proc->ps_tu = ps_tu_cur;
+        proc_type = PROC_RECON;
+
+        while(num_ctb_tmp)
+        {
+
+            /* Check proc map to ensure dependencies for recon are met */
+            ihevcd_proc_map_check(ps_proc, proc_type, nctb);
+
+            ihevcd_slice_hdr_update(ps_proc);
+
+            {
+
+                ihevcd_ctb_avail_update(ps_proc);
+
+                /*************************************************/
+                /**************** IQ IT RECON  *******************/
+                /*************************************************/
+
+                ihevcd_update_ctb_tu_cnt(ps_proc);
+
+                /* When scaling matrix is not to be used(scaling_list_enable_flag is zero in SPS),
+                 * default value of 16 has to be used. Since the value is same for all sizes,
+                 * same table is used for all cases.
+                 */
+                if(0 == ps_sps->i1_scaling_list_enable_flag)
+                {
+                    ps_proc->api2_dequant_intra_matrix[0] =
+                                    (WORD16 *)gi2_flat_scale_mat_32x32;
+                    ps_proc->api2_dequant_intra_matrix[1] =
+                                    (WORD16 *)gi2_flat_scale_mat_32x32;
+                    ps_proc->api2_dequant_intra_matrix[2] =
+                                    (WORD16 *)gi2_flat_scale_mat_32x32;
+                    ps_proc->api2_dequant_intra_matrix[3] =
+                                    (WORD16 *)gi2_flat_scale_mat_32x32;
+
+                    ps_proc->api2_dequant_inter_matrix[0] =
+                                    (WORD16 *)gi2_flat_scale_mat_32x32;
+                    ps_proc->api2_dequant_inter_matrix[1] =
+                                    (WORD16 *)gi2_flat_scale_mat_32x32;
+                    ps_proc->api2_dequant_inter_matrix[2] =
+                                    (WORD16 *)gi2_flat_scale_mat_32x32;
+                    ps_proc->api2_dequant_inter_matrix[3] =
+                                    (WORD16 *)gi2_flat_scale_mat_32x32;
+                }
+                else
+                {
+                    if(0 == ps_sps->i1_sps_scaling_list_data_present_flag)
+                    {
+                        ps_proc->api2_dequant_intra_matrix[0] =
+                                        (WORD16 *)gi2_flat_scale_mat_32x32;
+                        ps_proc->api2_dequant_intra_matrix[1] =
+                                        (WORD16 *)gi2_intra_default_scale_mat_8x8;
+                        ps_proc->api2_dequant_intra_matrix[2] =
+                                        (WORD16 *)gi2_intra_default_scale_mat_16x16;
+                        ps_proc->api2_dequant_intra_matrix[3] =
+                                        (WORD16 *)gi2_intra_default_scale_mat_32x32;
+
+                        ps_proc->api2_dequant_inter_matrix[0] =
+                                        (WORD16 *)gi2_flat_scale_mat_32x32;
+                        ps_proc->api2_dequant_inter_matrix[1] =
+                                        (WORD16 *)gi2_inter_default_scale_mat_8x8;
+                        ps_proc->api2_dequant_inter_matrix[2] =
+                                        (WORD16 *)gi2_inter_default_scale_mat_16x16;
+                        ps_proc->api2_dequant_inter_matrix[3] =
+                                        (WORD16 *)gi2_inter_default_scale_mat_32x32;
+                    }
+                    /*TODO: Add support for custom scaling matrices */
+                }
+
+
+                /* CTB Level pointers */
+                ps_proc->pu1_cur_ctb_luma = ps_proc->pu1_cur_pic_luma
+                                + (ps_proc->i4_ctb_x * ctb_size
+                                + ps_proc->i4_ctb_y * ctb_size
+                                * ps_codec->i4_strd);
+                ps_proc->pu1_cur_ctb_chroma = ps_proc->pu1_cur_pic_chroma
+                                + ps_proc->i4_ctb_x * ctb_size
+                                + (ps_proc->i4_ctb_y * ctb_size * ps_codec->i4_strd / 2);
+#if DEBUG_PRINT_IQ_IT_RECON
+                printf("\nCTB x=%d, y=%d", ps_proc->i4_ctb_x, ps_proc->i4_ctb_y);
+                printf("\n CTB size= %d,CTB level availability: L=%d,TL=%d,TR=%d,T=%d",
+                       ctb_size, ps_proc->u1_left_ctb_avail, ps_proc->u1_top_lt_ctb_avail, ps_proc->u1_top_rt_ctb_avail,
+                       ps_proc->u1_top_ctb_avail);
+#endif
+
+                ihevcd_iquant_itrans_recon_ctb(ps_proc);
+            }
+
+            /* Per CTB update the following */
+            {
+                WORD32 cur_ctb_idx = ps_proc->i4_ctb_x
+                                + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+                cur_ctb_idx++;
+
+                ps_proc->pu1_pu_map += nctb * num_minpu_in_ctb;
+                ps_proc->ps_tu += ps_proc->i4_ctb_tu_cnt;
+                if((1 == ps_codec->i4_num_cores) &&
+                                (0 == cur_ctb_idx % RESET_TU_BUF_NCTB))
+                {
+                    ps_proc->ps_tu = ps_proc->ps_pic_tu;
+                }
+                ps_proc->ps_pu += ps_proc->i4_ctb_pu_cnt;
+            }
+
+
+            /* Update proc map for recon*/
+            ihevcd_proc_map_update(ps_proc, proc_type, nctb);
+
+            num_ctb_tmp -= nctb;
+            ihevcd_ctb_pos_update(ps_proc, nctb);
+        }
+
+        if(cur_slice_idx != ps_proc->i4_cur_slice_idx)
+        {
+#ifdef GPU_BUILD
+            //TODO GPU : Later define it for ARM only version as well
+            ps_proc->ps_slice_hdr = ps_proc->ps_slice_hdr_base + ((cur_slice_idx)&(MAX_SLICE_HDR_CNT - 1));
+#else
+            ps_proc->ps_slice_hdr = ps_codec->ps_slice_hdr_base + ((cur_slice_idx)&(MAX_SLICE_HDR_CNT - 1));
+#endif
+            ps_proc->i4_cur_slice_idx = cur_slice_idx;
+        }
+        /* Restore the saved variables  */
+        num_ctb_tmp = num_ctb;
+        ps_proc->i4_ctb_x -= num_ctb;
+        ps_proc->i4_ctb_tile_x = cur_ctb_tile_x;
+        ps_proc->i4_ctb_slice_x = cur_ctb_slice_x;
+        ps_proc->i4_ctb_tile_y = cur_ctb_tile_y;
+        ps_proc->i4_ctb_slice_y = cur_ctb_slice_y;
+        pu1_pu_map_nxt = ps_proc->pu1_pu_map;
+        ps_tu_nxt = ps_proc->ps_tu;
+        ps_proc->pu1_pu_map = pu1_pu_map_cur;
+        ps_proc->ps_tu = ps_tu_cur;
+        proc_type = PROC_DEBLK;
+
+        while(num_ctb_tmp)
+        {
+            slice_header_t *ps_slice_hdr = ps_proc->ps_slice_hdr;
+
+            /* Check proc map to ensure dependencies for deblk are met */
+            ihevcd_proc_map_check(ps_proc, proc_type, nctb);
+
+            ihevcd_slice_hdr_update(ps_proc);
+            ps_slice_hdr = ps_proc->ps_slice_hdr;
+
+            if(((0 == FRAME_ILF_PAD || ps_codec->i4_num_cores != 1)) &&
+               (0 == ps_codec->i4_disable_deblk_pic))
+            {
+                WORD32 i4_is_last_ctb_x = 0;
+                WORD32 i4_is_last_ctb_y = 0;
+
+                if(0 == ps_slice_hdr->i1_slice_disable_deblocking_filter_flag ||
+                                (ps_proc->i4_ctb_slice_x == 0) ||
+                                (ps_proc->i4_ctb_slice_y == 0))
+                {
+                    ps_proc->s_deblk_ctxt.ps_pps = ps_proc->ps_pps;
+                    ps_proc->s_deblk_ctxt.ps_sps = ps_proc->ps_sps;
+                    ps_proc->s_deblk_ctxt.ps_codec = ps_proc->ps_codec;
+                    ps_proc->s_deblk_ctxt.ps_slice_hdr = ps_proc->ps_slice_hdr;
+                    ps_proc->s_deblk_ctxt.i4_ctb_x = ps_proc->i4_ctb_x;
+                    ps_proc->s_deblk_ctxt.i4_ctb_y = ps_proc->i4_ctb_y;
+                    ps_proc->s_deblk_ctxt.pu1_slice_idx = ps_proc->pu1_slice_idx;
+                    ps_proc->s_deblk_ctxt.is_chroma_yuv420sp_vu = (ps_codec->e_ref_chroma_fmt == IV_YUV_420SP_VU);
+
+                    /* Populating Current CTB's no_loop_filter flags */
+                    {
+                        WORD32 row;
+                        WORD32 log2_ctb_size = ps_sps->i1_log2_ctb_size;
+
+                        /* Loop filter strd in units of num bits */
+                        WORD32 loop_filter_strd = ((ps_sps->i2_pic_width_in_luma_samples + 63) >> 6) << 3;
+                        /* Bit position is the current 8x8 bit offset wrt pic_no_loop_filter
+                         * bit_pos has to be a WOR32 so that when it is negative, the downshift still retains it to be a negative value */
+                        WORD32 bit_pos = ((ps_proc->i4_ctb_y << (log2_ctb_size - 3)) - 1) * loop_filter_strd + (ps_proc->i4_ctb_x << (log2_ctb_size - 3)) - 1;
+
+                        for(row = 0; row < (ctb_size >> 3) + 1; row++)
+                        {
+                            /* Go to the corresponding byte - read 32 bits and downshift */
+                            ps_proc->s_deblk_ctxt.au2_ctb_no_loop_filter_flag[row] = (*(UWORD32 *)(ps_proc->pu1_pic_no_loop_filter_flag + (bit_pos >> 3))) >> (bit_pos & 7);
+                            bit_pos += loop_filter_strd;
+                        }
+                    }
+
+                    ihevcd_deblk_ctb(&ps_proc->s_deblk_ctxt, i4_is_last_ctb_x, i4_is_last_ctb_y);
+
+                    /* If the last CTB in the row was a complete CTB then deblocking has to be called from remaining pixels, since deblocking
+                     * is applied on a shifted CTB structure
+                     */
+                    if(ps_proc->i4_ctb_x == ps_sps->i2_pic_wd_in_ctb - 1)
+                    {
+                        WORD32 i4_is_last_ctb_x = 1;
+                        WORD32 i4_is_last_ctb_y = 0;
+
+                        WORD32 last_x_pos;
+                        last_x_pos = (ps_sps->i2_pic_wd_in_ctb << ps_sps->i1_log2_ctb_size);
+                        if(last_x_pos  ==  ps_sps->i2_pic_width_in_luma_samples)
+                        {
+                            ihevcd_deblk_ctb(&ps_proc->s_deblk_ctxt, i4_is_last_ctb_x, i4_is_last_ctb_y);
+                        }
+                    }
+
+
+                    /* If the last CTB in the column was a complete CTB then deblocking has to be called from remaining pixels, since deblocking
+                     * is applied on a shifted CTB structure
+                     */
+                    if(ps_proc->i4_ctb_y == ps_sps->i2_pic_ht_in_ctb - 1)
+                    {
+                        WORD32 i4_is_last_ctb_x = 0;
+                        WORD32 i4_is_last_ctb_y = 1;
+                        WORD32 last_y_pos;
+                        last_y_pos = (ps_sps->i2_pic_ht_in_ctb << ps_sps->i1_log2_ctb_size);
+                        if(last_y_pos == ps_sps->i2_pic_height_in_luma_samples)
+                        {
+                            ihevcd_deblk_ctb(&ps_proc->s_deblk_ctxt, i4_is_last_ctb_x, i4_is_last_ctb_y);
+                        }
+                    }
+                }
+            }
+
+            /* Update proc map for deblk*/
+            ihevcd_proc_map_update(ps_proc, proc_type, nctb);
+
+            num_ctb_tmp -= nctb;
+            ihevcd_ctb_pos_update(ps_proc, nctb);
+        }
+
+        if(cur_slice_idx != ps_proc->i4_cur_slice_idx)
+        {
+#ifdef GPU_BUILD
+            //TODO GPU : Later define it for ARM only version as well
+            ps_proc->ps_slice_hdr = ps_proc->ps_slice_hdr_base + ((cur_slice_idx)&(MAX_SLICE_HDR_CNT - 1));
+#else
+            ps_proc->ps_slice_hdr = ps_codec->ps_slice_hdr_base + ((cur_slice_idx)&(MAX_SLICE_HDR_CNT - 1));
+#endif
+            ps_proc->i4_cur_slice_idx = cur_slice_idx;
+        }
+        /* Restore the saved variables  */
+        num_ctb_tmp = num_ctb;
+        ps_proc->i4_ctb_x -= num_ctb;
+        ps_proc->i4_ctb_tile_x = cur_ctb_tile_x;
+        ps_proc->i4_ctb_tile_y = cur_ctb_tile_y;
+        ps_proc->pu1_pu_map = pu1_pu_map_cur;
+        ps_proc->ps_tu = ps_tu_cur;
+        nxt_ctb_slice_y = ps_proc->i4_ctb_slice_y;
+        nxt_ctb_slice_x = ps_proc->i4_ctb_slice_x;
+        ps_proc->i4_ctb_slice_y = cur_ctb_slice_y;
+        ps_proc->i4_ctb_slice_x = cur_ctb_slice_x;
+        proc_type = PROC_SAO;
+
+        while(num_ctb_tmp)
+        {
+            slice_header_t *ps_slice_hdr = ps_proc->ps_slice_hdr;
+
+            /* Check proc map to ensure dependencies for SAO are met */
+            ihevcd_proc_map_check(ps_proc, proc_type, nctb);
+
+            ihevcd_slice_hdr_update(ps_proc);
+            ps_slice_hdr = ps_proc->ps_slice_hdr;
+
+            if(0 == FRAME_ILF_PAD || ps_codec->i4_num_cores != 1)
+            {
+                if((0 == ps_codec->i4_disable_sao_pic) &&
+                                (ps_slice_hdr->i1_slice_sao_luma_flag || ps_slice_hdr->i1_slice_sao_chroma_flag))
+                {
+                    ps_proc->s_sao_ctxt.ps_pps = ps_proc->ps_pps;
+                    ps_proc->s_sao_ctxt.ps_sps = ps_proc->ps_sps;
+                    ps_proc->s_sao_ctxt.ps_tile = ps_proc->ps_tile;
+                    ps_proc->s_sao_ctxt.ps_codec = ps_proc->ps_codec;
+                    ps_proc->s_sao_ctxt.ps_slice_hdr = ps_proc->ps_slice_hdr;
+                    ps_proc->s_sao_ctxt.i4_cur_slice_idx = ps_proc->i4_cur_slice_idx;
+
+
+#if SAO_PROCESS_SHIFT_CTB
+                    ps_proc->s_sao_ctxt.i4_ctb_x = ps_proc->i4_ctb_x;
+                    ps_proc->s_sao_ctxt.i4_ctb_y = ps_proc->i4_ctb_y;
+                    ps_proc->s_sao_ctxt.is_chroma_yuv420sp_vu = (ps_codec->e_ref_chroma_fmt == IV_YUV_420SP_VU);
+
+                    ihevcd_sao_shift_ctb(&ps_proc->s_sao_ctxt);
+#else
+                    if(ps_proc->i4_ctb_x > 1 && ps_proc->i4_ctb_y > 0)
+                    {
+                        ps_proc->s_sao_ctxt.i4_ctb_x = ps_proc->i4_ctb_x - 2;
+                        ps_proc->s_sao_ctxt.i4_ctb_y = ps_proc->i4_ctb_y - 1;
+
+                        ihevcd_sao_ctb(&ps_proc->s_sao_ctxt);
+                    }
+
+                    if(ps_sps->i2_pic_wd_in_ctb - 1 == ps_proc->i4_ctb_x && ps_proc->i4_ctb_y > 0)
+                    {
+                        ps_proc->s_sao_ctxt.i4_ctb_x = ps_proc->i4_ctb_x - 1;
+                        ps_proc->s_sao_ctxt.i4_ctb_y = ps_proc->i4_ctb_y - 1;
+
+                        ihevcd_sao_ctb(&ps_proc->s_sao_ctxt);
+
+                        ps_proc->s_sao_ctxt.i4_ctb_x = ps_proc->i4_ctb_x;
+                        ps_proc->s_sao_ctxt.i4_ctb_y = ps_proc->i4_ctb_y - 1;
+
+                        ihevcd_sao_ctb(&ps_proc->s_sao_ctxt);
+
+                        if(ps_sps->i2_pic_ht_in_ctb - 1 == ps_proc->i4_ctb_y)
+                        {
+                            WORD32 i4_ctb_x;
+                            ps_proc->s_sao_ctxt.i4_ctb_y = ps_proc->i4_ctb_y;
+                            for(i4_ctb_x = 0; i4_ctb_x < ps_sps->i2_pic_wd_in_ctb; i4_ctb_x++)
+                            {
+                                ps_proc->s_sao_ctxt.i4_ctb_x = i4_ctb_x;
+                                ihevcd_sao_ctb(&ps_proc->s_sao_ctxt);
+                            }
+                        }
+                    }
+#endif
+                }
+
+
+                /* Call padding if required */
+                {
+#if SAO_PROCESS_SHIFT_CTB
+
+                    if(0 == ps_proc->i4_ctb_x)
+                    {
+                        WORD32 pad_ht_luma;
+                        WORD32 pad_ht_chroma;
+
+                        ps_proc->pu1_cur_ctb_luma = ps_proc->pu1_cur_pic_luma
+                                        + (ps_proc->i4_ctb_x * ctb_size
+                                        + ps_proc->i4_ctb_y * ctb_size
+                                        * ps_codec->i4_strd);
+                        ps_proc->pu1_cur_ctb_chroma = ps_proc->pu1_cur_pic_chroma
+                                        + ps_proc->i4_ctb_x * ctb_size
+                                        + (ps_proc->i4_ctb_y * ctb_size * ps_codec->i4_strd / 2);
+
+                        pad_ht_luma = ctb_size;
+                        pad_ht_luma += (ps_sps->i2_pic_ht_in_ctb - 1) == ps_proc->i4_ctb_y ? 8 : 0;
+                        pad_ht_chroma = ctb_size / 2;
+                        /* Pad left after 1st CTB is processed */
+                        ps_codec->s_func_selector.ihevc_pad_left_luma_fptr(ps_proc->pu1_cur_ctb_luma - 8 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_luma, PAD_LEFT);
+                        ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr(ps_proc->pu1_cur_ctb_chroma - 16 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_chroma, PAD_LEFT);
+                    }
+
+                    if((ps_sps->i2_pic_wd_in_ctb - 1) == ps_proc->i4_ctb_x)
+                    {
+                        WORD32 pad_ht_luma;
+                        WORD32 pad_ht_chroma;
+                        WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples - (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size);
+
+                        ps_proc->pu1_cur_ctb_luma = ps_proc->pu1_cur_pic_luma
+                                        + (ps_proc->i4_ctb_x * ctb_size
+                                        + ps_proc->i4_ctb_y * ctb_size
+                                        * ps_codec->i4_strd);
+                        ps_proc->pu1_cur_ctb_chroma = ps_proc->pu1_cur_pic_chroma
+                                        + ps_proc->i4_ctb_x * ctb_size
+                                        + (ps_proc->i4_ctb_y * ctb_size * ps_codec->i4_strd / 2);
+
+                        pad_ht_luma = ctb_size;
+                        pad_ht_chroma = ctb_size / 2;
+                        if((ps_sps->i2_pic_ht_in_ctb - 1) == ps_proc->i4_ctb_y)
+                        {
+                            pad_ht_luma += 8;
+                            pad_ht_chroma += 16;
+                            ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr(ps_proc->pu1_cur_pic_chroma + (ps_sps->i2_pic_height_in_luma_samples / 2 - 16) * ps_codec->i4_strd,
+                                                                                 ps_codec->i4_strd, 16, PAD_LEFT);
+                        }
+                        /* Pad right after last CTB in the current row is processed */
+                        ps_codec->s_func_selector.ihevc_pad_right_luma_fptr(ps_proc->pu1_cur_ctb_luma + cols_remaining - 8 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_luma, PAD_RIGHT);
+                        ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr(ps_proc->pu1_cur_ctb_chroma + cols_remaining - 16 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_chroma, PAD_RIGHT);
+
+                        if((ps_sps->i2_pic_ht_in_ctb - 1) == ps_proc->i4_ctb_y)
+                        {
+                            UWORD8 *pu1_buf;
+                            /* Since SAO is shifted by 8x8, chroma padding can not be done till second row is processed */
+                            /* Hence moving top padding to to end of frame, Moving it to second row also results in problems when there is only one row */
+                            /* Pad top after padding left and right for current rows after processing 1st CTB row */
+                            ihevc_pad_top(ps_proc->pu1_cur_pic_luma - PAD_LEFT, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_TOP);
+                            ihevc_pad_top(ps_proc->pu1_cur_pic_chroma - PAD_LEFT, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_TOP / 2);
+
+                            pu1_buf = ps_proc->pu1_cur_pic_luma + ps_codec->i4_strd * ps_sps->i2_pic_height_in_luma_samples - PAD_LEFT;
+                            /* Pad top after padding left and right for current rows after processing 1st CTB row */
+                            ihevc_pad_bottom(pu1_buf, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_BOT);
+
+                            pu1_buf = ps_proc->pu1_cur_pic_chroma + ps_codec->i4_strd * (ps_sps->i2_pic_height_in_luma_samples / 2) - PAD_LEFT;
+                            ihevc_pad_bottom(pu1_buf, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_BOT / 2);
+                        }
+                    }
+#else
+                    if(ps_proc->i4_ctb_y > 1)
+                    {
+                        if(0 == ps_proc->i4_ctb_x)
+                        {
+                            WORD32 pad_ht_luma;
+                            WORD32 pad_ht_chroma;
+
+                            pad_ht_luma = ctb_size;
+                            pad_ht_chroma = ctb_size / 2;
+                            /* Pad left after 1st CTB is processed */
+                            ps_codec->s_func_selector.ihevc_pad_left_luma_fptr(ps_proc->pu1_cur_ctb_luma - 2 * ctb_size * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_luma, PAD_LEFT);
+                            ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr(ps_proc->pu1_cur_ctb_chroma - ctb_size * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_chroma, PAD_LEFT);
+                        }
+                        else if((ps_sps->i2_pic_wd_in_ctb - 1) == ps_proc->i4_ctb_x)
+                        {
+                            WORD32 pad_ht_luma;
+                            WORD32 pad_ht_chroma;
+                            WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples - (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size);
+
+                            pad_ht_luma = ((ps_sps->i2_pic_ht_in_ctb - 1) == ps_proc->i4_ctb_y) ? 3 * ctb_size : ctb_size;
+                            pad_ht_chroma = ((ps_sps->i2_pic_ht_in_ctb - 1) == ps_proc->i4_ctb_y) ? 3 * ctb_size / 2 : ctb_size / 2;
+                            /* Pad right after last CTB in the current row is processed */
+                            ps_codec->s_func_selector.ihevc_pad_right_luma_fptr(ps_proc->pu1_cur_ctb_luma + cols_remaining - 2 * ctb_size * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_luma, PAD_RIGHT);
+                            ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr(ps_proc->pu1_cur_ctb_chroma + cols_remaining - ctb_size * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_chroma, PAD_RIGHT);
+
+                            if((ps_sps->i2_pic_ht_in_ctb - 1) == ps_proc->i4_ctb_y)
+                            {
+                                UWORD8 *pu1_buf;
+                                WORD32 pad_ht_luma;
+                                WORD32 pad_ht_chroma;
+
+                                pad_ht_luma = 2 * ctb_size;
+                                pad_ht_chroma = ctb_size;
+
+                                ps_codec->s_func_selector.ihevc_pad_left_luma_fptr(ps_proc->pu1_cur_pic_luma + ps_codec->i4_strd * (ps_sps->i2_pic_height_in_luma_samples - 2 * ctb_size),
+                                                                                   ps_codec->i4_strd, pad_ht_luma, PAD_LEFT);
+                                ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr(ps_proc->pu1_cur_pic_chroma + ps_codec->i4_strd * (ps_sps->i2_pic_height_in_luma_samples / 2 - ctb_size),
+                                                                                     ps_codec->i4_strd, pad_ht_chroma, PAD_LEFT);
+
+                                /* Since SAO is shifted by 8x8, chroma padding can not be done till second row is processed */
+                                /* Hence moving top padding to to end of frame, Moving it to second row also results in problems when there is only one row */
+                                /* Pad top after padding left and right for current rows after processing 1st CTB row */
+                                ihevc_pad_top(ps_proc->pu1_cur_pic_luma - PAD_LEFT, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_TOP);
+                                ihevc_pad_top(ps_proc->pu1_cur_pic_chroma - PAD_LEFT, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_TOP / 2);
+
+                                pu1_buf = ps_proc->pu1_cur_pic_luma + ps_codec->i4_strd * ps_sps->i2_pic_height_in_luma_samples - PAD_LEFT;
+                                /* Pad top after padding left and right for current rows after processing 1st CTB row */
+                                ihevc_pad_bottom(pu1_buf, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_BOT);
+
+                                pu1_buf = ps_proc->pu1_cur_pic_chroma + ps_codec->i4_strd * (ps_sps->i2_pic_height_in_luma_samples / 2) - PAD_LEFT;
+                                ihevc_pad_bottom(pu1_buf, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_BOT / 2);
+                            }
+                        }
+                    }
+#endif
+                }
+            }
+
+
+            /* Update proc map for SAO*/
+            ihevcd_proc_map_update(ps_proc, proc_type, nctb);
+            /* Update proc map for Completion of CTB*/
+            ihevcd_proc_map_update(ps_proc, PROC_ALL, nctb);
+            {
+                tile_t *ps_tile;
+
+                ps_tile = ps_proc->ps_tile;
+                num_ctb_tmp -= nctb;
+
+                ps_proc->i4_ctb_tile_x += nctb;
+                ps_proc->i4_ctb_x += nctb;
+
+                ps_proc->i4_ctb_slice_x += nctb;
+
+
+                /* Update tile counters */
+                if(ps_proc->i4_ctb_tile_x >= (ps_tile->u2_wd))
+                {
+                    /*End of tile row*/
+                    ps_proc->i4_ctb_tile_x = 0;
+                    ps_proc->i4_ctb_x = ps_tile->u1_pos_x;
+
+                    ps_proc->i4_ctb_tile_y++;
+                    ps_proc->i4_ctb_y++;
+                    if(ps_proc->i4_ctb_tile_y == ps_tile->u2_ht)
+                    {
+                        /* Reached End of Tile */
+                        ps_proc->i4_ctb_tile_y = 0;
+                        ps_proc->i4_ctb_tile_x = 0;
+                        ps_proc->ps_tile++;
+                        //End of picture
+                        if(!((ps_tile->u2_ht + ps_tile->u1_pos_y  ==  ps_sps->i2_pic_ht_in_ctb) && (ps_tile->u2_wd + ps_tile->u1_pos_x  ==  ps_sps->i2_pic_wd_in_ctb)))
+                        {
+                            ps_tile = ps_proc->ps_tile;
+                            ps_proc->i4_ctb_x = ps_tile->u1_pos_x;
+                            ps_proc->i4_ctb_y = ps_tile->u1_pos_y;
+
+                        }
+                    }
+                }
+            }
+        }
+
+        ps_proc->i4_ctb_cnt -= num_ctb;
+    }
+    return ret;
+}
+
+void ihevcd_init_proc_ctxt(process_ctxt_t *ps_proc, WORD32 tu_coeff_data_ofst)
+{
+    codec_t *ps_codec;
+    slice_header_t *ps_slice_hdr;
+    pps_t *ps_pps;
+    sps_t *ps_sps;
+    tile_t *ps_tile, *ps_tile_prev;
+    WORD32 tile_idx;
+    WORD32 ctb_size;
+    WORD32 num_minpu_in_ctb;
+    WORD32 num_ctb_in_row;
+    WORD32 ctb_addr;
+    WORD32 i4_wd_in_ctb;
+    WORD32 tile_start_ctb_idx;
+    WORD32 slice_start_ctb_idx;
+    WORD32 check_tile_wd;
+    WORD32 continuous_tiles = 0; //Refers to tiles that are continuous, within a slice, horizontally
+
+    ps_codec = ps_proc->ps_codec;
+
+#ifdef GPU_BUILD
+    //TODO GPU : Later define it for ARM only version as well
+    ps_slice_hdr = ps_proc->ps_slice_hdr_base + ((ps_proc->i4_cur_slice_idx) & (MAX_SLICE_HDR_CNT - 1));
+#else
+    ps_slice_hdr = ps_codec->ps_slice_hdr_base + ((ps_proc->i4_cur_slice_idx) & (MAX_SLICE_HDR_CNT - 1));
+#endif
+    ps_proc->ps_slice_hdr = ps_slice_hdr;
+    ps_proc->ps_pps = ps_codec->ps_pps_base + ps_slice_hdr->i1_pps_id;
+    ps_pps = ps_proc->ps_pps;
+    ps_proc->ps_sps = ps_codec->ps_sps_base + ps_pps->i1_sps_id;
+    ps_sps = ps_proc->ps_sps;
+    ps_proc->i4_init_done = 1;
+    ctb_size = 1 << ps_sps->i1_log2_ctb_size;
+    num_minpu_in_ctb = (ctb_size / MIN_PU_SIZE) * (ctb_size / MIN_PU_SIZE);
+    num_ctb_in_row = ps_sps->i2_pic_wd_in_ctb;
+
+    ps_proc->s_sao_ctxt.pu1_slice_idx = ps_proc->pu1_slice_idx;
+
+    ihevcd_get_tile_pos(ps_pps, ps_sps, ps_proc->i4_ctb_x, ps_proc->i4_ctb_y,
+                        &ps_proc->i4_ctb_tile_x, &ps_proc->i4_ctb_tile_y,
+                        &tile_idx);
+
+    ps_proc->ps_tile = ps_pps->ps_tile + tile_idx;
+    ps_proc->i4_cur_tile_idx = tile_idx;
+    ps_tile = ps_proc->ps_tile;
+
+    if(ps_pps->i1_tiles_enabled_flag)
+    {
+        if(tile_idx)
+            ps_tile_prev = ps_tile - 1;
+        else
+            ps_tile_prev = ps_tile;
+
+        slice_start_ctb_idx = ps_slice_hdr->i2_ctb_x + (ps_slice_hdr->i2_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+        tile_start_ctb_idx = ps_tile->u1_pos_x + (ps_tile->u1_pos_y * ps_sps->i2_pic_wd_in_ctb);
+
+        /*Check if
+         * 1. Last tile that ends in frame boundary and 1st tile in next row belongs to same slice
+         * 1.1. If it does, check if the slice that has these tiles spans across the frame row.
+         * 2. Vertical tiles are present within a slice */
+        if(((ps_slice_hdr->i2_ctb_x == ps_tile->u1_pos_x) && (ps_slice_hdr->i2_ctb_y != ps_tile->u1_pos_y)))
+        {
+            continuous_tiles = 1;
+        }
+        else
+        {
+            check_tile_wd = ps_slice_hdr->i2_ctb_x + ps_tile_prev->u2_wd;
+            if(!(((check_tile_wd >= ps_sps->i2_pic_wd_in_ctb) && (check_tile_wd % ps_sps->i2_pic_wd_in_ctb == ps_tile->u1_pos_x))
+                                            || ((ps_slice_hdr->i2_ctb_x == ps_tile->u1_pos_x))))
+            {
+                continuous_tiles = 1;
+            }
+        }
+
+        {
+            WORD32 i2_independent_ctb_x = ps_slice_hdr->i2_independent_ctb_x;
+            WORD32 i2_independent_ctb_y = ps_slice_hdr->i2_independent_ctb_y;
+
+            /* Handles cases where
+             * 1. Slices begin at the start of each tile
+             * 2. Tiles lie in the same slice row.i.e, starting tile_x > slice_x, but tile_y == slice_y
+             * */
+            if(ps_proc->i4_ctb_x >= i2_independent_ctb_x)
+            {
+                ps_proc->i4_ctb_slice_x = ps_proc->i4_ctb_x - i2_independent_ctb_x;
+            }
+            else
+            {
+                /* Indicates multiple tiles in a slice case where
+                 * The new tile belongs to an older slice that started in the previous rows-not the present row
+                 * & (tile_y > slice_y and tile_x < slice_x)
+                 */
+                if((slice_start_ctb_idx < tile_start_ctb_idx) && (continuous_tiles))
+                {
+                    i4_wd_in_ctb = ps_sps->i2_pic_wd_in_ctb;
+                }
+                /* Indicates many-tiles-in-one-slice case, for slices that end without spanning the frame width*/
+                else
+                {
+                    i4_wd_in_ctb = ps_tile->u2_wd;
+                }
+
+                if(continuous_tiles)
+                {
+                    ps_proc->i4_ctb_slice_x = i4_wd_in_ctb
+                                    - (i2_independent_ctb_x - ps_proc->i4_ctb_x);
+                }
+                else
+                {
+                    ps_proc->i4_ctb_slice_x = ps_proc->i4_ctb_x - ps_tile->u1_pos_x;
+                }
+            }
+            /* Initialize ctb slice y to zero and at the start of slice row initialize it
+        to difference between ctb_y and slice's start ctb y */
+
+            ps_proc->i4_ctb_slice_y = ps_proc->i4_ctb_y - i2_independent_ctb_y;
+
+            /*If beginning of tile, check if slice counters are set correctly*/
+            if((0 == ps_proc->i4_ctb_tile_x) && (0 == ps_proc->i4_ctb_tile_y))
+            {
+                if(ps_slice_hdr->i1_dependent_slice_flag)
+                {
+                    ps_proc->i4_ctb_slice_x = 0;
+                    ps_proc->i4_ctb_slice_y = 0;
+                }
+                /*For slices that span across multiple tiles*/
+                else if(slice_start_ctb_idx < tile_start_ctb_idx)
+                {
+                    ps_proc->i4_ctb_slice_y = ps_tile->u1_pos_y - i2_independent_ctb_y;
+                    /* Two Cases
+                     * 1 - slice spans across frame-width- but dose not start from 1st column
+                     * 2 - Slice spans across multiple tiles anywhere is a frame
+                     */
+                    /*TODO:In a multiple slice clip,  if an independent slice span across more than 2 tiles in a row, it is not supported*/
+                    if(continuous_tiles) //Case 2-implemented for slices that span not more than 2 tiles
+                    {
+                        if(i2_independent_ctb_y <= ps_tile->u1_pos_y)
+                        {
+                            //Check if ctb x is before or after
+                            if(i2_independent_ctb_x > ps_tile->u1_pos_x)
+                            {
+                                ps_proc->i4_ctb_slice_y -= 1;
+                            }
+                        }
+                    }
+                }
+            }
+            //Slice starts from a column which is not the starting tile-column, but is within the tile
+            if(((i2_independent_ctb_x - ps_tile->u1_pos_x) != 0) && ((ps_proc->i4_ctb_slice_y != 0))
+                            && ((i2_independent_ctb_x >= ps_tile->u1_pos_x) && (i2_independent_ctb_x < ps_tile->u1_pos_x + ps_tile->u2_wd)))
+            {
+                ps_proc->i4_ctb_slice_y -= 1;
+            }
+        }
+    }
+    else
+    {
+        WORD32 i2_independent_ctb_x = ps_slice_hdr->i2_independent_ctb_x;
+        WORD32 i2_independent_ctb_y = ps_slice_hdr->i2_independent_ctb_y;
+
+
+        {
+            ps_proc->i4_ctb_slice_x = ps_proc->i4_ctb_x - i2_independent_ctb_x;
+            ps_proc->i4_ctb_slice_y = ps_proc->i4_ctb_y - i2_independent_ctb_y;
+            if(ps_proc->i4_ctb_slice_x < 0)
+            {
+                ps_proc->i4_ctb_slice_x += ps_sps->i2_pic_wd_in_ctb;
+                ps_proc->i4_ctb_slice_y -= 1;
+            }
+
+            /* Initialize ctb slice y to zero and at the start of slice row initialize it
+            to difference between ctb_y and slice's start ctb y */
+        }
+    }
+
+    /* Compute TU offset for the current CTB set */
+    {
+
+        WORD32 ctb_luma_min_tu_cnt;
+        WORD32 ctb_addr;
+
+        ctb_addr = ps_proc->i4_ctb_y * num_ctb_in_row + ps_proc->i4_ctb_x;
+
+        ctb_luma_min_tu_cnt = (1 << ps_sps->i1_log2_ctb_size) / MIN_TU_SIZE;
+        ctb_luma_min_tu_cnt *= ctb_luma_min_tu_cnt;
+
+        ps_proc->pu1_tu_map = ps_proc->pu1_pic_tu_map
+                        + ctb_luma_min_tu_cnt * ctb_addr;
+        if(1 == ps_codec->i4_num_cores)
+        {
+            ps_proc->ps_tu = ps_proc->ps_pic_tu + ps_proc->pu4_pic_tu_idx[ctb_addr % RESET_TU_BUF_NCTB];
+        }
+        else
+        {
+            ps_proc->ps_tu = ps_proc->ps_pic_tu + ps_proc->pu4_pic_tu_idx[ctb_addr];
+        }
+        ps_proc->pv_tu_coeff_data = (UWORD8 *)ps_proc->pv_pic_tu_coeff_data
+                        + tu_coeff_data_ofst;
+
+    }
+
+    /* Compute PU related elements for the current CTB set */
+    {
+        WORD32 pu_idx;
+        ctb_addr = ps_proc->i4_ctb_y * num_ctb_in_row + ps_proc->i4_ctb_x;
+        pu_idx = ps_proc->pu4_pic_pu_idx[ctb_addr];
+        ps_proc->pu1_pu_map = ps_proc->pu1_pic_pu_map
+                        + ctb_addr * num_minpu_in_ctb;
+        ps_proc->ps_pu = ps_proc->ps_pic_pu + pu_idx;
+    }
+
+    /* Number of ctbs processed in one loop of process function */
+    {
+        ps_proc->i4_nctb = MIN(ps_codec->u4_nctb, ps_tile->u2_wd);
+    }
+
+}
+void ihevcd_process_thread(process_ctxt_t *ps_proc)
+{
+#ifdef GPU_BUILD
+    codec_t *ps_codec = ps_proc->ps_codec;
+#endif
+    {
+        ithread_set_affinity(ps_proc->i4_id + 1);
+    }
+    while(1)
+    {
+        IHEVCD_ERROR_T ret;
+        proc_job_t s_job;
+
+        ret = ihevcd_jobq_dequeue((jobq_t *)ps_proc->pv_proc_jobq, &s_job,
+                                  sizeof(proc_job_t), 1);
+        if((IHEVCD_ERROR_T)IHEVCD_SUCCESS != ret)
+            break;
+
+        ps_proc->i4_ctb_cnt = s_job.i2_ctb_cnt;
+        ps_proc->i4_ctb_x = s_job.i2_ctb_x;
+        ps_proc->i4_ctb_y = s_job.i2_ctb_y;
+        ps_proc->i4_cur_slice_idx = s_job.i2_slice_idx;
+
+
+
+        if(CMD_PROCESS == s_job.i4_cmd)
+        {
+            ihevcd_init_proc_ctxt(ps_proc, s_job.i4_tu_coeff_data_ofst);
+#ifdef GPU_BUILD
+            if(1) //g_enable_gpu == 1)
+            {
+
+                if(s_job.i2_wait)
+                {
+                    //long long start_time, stop_time;
+                    //start_time = itGetUs();
+                    //printf("Before MC wait\n");
+                    ihevcd_gpu_mc_wait(ps_proc, s_job.i2_granularity_idx);
+                    //printf("After MC wait\n");
+                    //stop_time = itGetUs();
+                    //printf("CL Wait time time = %lld us\n", (stop_time - start_time));
+                }
+
+            }
+#endif
+            ihevcd_process(ps_proc);
+        }
+        else if(CMD_FMTCONV == s_job.i4_cmd)
+        {
+            sps_t *ps_sps;
+            codec_t *ps_codec;
+            ivd_out_bufdesc_t *ps_out_buffer;
+            WORD32 num_rows;
+
+            if(0 == ps_proc->i4_init_done)
+            {
+                ihevcd_init_proc_ctxt(ps_proc, 0);
+            }
+            ps_sps = ps_proc->ps_sps;
+            ps_codec = ps_proc->ps_codec;
+            ps_out_buffer = ps_proc->ps_out_buffer;
+            num_rows = 1 << ps_sps->i1_log2_ctb_size;
+
+            num_rows = MIN(num_rows, (ps_codec->i4_disp_ht - (s_job.i2_ctb_y << ps_sps->i1_log2_ctb_size)));
+
+            if(num_rows < 0)
+                num_rows = 0;
+
+            ihevcd_fmt_conv(ps_proc->ps_codec, ps_proc, ps_out_buffer->pu1_bufs[0], ps_out_buffer->pu1_bufs[1], ps_out_buffer->pu1_bufs[2],
+                            s_job.i2_ctb_y << ps_sps->i1_log2_ctb_size, num_rows);
+        }
+    }
+    //ithread_exit(0);
+    return;
+}
+

diff --git a/decoder/ihevcd_process_slice.h b/decoder/ihevcd_process_slice.h
new file mode 100644
index 0000000..367a243
--- /dev/null
+++ b/decoder/ihevcd_process_slice.h

@@ -0,0 +1,44 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_parse_slice.h
+*
+* @brief
+*  Processing of slice level data
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_PROCESS_SLICE_H_
+#define _IHEVCD_PROCESS_SLICE_H_
+
+IHEVCD_ERROR_T ihevcd_process(process_ctxt_t *ps_proc);
+void ihevcd_init_proc_ctxt(process_ctxt_t *ps_proc, WORD32 tu_coeff_data_ofst);
+void ihevcd_process_thread(process_ctxt_t *ps_proc);
+
+#endif /* _IHEVCD_PROCESS_SLICE_H_ */

diff --git a/decoder/ihevcd_profile.h b/decoder/ihevcd_profile.h
new file mode 100644
index 0000000..2e95e5c
--- /dev/null
+++ b/decoder/ihevcd_profile.h

@@ -0,0 +1,105 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_profile.h
+*
+* @brief
+*  Contains macros for profiling individual modules of decoder
+*
+* @author
+*  Naveen SR
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_PROFILE_H_
+#define _IHEVCD_PROFILE_H_
+
+#include "ihevc_defs.h"
+/* Define return; to disable individual module */
+#ifdef PROFILE_DIS_SAO_LEAF_LEVEL
+#define PROFILE_DISABLE_SAO_LEAF_LEVEL() return;
+#else
+#define PROFILE_DISABLE_SAO_LEAF_LEVEL() ;
+#endif
+
+#ifdef PROFILE_DIS_SAO
+#define PROFILE_DISABLE_SAO() return;
+#else
+#define PROFILE_DISABLE_SAO() ;
+#endif
+
+#ifdef PROFILE_DIS_DEBLK
+#define PROFILE_DISABLE_DEBLK() return;
+#else
+#define PROFILE_DISABLE_DEBLK() ;
+#endif
+
+#ifdef PROFILE_DIS_IQ_IT_RECON_INTRA_PRED
+#define PROFILE_DISABLE_IQ_IT_RECON_INTRA_PRED() return;
+#else
+#define PROFILE_DISABLE_IQ_IT_RECON_INTRA_PRED() ;
+#endif
+
+#ifdef PROFILE_DIS_INTER_PRED
+#define PROFILE_DISABLE_INTER_PRED() return;
+#else
+#define PROFILE_DISABLE_INTER_PRED() ;
+#endif
+
+#ifdef PROFILE_DIS_PROCESS_CTB
+#define PROFILE_DISABLE_PROCESS_CTB() return;
+/* When processing is disabled, no point in format converion as well */
+#define PROFILE_DISABLE_FMT_CONV() return 0;
+#define PROFILE_DIS_PROCESS_CTB_SET_NOOUTPUT() ps_dec_op->u4_output_present = 0;
+#else
+#define PROFILE_DISABLE_PROCESS_CTB() ;
+#define PROFILE_DIS_PROCESS_CTB_SET_NOOUTPUT() ;
+#define PROFILE_DISABLE_FMT_CONV();
+#endif
+
+#ifdef PROFILE_DIS_BOUNDARY_STRENGTH
+#define PROFILE_DISABLE_BOUNDARY_STRENGTH() return;
+#else
+#define PROFILE_DISABLE_BOUNDARY_STRENGTH() ;
+#endif
+
+#ifdef PROFILE_DIS_MV_PREDICTION
+#define PROFILE_DISABLE_MV_PREDICTION() return;
+#else
+#define PROFILE_DISABLE_MV_PREDICTION() ;
+#endif
+
+//#define PROFILE_DISABLE_INTER_PRED_LUMA(clr_indx) {if(clr_indx == 0) continue;}
+//#define PROFILE_DISABLE_INTER_PRED_CHROMA(clr_indx) {if(clr_indx == 1) continue;}
+//#define PROFILE_DISABLE_INTER_PRED_LUMA_AVERAGING(clr_indx) {if(clr_indx == 0) continue;}
+//#define PROFILE_DISABLE_INTER_PRED_CHROMA_AVERAGING(clr_indx) {if(clr_indx == 1) continue;}
+
+#define PROFILE_DISABLE_INTER_PRED_LUMA(clr_indx) ;
+#define PROFILE_DISABLE_INTER_PRED_CHROMA(clr_indx) ;
+#define PROFILE_DISABLE_INTER_PRED_LUMA_AVERAGING(clr_indx) ;
+#define PROFILE_DISABLE_INTER_PRED_CHROMA_AVERAGING(clr_indx) ;
+
+#endif /* _IHEVCD_PROFILE_H_ */

diff --git a/decoder/ihevcd_ref_list.c b/decoder/ihevcd_ref_list.c
new file mode 100644
index 0000000..e04a756
--- /dev/null
+++ b/decoder/ihevcd_ref_list.c

@@ -0,0 +1,558 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_api.c
+*
+* @brief
+*  Contains functions definitions for reference list generation
+*
+* @author
+*  Srinivas T
+*
+* @par List of Functions:\
+* - ihevcd_ref_pic
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_defs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_structs.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_error.h"
+
+
+mv_buf_t* ihevcd_mv_mgr_get_poc(buf_mgr_t *ps_mv_buf_mgr, UWORD32 abs_poc)
+{
+    UWORD32 i;
+    mv_buf_t *ps_mv_buf = NULL;
+
+
+
+    for(i = 0; i < ps_mv_buf_mgr->u4_max_buf_cnt; i++)
+    {
+        ps_mv_buf = (mv_buf_t *)ps_mv_buf_mgr->apv_ptr[i];
+        if(ps_mv_buf && (ps_mv_buf->i4_abs_poc == (WORD32)abs_poc))
+        {
+            break;
+        }
+    }
+
+    return ps_mv_buf;
+}
+
+
+WORD32 ihevcd_ref_list(codec_t *ps_codec, pps_t *ps_pps, sps_t *ps_sps, slice_header_t *ps_slice_hdr)
+{
+    WORD32 i;
+    WORD32 st_rps_idx;
+    WORD32 num_neg_pics, num_pos_pics;
+    WORD8 *pi1_used;
+    WORD16 *pi2_delta_poc;
+    UWORD32 u4_max_poc_lsb;
+    pic_buf_t *ps_pic_buf;
+    mv_buf_t *ps_mv_buf;
+    UWORD32 r_idx;
+
+    dpb_mgr_t *ps_dpb_mgr = (dpb_mgr_t *)ps_codec->pv_dpb_mgr;
+    buf_mgr_t *ps_mv_buf_mgr = (buf_mgr_t *)ps_codec->pv_mv_buf_mgr;
+
+    WORD32 ai4_poc_st_curr_before[MAX_DPB_SIZE], ai4_poc_st_foll[MAX_DPB_SIZE], ai4_poc_st_curr_after[MAX_DPB_SIZE];
+    WORD32 ai4_poc_lt_curr[MAX_DPB_SIZE], ai4_poc_lt_foll[MAX_DPB_SIZE];
+    UWORD32 u4_num_st_curr_before, u4_num_st_foll, u4_num_st_curr_after, u4_num_lt_curr, u4_num_lt_foll;
+    UWORD32 u4_num_total_curr;
+
+    WORD8 ai1_curr_delta_poc_msb_present_flag[MAX_DPB_SIZE], ai1_foll_delta_poc_msb_present_flag[MAX_DPB_SIZE];
+
+    pic_buf_t *as_ref_pic_lt_curr[MAX_DPB_SIZE];
+    pic_buf_t *as_ref_pic_lt_foll[MAX_DPB_SIZE];
+    pic_buf_t *as_ref_pic_st_curr_after[MAX_DPB_SIZE];
+    pic_buf_t *as_ref_pic_st_curr_before[MAX_DPB_SIZE];
+    pic_buf_t *as_ref_pic_st_foll[MAX_DPB_SIZE];
+
+    pic_buf_t *as_ref_pic_list_temp0[MAX_DPB_SIZE], *as_ref_pic_list_temp1[MAX_DPB_SIZE];
+
+    UWORD32 u4_num_rps_curr_temp_list0, u4_num_rps_curr_temp_list1;
+
+    WORD32 i4_pic_order_cnt_val;
+    WORD32 i4_poc_lt;
+    UNUSED(as_ref_pic_lt_foll);
+    UNUSED(as_ref_pic_st_foll);
+    UNUSED(ps_pps);
+
+    RETURN_IF_NAL_INFO;
+
+    u4_max_poc_lsb = (1 << ps_sps->i1_log2_max_pic_order_cnt_lsb);
+
+    i4_pic_order_cnt_val = ps_slice_hdr->i4_abs_pic_order_cnt;
+
+    if(1 == ps_slice_hdr->i1_short_term_ref_pic_set_sps_flag)
+    {
+        st_rps_idx = ps_slice_hdr->i1_short_term_ref_pic_set_idx;
+        num_neg_pics = ps_sps->as_stref_picset[st_rps_idx].i1_num_neg_pics;
+        num_pos_pics = ps_sps->as_stref_picset[st_rps_idx].i1_num_pos_pics;
+        pi1_used = ps_sps->as_stref_picset[st_rps_idx].ai1_used;
+        pi2_delta_poc = ps_sps->as_stref_picset[st_rps_idx].ai2_delta_poc;
+    }
+    else
+    {
+        st_rps_idx = ps_sps->i1_num_short_term_ref_pic_sets;
+        num_neg_pics = ps_slice_hdr->s_stref_picset.i1_num_neg_pics;
+        num_pos_pics = ps_slice_hdr->s_stref_picset.i1_num_pos_pics;
+        pi1_used = ps_slice_hdr->s_stref_picset.ai1_used;
+        pi2_delta_poc = ps_slice_hdr->s_stref_picset.ai2_delta_poc;
+    }
+
+    u4_num_st_curr_before = 0;
+    u4_num_st_foll = 0;
+    for(i = 0; i < num_neg_pics; i++)
+    {
+        if(pi1_used[i])
+        {
+            ai4_poc_st_curr_before[u4_num_st_curr_before] = i4_pic_order_cnt_val + pi2_delta_poc[i];
+            u4_num_st_curr_before++;
+        }
+        else
+        {
+            ai4_poc_st_foll[u4_num_st_foll] = i4_pic_order_cnt_val + pi2_delta_poc[i];
+            u4_num_st_foll++;
+        }
+    }
+    u4_num_st_curr_after = 0;
+    for(i = num_neg_pics; i < num_neg_pics + num_pos_pics; i++)
+    {
+        if(pi1_used[i])
+        {
+            ai4_poc_st_curr_after[u4_num_st_curr_after] = i4_pic_order_cnt_val + pi2_delta_poc[i];
+            u4_num_st_curr_after++;
+        }
+        else
+        {
+            ai4_poc_st_foll[u4_num_st_foll] = i4_pic_order_cnt_val + pi2_delta_poc[i];
+            u4_num_st_foll++;
+        }
+    }
+
+    u4_num_lt_curr = 0;
+    u4_num_lt_foll = 0;
+    for(i = 0; i < ps_slice_hdr->i1_num_long_term_sps + ps_slice_hdr->i1_num_long_term_pics; i++)
+    {
+        i4_poc_lt = ps_slice_hdr->ai4_poc_lsb_lt[i];
+        if(ps_slice_hdr->ai1_delta_poc_msb_present_flag[i])
+        {
+            i4_poc_lt += i4_pic_order_cnt_val - ps_slice_hdr->ai1_delta_poc_msb_cycle_lt[i] * u4_max_poc_lsb - ps_slice_hdr->i4_pic_order_cnt_lsb;
+        }
+
+        if(ps_slice_hdr->ai1_used_by_curr_pic_lt_flag[i])
+        {
+            ai4_poc_lt_curr[u4_num_lt_curr] = i4_poc_lt;
+            ai1_curr_delta_poc_msb_present_flag[u4_num_lt_curr] = ps_slice_hdr->ai1_delta_poc_msb_present_flag[i];
+            u4_num_lt_curr++;
+        }
+        else
+        {
+            ai4_poc_lt_foll[u4_num_lt_foll] = i4_poc_lt;
+            ai1_foll_delta_poc_msb_present_flag[u4_num_lt_foll] = ps_slice_hdr->ai1_delta_poc_msb_present_flag[i];
+            u4_num_lt_foll++;
+        }
+    }
+
+    u4_num_total_curr = u4_num_lt_curr + u4_num_st_curr_after + u4_num_st_curr_before;
+
+    /* Bit stream conformance tests */
+/*
+    for(i = 0; i < u4_num_lt_curr; i++)
+    {
+        int j;
+        if(ai1_curr_delta_poc_msb_present_flag[i])
+        {
+            for(j = 0; j < u4_num_st_curr_before; j++)
+            {
+                ASSERT(ai4_poc_st_curr_before[j] != ai4_poc_lt_curr[i]);
+            }
+            for(j = 0; j < u4_num_st_curr_after; j++)
+            {
+                ASSERT(ai4_poc_st_curr_after[j] != ai4_poc_lt_curr[i]);
+            }
+            for(j = 0; j < u4_num_st_foll; j++)
+            {
+                ASSERT(ai4_poc_st_foll[j] != ai4_poc_lt_curr[i]);
+            }
+            for(j = 0; j < u4_num_lt_curr; j++)
+            {
+                ASSERT((ai4_poc_lt_curr[j] != ai4_poc_lt_curr[i]) || (j == i));
+            }
+        }
+        else
+        {
+            for(j = 0; j < u4_num_st_curr_before; j++)
+            {
+                ASSERT((ai4_poc_st_curr_before[j] & (u4_max_poc_lsb - 1)) != ai4_poc_lt_curr[i]);
+            }
+            for(j = 0; j < u4_num_st_curr_after; j++)
+            {
+                ASSERT((ai4_poc_st_curr_after[j] & (u4_max_poc_lsb - 1)) != ai4_poc_lt_curr[i]);
+            }
+            for(j = 0; j < u4_num_st_foll; j++)
+            {
+                ASSERT((ai4_poc_st_foll[j] & (u4_max_poc_lsb - 1)) != ai4_poc_lt_curr[i]);
+            }
+            for(j = 0; j < u4_num_lt_curr; j++)
+            {
+                ASSERT(((ai4_poc_lt_curr[j] & (u4_max_poc_lsb - 1)) != ai4_poc_lt_curr[i]) || (j == i));
+            }
+        }
+    }
+
+    for(i = 0; i < u4_num_lt_foll; i++)
+    {
+        int j;
+        if(ai1_foll_delta_poc_msb_present_flag[i])
+        {
+            for(j = 0; j < u4_num_st_curr_before; j++)
+            {
+                ASSERT(ai4_poc_st_curr_before[j] != ai4_poc_lt_foll[i]);
+            }
+            for(j = 0; j < u4_num_st_curr_after; j++)
+            {
+                ASSERT(ai4_poc_st_curr_after[j] != ai4_poc_lt_foll[i]);
+            }
+            for(j = 0; j < u4_num_st_foll; j++)
+            {
+                ASSERT(ai4_poc_st_foll[j] != ai4_poc_lt_foll[i]);
+            }
+            for(j = 0; j < u4_num_lt_curr; j++)
+            {
+                ASSERT(ai4_poc_lt_curr[j] != ai4_poc_lt_foll[i]);
+            }
+            for(j = 0; j < u4_num_lt_foll; j++)
+            {
+                ASSERT((ai4_poc_lt_foll[j] != ai4_poc_lt_foll[i]) || (j == i));
+            }
+        }
+        else
+        {
+            for(j = 0; j < u4_num_st_curr_before; j++)
+            {
+                ASSERT((ai4_poc_st_curr_before[j] & (u4_max_poc_lsb - 1)) != ai4_poc_lt_foll[i]);
+            }
+            for(j = 0; j < u4_num_st_curr_after; j++)
+            {
+                ASSERT((ai4_poc_st_curr_after[j] & (u4_max_poc_lsb - 1)) != ai4_poc_lt_foll[i]);
+            }
+            for(j = 0; j < u4_num_st_foll; j++)
+            {
+                ASSERT((ai4_poc_st_foll[j] & (u4_max_poc_lsb - 1)) != ai4_poc_lt_foll[i]);
+            }
+            for(j = 0; j < u4_num_lt_curr; j++)
+            {
+                ASSERT((ai4_poc_lt_curr[j] & (u4_max_poc_lsb - 1)) != ai4_poc_lt_foll[i]);
+            }
+            for(j = 0; j < u4_num_lt_foll; j++)
+            {
+                ASSERT(((ai4_poc_lt_foll[j] & (u4_max_poc_lsb - 1)) != ai4_poc_lt_foll[i]) || (j == i));
+            }
+        }
+    }
+*/
+
+
+    /* Reference Pic sets creation */
+
+    /* Set all the DPB buffers to UNUSED_FOR_REF */
+    if(0 == ps_codec->i4_pic_present)
+    {
+        for(i = 0; i < MAX_DPB_BUFS; i++)
+        {
+            if(ps_dpb_mgr->as_dpb_info[i].ps_pic_buf)
+                ps_dpb_mgr->as_dpb_info[i].ps_pic_buf->u1_used_as_ref = UNUSED_FOR_REF;
+        }
+    }
+
+    for(i = 0; i < (WORD32)u4_num_lt_curr; i++)
+    {
+        if(0 == ai1_curr_delta_poc_msb_present_flag[i])
+        {
+            ps_pic_buf = ihevc_dpb_mgr_get_ref_by_poc_lsb(ps_dpb_mgr, ai4_poc_lt_curr[i]);
+            if(NULL != ps_pic_buf)
+                ps_pic_buf->u1_used_as_ref = LONG_TERM_REF;
+
+            as_ref_pic_lt_curr[i] = ps_pic_buf;
+        }
+        else
+        {
+            ps_pic_buf = ihevc_dpb_mgr_get_ref_by_poc(ps_dpb_mgr, ai4_poc_lt_curr[i]);
+            if(NULL != ps_pic_buf)
+                ps_pic_buf->u1_used_as_ref = LONG_TERM_REF;
+
+            as_ref_pic_lt_curr[i] = ps_pic_buf;
+        }
+    }
+
+    for(i = 0; i < (WORD32)u4_num_lt_foll; i++)
+    {
+        if(0 == ai1_foll_delta_poc_msb_present_flag[i])
+        {
+            ps_pic_buf = ihevc_dpb_mgr_get_ref_by_poc_lsb(ps_dpb_mgr, ai4_poc_lt_foll[i]);
+            if(NULL != ps_pic_buf)
+                ps_pic_buf->u1_used_as_ref = LONG_TERM_REF;
+
+            as_ref_pic_lt_foll[i] = ps_pic_buf;
+        }
+        else
+        {
+            ps_pic_buf = ihevc_dpb_mgr_get_ref_by_poc(ps_dpb_mgr, ai4_poc_lt_foll[i]);
+            if(NULL != ps_pic_buf)
+                ps_pic_buf->u1_used_as_ref = LONG_TERM_REF;
+
+            as_ref_pic_lt_foll[i] = ps_pic_buf;
+        }
+    }
+
+
+    for(i = 0; i < (WORD32)u4_num_st_curr_before; i++)
+    {
+
+        ps_pic_buf = ihevc_dpb_mgr_get_ref_by_poc(ps_dpb_mgr, ai4_poc_st_curr_before[i]);
+        if(NULL != ps_pic_buf)
+            ps_pic_buf->u1_used_as_ref = SHORT_TERM_REF;
+
+        as_ref_pic_st_curr_before[i] = ps_pic_buf;
+    }
+
+    for(i = 0; i < (WORD32)u4_num_st_curr_after; i++)
+    {
+        ps_pic_buf = ihevc_dpb_mgr_get_ref_by_poc(ps_dpb_mgr, ai4_poc_st_curr_after[i]);
+        if(NULL != ps_pic_buf)
+            ps_pic_buf->u1_used_as_ref = SHORT_TERM_REF;
+
+        as_ref_pic_st_curr_after[i] = ps_pic_buf;
+    }
+
+    for(i = 0; i < (WORD32)u4_num_st_foll; i++)
+    {
+        ps_pic_buf = ihevc_dpb_mgr_get_ref_by_poc(ps_dpb_mgr, ai4_poc_st_foll[i]);
+        if(NULL != ps_pic_buf)
+            ps_pic_buf->u1_used_as_ref = SHORT_TERM_REF;
+
+        as_ref_pic_st_foll[i] = ps_pic_buf;
+    }
+
+    //TODO: Bit stream conformance tests to be included
+
+    u4_num_rps_curr_temp_list0 = (WORD32)u4_num_total_curr > ps_slice_hdr->i1_num_ref_idx_l0_active ? (WORD32)u4_num_total_curr : ps_slice_hdr->i1_num_ref_idx_l0_active;
+
+    r_idx = 0;
+    if((PSLICE == ps_slice_hdr->i1_slice_type) ||
+       (BSLICE == ps_slice_hdr->i1_slice_type))
+    {
+        while(r_idx < u4_num_rps_curr_temp_list0)
+        {
+            for(i = 0; (i < (WORD32)u4_num_st_curr_before) && (r_idx < u4_num_rps_curr_temp_list0); r_idx++, i++)
+            {
+                if(NULL == as_ref_pic_st_curr_before[i])
+                {
+                    as_ref_pic_st_curr_before[i] = ihevc_dpb_mgr_get_ref_by_nearest_poc(ps_dpb_mgr, ai4_poc_st_curr_before[i]);
+                }
+                as_ref_pic_list_temp0[r_idx] = as_ref_pic_st_curr_before[i];
+            }
+
+            for(i = 0; (i < (WORD32)u4_num_st_curr_after) && (r_idx < u4_num_rps_curr_temp_list0); r_idx++, i++)
+            {
+                if(NULL == as_ref_pic_st_curr_after[i])
+                {
+                    as_ref_pic_st_curr_after[i] = ihevc_dpb_mgr_get_ref_by_nearest_poc(ps_dpb_mgr, ai4_poc_st_curr_after[i]);
+                }
+                as_ref_pic_list_temp0[r_idx] = as_ref_pic_st_curr_after[i];
+            }
+
+            for(i = 0; (i < (WORD32)u4_num_lt_curr) && (r_idx < u4_num_rps_curr_temp_list0); r_idx++, i++)
+            {
+                if(NULL == as_ref_pic_lt_curr[i])
+                {
+                    as_ref_pic_lt_curr[i] = ihevc_dpb_mgr_get_ref_by_nearest_poc(ps_dpb_mgr, ai4_poc_lt_curr[i]);
+                }
+                as_ref_pic_list_temp0[r_idx] = as_ref_pic_lt_curr[i];
+            }
+        }
+
+        for(r_idx = 0; (WORD32)r_idx < ps_slice_hdr->i1_num_ref_idx_l0_active; r_idx++)
+        {
+            pic_buf_t *ps_pic_buf;
+            ps_slice_hdr->as_ref_pic_list0[r_idx].pv_pic_buf = ps_slice_hdr->s_rplm.i1_ref_pic_list_modification_flag_l0 ?  (void *)as_ref_pic_list_temp0[ps_slice_hdr->s_rplm.i1_list_entry_l0[r_idx]] :  (void *)as_ref_pic_list_temp0[r_idx];
+            ps_pic_buf = (pic_buf_t *)ps_slice_hdr->as_ref_pic_list0[r_idx].pv_pic_buf;
+
+            if(ps_pic_buf == NULL)
+                return IHEVCD_REF_PIC_NOT_FOUND;
+
+            ps_mv_buf = ihevcd_mv_mgr_get_poc(ps_mv_buf_mgr, ps_pic_buf->i4_abs_poc);
+            ps_slice_hdr->as_ref_pic_list0[r_idx].pv_mv_buf = ps_mv_buf;
+        }
+
+
+        if(ps_slice_hdr->i1_slice_type  == BSLICE)
+        {
+            u4_num_rps_curr_temp_list1 = (WORD32)u4_num_total_curr > ps_slice_hdr->i1_num_ref_idx_l1_active ? (WORD32)u4_num_total_curr : ps_slice_hdr->i1_num_ref_idx_l1_active;
+
+            r_idx = 0;
+            while(r_idx < u4_num_rps_curr_temp_list1)
+            {
+                for(i = 0; (i < (WORD32)u4_num_st_curr_after) && (r_idx < u4_num_rps_curr_temp_list1); r_idx++, i++)
+                {
+                    if(NULL == as_ref_pic_st_curr_after[i])
+                    {
+                        as_ref_pic_st_curr_after[i] = ihevc_dpb_mgr_get_ref_by_nearest_poc(ps_dpb_mgr, ai4_poc_st_curr_after[i]);
+                    }
+                    as_ref_pic_list_temp1[r_idx] = as_ref_pic_st_curr_after[i];
+                }
+
+                for(i = 0; (i < (WORD32)u4_num_st_curr_before) && (r_idx < u4_num_rps_curr_temp_list1); r_idx++, i++)
+                {
+                    if(NULL == as_ref_pic_st_curr_before[i])
+                    {
+                        as_ref_pic_st_curr_before[i] = ihevc_dpb_mgr_get_ref_by_nearest_poc(ps_dpb_mgr, ai4_poc_st_curr_before[i]);
+                    }
+                    as_ref_pic_list_temp1[r_idx] = as_ref_pic_st_curr_before[i];
+                }
+
+                for(i = 0; (i < (WORD32)u4_num_lt_curr) && (r_idx < u4_num_rps_curr_temp_list1); r_idx++, i++)
+                {
+                    if(NULL == as_ref_pic_lt_curr[i])
+                    {
+                        as_ref_pic_lt_curr[i] = ihevc_dpb_mgr_get_ref_by_nearest_poc(ps_dpb_mgr, ai4_poc_lt_curr[i]);
+                    }
+                    as_ref_pic_list_temp1[r_idx] = as_ref_pic_lt_curr[i];
+                }
+            }
+
+            for(r_idx = 0; (WORD32)r_idx < ps_slice_hdr->i1_num_ref_idx_l1_active; r_idx++)
+            {
+                pic_buf_t *ps_pic_buf;
+                ps_slice_hdr->as_ref_pic_list1[r_idx].pv_pic_buf = ps_slice_hdr->s_rplm.i1_ref_pic_list_modification_flag_l1 ?  (void *)as_ref_pic_list_temp1[ps_slice_hdr->s_rplm.i1_list_entry_l1[r_idx]] :  (void *)as_ref_pic_list_temp1[r_idx];
+                ps_pic_buf = (pic_buf_t *)ps_slice_hdr->as_ref_pic_list1[r_idx].pv_pic_buf;
+
+                if(ps_pic_buf == NULL)
+                    return IHEVCD_REF_PIC_NOT_FOUND;
+
+                ps_mv_buf = ihevcd_mv_mgr_get_poc(ps_mv_buf_mgr, ps_pic_buf->i4_abs_poc);
+                ps_slice_hdr->as_ref_pic_list1[r_idx].pv_mv_buf = ps_mv_buf;
+            }
+        }
+    }
+
+    DEBUG_PRINT_REF_LIST_POCS(i4_pic_order_cnt_val, ps_slice_hdr, ps_dpb_mgr, u4_num_st_curr_before, u4_num_st_curr_after, u4_num_st_foll, u4_num_lt_curr, u4_num_lt_foll, ai4_poc_st_curr_before, ai4_poc_st_curr_after, ai4_poc_st_foll, ai4_poc_lt_curr, ai4_poc_lt_foll);
+#ifndef GPU_BUILD
+    /* Buffers that are still marked as UNUSED_FOR_REF are released from dpb (internally dpb calls release from pic buf manager)*/
+    for(i = 0; i < MAX_DPB_BUFS; i++)
+    {
+        if((ps_dpb_mgr->as_dpb_info[i].ps_pic_buf) && (UNUSED_FOR_REF == ps_dpb_mgr->as_dpb_info[i].ps_pic_buf->u1_used_as_ref))
+        {
+            pic_buf_t *ps_pic_buf = ps_dpb_mgr->as_dpb_info[i].ps_pic_buf;
+            mv_buf_t *ps_mv_buf;
+
+            /* Long term index is set to MAX_DPB_BUFS to ensure it is not added as LT */
+            ihevc_dpb_mgr_del_ref(ps_dpb_mgr, (buf_mgr_t *)ps_codec->pv_pic_buf_mgr, ps_pic_buf->i4_abs_poc);
+
+
+            /* Find buffer id of the MV bank corresponding to the buffer being freed (Buffer with POC of u4_abs_poc) */
+            ps_mv_buf = (mv_buf_t *)ps_codec->ps_mv_buf;
+            for(i = 0; i < BUF_MGR_MAX_CNT; i++)
+            {
+                if(ps_mv_buf->i4_abs_poc == ps_pic_buf->i4_abs_poc)
+                {
+                    ihevc_buf_mgr_release((buf_mgr_t *)ps_codec->pv_mv_buf_mgr, i, BUF_MGR_REF);
+                    break;
+                }
+                ps_mv_buf++;
+            }
+        }
+
+    }
+#endif
+
+    return IHEVCD_SUCCESS;
+}
+#ifdef GPU_BUILD
+void ihevcd_free_ref_mv_buffers(codec_t *ps_codec)
+{
+    WORD32 i;
+    dpb_mgr_t *ps_dpb_mgr = ps_codec->pv_dpb_mgr;
+    // TODO
+    /* Buffers that are still marked as UNUSED_FOR_REF are released from dpb (internally dpb calls release from pic buf manager)*/
+    for(i = 0; i < MAX_DPB_BUFS; i++)
+    {
+        if((ps_dpb_mgr->as_dpb_info[i].ps_pic_buf) && (UNUSED_FOR_REF == ps_dpb_mgr->as_dpb_info[i].ps_pic_buf->u1_used_as_ref))
+        {
+            pic_buf_t *ps_pic_buf = ps_dpb_mgr->as_dpb_info[i].ps_pic_buf;
+            mv_buf_t *ps_mv_buf;
+
+            /* Long term index is set to MAX_DPB_BUFS to ensure it is not added as LT */
+            ihevc_dpb_mgr_del_ref(ps_dpb_mgr, (buf_mgr_t *)ps_codec->pv_pic_buf_mgr, ps_pic_buf->i4_abs_poc);
+
+
+            /* Find buffer id of the MV bank corresponding to the buffer being freed (Buffer with POC of u4_abs_poc) */
+            ps_mv_buf = (mv_buf_t *)ps_codec->ps_mv_buf;
+            for(i = 0; i < BUF_MGR_MAX_CNT; i++)
+            {
+                if(ps_mv_buf->i4_abs_poc == ps_pic_buf->i4_abs_poc)
+                {
+                    ihevc_buf_mgr_release((buf_mgr_t *)ps_codec->pv_mv_buf_mgr, i, BUF_MGR_REF);
+                    break;
+                }
+                ps_mv_buf++;
+            }
+        }
+
+    }
+
+    return IHEVCD_SUCCESS;
+}
+#endif

diff --git a/decoder/ihevcd_ref_list.h b/decoder/ihevcd_ref_list.h
new file mode 100644
index 0000000..7bc22f7
--- /dev/null
+++ b/decoder/ihevcd_ref_list.h

@@ -0,0 +1,39 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_ref_list.h
+*
+* @brief
+*  Contains functions definitions for reference list generation
+*
+* @author
+*  Srinivas T
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVCD_REF_LIST_H_
+#define _IHEVCD_REF_LIST_H_
+
+WORD32 ihevcd_ref_list(codec_t *ps_codec, pps_t *ps_pps, sps_t *ps_sps, slice_header_t *ps_slice_hdr);
+
+#endif

diff --git a/decoder/ihevcd_sao.c b/decoder/ihevcd_sao.c
new file mode 100644
index 0000000..d8e8f5c
--- /dev/null
+++ b/decoder/ihevcd_sao.c

@@ -0,0 +1,3348 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_sao.c
+ *
+ * @brief
+ *  Contains function definitions for sample adaptive offset process
+ *
+ * @author
+ *  Srinivas T
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_sao.h"
+#include "ihevc_mem_fns.h"
+
+#include "ihevc_error.h"
+#include "ihevc_common_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_utils.h"
+
+#include "ihevc_deblk.h"
+#include "ihevc_deblk_tables.h"
+#include "ihevcd_profile.h"
+#include "ihevcd_sao.h"
+#include "ihevcd_debug.h"
+
+#define SAO_SHIFT_CTB    8
+
+/**
+ * SAO at CTB level is implemented for a shifted CTB(8 pixels in x and y directions)
+ */
+void ihevcd_sao_ctb(sao_ctxt_t *ps_sao_ctxt)
+{
+    codec_t *ps_codec = ps_sao_ctxt->ps_codec;
+    UWORD8 *pu1_src_luma;
+    UWORD8 *pu1_src_chroma;
+    WORD32 src_strd;
+    WORD32 ctb_size;
+    WORD32 log2_ctb_size;
+    sps_t *ps_sps;
+    sao_t *ps_sao;
+    WORD32 row, col;
+    UWORD8 au1_avail_luma[8];
+    UWORD8 au1_avail_chroma[8];
+    WORD32 i;
+    UWORD8 *pu1_src_top_luma;
+    UWORD8 *pu1_src_top_chroma;
+    UWORD8 *pu1_src_left_luma;
+    UWORD8 *pu1_src_left_chroma;
+    UWORD8 au1_src_top_right[2];
+    UWORD8 au1_src_bot_left[2];
+    UWORD8 *pu1_no_loop_filter_flag;
+    WORD32 loop_filter_strd;
+
+    WORD8 ai1_offset_y[5];
+    WORD8 ai1_offset_cb[5];
+    WORD8 ai1_offset_cr[5];
+
+    PROFILE_DISABLE_SAO();
+
+    ai1_offset_y[0] = 0;
+    ai1_offset_cb[0] = 0;
+    ai1_offset_cr[0] = 0;
+
+    ps_sps = ps_sao_ctxt->ps_sps;
+    log2_ctb_size = ps_sps->i1_log2_ctb_size;
+    ctb_size = (1 << log2_ctb_size);
+    src_strd = ps_sao_ctxt->ps_codec->i4_strd;
+    pu1_src_luma = ps_sao_ctxt->pu1_cur_pic_luma + ((ps_sao_ctxt->i4_ctb_x + ps_sao_ctxt->i4_ctb_y * ps_sao_ctxt->ps_codec->i4_strd) << (log2_ctb_size));
+    pu1_src_chroma = ps_sao_ctxt->pu1_cur_pic_chroma + ((ps_sao_ctxt->i4_ctb_x + ps_sao_ctxt->i4_ctb_y * ps_sao_ctxt->ps_codec->i4_strd / 2) << (log2_ctb_size));
+
+    ps_sao = ps_sao_ctxt->ps_pic_sao + ps_sao_ctxt->i4_ctb_x + ps_sao_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+    loop_filter_strd =  (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
+
+    /* Current CTB */
+    {
+        WORD32 sao_wd_luma;
+        WORD32 sao_wd_chroma;
+        WORD32 sao_ht_luma;
+        WORD32 sao_ht_chroma;
+
+        WORD32 remaining_rows;
+        WORD32 remaining_cols;
+
+        remaining_cols = ps_sps->i2_pic_width_in_luma_samples - (ps_sao_ctxt->i4_ctb_x << log2_ctb_size);
+        sao_wd_luma = MIN(ctb_size, remaining_cols);
+        sao_wd_chroma = MIN(ctb_size, remaining_cols);
+
+        remaining_rows = ps_sps->i2_pic_height_in_luma_samples - (ps_sao_ctxt->i4_ctb_y << log2_ctb_size);
+        sao_ht_luma = MIN(ctb_size, remaining_rows);
+        sao_ht_chroma = MIN(ctb_size, remaining_rows) / 2;
+
+        pu1_src_top_luma = ps_sao_ctxt->pu1_sao_src_top_luma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size);
+        pu1_src_top_chroma = ps_sao_ctxt->pu1_sao_src_top_chroma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size);
+        pu1_src_left_luma = ps_sao_ctxt->pu1_sao_src_left_luma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size);
+        pu1_src_left_chroma = ps_sao_ctxt->pu1_sao_src_left_chroma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size);
+
+        pu1_no_loop_filter_flag = ps_sao_ctxt->pu1_pic_no_loop_filter_flag +
+                        ((ps_sao_ctxt->i4_ctb_y * ctb_size) / 8) * loop_filter_strd +
+                        ((ps_sao_ctxt->i4_ctb_x * ctb_size) / 64);
+
+        ai1_offset_y[1] = ps_sao->b4_y_offset_1;
+        ai1_offset_y[2] = ps_sao->b4_y_offset_2;
+        ai1_offset_y[3] = ps_sao->b4_y_offset_3;
+        ai1_offset_y[4] = ps_sao->b4_y_offset_4;
+
+        ai1_offset_cb[1] = ps_sao->b4_cb_offset_1;
+        ai1_offset_cb[2] = ps_sao->b4_cb_offset_2;
+        ai1_offset_cb[3] = ps_sao->b4_cb_offset_3;
+        ai1_offset_cb[4] = ps_sao->b4_cb_offset_4;
+
+        ai1_offset_cr[1] = ps_sao->b4_cr_offset_1;
+        ai1_offset_cr[2] = ps_sao->b4_cr_offset_2;
+        ai1_offset_cr[3] = ps_sao->b4_cr_offset_3;
+        ai1_offset_cr[4] = ps_sao->b4_cr_offset_4;
+
+        for(i = 0; i < 8; i++)
+        {
+            au1_avail_luma[i] = 255;
+            au1_avail_chroma[i] = 255;
+        }
+
+
+        if(0 == ps_sao_ctxt->i4_ctb_x)
+        {
+            au1_avail_luma[0] = 0;
+            au1_avail_luma[4] = 0;
+            au1_avail_luma[6] = 0;
+
+            au1_avail_chroma[0] = 0;
+            au1_avail_chroma[4] = 0;
+            au1_avail_chroma[6] = 0;
+        }
+
+        if(ps_sps->i2_pic_wd_in_ctb - 1 == ps_sao_ctxt->i4_ctb_x)
+        {
+            au1_avail_luma[1] = 0;
+            au1_avail_luma[5] = 0;
+            au1_avail_luma[7] = 0;
+
+            au1_avail_chroma[1] = 0;
+            au1_avail_chroma[5] = 0;
+            au1_avail_chroma[7] = 0;
+        }
+
+        if(0 == ps_sao_ctxt->i4_ctb_y)
+        {
+            au1_avail_luma[2] = 0;
+            au1_avail_luma[4] = 0;
+            au1_avail_luma[5] = 0;
+
+            au1_avail_chroma[2] = 0;
+            au1_avail_chroma[4] = 0;
+            au1_avail_chroma[5] = 0;
+        }
+
+        if(ps_sps->i2_pic_ht_in_ctb - 1 == ps_sao_ctxt->i4_ctb_y)
+        {
+            au1_avail_luma[3] = 0;
+            au1_avail_luma[6] = 0;
+            au1_avail_luma[7] = 0;
+
+            au1_avail_chroma[3] = 0;
+            au1_avail_chroma[6] = 0;
+            au1_avail_chroma[7] = 0;
+        }
+
+
+        if(0 == ps_sao->b3_y_type_idx)
+        {
+            /* Update left, top and top-left */
+            for(row = 0; row < sao_ht_luma; row++)
+            {
+                pu1_src_left_luma[row] = pu1_src_luma[row * src_strd + (sao_wd_luma - 1)];
+            }
+            ps_sao_ctxt->pu1_sao_src_top_left_luma_curr_ctb[0] = pu1_src_top_luma[sao_wd_luma - 1];
+
+            ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_luma, &pu1_src_luma[(sao_ht_luma - 1) * src_strd], sao_wd_luma);
+
+        }
+        else
+        {
+            UWORD8 au1_src_copy[(MAX_CTB_SIZE + 2) * (MAX_CTB_SIZE + 2)];
+            UWORD8 *pu1_src_copy = au1_src_copy + (MAX_CTB_SIZE + 2) + 1;
+            WORD32 tmp_strd = MAX_CTB_SIZE + 2;
+            WORD32 no_loop_filter_enabled = 0;
+
+            /* Check the loop filter flags and copy the original values for back up */
+            {
+                UWORD32 u4_no_loop_filter_flag;
+                WORD32 min_cu = 8;
+                UWORD8 *pu1_src_tmp = pu1_src_luma;
+
+                for(i = 0; i < (sao_ht_luma + min_cu - 1) / min_cu; i++)
+                {
+                    u4_no_loop_filter_flag = (*(UWORD32 *)(pu1_no_loop_filter_flag + i * loop_filter_strd)) >>
+                                    ((((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_luma) / 8) % 8);
+                    u4_no_loop_filter_flag &= (1 << ((sao_wd_luma + (min_cu - 1)) / min_cu)) - 1;
+
+                    if(u4_no_loop_filter_flag)
+                    {
+                        WORD32 tmp_wd = sao_wd_luma;
+                        no_loop_filter_enabled = 1;
+                        while(tmp_wd > 0)
+                        {
+                            if(CTZ(u4_no_loop_filter_flag))
+                            {
+                                u4_no_loop_filter_flag  >>= (CTZ(u4_no_loop_filter_flag));
+                                pu1_src_tmp += MIN(CTZ(u4_no_loop_filter_flag), tmp_wd);
+                                pu1_src_copy += MIN(CTZ(u4_no_loop_filter_flag), tmp_wd);
+                                tmp_wd -= CTZ(u4_no_loop_filter_flag) * min_cu;
+                            }
+                            else
+                            {
+                                for(row = 0; row < MIN(min_cu, sao_ht_luma - (i - 1) * min_cu); row++)
+                                {
+                                    for(col = 0; col < MIN(CTZ(~u4_no_loop_filter_flag) * min_cu, tmp_wd); col++)
+                                    {
+                                        pu1_src_copy[row * src_strd + col] = pu1_src_tmp[row * tmp_strd + col];
+                                    }
+                                }
+
+                                u4_no_loop_filter_flag  >>= (CTZ(~u4_no_loop_filter_flag));
+                                pu1_src_tmp += MIN(CTZ(~u4_no_loop_filter_flag), tmp_wd);
+                                pu1_src_copy += MIN(CTZ(~u4_no_loop_filter_flag), tmp_wd);
+                                tmp_wd -= CTZ(~u4_no_loop_filter_flag) * min_cu;
+                            }
+                        }
+
+                        pu1_src_tmp -= sao_wd_luma;
+                    }
+
+                    pu1_src_tmp += min_cu * src_strd;
+                    pu1_src_copy += min_cu * tmp_strd;
+                }
+            }
+
+            if(1 == ps_sao->b3_y_type_idx)
+            {
+                ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr(pu1_src_luma,
+                                                                          src_strd,
+                                                                          pu1_src_left_luma,
+                                                                          pu1_src_top_luma,
+                                                                          ps_sao_ctxt->pu1_sao_src_top_left_luma_curr_ctb,
+                                                                          ps_sao->b5_y_band_pos,
+                                                                          ai1_offset_y,
+                                                                          sao_wd_luma,
+                                                                          sao_ht_luma);
+            }
+            else // if(2 <= ps_sao->b3_y_type_idx)
+            {
+                au1_src_top_right[0] = pu1_src_top_luma[sao_wd_luma];
+                au1_src_bot_left[0] = pu1_src_luma[sao_ht_luma * src_strd - 1];
+                ps_codec->apf_sao_luma[ps_sao->b3_y_type_idx - 2](pu1_src_luma,
+                                                                  src_strd,
+                                                                  pu1_src_left_luma,
+                                                                  pu1_src_top_luma,
+                                                                  ps_sao_ctxt->pu1_sao_src_top_left_luma_curr_ctb,
+                                                                  au1_src_top_right,
+                                                                  au1_src_bot_left,
+                                                                  au1_avail_luma,
+                                                                  ai1_offset_y,
+                                                                  sao_wd_luma,
+                                                                  sao_ht_luma);
+            }
+
+            /* Check the loop filter flags and copy the original values back if they are set */
+            if(no_loop_filter_enabled)
+            {
+                UWORD32 u4_no_loop_filter_flag;
+                WORD32 min_cu = 8;
+                UWORD8 *pu1_src_tmp = pu1_src_luma;
+
+                for(i = 0; i < (sao_ht_luma + min_cu - 1) / min_cu; i++)
+                {
+                    u4_no_loop_filter_flag = (*(UWORD32 *)(pu1_no_loop_filter_flag + i * loop_filter_strd)) >> ((((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_luma) / 8) % 8);
+                    u4_no_loop_filter_flag &= (1 << ((sao_wd_luma + (min_cu - 1)) / min_cu)) - 1;
+
+                    if(u4_no_loop_filter_flag)
+                    {
+                        WORD32 tmp_wd = sao_wd_luma;
+                        while(tmp_wd > 0)
+                        {
+                            if(CTZ(u4_no_loop_filter_flag))
+                            {
+                                u4_no_loop_filter_flag  >>= (CTZ(u4_no_loop_filter_flag));
+                                pu1_src_tmp += MIN(CTZ(u4_no_loop_filter_flag), tmp_wd);
+                                pu1_src_copy += MIN(CTZ(u4_no_loop_filter_flag), tmp_wd);
+                                tmp_wd -= CTZ(u4_no_loop_filter_flag) * min_cu;
+                            }
+                            else
+                            {
+                                for(row = 0; row < MIN(min_cu, sao_ht_luma - (i - 1) * min_cu); row++)
+                                {
+                                    for(col = 0; col < MIN(CTZ(~u4_no_loop_filter_flag) * min_cu, tmp_wd); col++)
+                                    {
+                                        pu1_src_tmp[row * src_strd + col] = pu1_src_copy[row * tmp_strd + col];
+                                    }
+                                }
+
+                                u4_no_loop_filter_flag  >>= (CTZ(~u4_no_loop_filter_flag));
+                                pu1_src_tmp += MIN(CTZ(~u4_no_loop_filter_flag), tmp_wd);
+                                pu1_src_copy += MIN(CTZ(~u4_no_loop_filter_flag), tmp_wd);
+                                tmp_wd -= CTZ(~u4_no_loop_filter_flag) * min_cu;
+                            }
+                        }
+
+                        pu1_src_tmp -= sao_wd_luma;
+                    }
+
+                    pu1_src_tmp += min_cu * src_strd;
+                    pu1_src_copy += min_cu * tmp_strd;
+                }
+            }
+
+        }
+
+        if(0 == ps_sao->b3_cb_type_idx)
+        {
+            for(row = 0; row < sao_ht_chroma; row++)
+            {
+                pu1_src_left_chroma[2 * row] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 2)];
+                pu1_src_left_chroma[2 * row + 1] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 1)];
+            }
+            ps_sao_ctxt->pu1_sao_src_top_left_chroma_curr_ctb[0] = pu1_src_top_chroma[sao_wd_chroma - 2];
+            ps_sao_ctxt->pu1_sao_src_top_left_chroma_curr_ctb[1] = pu1_src_top_chroma[sao_wd_chroma - 1];
+
+            ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_chroma, &pu1_src_chroma[(sao_ht_chroma - 1) * src_strd], sao_wd_chroma);
+        }
+        else
+        {
+            UWORD8 au1_src_copy[(MAX_CTB_SIZE + 4) * (MAX_CTB_SIZE + 2)];
+            UWORD8 *pu1_src_copy = au1_src_copy + (MAX_CTB_SIZE + 4) + 2;
+            WORD32 tmp_strd = MAX_CTB_SIZE + 4;
+            WORD32 no_loop_filter_enabled = 0;
+
+            /* Check the loop filter flags and copy the original values for back up */
+            {
+                UWORD32 u4_no_loop_filter_flag;
+                WORD32 min_cu = 4;
+                UWORD8 *pu1_src_tmp = pu1_src_chroma;
+
+                for(i = 0; i < (sao_ht_chroma + min_cu - 1) / min_cu; i++)
+                {
+                    u4_no_loop_filter_flag = (*(UWORD32 *)(pu1_no_loop_filter_flag + i * loop_filter_strd)) >> ((((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_chroma) / 8) % 8);
+                    u4_no_loop_filter_flag &= (1 << ((sao_wd_chroma + (min_cu - 1)) / min_cu)) - 1;
+
+                    if(u4_no_loop_filter_flag)
+                    {
+                        WORD32 tmp_wd = sao_wd_chroma;
+                        no_loop_filter_enabled = 1;
+                        while(tmp_wd > 0)
+                        {
+                            if(CTZ(u4_no_loop_filter_flag))
+                            {
+                                u4_no_loop_filter_flag  >>= (CTZ(u4_no_loop_filter_flag));
+                                pu1_src_tmp += MIN(CTZ(u4_no_loop_filter_flag), tmp_wd);
+                                pu1_src_copy += MIN(CTZ(u4_no_loop_filter_flag), tmp_wd);
+                                tmp_wd -= CTZ(u4_no_loop_filter_flag) * min_cu;
+                            }
+                            else
+                            {
+                                for(row = 0; row < MIN(min_cu, sao_ht_chroma - (i - 1) * min_cu); row++)
+                                {
+                                    for(col = 0; col < MIN(CTZ(~u4_no_loop_filter_flag) * min_cu, tmp_wd); col++)
+                                    {
+                                        pu1_src_copy[row * src_strd + col] = pu1_src_tmp[row * tmp_strd + col];
+                                    }
+                                }
+
+                                u4_no_loop_filter_flag  >>= (CTZ(~u4_no_loop_filter_flag));
+                                pu1_src_tmp += MIN(CTZ(~u4_no_loop_filter_flag), tmp_wd);
+                                pu1_src_copy += MIN(CTZ(~u4_no_loop_filter_flag), tmp_wd);
+                                tmp_wd -= CTZ(~u4_no_loop_filter_flag) * min_cu;
+                            }
+                        }
+
+                        pu1_src_tmp -= sao_wd_chroma;
+                    }
+
+                    pu1_src_tmp += min_cu * src_strd;
+                    pu1_src_copy += min_cu * tmp_strd;
+                }
+            }
+
+            if(1 == ps_sao->b3_cb_type_idx)
+            {
+                ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr(pu1_src_chroma,
+                                                                            src_strd,
+                                                                            pu1_src_left_chroma,
+                                                                            pu1_src_top_chroma,
+                                                                            ps_sao_ctxt->pu1_sao_src_top_left_chroma_curr_ctb,
+                                                                            ps_sao->b5_cb_band_pos,
+                                                                            ps_sao->b5_cr_band_pos,
+                                                                            ai1_offset_cb,
+                                                                            ai1_offset_cr,
+                                                                            sao_wd_chroma,
+                                                                            sao_ht_chroma
+                                                                           );
+            }
+            else // if(2 <= ps_sao->b3_cb_type_idx)
+            {
+                au1_src_top_right[0] = pu1_src_top_chroma[sao_wd_chroma];
+                au1_src_top_right[1] = pu1_src_top_chroma[sao_wd_chroma + 1];
+                au1_src_bot_left[0] = pu1_src_chroma[sao_ht_chroma * src_strd - 2];
+                au1_src_bot_left[1] = pu1_src_chroma[sao_ht_chroma * src_strd - 1];
+                ps_codec->apf_sao_chroma[ps_sao->b3_cb_type_idx - 2](pu1_src_chroma,
+                                                                     src_strd,
+                                                                     pu1_src_left_chroma,
+                                                                     pu1_src_top_chroma,
+                                                                     ps_sao_ctxt->pu1_sao_src_top_left_chroma_curr_ctb,
+                                                                     au1_src_top_right,
+                                                                     au1_src_bot_left,
+                                                                     au1_avail_chroma,
+                                                                     ai1_offset_cb,
+                                                                     ai1_offset_cr,
+                                                                     sao_wd_chroma,
+                                                                     sao_ht_chroma);
+            }
+
+            /* Check the loop filter flags and copy the original values back if they are set */
+            if(no_loop_filter_enabled)
+            {
+                UWORD32 u4_no_loop_filter_flag;
+                WORD32 min_cu = 4;
+                UWORD8 *pu1_src_tmp = pu1_src_chroma;
+
+                for(i = 0; i < (sao_ht_chroma + min_cu - 1) / min_cu; i++)
+                {
+                    u4_no_loop_filter_flag = (*(UWORD32 *)(pu1_no_loop_filter_flag + i * loop_filter_strd)) >> ((((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_chroma) / 8) % 8);
+                    u4_no_loop_filter_flag &= (1 << ((sao_wd_chroma + (min_cu - 1)) / min_cu)) - 1;
+
+                    if(u4_no_loop_filter_flag)
+                    {
+                        WORD32 tmp_wd = sao_wd_chroma;
+                        while(tmp_wd > 0)
+                        {
+                            if(CTZ(u4_no_loop_filter_flag))
+                            {
+                                u4_no_loop_filter_flag  >>= (CTZ(u4_no_loop_filter_flag));
+                                pu1_src_tmp += MIN(CTZ(u4_no_loop_filter_flag), tmp_wd);
+                                pu1_src_copy += MIN(CTZ(u4_no_loop_filter_flag), tmp_wd);
+                                tmp_wd -= CTZ(u4_no_loop_filter_flag) * min_cu;
+                            }
+                            else
+                            {
+                                for(row = 0; row < MIN(min_cu, sao_ht_chroma - (i - 1) * min_cu); row++)
+                                {
+                                    for(col = 0; col < MIN(CTZ(~u4_no_loop_filter_flag) * min_cu, tmp_wd); col++)
+                                    {
+                                        pu1_src_tmp[row * src_strd + col] = pu1_src_copy[row * tmp_strd + col];
+                                    }
+                                }
+
+                                u4_no_loop_filter_flag  >>= (CTZ(~u4_no_loop_filter_flag));
+                                pu1_src_tmp += MIN(CTZ(~u4_no_loop_filter_flag), tmp_wd);
+                                pu1_src_copy += MIN(CTZ(~u4_no_loop_filter_flag), tmp_wd);
+                                tmp_wd -= CTZ(~u4_no_loop_filter_flag) * min_cu;
+                            }
+                        }
+
+                        pu1_src_tmp -= sao_wd_chroma;
+                    }
+
+                    pu1_src_tmp += min_cu * src_strd;
+                    pu1_src_copy += min_cu * tmp_strd;
+                }
+            }
+
+        }
+
+    }
+}
+
+void ihevcd_sao_shift_ctb(sao_ctxt_t *ps_sao_ctxt)
+{
+    codec_t *ps_codec = ps_sao_ctxt->ps_codec;
+    UWORD8 *pu1_src_luma;
+    UWORD8 *pu1_src_chroma;
+    WORD32 src_strd;
+    WORD32 ctb_size;
+    WORD32 log2_ctb_size;
+    sps_t *ps_sps;
+    sao_t *ps_sao;
+    pps_t *ps_pps;
+    slice_header_t *ps_slice_hdr, *ps_slice_hdr_base;
+    tile_t *ps_tile;
+    UWORD16 *pu1_slice_idx;
+    UWORD16 *pu1_tile_idx;
+    WORD32 row, col;
+    UWORD8 au1_avail_luma[8];
+    UWORD8 au1_avail_chroma[8];
+    UWORD8 au1_tile_slice_boundary[8];
+    UWORD8 au4_ilf_across_tile_slice_enable[8];
+    WORD32 i;
+    UWORD8 *pu1_src_top_luma;
+    UWORD8 *pu1_src_top_chroma;
+    UWORD8 *pu1_src_left_luma;
+    UWORD8 *pu1_src_left_chroma;
+    UWORD8 au1_src_top_right[2];
+    UWORD8 au1_src_bot_left[2];
+    UWORD8 *pu1_no_loop_filter_flag;
+    UWORD8 *pu1_src_backup_luma;
+    UWORD8 *pu1_src_backup_chroma;
+    WORD32 backup_strd;
+    WORD32 loop_filter_strd;
+
+    WORD32 no_loop_filter_enabled_luma = 0;
+    WORD32 no_loop_filter_enabled_chroma = 0;
+    UWORD8 *pu1_sao_src_top_left_chroma_curr_ctb;
+    UWORD8 *pu1_sao_src_top_left_luma_curr_ctb;
+    UWORD8 *pu1_sao_src_luma_top_left_ctb;
+    UWORD8 *pu1_sao_src_chroma_top_left_ctb;
+    UWORD8 *pu1_sao_src_top_left_luma_top_right;
+    UWORD8 *pu1_sao_src_top_left_chroma_top_right;
+    UWORD8  u1_sao_src_top_left_luma_bot_left;
+    UWORD8  *pu1_sao_src_top_left_luma_bot_left;
+    UWORD8 *au1_sao_src_top_left_chroma_bot_left;
+    UWORD8 *pu1_sao_src_top_left_chroma_bot_left;
+
+    WORD8 ai1_offset_y[5];
+    WORD8 ai1_offset_cb[5];
+    WORD8 ai1_offset_cr[5];
+    WORD32  chroma_yuv420sp_vu = ps_sao_ctxt->is_chroma_yuv420sp_vu;
+
+    PROFILE_DISABLE_SAO();
+
+    ai1_offset_y[0] = 0;
+    ai1_offset_cb[0] = 0;
+    ai1_offset_cr[0] = 0;
+
+    ps_sps = ps_sao_ctxt->ps_sps;
+    ps_pps = ps_sao_ctxt->ps_pps;
+    ps_tile = ps_sao_ctxt->ps_tile;
+
+    log2_ctb_size = ps_sps->i1_log2_ctb_size;
+    ctb_size = (1 << log2_ctb_size);
+    src_strd = ps_sao_ctxt->ps_codec->i4_strd;
+#ifdef GPU_BUILD
+    //TODO GPU : Later define it for ARM only version as well
+    ps_slice_hdr_base = ps_sao_ctxt->ps_slice_hdr_base;
+#else
+    ps_slice_hdr_base = ps_sao_ctxt->ps_codec->ps_slice_hdr_base;
+#endif
+    ps_slice_hdr = ps_slice_hdr_base + (ps_sao_ctxt->i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1));
+
+    pu1_slice_idx = ps_sao_ctxt->pu1_slice_idx;
+    pu1_tile_idx = ps_sao_ctxt->pu1_tile_idx;
+    pu1_src_luma = ps_sao_ctxt->pu1_cur_pic_luma + ((ps_sao_ctxt->i4_ctb_x + ps_sao_ctxt->i4_ctb_y * ps_sao_ctxt->ps_codec->i4_strd) << (log2_ctb_size));
+    pu1_src_chroma = ps_sao_ctxt->pu1_cur_pic_chroma + ((ps_sao_ctxt->i4_ctb_x + ps_sao_ctxt->i4_ctb_y * ps_sao_ctxt->ps_codec->i4_strd / 2) << (log2_ctb_size));
+
+    /*Stores the left value for each row ctbs- Needed for column tiles*/
+    pu1_sao_src_top_left_luma_curr_ctb = ps_sao_ctxt->pu1_sao_src_top_left_luma_curr_ctb + ((ps_sao_ctxt->i4_ctb_y));
+    pu1_sao_src_top_left_chroma_curr_ctb = ps_sao_ctxt->pu1_sao_src_top_left_chroma_curr_ctb + (2 * (ps_sao_ctxt->i4_ctb_y));
+    pu1_sao_src_luma_top_left_ctb = ps_sao_ctxt->pu1_sao_src_luma_top_left_ctb + ((ps_sao_ctxt->i4_ctb_y));
+    pu1_sao_src_chroma_top_left_ctb = ps_sao_ctxt->pu1_sao_src_chroma_top_left_ctb + (2 * ps_sao_ctxt->i4_ctb_y);
+    u1_sao_src_top_left_luma_bot_left = ps_sao_ctxt->u1_sao_src_top_left_luma_bot_left; // + ((ps_sao_ctxt->i4_ctb_y));
+    pu1_sao_src_top_left_luma_bot_left = ps_sao_ctxt->pu1_sao_src_top_left_luma_bot_left + ((ps_sao_ctxt->i4_ctb_y));
+    au1_sao_src_top_left_chroma_bot_left = ps_sao_ctxt->au1_sao_src_top_left_chroma_bot_left; // + (2 * ps_sao_ctxt->i4_ctb_y);
+    pu1_sao_src_top_left_chroma_bot_left = ps_sao_ctxt->pu1_sao_src_top_left_chroma_bot_left + (2 * ps_sao_ctxt->i4_ctb_y);
+    pu1_sao_src_top_left_luma_top_right = ps_sao_ctxt->pu1_sao_src_top_left_luma_top_right + ((ps_sao_ctxt->i4_ctb_x));
+    pu1_sao_src_top_left_chroma_top_right = ps_sao_ctxt->pu1_sao_src_top_left_chroma_top_right + (2 * ps_sao_ctxt->i4_ctb_x);
+
+    ps_sao = ps_sao_ctxt->ps_pic_sao + ps_sao_ctxt->i4_ctb_x + ps_sao_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+    loop_filter_strd =  (ps_sps->i2_pic_width_in_luma_samples + 63) >> 6;
+    backup_strd = 2 * MAX_CTB_SIZE;
+
+    DEBUG_INIT_TMP_BUF(ps_sao_ctxt->pu1_tmp_buf_luma, ps_sao_ctxt->pu1_tmp_buf_chroma);
+
+    {
+        /* Check the loop filter flags and copy the original values for back up */
+        /* Luma */
+        if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_luma_flag)
+        {
+            UWORD32 u4_no_loop_filter_flag;
+            WORD32 loop_filter_bit_pos;
+            WORD32 log2_min_cu = 3;
+            WORD32 min_cu = (1 << log2_min_cu);
+            UWORD8 *pu1_src_tmp_luma = pu1_src_luma;
+            WORD32 sao_blk_ht = ctb_size - SAO_SHIFT_CTB;
+            WORD32 sao_blk_wd = ctb_size;
+            WORD32 remaining_rows;
+            WORD32 remaining_cols;
+
+            remaining_rows = ps_sps->i2_pic_height_in_luma_samples - ((ps_sao_ctxt->i4_ctb_y << log2_ctb_size) + ctb_size - SAO_SHIFT_CTB);
+            remaining_cols = ps_sps->i2_pic_width_in_luma_samples - ((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) + ctb_size - SAO_SHIFT_CTB);
+            if(remaining_rows <= SAO_SHIFT_CTB)
+                sao_blk_ht += remaining_rows;
+            if(remaining_cols <= SAO_SHIFT_CTB)
+                sao_blk_wd += remaining_cols;
+
+            pu1_src_tmp_luma -= ps_sao_ctxt->i4_ctb_x ? SAO_SHIFT_CTB : 0;
+            pu1_src_tmp_luma -= ps_sao_ctxt->i4_ctb_y ? SAO_SHIFT_CTB * src_strd : 0;
+
+            pu1_src_backup_luma = ps_sao_ctxt->pu1_tmp_buf_luma;
+
+            loop_filter_bit_pos = (ps_sao_ctxt->i4_ctb_x << (log2_ctb_size - 3)) +
+                            (ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 3)) * (loop_filter_strd << 3);
+            if(ps_sao_ctxt->i4_ctb_x > 0)
+                loop_filter_bit_pos -= 1;
+
+            pu1_no_loop_filter_flag = ps_sao_ctxt->pu1_pic_no_loop_filter_flag +
+                            (loop_filter_bit_pos >> 3);
+
+            for(i = -(ps_sao_ctxt->i4_ctb_y ? SAO_SHIFT_CTB : 0) >> log2_min_cu;
+                            i < (sao_blk_ht + (min_cu - 1)) >> log2_min_cu; i++)
+            {
+                WORD32 tmp_wd = sao_blk_wd;
+
+                u4_no_loop_filter_flag = (*(UWORD32 *)(pu1_no_loop_filter_flag + i * loop_filter_strd)) >>
+                                (loop_filter_bit_pos & 7);
+                u4_no_loop_filter_flag &= (1 << ((tmp_wd + (min_cu - 1)) >> log2_min_cu)) - 1;
+
+                if(u4_no_loop_filter_flag)
+                {
+                    no_loop_filter_enabled_luma = 1;
+                    while(tmp_wd > 0)
+                    {
+                        if(CTZ(u4_no_loop_filter_flag))
+                        {
+                            pu1_src_tmp_luma += MIN((CTZ(u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+                            pu1_src_backup_luma += MIN((CTZ(u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+                            tmp_wd -= CTZ(u4_no_loop_filter_flag) << log2_min_cu;
+                            u4_no_loop_filter_flag  >>= (CTZ(u4_no_loop_filter_flag));
+                        }
+                        else
+                        {
+                            for(row = 0; row < min_cu; row++)
+                            {
+                                for(col = 0; col < MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd); col++)
+                                {
+                                    pu1_src_backup_luma[row * backup_strd + col] = pu1_src_tmp_luma[row * src_strd + col];
+                                }
+                            }
+                            pu1_src_tmp_luma += MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+                            pu1_src_backup_luma += MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+                            tmp_wd -= CTZ(~u4_no_loop_filter_flag) << log2_min_cu;
+                            u4_no_loop_filter_flag  >>= (CTZ(~u4_no_loop_filter_flag));
+                        }
+                    }
+
+                    pu1_src_tmp_luma -= sao_blk_wd;
+                    pu1_src_backup_luma -= sao_blk_wd;
+                }
+
+                pu1_src_tmp_luma += (src_strd << log2_min_cu);
+                pu1_src_backup_luma += (backup_strd << log2_min_cu);
+            }
+        }
+
+        /* Chroma */
+        if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_chroma_flag)
+        {
+            UWORD32 u4_no_loop_filter_flag;
+            WORD32 loop_filter_bit_pos;
+            WORD32 log2_min_cu = 3;
+            WORD32 min_cu = (1 << log2_min_cu);
+            UWORD8 *pu1_src_tmp_chroma = pu1_src_chroma;
+            WORD32 sao_blk_ht = ctb_size - 2 * SAO_SHIFT_CTB;
+            WORD32 sao_blk_wd = ctb_size;
+            WORD32 remaining_rows;
+            WORD32 remaining_cols;
+
+            remaining_rows = ps_sps->i2_pic_height_in_luma_samples - ((ps_sao_ctxt->i4_ctb_y << log2_ctb_size) + ctb_size - 2 * SAO_SHIFT_CTB);
+            remaining_cols = ps_sps->i2_pic_width_in_luma_samples - ((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) + ctb_size - 2 * SAO_SHIFT_CTB);
+            if(remaining_rows <= 2 * SAO_SHIFT_CTB)
+                sao_blk_ht += remaining_rows;
+            if(remaining_cols <= 2 * SAO_SHIFT_CTB)
+                sao_blk_wd += remaining_cols;
+
+            pu1_src_tmp_chroma -= ps_sao_ctxt->i4_ctb_x ? SAO_SHIFT_CTB * 2 : 0;
+            pu1_src_tmp_chroma -= ps_sao_ctxt->i4_ctb_y ? SAO_SHIFT_CTB * src_strd : 0;
+
+            pu1_src_backup_chroma = ps_sao_ctxt->pu1_tmp_buf_chroma;
+
+            loop_filter_bit_pos = (ps_sao_ctxt->i4_ctb_x << (log2_ctb_size - 3)) +
+                            (ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 3)) * (loop_filter_strd << 3);
+            if(ps_sao_ctxt->i4_ctb_x > 0)
+                loop_filter_bit_pos -= 2;
+
+            pu1_no_loop_filter_flag = ps_sao_ctxt->pu1_pic_no_loop_filter_flag +
+                            (loop_filter_bit_pos >> 3);
+
+            for(i = -(ps_sao_ctxt->i4_ctb_y ? 2 * SAO_SHIFT_CTB : 0) >> log2_min_cu;
+                            i < (sao_blk_ht + (min_cu - 1)) >> log2_min_cu; i++)
+            {
+                WORD32 tmp_wd = sao_blk_wd;
+
+                u4_no_loop_filter_flag = (*(UWORD32 *)(pu1_no_loop_filter_flag + i * loop_filter_strd)) >>
+                                (loop_filter_bit_pos & 7);
+                u4_no_loop_filter_flag &= (1 << ((tmp_wd + (min_cu - 1)) >> log2_min_cu)) - 1;
+
+                if(u4_no_loop_filter_flag)
+                {
+                    no_loop_filter_enabled_chroma = 1;
+                    while(tmp_wd > 0)
+                    {
+                        if(CTZ(u4_no_loop_filter_flag))
+                        {
+                            pu1_src_tmp_chroma += MIN((CTZ(u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+                            pu1_src_backup_chroma += MIN((CTZ(u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+                            tmp_wd -= CTZ(u4_no_loop_filter_flag) << log2_min_cu;
+                            u4_no_loop_filter_flag  >>= (CTZ(u4_no_loop_filter_flag));
+                        }
+                        else
+                        {
+                            for(row = 0; row < min_cu / 2; row++)
+                            {
+                                for(col = 0; col < MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd); col++)
+                                {
+                                    pu1_src_backup_chroma[row * backup_strd + col] = pu1_src_tmp_chroma[row * src_strd + col];
+                                }
+                            }
+
+                            pu1_src_tmp_chroma += MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+                            pu1_src_backup_chroma += MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+                            tmp_wd -= CTZ(~u4_no_loop_filter_flag) << log2_min_cu;
+                            u4_no_loop_filter_flag  >>= (CTZ(~u4_no_loop_filter_flag));
+                        }
+                    }
+
+                    pu1_src_tmp_chroma -= sao_blk_wd;
+                    pu1_src_backup_chroma -= sao_blk_wd;
+                }
+
+                pu1_src_tmp_chroma += ((src_strd / 2) << log2_min_cu);
+                pu1_src_backup_chroma += ((backup_strd / 2) << log2_min_cu);
+            }
+        }
+    }
+
+    DEBUG_PROCESS_TMP_BUF(ps_sao_ctxt->pu1_tmp_buf_luma, ps_sao_ctxt->pu1_tmp_buf_chroma);
+
+    /* Top-left CTB */
+    if(ps_sao_ctxt->i4_ctb_x > 0 && ps_sao_ctxt->i4_ctb_y > 0)
+    {
+        WORD32 sao_wd_luma = SAO_SHIFT_CTB;
+        WORD32 sao_wd_chroma = 2 * SAO_SHIFT_CTB;
+        WORD32 sao_ht_luma = SAO_SHIFT_CTB;
+        WORD32 sao_ht_chroma = SAO_SHIFT_CTB;
+
+        WORD32 ctbx_tl_t = 0, ctbx_tl_l = 0, ctbx_tl_r = 0, ctbx_tl_d = 0, ctbx_tl = 0;
+        WORD32 ctby_tl_t = 0, ctby_tl_l = 0, ctby_tl_r = 0, ctby_tl_d = 0, ctby_tl = 0;
+        WORD32 au4_idx_tl[8], idx_tl;
+
+
+        pu1_src_luma -= (sao_wd_luma + sao_ht_luma * src_strd);
+        pu1_src_chroma -= (sao_wd_chroma + sao_ht_chroma * src_strd);
+        ps_sao -= (1 + ps_sps->i2_pic_wd_in_ctb);
+        pu1_src_top_luma = ps_sao_ctxt->pu1_sao_src_top_luma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_luma;
+        pu1_src_top_chroma = ps_sao_ctxt->pu1_sao_src_top_chroma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_chroma;
+        pu1_src_left_luma = ps_sao_ctxt->pu1_sao_src_left_luma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size) - sao_ht_luma;
+        pu1_src_left_chroma = ps_sao_ctxt->pu1_sao_src_left_chroma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size) - (2 * sao_ht_chroma);
+
+        if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_luma_flag)
+        {
+            if(0 == ps_sao->b3_y_type_idx)
+            {
+                /* Update left, top and top-left */
+                for(row = 0; row < sao_ht_luma; row++)
+                {
+                    pu1_src_left_luma[row] = pu1_src_luma[row * src_strd + (sao_wd_luma - 1)];
+                }
+                pu1_sao_src_luma_top_left_ctb[0] = pu1_src_top_luma[sao_wd_luma - 1];
+
+                ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_luma, &pu1_src_luma[(sao_ht_luma - 1) * src_strd], sao_wd_luma);
+
+
+            }
+
+            else if(1 == ps_sao->b3_y_type_idx)
+            {
+                ai1_offset_y[1] = ps_sao->b4_y_offset_1;
+                ai1_offset_y[2] = ps_sao->b4_y_offset_2;
+                ai1_offset_y[3] = ps_sao->b4_y_offset_3;
+                ai1_offset_y[4] = ps_sao->b4_y_offset_4;
+
+                ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr(pu1_src_luma,
+                                                                          src_strd,
+                                                                          pu1_src_left_luma,
+                                                                          pu1_src_top_luma,
+                                                                          pu1_sao_src_luma_top_left_ctb,
+                                                                          ps_sao->b5_y_band_pos,
+                                                                          ai1_offset_y,
+                                                                          sao_wd_luma,
+                                                                          sao_ht_luma
+                                                                         );
+            }
+
+            else // if(2 <= ps_sao->b3_y_type_idx)
+            {
+                ai1_offset_y[1] = ps_sao->b4_y_offset_1;
+                ai1_offset_y[2] = ps_sao->b4_y_offset_2;
+                ai1_offset_y[3] = ps_sao->b4_y_offset_3;
+                ai1_offset_y[4] = ps_sao->b4_y_offset_4;
+
+                for(i = 0; i < 8; i++)
+                {
+                    au1_avail_luma[i] = 255;
+                    au1_tile_slice_boundary[i] = 0;
+                    au4_idx_tl[i] = 0;
+                    au4_ilf_across_tile_slice_enable[i] = 1;
+                }
+
+                /******************************************************************
+                 * Derive the  Top-left CTB's neighbor pixel's slice indices.
+                 *
+                 *          TL_T
+                 *       4  _2__5________
+                 *     0   |    |       |
+                 *    TL_L | TL | 1 TL_R|
+                 *         |____|_______|____
+                 *        6|TL_D|7      |    |
+                 *         | 3  |       |    |
+                 *         |____|_______|    |
+                 *              |            |
+                 *              |            |
+                 *              |____________|
+                 *
+                 *****************************************************************/
+
+                /*In case of slices, unless we encounter multiple slice/tiled clips, don't enter*/
+                {
+                    if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+                    {
+                        {
+                            /*Assuming that sao shift is uniform along x and y directions*/
+                            if((0 == (1 << log2_ctb_size) - sao_wd_luma) && (ps_sao_ctxt->i4_ctb_y > 1) && (ps_sao_ctxt->i4_ctb_x > 1))
+                            {
+                                ctby_tl_t = ps_sao_ctxt->i4_ctb_y - 2;
+                                ctbx_tl_l = ps_sao_ctxt->i4_ctb_x - 2;
+                            }
+                            else if(!(0 == (1 << log2_ctb_size) - sao_wd_luma))
+                            {
+                                ctby_tl_t = ps_sao_ctxt->i4_ctb_y - 1;
+                                ctbx_tl_l = ps_sao_ctxt->i4_ctb_x - 1;
+                            }
+                            ctbx_tl_t = ps_sao_ctxt->i4_ctb_x - 1;
+                            ctby_tl_l = ps_sao_ctxt->i4_ctb_y - 1;
+
+                            ctbx_tl_r = ps_sao_ctxt->i4_ctb_x;
+                            ctby_tl_r = ps_sao_ctxt->i4_ctb_y - 1;
+
+                            ctbx_tl_d =  ps_sao_ctxt->i4_ctb_x - 1;
+                            ctby_tl_d =  ps_sao_ctxt->i4_ctb_y;
+
+                            ctbx_tl = ps_sao_ctxt->i4_ctb_x - 1;
+                            ctby_tl = ps_sao_ctxt->i4_ctb_y - 1;
+                        }
+
+                        if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+                        {
+                            /*Calculate slice indices for neighbor pixels*/
+                            idx_tl   = pu1_slice_idx[ctbx_tl + (ctby_tl * ps_sps->i2_pic_wd_in_ctb)];
+                            au4_idx_tl[2] = au4_idx_tl[4] = *(pu1_slice_idx + ctbx_tl_t + (ctby_tl_t * ps_sps->i2_pic_wd_in_ctb));
+                            au4_idx_tl[0] =  pu1_slice_idx[ctbx_tl_l + (ctby_tl_l * ps_sps->i2_pic_wd_in_ctb)];
+                            au4_idx_tl[1] = au4_idx_tl[5] = pu1_slice_idx[ctbx_tl_r + (ctby_tl_r * ps_sps->i2_pic_wd_in_ctb)];
+                            au4_idx_tl[3] = au4_idx_tl[6] =   pu1_slice_idx[ctbx_tl_d + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+                            au4_idx_tl[7] = pu1_slice_idx[ctbx_tl_d + 1 + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+
+                            if((0 == (1 << log2_ctb_size) - sao_wd_luma))
+                            {
+                                if(ps_sao_ctxt->i4_ctb_x == 1)
+                                {
+                                    au4_idx_tl[6] = -1;
+                                    au4_idx_tl[4] = -1;
+                                }
+                                else
+                                {
+                                    au4_idx_tl[6] = pu1_slice_idx[(ctbx_tl_d - 1) + (ctby_tl_r * ps_sps->i2_pic_wd_in_ctb)];
+                                }
+                                if(ps_sao_ctxt->i4_ctb_y == 1)
+                                {
+                                    au4_idx_tl[5] = -1;
+                                    au4_idx_tl[4] = -1;
+                                }
+                                else
+                                {
+                                    au4_idx_tl[5] = pu1_slice_idx[(ctbx_tl_l + 1) + (ctby_tl_l * ps_sps->i2_pic_wd_in_ctb)];
+                                    au4_idx_tl[4] = pu1_slice_idx[(ctbx_tl_t - 1) + (ctby_tl_t * ps_sps->i2_pic_wd_in_ctb)];
+                                }
+                                au4_idx_tl[7] = pu1_slice_idx[(ctbx_tl_d + 1) + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+                            }
+
+                            /* Verify that the neighbor ctbs dont cross pic boundary.
+                             * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
+                             * of the pixel having a greater address is checked. Accordingly, set the availability flags.
+                             * Hence, for top and left pixels, current ctb flag is checked. For right and down pixels,
+                             * the respective pixel's flags are checked
+                             */
+
+                            if((0 == (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_luma))
+                            {
+                                au4_ilf_across_tile_slice_enable[4] = 0;
+                                au4_ilf_across_tile_slice_enable[6] = 0;
+                            }
+                            else
+                            {
+                                au4_ilf_across_tile_slice_enable[6] = (ps_slice_hdr_base + au4_idx_tl[6])->i1_slice_loop_filter_across_slices_enabled_flag;
+                            }
+                            if((0 == (ps_sao_ctxt->i4_ctb_y << log2_ctb_size) - sao_ht_luma))
+                            {
+                                au4_ilf_across_tile_slice_enable[5] = 0;
+                                au4_ilf_across_tile_slice_enable[4] = 0;
+                            }
+                            else
+                            {
+                                au4_ilf_across_tile_slice_enable[5] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[4] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
+                            }
+                            au4_ilf_across_tile_slice_enable[2] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
+                            au4_ilf_across_tile_slice_enable[0] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
+                            au4_ilf_across_tile_slice_enable[1] = (ps_slice_hdr_base + au4_idx_tl[1])->i1_slice_loop_filter_across_slices_enabled_flag;
+                            au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_tl[3])->i1_slice_loop_filter_across_slices_enabled_flag;
+                            au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_tl[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+
+                            /*
+                             * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
+                             * of the pixel having a greater address is checked. Accordingly, set the availability flags.
+                             * Hence, for top and left pixels, current ctb flag is checked. For right and down pixels,
+                             * the respective pixel's flags are checked
+                             */
+                            for(i = 0; i < 8; i++)
+                            {
+                                /*Sets the edges that lie on the slice/tile boundary*/
+                                if(au4_idx_tl[i] != idx_tl)
+                                {
+                                    au1_tile_slice_boundary[i] = 1;
+                                }
+                                else
+                                {
+                                    au4_ilf_across_tile_slice_enable[i] = 1;
+                                }
+                            }
+
+                            ps_codec->s_func_selector.ihevc_memset_mul_8_fptr((UWORD8 *)au4_idx_tl, 0, 8 * sizeof(WORD32));
+                        }
+
+                        if(ps_pps->i1_tiles_enabled_flag)
+                        {
+                            /* Calculate availability flags at slice boundary */
+                            if(((ps_tile->u1_pos_x == ps_sao_ctxt->i4_ctb_x) || (ps_tile->u1_pos_y == ps_sao_ctxt->i4_ctb_y)) && (!((0 == ps_tile->u1_pos_x) && (0 == ps_tile->u1_pos_y))))
+                            {
+                                /*If ilf across tiles is enabled, boundary availability for tiles is not checked. */
+                                if(!ps_pps->i1_loop_filter_across_tiles_enabled_flag)
+                                {
+                                    /*Set the boundary arrays*/
+                                    /*Calculate tile indices for neighbor pixels*/
+                                    idx_tl   = pu1_tile_idx[ctbx_tl + (ctby_tl * ps_sps->i2_pic_wd_in_ctb)];
+                                    au4_idx_tl[2] = au4_idx_tl[4] = *(pu1_tile_idx + ctbx_tl_t + (ctby_tl_t * ps_sps->i2_pic_wd_in_ctb));
+                                    au4_idx_tl[0] =  pu1_tile_idx[ctbx_tl_l + (ctby_tl_l * ps_sps->i2_pic_wd_in_ctb)];
+                                    au4_idx_tl[1] = au4_idx_tl[5] = pu1_tile_idx[ctbx_tl_r + (ctby_tl_r * ps_sps->i2_pic_wd_in_ctb)];
+                                    au4_idx_tl[3] = au4_idx_tl[6] =   pu1_tile_idx[ctbx_tl_d + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+                                    au4_idx_tl[7] = pu1_tile_idx[ctbx_tl_d + 1 + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+
+                                    if((0 == (1 << log2_ctb_size) - sao_wd_luma))
+                                    {
+                                        if(ps_sao_ctxt->i4_ctb_x == 1)
+                                        {
+                                            au4_idx_tl[6] = -1;
+                                            au4_idx_tl[4] = -1;
+                                        }
+                                        else
+                                        {
+                                            au4_idx_tl[6] = pu1_tile_idx[(ctbx_tl_d - 1) + (ctby_tl_r * ps_sps->i2_pic_wd_in_ctb)];
+                                        }
+                                        if(ps_sao_ctxt->i4_ctb_y == 1)
+                                        {
+                                            au4_idx_tl[5] = -1;
+                                            au4_idx_tl[4] = -1;
+                                        }
+                                        else
+                                        {
+                                            au4_idx_tl[5] = pu1_tile_idx[(ctbx_tl_l + 1) + (ctby_tl_l * ps_sps->i2_pic_wd_in_ctb)];
+                                            au4_idx_tl[4] = pu1_tile_idx[(ctbx_tl_t - 1) + (ctby_tl_t * ps_sps->i2_pic_wd_in_ctb)];
+                                        }
+                                        au4_idx_tl[7] = pu1_tile_idx[(ctbx_tl_d + 1) + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+                                    }
+                                    for(i = 0; i < 8; i++)
+                                    {
+                                        /*Sets the edges that lie on the tile boundary*/
+                                        if(au4_idx_tl[i] != idx_tl)
+                                        {
+                                            au1_tile_slice_boundary[i] |= 1;
+                                            au4_ilf_across_tile_slice_enable[i] &= ps_pps->i1_loop_filter_across_tiles_enabled_flag; //=0
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+
+                        /*Set availability flags based on tile and slice boundaries*/
+                        for(i = 0; i < 8; i++)
+                        {
+                            /*Sets the edges that lie on the slice/tile boundary*/
+                            if((au1_tile_slice_boundary[i]) && !(au4_ilf_across_tile_slice_enable[i]))
+                            {
+                                au1_avail_luma[i] = 0;
+                            }
+                        }
+                    }
+                }
+
+                if(0 == (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_luma)
+                {
+                    au1_avail_luma[0] = 0;
+                    au1_avail_luma[4] = 0;
+                    au1_avail_luma[6] = 0;
+                }
+
+                if(ps_sps->i2_pic_wd_in_ctb == ps_sao_ctxt->i4_ctb_x)
+                {
+                    au1_avail_luma[1] = 0;
+                    au1_avail_luma[5] = 0;
+                    au1_avail_luma[7] = 0;
+                }
+                //y==1 case
+                if((0 == (ps_sao_ctxt->i4_ctb_y << log2_ctb_size) - sao_ht_luma))
+                {
+                    au1_avail_luma[2] = 0;
+                    au1_avail_luma[4] = 0;
+                    au1_avail_luma[5] = 0;
+                }
+                if(ps_sps->i2_pic_ht_in_ctb == ps_sao_ctxt->i4_ctb_y)
+                {
+                    au1_avail_luma[3] = 0;
+                    au1_avail_luma[6] = 0;
+                    au1_avail_luma[7] = 0;
+                }
+
+                {
+                    au1_src_top_right[0] = pu1_src_top_luma[sao_wd_luma];
+                    u1_sao_src_top_left_luma_bot_left = pu1_src_left_luma[sao_ht_luma];
+                    ps_codec->apf_sao_luma[ps_sao->b3_y_type_idx - 2](pu1_src_luma,
+                                                                      src_strd,
+                                                                      pu1_src_left_luma,
+                                                                      pu1_src_top_luma,
+                                                                      pu1_sao_src_luma_top_left_ctb,
+                                                                      au1_src_top_right,
+                                                                      &u1_sao_src_top_left_luma_bot_left,
+                                                                      au1_avail_luma,
+                                                                      ai1_offset_y,
+                                                                      sao_wd_luma,
+                                                                      sao_ht_luma);
+                }
+            }
+
+        }
+
+        if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_chroma_flag)
+        {
+            if(0 == ps_sao->b3_cb_type_idx)
+            {
+                for(row = 0; row < sao_ht_chroma; row++)
+                {
+                    pu1_src_left_chroma[2 * row] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 2)];
+                    pu1_src_left_chroma[2 * row + 1] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 1)];
+                }
+                pu1_sao_src_chroma_top_left_ctb[0] = pu1_src_top_chroma[sao_wd_chroma - 2];
+                pu1_sao_src_chroma_top_left_ctb[1] = pu1_src_top_chroma[sao_wd_chroma - 1];
+
+                ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_chroma, &pu1_src_chroma[(sao_ht_chroma - 1) * src_strd], sao_wd_chroma);
+
+            }
+
+            else if(1 == ps_sao->b3_cb_type_idx)
+            {
+                ai1_offset_cb[1] = ps_sao->b4_cb_offset_1;
+                ai1_offset_cb[2] = ps_sao->b4_cb_offset_2;
+                ai1_offset_cb[3] = ps_sao->b4_cb_offset_3;
+                ai1_offset_cb[4] = ps_sao->b4_cb_offset_4;
+
+                ai1_offset_cr[1] = ps_sao->b4_cr_offset_1;
+                ai1_offset_cr[2] = ps_sao->b4_cr_offset_2;
+                ai1_offset_cr[3] = ps_sao->b4_cr_offset_3;
+                ai1_offset_cr[4] = ps_sao->b4_cr_offset_4;
+
+                if(chroma_yuv420sp_vu)
+                {
+                    ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr(pu1_src_chroma,
+                                                                                src_strd,
+                                                                                pu1_src_left_chroma,
+                                                                                pu1_src_top_chroma,
+                                                                                pu1_sao_src_chroma_top_left_ctb,
+                                                                                ps_sao->b5_cr_band_pos,
+                                                                                ps_sao->b5_cb_band_pos,
+                                                                                ai1_offset_cr,
+                                                                                ai1_offset_cb,
+                                                                                sao_wd_chroma,
+                                                                                sao_ht_chroma
+                                                                               );
+                }
+                else
+                {
+                    ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr(pu1_src_chroma,
+                                                                                src_strd,
+                                                                                pu1_src_left_chroma,
+                                                                                pu1_src_top_chroma,
+                                                                                pu1_sao_src_chroma_top_left_ctb,
+                                                                                ps_sao->b5_cb_band_pos,
+                                                                                ps_sao->b5_cr_band_pos,
+                                                                                ai1_offset_cb,
+                                                                                ai1_offset_cr,
+                                                                                sao_wd_chroma,
+                                                                                sao_ht_chroma
+                                                                               );
+                }
+            }
+
+            else // if(2 <= ps_sao->b3_cb_type_idx)
+            {
+                ai1_offset_cb[1] = ps_sao->b4_cb_offset_1;
+                ai1_offset_cb[2] = ps_sao->b4_cb_offset_2;
+                ai1_offset_cb[3] = ps_sao->b4_cb_offset_3;
+                ai1_offset_cb[4] = ps_sao->b4_cb_offset_4;
+
+                ai1_offset_cr[1] = ps_sao->b4_cr_offset_1;
+                ai1_offset_cr[2] = ps_sao->b4_cr_offset_2;
+                ai1_offset_cr[3] = ps_sao->b4_cr_offset_3;
+                ai1_offset_cr[4] = ps_sao->b4_cr_offset_4;
+                for(i = 0; i < 8; i++)
+                {
+                    au1_avail_chroma[i] = 255;
+                    au1_tile_slice_boundary[i] = 0;
+                    au4_idx_tl[i] = 0;
+                    au4_ilf_across_tile_slice_enable[i] = 1;
+                }
+                /*In case of slices*/
+                {
+                    if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+                    {
+                        if((0 == (1 << log2_ctb_size) - sao_wd_chroma) && (ps_sao_ctxt->i4_ctb_y > 1) && (ps_sao_ctxt->i4_ctb_x > 1))
+                        {
+                            ctby_tl_t = ps_sao_ctxt->i4_ctb_y - 2;
+                            ctbx_tl_l = ps_sao_ctxt->i4_ctb_x - 2;
+                        }
+                        else if(!(0 == (1 << log2_ctb_size) - sao_wd_chroma))
+                        {
+                            ctby_tl_t = ps_sao_ctxt->i4_ctb_y - 1;
+                            ctbx_tl_l = ps_sao_ctxt->i4_ctb_x - 1;
+                        }
+                        ctbx_tl_t = ps_sao_ctxt->i4_ctb_x - 1;
+                        ctby_tl_l = ps_sao_ctxt->i4_ctb_y - 1;
+
+                        ctbx_tl_r = ps_sao_ctxt->i4_ctb_x;
+                        ctby_tl_r = ps_sao_ctxt->i4_ctb_y - 1;
+
+                        ctbx_tl_d =  ps_sao_ctxt->i4_ctb_x - 1;
+                        ctby_tl_d =  ps_sao_ctxt->i4_ctb_y;
+
+                        ctbx_tl = ps_sao_ctxt->i4_ctb_x - 1;
+                        ctby_tl = ps_sao_ctxt->i4_ctb_y - 1;
+
+                        if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+                        {
+
+                            idx_tl   = pu1_slice_idx[ctbx_tl + (ctby_tl * ps_sps->i2_pic_wd_in_ctb)];
+                            au4_idx_tl[2] = au4_idx_tl[4] = *(pu1_slice_idx + ctbx_tl_t + (ctby_tl_t * ps_sps->i2_pic_wd_in_ctb));
+                            au4_idx_tl[0] =  pu1_slice_idx[ctbx_tl_l + (ctby_tl_l * ps_sps->i2_pic_wd_in_ctb)];
+                            au4_idx_tl[1] = au4_idx_tl[5] = pu1_slice_idx[ctbx_tl_r + (ctby_tl_r * ps_sps->i2_pic_wd_in_ctb)];
+                            au4_idx_tl[3] = au4_idx_tl[6] =   pu1_slice_idx[ctbx_tl_d + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+                            au4_idx_tl[7] = pu1_slice_idx[ctbx_tl_d + 1 + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+
+                            if((0 == (1 << log2_ctb_size) - sao_wd_chroma))
+                            {
+                                if(ps_sao_ctxt->i4_ctb_x == 1)
+                                {
+                                    au4_idx_tl[6] = -1;
+                                    au4_idx_tl[4] = -1;
+                                }
+                                else
+                                {
+                                    au4_idx_tl[6] = pu1_slice_idx[(ctbx_tl_d - 1) + (ctby_tl_r * ps_sps->i2_pic_wd_in_ctb)];
+                                }
+                                if(ps_sao_ctxt->i4_ctb_y == 1)
+                                {
+                                    au4_idx_tl[5] = -1;
+                                    au4_idx_tl[4] = -1;
+                                }
+                                else
+                                {
+                                    au4_idx_tl[5] = pu1_slice_idx[(ctbx_tl_l + 1) + (ctby_tl_l * ps_sps->i2_pic_wd_in_ctb)];
+                                    au4_idx_tl[4] = pu1_slice_idx[(ctbx_tl_t - 1) + (ctby_tl_t * ps_sps->i2_pic_wd_in_ctb)];
+                                }
+                                au4_idx_tl[7] = pu1_slice_idx[(ctbx_tl_d + 1) + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+                            }
+
+                            /* Verify that the neighbor ctbs don't cross pic boundary
+                             * Also, the ILF flag belonging to the higher pixel address (between neighbor and current pixels) must be assigned*/
+                            if((0 == (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_chroma))
+                            {
+                                au4_ilf_across_tile_slice_enable[4] = 0;
+                                au4_ilf_across_tile_slice_enable[6] = 0;
+                            }
+                            else
+                            {
+                                au4_ilf_across_tile_slice_enable[6] = (ps_slice_hdr_base + au4_idx_tl[6])->i1_slice_loop_filter_across_slices_enabled_flag;
+                            }
+                            if((0 == (ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 1)) - sao_ht_chroma))
+                            {
+                                au4_ilf_across_tile_slice_enable[5] = 0;
+                                au4_ilf_across_tile_slice_enable[4] = 0;
+                            }
+                            else
+                            {
+                                au4_ilf_across_tile_slice_enable[4] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[5] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
+                            }
+                            au4_ilf_across_tile_slice_enable[2] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
+                            au4_ilf_across_tile_slice_enable[0] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
+                            au4_ilf_across_tile_slice_enable[1] = (ps_slice_hdr_base + au4_idx_tl[1])->i1_slice_loop_filter_across_slices_enabled_flag;
+                            au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_tl[3])->i1_slice_loop_filter_across_slices_enabled_flag;
+                            au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_tl[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+                            /*
+                             * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
+                             * of the pixel having a greater address is checked. Accordingly, set the availability flags
+                             */
+                            for(i = 0; i < 8; i++)
+                            {
+                                /*Sets the edges that lie on the slice/tile boundary*/
+                                if(au4_idx_tl[i] != idx_tl)
+                                {
+                                    au1_tile_slice_boundary[i] = 1;
+                                }
+                                else
+                                {
+                                    au4_ilf_across_tile_slice_enable[i] = 1;
+                                }
+                            }
+
+                            /*Reset indices*/
+                            for(i = 0; i < 8; i++)
+                            {
+                                au4_idx_tl[i] = 0;
+                            }
+                        }
+                        if(ps_pps->i1_tiles_enabled_flag)
+                        {
+                            /* Calculate availability flags at slice boundary */
+                            if(((ps_tile->u1_pos_x == ps_sao_ctxt->i4_ctb_x) || (ps_tile->u1_pos_y == ps_sao_ctxt->i4_ctb_y)) && (!((0 == ps_tile->u1_pos_x) && (0 == ps_tile->u1_pos_y))))
+                            {
+                                /*If ilf across tiles is enabled, boundary availability for tiles is not checked. */
+                                if(!ps_pps->i1_loop_filter_across_tiles_enabled_flag)
+                                {
+                                    /*Set the boundary arrays*/
+                                    /*Calculate tile indices for neighbor pixels*/
+                                    idx_tl   = pu1_tile_idx[ctbx_tl + (ctby_tl * ps_sps->i2_pic_wd_in_ctb)];
+                                    au4_idx_tl[2] = au4_idx_tl[4] = *(pu1_tile_idx + ctbx_tl_t + (ctby_tl_t * ps_sps->i2_pic_wd_in_ctb));
+                                    au4_idx_tl[0] =  pu1_tile_idx[ctbx_tl_l + (ctby_tl_l * ps_sps->i2_pic_wd_in_ctb)];
+                                    au4_idx_tl[1] = au4_idx_tl[5] = pu1_tile_idx[ctbx_tl_r + (ctby_tl_r * ps_sps->i2_pic_wd_in_ctb)];
+                                    au4_idx_tl[3] = au4_idx_tl[6] =   pu1_tile_idx[ctbx_tl_d + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+                                    au4_idx_tl[7] = pu1_tile_idx[ctbx_tl_d + 1 + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+
+                                    if((0 == (1 << log2_ctb_size) - sao_wd_luma))
+                                    {
+                                        if(ps_sao_ctxt->i4_ctb_x == 1)
+                                        {
+                                            au4_idx_tl[6] = -1;
+                                            au4_idx_tl[4] = -1;
+                                        }
+                                        else
+                                        {
+                                            au4_idx_tl[6] = pu1_tile_idx[(ctbx_tl_d - 1) + (ctby_tl_r * ps_sps->i2_pic_wd_in_ctb)];
+                                        }
+                                        if(ps_sao_ctxt->i4_ctb_y == 1)
+                                        {
+                                            au4_idx_tl[5] = -1;
+                                            au4_idx_tl[4] = -1;
+                                        }
+                                        else
+                                        {
+                                            au4_idx_tl[5] = pu1_tile_idx[(ctbx_tl_l + 1) + (ctby_tl_l * ps_sps->i2_pic_wd_in_ctb)];
+                                            au4_idx_tl[4] = pu1_tile_idx[(ctbx_tl_t - 1) + (ctby_tl_t * ps_sps->i2_pic_wd_in_ctb)];
+                                        }
+                                        au4_idx_tl[7] = pu1_tile_idx[(ctbx_tl_d + 1) + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+                                    }
+                                    for(i = 0; i < 8; i++)
+                                    {
+                                        /*Sets the edges that lie on the tile boundary*/
+                                        if(au4_idx_tl[i] != idx_tl)
+                                        {
+                                            au1_tile_slice_boundary[i] |= 1;
+                                            au4_ilf_across_tile_slice_enable[i] &= ps_pps->i1_loop_filter_across_tiles_enabled_flag; //=0
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        for(i = 0; i < 8; i++)
+                        {
+                            /*Sets the edges that lie on the slice/tile boundary*/
+                            if((au1_tile_slice_boundary[i]) && !(au4_ilf_across_tile_slice_enable[i]))
+                            {
+                                au1_avail_chroma[i] = 0;
+                            }
+                        }
+                    }
+                }
+
+                if(0 == (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_chroma)
+                {
+                    au1_avail_chroma[0] = 0;
+                    au1_avail_chroma[4] = 0;
+                    au1_avail_chroma[6] = 0;
+                }
+                if(ps_sps->i2_pic_wd_in_ctb == ps_sao_ctxt->i4_ctb_x)
+                {
+                    au1_avail_chroma[1] = 0;
+                    au1_avail_chroma[5] = 0;
+                    au1_avail_chroma[7] = 0;
+                }
+
+                if(0 == (ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 1)) - sao_ht_chroma)
+                {
+                    au1_avail_chroma[2] = 0;
+                    au1_avail_chroma[4] = 0;
+                    au1_avail_chroma[5] = 0;
+                }
+                if(ps_sps->i2_pic_ht_in_ctb == ps_sao_ctxt->i4_ctb_y)
+                {
+                    au1_avail_chroma[3] = 0;
+                    au1_avail_chroma[6] = 0;
+                    au1_avail_chroma[7] = 0;
+                }
+
+                {
+                    au1_src_top_right[0] = pu1_src_top_chroma[sao_wd_chroma];
+                    au1_src_top_right[1] = pu1_src_top_chroma[sao_wd_chroma + 1];
+                    au1_sao_src_top_left_chroma_bot_left[0] = pu1_src_left_chroma[2 * sao_ht_chroma];
+                    au1_sao_src_top_left_chroma_bot_left[1] = pu1_src_left_chroma[2 * sao_ht_chroma + 1];
+                    if((ctb_size == 16) && (ps_sao_ctxt->i4_ctb_y != ps_sps->i2_pic_ht_in_ctb - 1))
+                    {
+                        au1_sao_src_top_left_chroma_bot_left[0] = pu1_src_chroma[sao_ht_chroma * src_strd - 2];
+                        au1_sao_src_top_left_chroma_bot_left[1] = pu1_src_chroma[sao_ht_chroma * src_strd - 1];
+                    }
+
+                    if(chroma_yuv420sp_vu)
+                    {
+                        ps_codec->apf_sao_chroma[ps_sao->b3_cb_type_idx - 2](pu1_src_chroma,
+                                                                             src_strd,
+                                                                             pu1_src_left_chroma,
+                                                                             pu1_src_top_chroma,
+                                                                             pu1_sao_src_chroma_top_left_ctb,
+                                                                             au1_src_top_right,
+                                                                             au1_sao_src_top_left_chroma_bot_left,
+                                                                             au1_avail_chroma,
+                                                                             ai1_offset_cr,
+                                                                             ai1_offset_cb,
+                                                                             sao_wd_chroma,
+                                                                             sao_ht_chroma);
+                    }
+                    else
+                    {
+                        ps_codec->apf_sao_chroma[ps_sao->b3_cb_type_idx - 2](pu1_src_chroma,
+                                                                             src_strd,
+                                                                             pu1_src_left_chroma,
+                                                                             pu1_src_top_chroma,
+                                                                             pu1_sao_src_chroma_top_left_ctb,
+                                                                             au1_src_top_right,
+                                                                             au1_sao_src_top_left_chroma_bot_left,
+                                                                             au1_avail_chroma,
+                                                                             ai1_offset_cb,
+                                                                             ai1_offset_cr,
+                                                                             sao_wd_chroma,
+                                                                             sao_ht_chroma);
+                    }
+                }
+            }
+        }
+
+        pu1_src_luma += sao_wd_luma + sao_ht_luma * src_strd;
+        pu1_src_chroma += sao_wd_chroma + sao_ht_chroma * src_strd;
+        ps_sao += (1 + ps_sps->i2_pic_wd_in_ctb);
+    }
+
+
+    /* Top CTB */
+    if((ps_sao_ctxt->i4_ctb_y > 0))
+    {
+        WORD32 sao_wd_luma = ctb_size - SAO_SHIFT_CTB;
+        WORD32 sao_wd_chroma = ctb_size - 2 * SAO_SHIFT_CTB;
+        WORD32 sao_ht_luma = SAO_SHIFT_CTB;
+        WORD32 sao_ht_chroma = SAO_SHIFT_CTB;
+
+        WORD32 ctbx_t_t = 0, ctbx_t_l = 0, ctbx_t_r = 0, ctbx_t_d = 0, ctbx_t = 0;
+        WORD32 ctby_t_t = 0, ctby_t_l = 0, ctby_t_r = 0, ctby_t_d = 0, ctby_t = 0;
+        WORD32 au4_idx_t[8], idx_t;
+
+        WORD32 remaining_cols;
+
+        remaining_cols = ps_sps->i2_pic_width_in_luma_samples - ((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) + sao_wd_luma);
+        if(remaining_cols <= SAO_SHIFT_CTB)
+        {
+            sao_wd_luma += remaining_cols;
+        }
+        remaining_cols = ps_sps->i2_pic_width_in_luma_samples - ((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) + sao_wd_chroma);
+        if(remaining_cols <= 2 * SAO_SHIFT_CTB)
+        {
+            sao_wd_chroma += remaining_cols;
+        }
+
+        pu1_src_luma -= (sao_ht_luma * src_strd);
+        pu1_src_chroma -= (sao_ht_chroma * src_strd);
+        ps_sao -= (ps_sps->i2_pic_wd_in_ctb);
+        pu1_src_top_luma = ps_sao_ctxt->pu1_sao_src_top_luma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size);
+        pu1_src_top_chroma = ps_sao_ctxt->pu1_sao_src_top_chroma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size);
+        pu1_src_left_luma = ps_sao_ctxt->pu1_sao_src_left_luma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size) - sao_ht_chroma;
+        pu1_src_left_chroma = ps_sao_ctxt->pu1_sao_src_left_chroma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size) - (2 * sao_ht_chroma);
+
+        if(0 != sao_wd_luma)
+        {
+            if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_luma_flag)
+            {
+                if(0 == ps_sao->b3_y_type_idx)
+                {
+                    /* Update left, top and top-left */
+                    for(row = 0; row < sao_ht_luma; row++)
+                    {
+                        pu1_src_left_luma[row] = pu1_src_luma[row * src_strd + (sao_wd_luma - 1)];
+                    }
+                    pu1_sao_src_luma_top_left_ctb[0] = pu1_src_top_luma[sao_wd_luma - 1];
+
+                    ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_luma, &pu1_src_luma[(sao_ht_luma - 1) * src_strd], sao_wd_luma);
+
+                }
+
+                else if(1 == ps_sao->b3_y_type_idx)
+                {
+                    ai1_offset_y[1] = ps_sao->b4_y_offset_1;
+                    ai1_offset_y[2] = ps_sao->b4_y_offset_2;
+                    ai1_offset_y[3] = ps_sao->b4_y_offset_3;
+                    ai1_offset_y[4] = ps_sao->b4_y_offset_4;
+
+                    ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr(pu1_src_luma,
+                                                                              src_strd,
+                                                                              pu1_src_left_luma,
+                                                                              pu1_src_top_luma,
+                                                                              pu1_sao_src_luma_top_left_ctb,
+                                                                              ps_sao->b5_y_band_pos,
+                                                                              ai1_offset_y,
+                                                                              sao_wd_luma,
+                                                                              sao_ht_luma
+                                                                             );
+                }
+
+                else // if(2 <= ps_sao->b3_y_type_idx)
+                {
+                    ai1_offset_y[1] = ps_sao->b4_y_offset_1;
+                    ai1_offset_y[2] = ps_sao->b4_y_offset_2;
+                    ai1_offset_y[3] = ps_sao->b4_y_offset_3;
+                    ai1_offset_y[4] = ps_sao->b4_y_offset_4;
+
+                    ps_codec->s_func_selector.ihevc_memset_mul_8_fptr(au1_avail_luma, 255, 8);
+                    ps_codec->s_func_selector.ihevc_memset_mul_8_fptr(au1_tile_slice_boundary, 0, 8);
+                    ps_codec->s_func_selector.ihevc_memset_mul_8_fptr((UWORD8 *)au4_idx_t, 0, 8 * sizeof(WORD32));
+
+                    for(i = 0; i < 8; i++)
+                    {
+
+                        au4_ilf_across_tile_slice_enable[i] = 1;
+                    }
+                    /******************************************************************
+                     * Derive the  Top-left CTB's neighbor pixel's slice indices.
+                     *
+                     *               T_T
+                     *          ____________
+                     *         |    |       |
+                     *         | T_L|  T    |T_R
+                     *         |    | ______|____
+                     *         |    |  T_D  |    |
+                     *         |    |       |    |
+                     *         |____|_______|    |
+                     *              |            |
+                     *              |            |
+                     *              |____________|
+                     *
+                     *****************************************************************/
+
+                    /*In case of slices*/
+                    {
+                        if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+                        {
+
+                            ctbx_t_t = ps_sao_ctxt->i4_ctb_x;
+                            ctby_t_t = ps_sao_ctxt->i4_ctb_y - 1;
+
+                            ctbx_t_l = ps_sao_ctxt->i4_ctb_x - 1;
+                            ctby_t_l = ps_sao_ctxt->i4_ctb_y - 1;
+
+                            ctbx_t_r = ps_sao_ctxt->i4_ctb_x;
+                            ctby_t_r = ps_sao_ctxt->i4_ctb_y - 1;
+
+                            ctbx_t_d =  ps_sao_ctxt->i4_ctb_x;
+                            ctby_t_d =  ps_sao_ctxt->i4_ctb_y;
+
+                            ctbx_t = ps_sao_ctxt->i4_ctb_x;
+                            ctby_t = ps_sao_ctxt->i4_ctb_y - 1;
+
+                            if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+                            {
+                                /*Calculate neighbor ctb slice indices*/
+                                if(0 == ps_sao_ctxt->i4_ctb_x)
+                                {
+                                    au4_idx_t[0] = -1;
+                                    au4_idx_t[6] = -1;
+                                    au4_idx_t[4] = -1;
+                                }
+                                else
+                                {
+                                    au4_idx_t[0] = au4_idx_t[4] = pu1_slice_idx[ctbx_t_l + (ctby_t_l * ps_sps->i2_pic_wd_in_ctb)];
+                                    au4_idx_t[6] = pu1_slice_idx[ctbx_t_d - 1 + (ctby_t_d * ps_sps->i2_pic_wd_in_ctb)];
+                                }
+                                idx_t   = pu1_slice_idx[ctbx_t + (ctby_t * ps_sps->i2_pic_wd_in_ctb)];
+                                au4_idx_t[2] = au4_idx_t[5] = pu1_slice_idx[ctbx_t_t + (ctby_t_t * ps_sps->i2_pic_wd_in_ctb)];
+                                au4_idx_t[1] = pu1_slice_idx[ctbx_t_r + (ctby_t_r * ps_sps->i2_pic_wd_in_ctb)];
+                                au4_idx_t[3] = au4_idx_t[7] = pu1_slice_idx[ctbx_t_d + (ctby_t_d * ps_sps->i2_pic_wd_in_ctb)];
+
+                                /*Verify that the neighbor ctbs don't cross pic boundary.*/
+                                if(0 == ps_sao_ctxt->i4_ctb_x)
+                                {
+                                    au4_ilf_across_tile_slice_enable[4] = 0;
+                                    au4_ilf_across_tile_slice_enable[6] = 0;
+                                    au4_ilf_across_tile_slice_enable[0] = 0;
+                                }
+                                else
+                                {
+                                    au4_ilf_across_tile_slice_enable[4] = au4_ilf_across_tile_slice_enable[0] = (ps_slice_hdr_base + idx_t)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                    au4_ilf_across_tile_slice_enable[6] = (ps_slice_hdr_base + au4_idx_t[6])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                }
+
+
+
+                                au4_ilf_across_tile_slice_enable[5] = (ps_slice_hdr_base + idx_t)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[2] = (ps_slice_hdr_base + idx_t)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[1] = (ps_slice_hdr_base + au4_idx_t[1])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_t[3])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_t[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                /*
+                                 * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
+                                 * of the pixel having a greater address is checked. Accordingly, set the availability flags
+                                 */
+
+                                for(i = 0; i < 8; i++)
+                                {
+                                    /*Sets the edges that lie on the slice/tile boundary*/
+                                    if(au4_idx_t[i] != idx_t)
+                                    {
+                                        au1_tile_slice_boundary[i] = 1;
+                                        /*Check for slice flag at such boundaries*/
+                                    }
+                                    else
+                                    {
+                                        au4_ilf_across_tile_slice_enable[i] = 1;
+                                    }
+                                }
+                                /*Reset indices*/
+                                for(i = 0; i < 8; i++)
+                                {
+                                    au4_idx_t[i] = 0;
+                                }
+                            }
+
+                            if(ps_pps->i1_tiles_enabled_flag)
+                            {
+                                /* Calculate availability flags at slice boundary */
+                                if(((ps_tile->u1_pos_x == ps_sao_ctxt->i4_ctb_x) || (ps_tile->u1_pos_y == ps_sao_ctxt->i4_ctb_y)) && (!((0 == ps_tile->u1_pos_x) && (0 == ps_tile->u1_pos_y))))
+                                {
+                                    /*If ilf across tiles is enabled, boundary availability for tiles is not checked. */
+                                    if(!ps_pps->i1_loop_filter_across_tiles_enabled_flag)
+                                    {
+                                        /*Calculate neighbor ctb slice indices*/
+                                        if(0 == ps_sao_ctxt->i4_ctb_x)
+                                        {
+                                            au4_idx_t[0] = -1;
+                                            au4_idx_t[6] = -1;
+                                            au4_idx_t[4] = -1;
+                                        }
+                                        else
+                                        {
+                                            au4_idx_t[0] = au4_idx_t[4] = pu1_tile_idx[ctbx_t_l + (ctby_t_l * ps_sps->i2_pic_wd_in_ctb)];
+                                            au4_idx_t[6] = pu1_tile_idx[ctbx_t_d - 1 + (ctby_t_d * ps_sps->i2_pic_wd_in_ctb)];
+                                        }
+                                        idx_t   = pu1_tile_idx[ctbx_t + (ctby_t * ps_sps->i2_pic_wd_in_ctb)];
+                                        au4_idx_t[2] = au4_idx_t[5] = pu1_tile_idx[ctbx_t_t + (ctby_t_t * ps_sps->i2_pic_wd_in_ctb)];
+                                        au4_idx_t[1] = pu1_tile_idx[ctbx_t_r + (ctby_t_r * ps_sps->i2_pic_wd_in_ctb)];
+                                        au4_idx_t[3] = au4_idx_t[7] = pu1_tile_idx[ctbx_t_d + (ctby_t_d * ps_sps->i2_pic_wd_in_ctb)];
+
+                                        for(i = 0; i < 8; i++)
+                                        {
+                                            /*Sets the edges that lie on the tile boundary*/
+                                            if(au4_idx_t[i] != idx_t)
+                                            {
+                                                au1_tile_slice_boundary[i] |= 1;
+                                                au4_ilf_across_tile_slice_enable[i] &= ps_pps->i1_loop_filter_across_tiles_enabled_flag;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+
+                            for(i = 0; i < 8; i++)
+                            {
+                                /*Sets the edges that lie on the slice/tile boundary*/
+                                if((au1_tile_slice_boundary[i]) && !(au4_ilf_across_tile_slice_enable[i]))
+                                {
+                                    au1_avail_luma[i] = 0;
+                                }
+                            }
+                        }
+                    }
+
+
+                    if(0 == ps_sao_ctxt->i4_ctb_x)
+                    {
+                        au1_avail_luma[0] = 0;
+                        au1_avail_luma[4] = 0;
+                        au1_avail_luma[6] = 0;
+                    }
+
+                    if(ps_sps->i2_pic_width_in_luma_samples - (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) <= sao_wd_luma)
+                    {
+                        au1_avail_luma[1] = 0;
+                        au1_avail_luma[5] = 0;
+                        au1_avail_luma[7] = 0;
+                    }
+
+                    if(0 == (ps_sao_ctxt->i4_ctb_y << log2_ctb_size) - sao_ht_luma)
+                    {
+                        au1_avail_luma[2] = 0;
+                        au1_avail_luma[4] = 0;
+                        au1_avail_luma[5] = 0;
+                    }
+
+                    if(ps_sps->i2_pic_ht_in_ctb == ps_sao_ctxt->i4_ctb_y)
+                    {
+                        au1_avail_luma[3] = 0;
+                        au1_avail_luma[6] = 0;
+                        au1_avail_luma[7] = 0;
+                    }
+
+                    {
+                        au1_src_top_right[0] = pu1_sao_src_top_left_luma_top_right[0];
+                        u1_sao_src_top_left_luma_bot_left = pu1_src_luma[sao_ht_luma * src_strd - 1];
+                        ps_codec->apf_sao_luma[ps_sao->b3_y_type_idx - 2](pu1_src_luma,
+                                                                          src_strd,
+                                                                          pu1_src_left_luma,
+                                                                          pu1_src_top_luma,
+                                                                          pu1_sao_src_luma_top_left_ctb,
+                                                                          au1_src_top_right,
+                                                                          &u1_sao_src_top_left_luma_bot_left,
+                                                                          au1_avail_luma,
+                                                                          ai1_offset_y,
+                                                                          sao_wd_luma,
+                                                                          sao_ht_luma);
+                    }
+                }
+            }
+        }
+
+        if(0 != sao_wd_chroma)
+        {
+            if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_chroma_flag)
+            {
+                if(0 == ps_sao->b3_cb_type_idx)
+                {
+
+                    for(row = 0; row < sao_ht_chroma; row++)
+                    {
+                        pu1_src_left_chroma[2 * row] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 2)];
+                        pu1_src_left_chroma[2 * row + 1] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 1)];
+                    }
+                    pu1_sao_src_chroma_top_left_ctb[0] = pu1_src_top_chroma[sao_wd_chroma - 2];
+                    pu1_sao_src_chroma_top_left_ctb[1] = pu1_src_top_chroma[sao_wd_chroma - 1];
+
+                    ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_chroma, &pu1_src_chroma[(sao_ht_chroma - 1) * src_strd], sao_wd_chroma);
+
+                }
+
+                else if(1 == ps_sao->b3_cb_type_idx)
+                {
+                    ai1_offset_cb[1] = ps_sao->b4_cb_offset_1;
+                    ai1_offset_cb[2] = ps_sao->b4_cb_offset_2;
+                    ai1_offset_cb[3] = ps_sao->b4_cb_offset_3;
+                    ai1_offset_cb[4] = ps_sao->b4_cb_offset_4;
+
+                    ai1_offset_cr[1] = ps_sao->b4_cr_offset_1;
+                    ai1_offset_cr[2] = ps_sao->b4_cr_offset_2;
+                    ai1_offset_cr[3] = ps_sao->b4_cr_offset_3;
+                    ai1_offset_cr[4] = ps_sao->b4_cr_offset_4;
+
+                    if(chroma_yuv420sp_vu)
+                    {
+                        ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr(pu1_src_chroma,
+                                                                                    src_strd,
+                                                                                    pu1_src_left_chroma,
+                                                                                    pu1_src_top_chroma,
+                                                                                    pu1_sao_src_chroma_top_left_ctb,
+                                                                                    ps_sao->b5_cr_band_pos,
+                                                                                    ps_sao->b5_cb_band_pos,
+                                                                                    ai1_offset_cr,
+                                                                                    ai1_offset_cb,
+                                                                                    sao_wd_chroma,
+                                                                                    sao_ht_chroma
+                                                                                   );
+                    }
+                    else
+                    {
+                        ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr(pu1_src_chroma,
+                                                                                    src_strd,
+                                                                                    pu1_src_left_chroma,
+                                                                                    pu1_src_top_chroma,
+                                                                                    pu1_sao_src_chroma_top_left_ctb,
+                                                                                    ps_sao->b5_cb_band_pos,
+                                                                                    ps_sao->b5_cr_band_pos,
+                                                                                    ai1_offset_cb,
+                                                                                    ai1_offset_cr,
+                                                                                    sao_wd_chroma,
+                                                                                    sao_ht_chroma
+                                                                                   );
+                    }
+                }
+                else // if(2 <= ps_sao->b3_cb_type_idx)
+                {
+                    ai1_offset_cb[1] = ps_sao->b4_cb_offset_1;
+                    ai1_offset_cb[2] = ps_sao->b4_cb_offset_2;
+                    ai1_offset_cb[3] = ps_sao->b4_cb_offset_3;
+                    ai1_offset_cb[4] = ps_sao->b4_cb_offset_4;
+
+                    ai1_offset_cr[1] = ps_sao->b4_cr_offset_1;
+                    ai1_offset_cr[2] = ps_sao->b4_cr_offset_2;
+                    ai1_offset_cr[3] = ps_sao->b4_cr_offset_3;
+                    ai1_offset_cr[4] = ps_sao->b4_cr_offset_4;
+
+                    for(i = 0; i < 8; i++)
+                    {
+                        au1_avail_chroma[i] = 255;
+                        au1_tile_slice_boundary[i] = 0;
+                        au4_idx_t[i] = 0;
+                        au4_ilf_across_tile_slice_enable[i] = 1;
+                    }
+
+                    {
+                        if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+                        {
+                            ctbx_t_t = ps_sao_ctxt->i4_ctb_x;
+                            ctby_t_t = ps_sao_ctxt->i4_ctb_y - 1;
+
+                            ctbx_t_l = ps_sao_ctxt->i4_ctb_x - 1;
+                            ctby_t_l = ps_sao_ctxt->i4_ctb_y - 1;
+
+                            ctbx_t_r = ps_sao_ctxt->i4_ctb_x;
+                            ctby_t_r = ps_sao_ctxt->i4_ctb_y - 1;
+
+                            ctbx_t_d =  ps_sao_ctxt->i4_ctb_x;
+                            ctby_t_d =  ps_sao_ctxt->i4_ctb_y;
+
+                            ctbx_t = ps_sao_ctxt->i4_ctb_x;
+                            ctby_t = ps_sao_ctxt->i4_ctb_y - 1;
+
+                            if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+                            {
+                                if(0 == ps_sao_ctxt->i4_ctb_x)
+                                {
+                                    au4_idx_t[0] = -1;
+                                    au4_idx_t[6] = -1;
+                                    au4_idx_t[4] = -1;
+                                }
+                                else
+                                {
+                                    au4_idx_t[0] = au4_idx_t[4] = pu1_slice_idx[ctbx_t_l + (ctby_t_l * ps_sps->i2_pic_wd_in_ctb)];
+                                    au4_idx_t[6] = pu1_slice_idx[ctbx_t_d - 1 + (ctby_t_d * ps_sps->i2_pic_wd_in_ctb)];
+                                }
+                                idx_t   = pu1_slice_idx[ctbx_t + (ctby_t * ps_sps->i2_pic_wd_in_ctb)];
+                                au4_idx_t[2] = au4_idx_t[5] = pu1_slice_idx[ctbx_t_t + (ctby_t_t * ps_sps->i2_pic_wd_in_ctb)];
+                                au4_idx_t[1] = pu1_slice_idx[ctbx_t_r + (ctby_t_r * ps_sps->i2_pic_wd_in_ctb)];
+                                au4_idx_t[3] = au4_idx_t[7] = pu1_slice_idx[ctbx_t_d + (ctby_t_d * ps_sps->i2_pic_wd_in_ctb)];
+
+                                /*Verify that the neighbor ctbs don't cross pic boundary.*/
+
+                                if(0 == ps_sao_ctxt->i4_ctb_x)
+                                {
+                                    au4_ilf_across_tile_slice_enable[4] = 0;
+                                    au4_ilf_across_tile_slice_enable[6] = 0;
+                                    au4_ilf_across_tile_slice_enable[0] = 0;
+                                }
+                                else
+                                {
+                                    au4_ilf_across_tile_slice_enable[4] = au4_ilf_across_tile_slice_enable[0] = (ps_slice_hdr_base + idx_t)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                    au4_ilf_across_tile_slice_enable[6] = (ps_slice_hdr_base + au4_idx_t[6])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                }
+
+                                au4_ilf_across_tile_slice_enable[5] = (ps_slice_hdr_base + idx_t)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[2] = (ps_slice_hdr_base + idx_t)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[1] = (ps_slice_hdr_base + au4_idx_t[1])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_t[3])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_t[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                /*
+                                 * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
+                                 * of the pixel having a greater address is checked. Accordingly, set the availability flags
+                                 */
+                                for(i = 0; i < 8; i++)
+                                {
+                                    /*Sets the edges that lie on the slice/tile boundary*/
+                                    if(au4_idx_t[i] != idx_t)
+                                    {
+                                        au1_tile_slice_boundary[i] = 1;
+                                    }
+                                    else
+                                    {
+                                        /*Indicates that the neighbour belongs to same/dependent slice*/
+                                        au4_ilf_across_tile_slice_enable[i] = 1;
+                                    }
+                                }
+                                /*Reset indices*/
+                                for(i = 0; i < 8; i++)
+                                {
+                                    au4_idx_t[i] = 0;
+                                }
+                            }
+                            if(ps_pps->i1_tiles_enabled_flag)
+                            {
+                                /* Calculate availability flags at slice boundary */
+                                if(((ps_tile->u1_pos_x == ps_sao_ctxt->i4_ctb_x) || (ps_tile->u1_pos_y == ps_sao_ctxt->i4_ctb_y)) && (!((0 == ps_tile->u1_pos_x) && (0 == ps_tile->u1_pos_y))))
+                                {
+                                    /*If ilf across tiles is enabled, boundary availability for tiles is not checked. */
+                                    if(!ps_pps->i1_loop_filter_across_tiles_enabled_flag)
+                                    {
+                                        /*Calculate neighbor ctb slice indices*/
+                                        if(0 == ps_sao_ctxt->i4_ctb_x)
+                                        {
+                                            au4_idx_t[0] = -1;
+                                            au4_idx_t[6] = -1;
+                                            au4_idx_t[4] = -1;
+                                        }
+                                        else
+                                        {
+                                            au4_idx_t[0] = au4_idx_t[4] = pu1_tile_idx[ctbx_t_l + (ctby_t_l * ps_sps->i2_pic_wd_in_ctb)];
+                                            au4_idx_t[6] = pu1_tile_idx[ctbx_t_d - 1 + (ctby_t_d * ps_sps->i2_pic_wd_in_ctb)];
+                                        }
+                                        idx_t   = pu1_tile_idx[ctbx_t + (ctby_t * ps_sps->i2_pic_wd_in_ctb)];
+                                        au4_idx_t[2] = au4_idx_t[5] = pu1_tile_idx[ctbx_t_t + (ctby_t_t * ps_sps->i2_pic_wd_in_ctb)];
+                                        au4_idx_t[1] = pu1_tile_idx[ctbx_t_r + (ctby_t_r * ps_sps->i2_pic_wd_in_ctb)];
+                                        au4_idx_t[3] = au4_idx_t[7] = pu1_tile_idx[ctbx_t_d + (ctby_t_d * ps_sps->i2_pic_wd_in_ctb)];
+
+                                        for(i = 0; i < 8; i++)
+                                        {
+                                            /*Sets the edges that lie on the tile boundary*/
+                                            if(au4_idx_t[i] != idx_t)
+                                            {
+                                                au1_tile_slice_boundary[i] |= 1;
+                                                au4_ilf_across_tile_slice_enable[i] &= ps_pps->i1_loop_filter_across_tiles_enabled_flag;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                            for(i = 0; i < 8; i++)
+                            {
+                                /*Sets the edges that lie on the slice/tile boundary*/
+                                if((au1_tile_slice_boundary[i]) && !(au4_ilf_across_tile_slice_enable[i]))
+                                {
+                                    au1_avail_chroma[i] = 0;
+                                }
+                            }
+
+                        }
+                    }
+                    if(0 == ps_sao_ctxt->i4_ctb_x)
+                    {
+                        au1_avail_chroma[0] = 0;
+                        au1_avail_chroma[4] = 0;
+                        au1_avail_chroma[6] = 0;
+                    }
+
+                    if(ps_sps->i2_pic_width_in_luma_samples - (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) <= sao_wd_chroma)
+                    {
+                        au1_avail_chroma[1] = 0;
+                        au1_avail_chroma[5] = 0;
+                        au1_avail_chroma[7] = 0;
+                    }
+
+                    if(0 == (ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 1)) - sao_ht_chroma)
+                    {
+                        au1_avail_chroma[2] = 0;
+                        au1_avail_chroma[4] = 0;
+                        au1_avail_chroma[5] = 0;
+                    }
+
+                    if(ps_sps->i2_pic_ht_in_ctb == ps_sao_ctxt->i4_ctb_y)
+                    {
+                        au1_avail_chroma[3] = 0;
+                        au1_avail_chroma[6] = 0;
+                        au1_avail_chroma[7] = 0;
+                    }
+
+                    {
+                        au1_src_top_right[0] = pu1_sao_src_top_left_chroma_top_right[0];
+                        au1_src_top_right[1] = pu1_sao_src_top_left_chroma_top_right[1];
+                        au1_sao_src_top_left_chroma_bot_left[0] = pu1_src_chroma[sao_ht_chroma * src_strd - 2];
+                        au1_sao_src_top_left_chroma_bot_left[1] = pu1_src_chroma[sao_ht_chroma * src_strd - 1];
+
+                        if(chroma_yuv420sp_vu)
+                        {
+                            ps_codec->apf_sao_chroma[ps_sao->b3_cb_type_idx - 2](pu1_src_chroma,
+                                                                                 src_strd,
+                                                                                 pu1_src_left_chroma,
+                                                                                 pu1_src_top_chroma,
+                                                                                 pu1_sao_src_chroma_top_left_ctb,
+                                                                                 au1_src_top_right,
+                                                                                 au1_sao_src_top_left_chroma_bot_left,
+                                                                                 au1_avail_chroma,
+                                                                                 ai1_offset_cr,
+                                                                                 ai1_offset_cb,
+                                                                                 sao_wd_chroma,
+                                                                                 sao_ht_chroma);
+                        }
+                        else
+                        {
+                            ps_codec->apf_sao_chroma[ps_sao->b3_cb_type_idx - 2](pu1_src_chroma,
+                                                                                 src_strd,
+                                                                                 pu1_src_left_chroma,
+                                                                                 pu1_src_top_chroma,
+                                                                                 pu1_sao_src_chroma_top_left_ctb,
+                                                                                 au1_src_top_right,
+                                                                                 au1_sao_src_top_left_chroma_bot_left,
+                                                                                 au1_avail_chroma,
+                                                                                 ai1_offset_cb,
+                                                                                 ai1_offset_cr,
+                                                                                 sao_wd_chroma,
+                                                                                 sao_ht_chroma);
+                        }
+                    }
+
+                }
+            }
+        }
+
+        pu1_src_luma += sao_ht_luma * src_strd;
+        pu1_src_chroma += sao_ht_chroma * src_strd;
+        ps_sao += (ps_sps->i2_pic_wd_in_ctb);
+    }
+
+    /* Left CTB */
+    if(ps_sao_ctxt->i4_ctb_x > 0)
+    {
+        WORD32 sao_wd_luma = SAO_SHIFT_CTB;
+        WORD32 sao_wd_chroma = 2 * SAO_SHIFT_CTB;
+        WORD32 sao_ht_luma = ctb_size - SAO_SHIFT_CTB;
+        WORD32 sao_ht_chroma = ctb_size / 2 - SAO_SHIFT_CTB;
+
+        WORD32 ctbx_l_t = 0, ctbx_l_l = 0, ctbx_l_r = 0, ctbx_l_d = 0, ctbx_l = 0;
+        WORD32 ctby_l_t = 0, ctby_l_l = 0, ctby_l_r = 0, ctby_l_d = 0, ctby_l = 0;
+        WORD32 au4_idx_l[8], idx_l;
+
+        WORD32 remaining_rows;
+        remaining_rows = ps_sps->i2_pic_height_in_luma_samples - ((ps_sao_ctxt->i4_ctb_y << log2_ctb_size) + sao_ht_luma);
+        if(remaining_rows <= SAO_SHIFT_CTB)
+        {
+            sao_ht_luma += remaining_rows;
+        }
+        remaining_rows = ps_sps->i2_pic_height_in_luma_samples / 2 - ((ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 1)) + sao_ht_chroma);
+        if(remaining_rows <= SAO_SHIFT_CTB)
+        {
+            sao_ht_chroma += remaining_rows;
+        }
+
+        pu1_src_luma -= sao_wd_luma;
+        pu1_src_chroma -= sao_wd_chroma;
+        ps_sao -= 1;
+        pu1_src_top_luma = ps_sao_ctxt->pu1_sao_src_top_luma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_luma;
+        pu1_src_top_chroma = ps_sao_ctxt->pu1_sao_src_top_chroma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_chroma;
+        pu1_src_left_luma = ps_sao_ctxt->pu1_sao_src_left_luma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size);
+        pu1_src_left_chroma = ps_sao_ctxt->pu1_sao_src_left_chroma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size);
+
+
+        if(0 != sao_ht_luma)
+        {
+            if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_luma_flag)
+            {
+                if(0 == ps_sao->b3_y_type_idx)
+                {
+                    /* Update left, top and top-left */
+                    for(row = 0; row < sao_ht_luma; row++)
+                    {
+                        pu1_src_left_luma[row] = pu1_src_luma[row * src_strd + (sao_wd_luma - 1)];
+                    }
+                    /*Update in next location*/
+                    pu1_sao_src_top_left_luma_curr_ctb[0] = pu1_src_top_luma[sao_wd_luma - 1];
+
+                    ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_luma, &pu1_src_luma[(sao_ht_luma - 1) * src_strd], sao_wd_luma);
+
+                }
+
+                else if(1 == ps_sao->b3_y_type_idx)
+                {
+                    ai1_offset_y[1] = ps_sao->b4_y_offset_1;
+                    ai1_offset_y[2] = ps_sao->b4_y_offset_2;
+                    ai1_offset_y[3] = ps_sao->b4_y_offset_3;
+                    ai1_offset_y[4] = ps_sao->b4_y_offset_4;
+
+                    ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr(pu1_src_luma,
+                                                                              src_strd,
+                                                                              pu1_src_left_luma,
+                                                                              pu1_src_top_luma,
+                                                                              pu1_sao_src_top_left_luma_curr_ctb,
+                                                                              ps_sao->b5_y_band_pos,
+                                                                              ai1_offset_y,
+                                                                              sao_wd_luma,
+                                                                              sao_ht_luma
+                                                                             );
+                }
+
+                else // if(2 <= ps_sao->b3_y_type_idx)
+                {
+                    ai1_offset_y[1] = ps_sao->b4_y_offset_1;
+                    ai1_offset_y[2] = ps_sao->b4_y_offset_2;
+                    ai1_offset_y[3] = ps_sao->b4_y_offset_3;
+                    ai1_offset_y[4] = ps_sao->b4_y_offset_4;
+
+                    for(i = 0; i < 8; i++)
+                    {
+                        au1_avail_luma[i] = 255;
+                        au1_tile_slice_boundary[i] = 0;
+                        au4_idx_l[i] = 0;
+                        au4_ilf_across_tile_slice_enable[i] = 1;
+                    }
+                    /******************************************************************
+                     * Derive the  Top-left CTB's neighbour pixel's slice indices.
+                     *
+                     *
+                     *          ____________
+                     *         |    |       |
+                     *         | L_T|       |
+                     *         |____|_______|____
+                     *         |    |       |    |
+                     *     L_L |  L |  L_R  |    |
+                     *         |____|_______|    |
+                     *              |            |
+                     *          L_D |            |
+                     *              |____________|
+                     *
+                     *****************************************************************/
+
+                    /*In case of slices or tiles*/
+                    {
+                        if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+                        {
+                            ctbx_l_t = ps_sao_ctxt->i4_ctb_x - 1;
+                            ctby_l_t = ps_sao_ctxt->i4_ctb_y - 1;
+
+                            ctbx_l_l = ps_sao_ctxt->i4_ctb_x - 1;
+                            ctby_l_l = ps_sao_ctxt->i4_ctb_y;
+
+                            ctbx_l_r = ps_sao_ctxt->i4_ctb_x;
+                            ctby_l_r = ps_sao_ctxt->i4_ctb_y;
+
+                            ctbx_l_d =  ps_sao_ctxt->i4_ctb_x - 1;
+                            ctby_l_d =  ps_sao_ctxt->i4_ctb_y;
+
+                            ctbx_l = ps_sao_ctxt->i4_ctb_x - 1;
+                            ctby_l = ps_sao_ctxt->i4_ctb_y;
+
+                            if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+                            {
+                                if(0 == ps_sao_ctxt->i4_ctb_y)
+                                {
+                                    au4_idx_l[2] = -1;
+                                    au4_idx_l[4] = -1;
+                                    au4_idx_l[5] = -1;
+                                }
+                                else
+                                {
+                                    au4_idx_l[2] = au4_idx_l[4] = pu1_slice_idx[ctbx_l_t + (ctby_l_t * ps_sps->i2_pic_wd_in_ctb)];
+                                    au4_idx_l[5] =  pu1_slice_idx[ctbx_l_t + 1 + (ctby_l_t  * ps_sps->i2_pic_wd_in_ctb)];
+                                }
+                                idx_l   = au4_idx_l[6] = pu1_slice_idx[ctbx_l + (ctby_l * ps_sps->i2_pic_wd_in_ctb)];
+                                au4_idx_l[0] = pu1_slice_idx[ctbx_l_l + (ctby_l_l * ps_sps->i2_pic_wd_in_ctb)];
+                                au4_idx_l[1] = au4_idx_l[7] = pu1_slice_idx[ctbx_l_r + (ctby_l_r * ps_sps->i2_pic_wd_in_ctb)];
+                                au4_idx_l[3] = pu1_slice_idx[ctbx_l_d + (ctby_l_d * ps_sps->i2_pic_wd_in_ctb)];
+
+                                /*Verify that the neighbor ctbs don't cross pic boundary.*/
+                                if(0 == ps_sao_ctxt->i4_ctb_y)
+                                {
+                                    au4_ilf_across_tile_slice_enable[2] = 0;
+                                    au4_ilf_across_tile_slice_enable[4] = 0;
+                                    au4_ilf_across_tile_slice_enable[5] = 0;
+                                }
+                                else
+                                {
+                                    au4_ilf_across_tile_slice_enable[2] =  (ps_slice_hdr_base + idx_l)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                    au4_ilf_across_tile_slice_enable[5] = au4_ilf_across_tile_slice_enable[4] = au4_ilf_across_tile_slice_enable[2];
+
+                                }
+                                //TODO: ILF flag checks for [0] and [6] is missing.
+                                au4_ilf_across_tile_slice_enable[1] = (ps_slice_hdr_base + au4_idx_l[1])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_l[3])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_l[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                /*
+                                 * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
+                                 * of the pixel having a greater address is checked. Accordingly, set the availability flags
+                                 */
+                                for(i = 0; i < 8; i++)
+                                {
+                                    /*Sets the edges that lie on the slice/tile boundary*/
+                                    if(au4_idx_l[i] != idx_l)
+                                    {
+                                        au1_tile_slice_boundary[i] = 1;
+                                    }
+                                    else
+                                    {
+                                        au4_ilf_across_tile_slice_enable[i] = 1;
+                                    }
+                                }
+                                /*Reset indices*/
+                                for(i = 0; i < 8; i++)
+                                {
+                                    au4_idx_l[i] = 0;
+                                }
+                            }
+
+                            if(ps_pps->i1_tiles_enabled_flag)
+                            {
+                                /* Calculate availability flags at slice boundary */
+                                if(((ps_tile->u1_pos_x == ps_sao_ctxt->i4_ctb_x) || (ps_tile->u1_pos_y == ps_sao_ctxt->i4_ctb_y)) && (!((0 == ps_tile->u1_pos_x) && (0 == ps_tile->u1_pos_y))))
+                                {
+                                    /*If ilf across tiles is enabled, boundary availability for tiles is not checked. */
+                                    if(!ps_pps->i1_loop_filter_across_tiles_enabled_flag)
+                                    {
+                                        if(0 == ps_sao_ctxt->i4_ctb_y)
+                                        {
+                                            au4_idx_l[2] = -1;
+                                            au4_idx_l[4] = -1;
+                                            au4_idx_l[5] = -1;
+                                        }
+                                        else
+                                        {
+                                            au4_idx_l[2] = au4_idx_l[4] = pu1_tile_idx[ctbx_l_t + (ctby_l_t * ps_sps->i2_pic_wd_in_ctb)];
+                                            au4_idx_l[5] =  pu1_tile_idx[ctbx_l_t + 1 + (ctby_l_t  * ps_sps->i2_pic_wd_in_ctb)];
+                                        }
+
+                                        idx_l   = au4_idx_l[6] = pu1_tile_idx[ctbx_l + (ctby_l * ps_sps->i2_pic_wd_in_ctb)];
+                                        au4_idx_l[0] = pu1_tile_idx[ctbx_l_l + (ctby_l_l * ps_sps->i2_pic_wd_in_ctb)];
+                                        au4_idx_l[1] = au4_idx_l[7] = pu1_tile_idx[ctbx_l_r + (ctby_l_r * ps_sps->i2_pic_wd_in_ctb)];
+                                        au4_idx_l[3] = pu1_tile_idx[ctbx_l_d + (ctby_l_d * ps_sps->i2_pic_wd_in_ctb)];
+
+                                        for(i = 0; i < 8; i++)
+                                        {
+                                            /*Sets the edges that lie on the slice/tile boundary*/
+                                            if(au4_idx_l[i] != idx_l)
+                                            {
+                                                au1_tile_slice_boundary[i] |= 1;
+                                                au4_ilf_across_tile_slice_enable[i] &= ps_pps->i1_loop_filter_across_tiles_enabled_flag;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+
+                            for(i = 0; i < 8; i++)
+                            {
+                                /*Sets the edges that lie on the slice/tile boundary*/
+                                if((au1_tile_slice_boundary[i]) && !(au4_ilf_across_tile_slice_enable[i]))
+                                {
+                                    au1_avail_luma[i] = 0;
+                                }
+                            }
+                        }
+                    }
+                    if(0 == (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_luma)
+                    {
+                        au1_avail_luma[0] = 0;
+                        au1_avail_luma[4] = 0;
+                        au1_avail_luma[6] = 0;
+                    }
+                    if(ps_sps->i2_pic_wd_in_ctb == ps_sao_ctxt->i4_ctb_x)
+                    {
+                        au1_avail_luma[1] = 0;
+                        au1_avail_luma[5] = 0;
+                        au1_avail_luma[7] = 0;
+                    }
+
+                    if(0 == ps_sao_ctxt->i4_ctb_y)
+                    {
+                        au1_avail_luma[2] = 0;
+                        au1_avail_luma[4] = 0;
+                        au1_avail_luma[5] = 0;
+                    }
+
+                    if(ps_sps->i2_pic_height_in_luma_samples - (ps_sao_ctxt->i4_ctb_y  << log2_ctb_size) <= sao_ht_luma)
+                    {
+                        au1_avail_luma[3] = 0;
+                        au1_avail_luma[6] = 0;
+                        au1_avail_luma[7] = 0;
+                    }
+
+                    {
+                        au1_src_top_right[0] = pu1_src_top_luma[sao_wd_luma];
+                        u1_sao_src_top_left_luma_bot_left = pu1_sao_src_top_left_luma_bot_left[0];
+                        ps_codec->apf_sao_luma[ps_sao->b3_y_type_idx - 2](pu1_src_luma,
+                                                                          src_strd,
+                                                                          pu1_src_left_luma,
+                                                                          pu1_src_top_luma,
+                                                                          pu1_sao_src_top_left_luma_curr_ctb,
+                                                                          au1_src_top_right,
+                                                                          &u1_sao_src_top_left_luma_bot_left,
+                                                                          au1_avail_luma,
+                                                                          ai1_offset_y,
+                                                                          sao_wd_luma,
+                                                                          sao_ht_luma);
+                    }
+
+                }
+            }
+        }
+
+        if(0 != sao_ht_chroma)
+        {
+            if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_chroma_flag)
+            {
+                if(0 == ps_sao->b3_cb_type_idx)
+                {
+                    for(row = 0; row < sao_ht_chroma; row++)
+                    {
+                        pu1_src_left_chroma[2 * row] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 2)];
+                        pu1_src_left_chroma[2 * row + 1] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 1)];
+                    }
+                    pu1_sao_src_top_left_chroma_curr_ctb[0] = pu1_src_top_chroma[sao_wd_chroma - 2];
+                    pu1_sao_src_top_left_chroma_curr_ctb[1] = pu1_src_top_chroma[sao_wd_chroma - 1];
+
+                    ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_chroma, &pu1_src_chroma[(sao_ht_chroma - 1) * src_strd], sao_wd_chroma);
+                }
+
+                else if(1 == ps_sao->b3_cb_type_idx)
+                {
+                    ai1_offset_cb[1] = ps_sao->b4_cb_offset_1;
+                    ai1_offset_cb[2] = ps_sao->b4_cb_offset_2;
+                    ai1_offset_cb[3] = ps_sao->b4_cb_offset_3;
+                    ai1_offset_cb[4] = ps_sao->b4_cb_offset_4;
+
+                    ai1_offset_cr[1] = ps_sao->b4_cr_offset_1;
+                    ai1_offset_cr[2] = ps_sao->b4_cr_offset_2;
+                    ai1_offset_cr[3] = ps_sao->b4_cr_offset_3;
+                    ai1_offset_cr[4] = ps_sao->b4_cr_offset_4;
+
+                    if(chroma_yuv420sp_vu)
+                    {
+                        ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr(pu1_src_chroma,
+                                                                                    src_strd,
+                                                                                    pu1_src_left_chroma,
+                                                                                    pu1_src_top_chroma,
+                                                                                    pu1_sao_src_top_left_chroma_curr_ctb,
+                                                                                    ps_sao->b5_cr_band_pos,
+                                                                                    ps_sao->b5_cb_band_pos,
+                                                                                    ai1_offset_cr,
+                                                                                    ai1_offset_cb,
+                                                                                    sao_wd_chroma,
+                                                                                    sao_ht_chroma
+                                                                                   );
+                    }
+                    else
+                    {
+                        ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr(pu1_src_chroma,
+                                                                                    src_strd,
+                                                                                    pu1_src_left_chroma,
+                                                                                    pu1_src_top_chroma,
+                                                                                    pu1_sao_src_top_left_chroma_curr_ctb,
+                                                                                    ps_sao->b5_cb_band_pos,
+                                                                                    ps_sao->b5_cr_band_pos,
+                                                                                    ai1_offset_cb,
+                                                                                    ai1_offset_cr,
+                                                                                    sao_wd_chroma,
+                                                                                    sao_ht_chroma
+                                                                                   );
+                    }
+                }
+
+                else // if(2 <= ps_sao->b3_cb_type_idx)
+                {
+                    ai1_offset_cb[1] = ps_sao->b4_cb_offset_1;
+                    ai1_offset_cb[2] = ps_sao->b4_cb_offset_2;
+                    ai1_offset_cb[3] = ps_sao->b4_cb_offset_3;
+                    ai1_offset_cb[4] = ps_sao->b4_cb_offset_4;
+
+                    ai1_offset_cr[1] = ps_sao->b4_cr_offset_1;
+                    ai1_offset_cr[2] = ps_sao->b4_cr_offset_2;
+                    ai1_offset_cr[3] = ps_sao->b4_cr_offset_3;
+                    ai1_offset_cr[4] = ps_sao->b4_cr_offset_4;
+
+                    for(i = 0; i < 8; i++)
+                    {
+                        au1_avail_chroma[i] = 255;
+                        au1_tile_slice_boundary[i] = 0;
+                        au4_idx_l[i] = 0;
+                        au4_ilf_across_tile_slice_enable[i] = 1;
+                    }
+                    /*In case of slices*/
+                    {
+                        if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+                        {
+                            ctbx_l_t = ps_sao_ctxt->i4_ctb_x - 1;
+                            ctby_l_t = ps_sao_ctxt->i4_ctb_y - 1;
+
+                            ctbx_l_l = ps_sao_ctxt->i4_ctb_x - 1;
+                            ctby_l_l = ps_sao_ctxt->i4_ctb_y;
+
+                            ctbx_l_r = ps_sao_ctxt->i4_ctb_x;
+                            ctby_l_r = ps_sao_ctxt->i4_ctb_y;
+
+                            ctbx_l_d =  ps_sao_ctxt->i4_ctb_x - 1;
+                            ctby_l_d =  ps_sao_ctxt->i4_ctb_y;
+
+                            ctbx_l = ps_sao_ctxt->i4_ctb_x - 1;
+                            ctby_l = ps_sao_ctxt->i4_ctb_y;
+
+                            if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+                            {
+                                if(0 == ps_sao_ctxt->i4_ctb_y)
+                                {
+                                    au4_idx_l[2] = -1;
+                                    au4_idx_l[4] = -1;
+                                    au4_idx_l[5] = -1;
+                                }
+                                else
+                                {
+                                    au4_idx_l[2] = au4_idx_l[4] = pu1_slice_idx[ctbx_l_t + (ctby_l_t * ps_sps->i2_pic_wd_in_ctb)];
+                                    au4_idx_l[5] =  pu1_slice_idx[ctbx_l_t + 1 + (ctby_l_t  * ps_sps->i2_pic_wd_in_ctb)];
+                                }
+                                idx_l   = au4_idx_l[6] = pu1_slice_idx[ctbx_l + (ctby_l * ps_sps->i2_pic_wd_in_ctb)];
+                                au4_idx_l[0] = pu1_slice_idx[ctbx_l_l + (ctby_l_l * ps_sps->i2_pic_wd_in_ctb)];
+                                au4_idx_l[1] = au4_idx_l[7] = pu1_slice_idx[ctbx_l_r + (ctby_l_r * ps_sps->i2_pic_wd_in_ctb)];
+                                au4_idx_l[3] = pu1_slice_idx[ctbx_l_d + (ctby_l_d * ps_sps->i2_pic_wd_in_ctb)];
+
+                                /*Verify that the neighbour ctbs dont cross pic boundary.*/
+                                if(0 == ps_sao_ctxt->i4_ctb_y)
+                                {
+                                    au4_ilf_across_tile_slice_enable[2] = 0;
+                                    au4_ilf_across_tile_slice_enable[4] = 0;
+                                    au4_ilf_across_tile_slice_enable[5] = 0;
+                                }
+                                else
+                                {
+                                    au4_ilf_across_tile_slice_enable[2] =  (ps_slice_hdr_base + idx_l)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                    au4_ilf_across_tile_slice_enable[5] = au4_ilf_across_tile_slice_enable[4] = au4_ilf_across_tile_slice_enable[2];
+                                }
+                                //  au4_ilf_across_tile_slice_enable[5] = au4_ilf_across_tile_slice_enable[2] = (ps_slice_hdr_base + idx_l)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[1] = (ps_slice_hdr_base + au4_idx_l[1])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_l[3])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_l[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                /*
+                                 * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
+                                 * of the pixel having a greater address is checked. Accordingly, set the availability flags
+                                 */
+                                for(i = 0; i < 8; i++)
+                                {
+                                    /*Sets the edges that lie on the slice/tile boundary*/
+                                    if(au4_idx_l[i] != idx_l)
+                                    {
+                                        au1_tile_slice_boundary[i] = 1;
+                                    }
+                                    else
+                                    {
+                                        au4_ilf_across_tile_slice_enable[i] = 1;
+                                    }
+                                }
+                                /*Reset indices*/
+                                for(i = 0; i < 8; i++)
+                                {
+                                    au4_idx_l[i] = 0;
+                                }
+                            }
+                            if(ps_pps->i1_tiles_enabled_flag)
+                            {
+                                /* Calculate availability flags at slice boundary */
+                                if(((ps_tile->u1_pos_x == ps_sao_ctxt->i4_ctb_x) || (ps_tile->u1_pos_y == ps_sao_ctxt->i4_ctb_y)) && (!((0 == ps_tile->u1_pos_x) && (0 == ps_tile->u1_pos_y))))
+                                {
+                                    /*If ilf across tiles is enabled, boundary availability for tiles is not checked. */
+                                    if(!ps_pps->i1_loop_filter_across_tiles_enabled_flag)
+                                    {
+                                        if(0 == ps_sao_ctxt->i4_ctb_y)
+                                        {
+                                            au4_idx_l[2] = -1;
+                                            au4_idx_l[4] = -1;
+                                            au4_idx_l[5] = -1;
+                                        }
+                                        else
+                                        {
+                                            au4_idx_l[2] = au4_idx_l[4] = pu1_tile_idx[ctbx_l_t + (ctby_l_t * ps_sps->i2_pic_wd_in_ctb)];
+                                            au4_idx_l[5] =  pu1_tile_idx[ctbx_l_t + 1 + (ctby_l_t  * ps_sps->i2_pic_wd_in_ctb)];
+                                        }
+
+                                        idx_l   = au4_idx_l[6] = pu1_tile_idx[ctbx_l + (ctby_l * ps_sps->i2_pic_wd_in_ctb)];
+                                        au4_idx_l[0] = pu1_tile_idx[ctbx_l_l + (ctby_l_l * ps_sps->i2_pic_wd_in_ctb)];
+                                        au4_idx_l[1] = au4_idx_l[7] = pu1_tile_idx[ctbx_l_r + (ctby_l_r * ps_sps->i2_pic_wd_in_ctb)];
+                                        au4_idx_l[3] = pu1_tile_idx[ctbx_l_d + (ctby_l_d * ps_sps->i2_pic_wd_in_ctb)];
+
+                                        for(i = 0; i < 8; i++)
+                                        {
+                                            /*Sets the edges that lie on the slice/tile boundary*/
+                                            if(au4_idx_l[i] != idx_l)
+                                            {
+                                                au1_tile_slice_boundary[i] |= 1;
+                                                au4_ilf_across_tile_slice_enable[i] &= ps_pps->i1_loop_filter_across_tiles_enabled_flag; //=0
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                            for(i = 0; i < 8; i++)
+                            {
+                                /*Sets the edges that lie on the slice/tile boundary*/
+                                if((au1_tile_slice_boundary[i]) && !(au4_ilf_across_tile_slice_enable[i]))
+                                {
+                                    au1_avail_chroma[i] = 0;
+                                }
+                            }
+                        }
+                    }
+                    if(0 == (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_chroma)
+                    {
+                        au1_avail_chroma[0] = 0;
+                        au1_avail_chroma[4] = 0;
+                        au1_avail_chroma[6] = 0;
+                    }
+
+                    if(ps_sps->i2_pic_wd_in_ctb == ps_sao_ctxt->i4_ctb_x)
+                    {
+                        au1_avail_chroma[1] = 0;
+                        au1_avail_chroma[5] = 0;
+                        au1_avail_chroma[7] = 0;
+                    }
+
+                    if(0 == ps_sao_ctxt->i4_ctb_y)
+                    {
+                        au1_avail_chroma[2] = 0;
+                        au1_avail_chroma[4] = 0;
+                        au1_avail_chroma[5] = 0;
+                    }
+
+                    if(ps_sps->i2_pic_height_in_luma_samples / 2 - (ps_sao_ctxt->i4_ctb_y  << (log2_ctb_size - 1)) <= sao_ht_chroma)
+                    {
+                        au1_avail_chroma[3] = 0;
+                        au1_avail_chroma[6] = 0;
+                        au1_avail_chroma[7] = 0;
+                    }
+
+                    {
+                        au1_src_top_right[0] = pu1_src_top_chroma[sao_wd_chroma];
+                        au1_src_top_right[1] = pu1_src_top_chroma[sao_wd_chroma + 1];
+                        au1_src_bot_left[0] = pu1_sao_src_top_left_chroma_bot_left[0];
+                        au1_src_bot_left[1] = pu1_sao_src_top_left_chroma_bot_left[1];
+                        //au1_src_bot_left[0] = pu1_src_chroma[sao_ht_chroma * src_strd - 2];
+                        //au1_src_bot_left[1] = pu1_src_chroma[sao_ht_chroma * src_strd - 1];
+                        if((ctb_size == 16) && (ps_sao_ctxt->i4_ctb_x != ps_sps->i2_pic_wd_in_ctb - 1))
+                        {
+                            au1_src_top_right[0] = pu1_src_chroma[sao_wd_chroma - src_strd];
+                            au1_src_top_right[1] = pu1_src_chroma[sao_wd_chroma - src_strd + 1];
+                        }
+
+
+                        if(chroma_yuv420sp_vu)
+                        {
+                            ps_codec->apf_sao_chroma[ps_sao->b3_cb_type_idx - 2](pu1_src_chroma,
+                                                                                 src_strd,
+                                                                                 pu1_src_left_chroma,
+                                                                                 pu1_src_top_chroma,
+                                                                                 pu1_sao_src_top_left_chroma_curr_ctb,
+                                                                                 au1_src_top_right,
+                                                                                 au1_src_bot_left,
+                                                                                 au1_avail_chroma,
+                                                                                 ai1_offset_cr,
+                                                                                 ai1_offset_cb,
+                                                                                 sao_wd_chroma,
+                                                                                 sao_ht_chroma);
+                        }
+                        else
+                        {
+                            ps_codec->apf_sao_chroma[ps_sao->b3_cb_type_idx - 2](pu1_src_chroma,
+                                                                                 src_strd,
+                                                                                 pu1_src_left_chroma,
+                                                                                 pu1_src_top_chroma,
+                                                                                 pu1_sao_src_top_left_chroma_curr_ctb,
+                                                                                 au1_src_top_right,
+                                                                                 au1_src_bot_left,
+                                                                                 au1_avail_chroma,
+                                                                                 ai1_offset_cb,
+                                                                                 ai1_offset_cr,
+                                                                                 sao_wd_chroma,
+                                                                                 sao_ht_chroma);
+                        }
+                    }
+
+                }
+            }
+
+        }
+        pu1_src_luma += sao_wd_luma;
+        pu1_src_chroma += sao_wd_chroma;
+        ps_sao += 1;
+    }
+
+
+    /* Current CTB */
+    {
+        WORD32 sao_wd_luma = ctb_size - SAO_SHIFT_CTB;
+        WORD32 sao_wd_chroma = ctb_size - SAO_SHIFT_CTB * 2;
+        WORD32 sao_ht_luma = ctb_size - SAO_SHIFT_CTB;
+        WORD32 sao_ht_chroma = ctb_size / 2 - SAO_SHIFT_CTB;
+        WORD32 ctbx_c_t = 0, ctbx_c_l = 0, ctbx_c_r = 0, ctbx_c_d = 0, ctbx_c = 0;
+        WORD32 ctby_c_t = 0, ctby_c_l = 0, ctby_c_r = 0, ctby_c_d = 0, ctby_c = 0;
+        WORD32 au4_idx_c[8], idx_c;
+
+        WORD32 remaining_rows;
+        WORD32 remaining_cols;
+
+        remaining_cols = ps_sps->i2_pic_width_in_luma_samples - ((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) + sao_wd_luma);
+        if(remaining_cols <= SAO_SHIFT_CTB)
+        {
+            sao_wd_luma += remaining_cols;
+        }
+        remaining_cols = ps_sps->i2_pic_width_in_luma_samples - ((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) + sao_wd_chroma);
+        if(remaining_cols <= 2 * SAO_SHIFT_CTB)
+        {
+            sao_wd_chroma += remaining_cols;
+        }
+
+        remaining_rows = ps_sps->i2_pic_height_in_luma_samples - ((ps_sao_ctxt->i4_ctb_y << log2_ctb_size) + sao_ht_luma);
+        if(remaining_rows <= SAO_SHIFT_CTB)
+        {
+            sao_ht_luma += remaining_rows;
+        }
+        remaining_rows = ps_sps->i2_pic_height_in_luma_samples / 2 - ((ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 1)) + sao_ht_chroma);
+        if(remaining_rows <= SAO_SHIFT_CTB)
+        {
+            sao_ht_chroma += remaining_rows;
+        }
+
+        pu1_src_top_luma = ps_sao_ctxt->pu1_sao_src_top_luma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size);
+        pu1_src_top_chroma = ps_sao_ctxt->pu1_sao_src_top_chroma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size);
+        pu1_src_left_luma = ps_sao_ctxt->pu1_sao_src_left_luma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size);
+        pu1_src_left_chroma = ps_sao_ctxt->pu1_sao_src_left_chroma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size);
+
+        if((0 != sao_wd_luma) && (0 != sao_ht_luma))
+        {
+            if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_luma_flag)
+            {
+                if(0 == ps_sao->b3_y_type_idx)
+                {
+                    /* Update left, top and top-left */
+                    for(row = 0; row < sao_ht_luma; row++)
+                    {
+                        pu1_src_left_luma[row] = pu1_src_luma[row * src_strd + (sao_wd_luma - 1)];
+                    }
+                    pu1_sao_src_top_left_luma_curr_ctb[0] = pu1_src_top_luma[sao_wd_luma - 1];
+
+                    ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_luma, &pu1_src_luma[(sao_ht_luma - 1) * src_strd], sao_wd_luma);
+
+                    pu1_sao_src_top_left_luma_top_right[0] = pu1_src_luma[(sao_ht_luma - 1) * src_strd + sao_wd_luma];
+
+                }
+
+                else if(1 == ps_sao->b3_y_type_idx)
+                {
+                    ai1_offset_y[1] = ps_sao->b4_y_offset_1;
+                    ai1_offset_y[2] = ps_sao->b4_y_offset_2;
+                    ai1_offset_y[3] = ps_sao->b4_y_offset_3;
+                    ai1_offset_y[4] = ps_sao->b4_y_offset_4;
+
+                    ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr(pu1_src_luma,
+                                                                              src_strd,
+                                                                              pu1_src_left_luma,
+                                                                              pu1_src_top_luma,
+                                                                              pu1_sao_src_top_left_luma_curr_ctb,
+                                                                              ps_sao->b5_y_band_pos,
+                                                                              ai1_offset_y,
+                                                                              sao_wd_luma,
+                                                                              sao_ht_luma
+                                                                             );
+                }
+
+                else // if(2 <= ps_sao->b3_y_type_idx)
+                {
+                    ai1_offset_y[1] = ps_sao->b4_y_offset_1;
+                    ai1_offset_y[2] = ps_sao->b4_y_offset_2;
+                    ai1_offset_y[3] = ps_sao->b4_y_offset_3;
+                    ai1_offset_y[4] = ps_sao->b4_y_offset_4;
+
+                    for(i = 0; i < 8; i++)
+                    {
+                        au1_avail_luma[i] = 255;
+                        au1_tile_slice_boundary[i] = 0;
+                        au4_idx_c[i] = 0;
+                        au4_ilf_across_tile_slice_enable[i] = 1;
+                    }
+                    /******************************************************************
+                     * Derive the  Top-left CTB's neighbour pixel's slice indices.
+                     *
+                     *
+                     *          ____________
+                     *         |    |       |
+                     *         |    | C_T   |
+                     *         |____|_______|____
+                     *         |    |       |    |
+                     *         | C_L|   C   | C_R|
+                     *         |____|_______|    |
+                     *              |  C_D       |
+                     *              |            |
+                     *              |____________|
+                     *
+                     *****************************************************************/
+
+                    /*In case of slices*/
+                    {
+                        if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+                        {
+                            ctbx_c_t = ps_sao_ctxt->i4_ctb_x;
+                            ctby_c_t = ps_sao_ctxt->i4_ctb_y - 1;
+
+                            ctbx_c_l = ps_sao_ctxt->i4_ctb_x - 1;
+                            ctby_c_l = ps_sao_ctxt->i4_ctb_y;
+
+                            ctbx_c_r = ps_sao_ctxt->i4_ctb_x;
+                            ctby_c_r = ps_sao_ctxt->i4_ctb_y;
+
+                            ctbx_c_d =  ps_sao_ctxt->i4_ctb_x;
+                            ctby_c_d =  ps_sao_ctxt->i4_ctb_y;
+
+                            ctbx_c = ps_sao_ctxt->i4_ctb_x;
+                            ctby_c = ps_sao_ctxt->i4_ctb_y;
+
+                            if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+                            {
+                                if(0 == ps_sao_ctxt->i4_ctb_x)
+                                {
+                                    au4_idx_c[6] = -1;
+                                    au4_idx_c[0] = -1;
+                                    au4_idx_c[4] = -1;
+                                }
+                                else
+                                {
+                                    au4_idx_c[0] =  au4_idx_c[6] = pu1_slice_idx[ctbx_c_l + (ctby_c_l * ps_sps->i2_pic_wd_in_ctb)];
+                                }
+
+                                if(0 == ps_sao_ctxt->i4_ctb_y)
+                                {
+                                    au4_idx_c[2] = -1;
+                                    au4_idx_c[5] = -1;
+                                    au4_idx_c[4] = -1;
+                                }
+                                else
+                                {
+                                    au4_idx_c[4] =  pu1_slice_idx[ctbx_c_t - 1 + (ctby_c_t  * ps_sps->i2_pic_wd_in_ctb)];
+                                    au4_idx_c[2] = au4_idx_c[5] = pu1_slice_idx[ctbx_c_t + (ctby_c_t * ps_sps->i2_pic_wd_in_ctb)];
+                                }
+                                idx_c   = pu1_slice_idx[ctbx_c + (ctby_c * ps_sps->i2_pic_wd_in_ctb)];
+                                au4_idx_c[1] = au4_idx_c[7] = pu1_slice_idx[ctbx_c_r + (ctby_c_r * ps_sps->i2_pic_wd_in_ctb)];
+                                au4_idx_c[3] = pu1_slice_idx[ctbx_c_d + (ctby_c_d * ps_sps->i2_pic_wd_in_ctb)];
+
+                                if(0 == ps_sao_ctxt->i4_ctb_x)
+                                {
+                                    au4_ilf_across_tile_slice_enable[6] = 0;
+                                    au4_ilf_across_tile_slice_enable[0] = 0;
+                                    au4_ilf_across_tile_slice_enable[4] = 0;
+                                }
+                                else
+                                {
+                                    au4_ilf_across_tile_slice_enable[6] = (ps_slice_hdr_base + au4_idx_c[6])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                    au4_ilf_across_tile_slice_enable[0] = (ps_slice_hdr_base + idx_c)->i1_slice_loop_filter_across_slices_enabled_flag;;
+                                }
+                                if(0 == ps_sao_ctxt->i4_ctb_y)
+                                {
+                                    au4_ilf_across_tile_slice_enable[2] = 0;
+                                    au4_ilf_across_tile_slice_enable[4] = 0;
+                                    au4_ilf_across_tile_slice_enable[5] = 0;
+                                }
+                                else
+                                {
+                                    au4_ilf_across_tile_slice_enable[2] = (ps_slice_hdr_base + idx_c)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                    au4_ilf_across_tile_slice_enable[5] = au4_ilf_across_tile_slice_enable[4] = au4_ilf_across_tile_slice_enable[2];
+                                }
+                                au4_ilf_across_tile_slice_enable[1] = (ps_slice_hdr_base + au4_idx_c[1])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_c[3])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_c[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+
+                                /*
+                                 * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
+                                 * of the pixel having a greater address is checked. Accordingly, set the availability flags
+                                 */
+                                for(i = 0; i < 8; i++)
+                                {
+                                    /*Sets the edges that lie on the slice/tile boundary*/
+                                    if(au4_idx_c[i] != idx_c)
+                                    {
+                                        au1_tile_slice_boundary[i] = 1;
+                                    }
+                                    else
+                                    {
+                                        au4_ilf_across_tile_slice_enable[i] = 1;
+                                    }
+                                }
+                                /*Reset indices*/
+                                for(i = 0; i < 8; i++)
+                                {
+                                    au4_idx_c[i] = 0;
+                                }
+                            }
+
+                            if(ps_pps->i1_tiles_enabled_flag)
+                            {
+                                /* Calculate availability flags at slice boundary */
+                                if(((ps_tile->u1_pos_x == ps_sao_ctxt->i4_ctb_x) || (ps_tile->u1_pos_y == ps_sao_ctxt->i4_ctb_y)) && (!((0 == ps_tile->u1_pos_x) && (0 == ps_tile->u1_pos_y))))
+                                {
+                                    /*If ilf across tiles is enabled, boundary availability for tiles is not checked. */
+                                    if(!ps_pps->i1_loop_filter_across_tiles_enabled_flag)
+                                    {
+                                        if(0 == ps_sao_ctxt->i4_ctb_x)
+                                        {
+                                            au4_idx_c[6] = -1;
+                                            au4_idx_c[0] = -1;
+                                            au4_idx_c[4] = -1;
+                                        }
+                                        else
+                                        {
+                                            au4_idx_c[0] =  au4_idx_c[6] = pu1_tile_idx[ctbx_c_l + (ctby_c_l * ps_sps->i2_pic_wd_in_ctb)];
+                                        }
+
+                                        if(0 == ps_sao_ctxt->i4_ctb_y)
+                                        {
+                                            au4_idx_c[2] = -1;
+                                            au4_idx_c[5] = -1;
+                                            au4_idx_c[4] = -1;
+                                        }
+                                        else
+                                        {
+                                            au4_idx_c[4] =  pu1_tile_idx[ctbx_c_t - 1 + (ctby_c_t  * ps_sps->i2_pic_wd_in_ctb)];
+                                            au4_idx_c[2] = au4_idx_c[5] = pu1_tile_idx[ctbx_c_t + (ctby_c_t * ps_sps->i2_pic_wd_in_ctb)];
+                                        }
+                                        idx_c   = pu1_tile_idx[ctbx_c + (ctby_c * ps_sps->i2_pic_wd_in_ctb)];
+                                        au4_idx_c[1] = au4_idx_c[7] = pu1_tile_idx[ctbx_c_r + (ctby_c_r * ps_sps->i2_pic_wd_in_ctb)];
+                                        au4_idx_c[3] = pu1_tile_idx[ctbx_c_d + (ctby_c_d * ps_sps->i2_pic_wd_in_ctb)];
+
+                                        for(i = 0; i < 8; i++)
+                                        {
+                                            /*Sets the edges that lie on the slice/tile boundary*/
+                                            if(au4_idx_c[i] != idx_c)
+                                            {
+                                                au1_tile_slice_boundary[i] |= 1;
+                                                au4_ilf_across_tile_slice_enable[i] &= ps_pps->i1_loop_filter_across_tiles_enabled_flag; //=0
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+
+                            for(i = 0; i < 8; i++)
+                            {
+                                /*Sets the edges that lie on the slice/tile boundary*/
+                                if((au1_tile_slice_boundary[i]) && !(au4_ilf_across_tile_slice_enable[i]))
+                                {
+                                    au1_avail_luma[i] = 0;
+                                }
+                            }
+
+                        }
+                    }
+                    if(0 == ps_sao_ctxt->i4_ctb_x)
+                    {
+                        au1_avail_luma[0] = 0;
+                        au1_avail_luma[4] = 0;
+                        au1_avail_luma[6] = 0;
+                    }
+
+                    if(ps_sps->i2_pic_width_in_luma_samples - (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) <= sao_wd_luma)
+                    {
+                        au1_avail_luma[1] = 0;
+                        au1_avail_luma[5] = 0;
+                        au1_avail_luma[7] = 0;
+                    }
+
+                    if(0 == ps_sao_ctxt->i4_ctb_y)
+                    {
+                        au1_avail_luma[2] = 0;
+                        au1_avail_luma[4] = 0;
+                        au1_avail_luma[5] = 0;
+                    }
+
+                    if(ps_sps->i2_pic_height_in_luma_samples - (ps_sao_ctxt->i4_ctb_y  << log2_ctb_size) <= sao_ht_luma)
+                    {
+                        au1_avail_luma[3] = 0;
+                        au1_avail_luma[6] = 0;
+                        au1_avail_luma[7] = 0;
+                    }
+
+                    {
+                        au1_src_top_right[0] = pu1_src_luma[sao_wd_luma - src_strd];
+                        u1_sao_src_top_left_luma_bot_left = pu1_src_luma[sao_ht_luma * src_strd - 1];
+
+                        ps_codec->apf_sao_luma[ps_sao->b3_y_type_idx - 2](pu1_src_luma,
+                                                                          src_strd,
+                                                                          pu1_src_left_luma,
+                                                                          pu1_src_top_luma,
+                                                                          pu1_sao_src_top_left_luma_curr_ctb,
+                                                                          au1_src_top_right,
+                                                                          &u1_sao_src_top_left_luma_bot_left,
+                                                                          au1_avail_luma,
+                                                                          ai1_offset_y,
+                                                                          sao_wd_luma,
+                                                                          sao_ht_luma);
+                    }
+                    pu1_sao_src_top_left_luma_top_right[0] = pu1_src_luma[(sao_ht_luma - 1) * src_strd + sao_wd_luma];
+                    pu1_sao_src_top_left_luma_bot_left[0] = pu1_src_luma[(sao_ht_luma)*src_strd + sao_wd_luma - 1];
+                }
+            }
+        }
+
+        if((0 != sao_wd_chroma) && (0 != sao_ht_chroma))
+        {
+            if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_chroma_flag)
+            {
+                if(0 == ps_sao->b3_cb_type_idx)
+                {
+                    for(row = 0; row < sao_ht_chroma; row++)
+                    {
+                        pu1_src_left_chroma[2 * row] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 2)];
+                        pu1_src_left_chroma[2 * row + 1] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 1)];
+                    }
+                    pu1_sao_src_top_left_chroma_curr_ctb[0] = pu1_src_top_chroma[sao_wd_chroma - 2];
+                    pu1_sao_src_top_left_chroma_curr_ctb[1] = pu1_src_top_chroma[sao_wd_chroma - 1];
+
+                    ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_chroma, &pu1_src_chroma[(sao_ht_chroma - 1) * src_strd], sao_wd_chroma);
+
+                    pu1_sao_src_top_left_chroma_top_right[0] = pu1_src_chroma[(sao_ht_chroma - 1) * src_strd + sao_wd_chroma];
+                    pu1_sao_src_top_left_chroma_top_right[1] = pu1_src_chroma[(sao_ht_chroma - 1) * src_strd + sao_wd_chroma + 1];
+                }
+
+                else if(1 == ps_sao->b3_cb_type_idx)
+                {
+                    ai1_offset_cb[1] = ps_sao->b4_cb_offset_1;
+                    ai1_offset_cb[2] = ps_sao->b4_cb_offset_2;
+                    ai1_offset_cb[3] = ps_sao->b4_cb_offset_3;
+                    ai1_offset_cb[4] = ps_sao->b4_cb_offset_4;
+
+                    ai1_offset_cr[1] = ps_sao->b4_cr_offset_1;
+                    ai1_offset_cr[2] = ps_sao->b4_cr_offset_2;
+                    ai1_offset_cr[3] = ps_sao->b4_cr_offset_3;
+                    ai1_offset_cr[4] = ps_sao->b4_cr_offset_4;
+
+                    if(chroma_yuv420sp_vu)
+                    {
+                        ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr(pu1_src_chroma,
+                                                                                    src_strd,
+                                                                                    pu1_src_left_chroma,
+                                                                                    pu1_src_top_chroma,
+                                                                                    pu1_sao_src_top_left_chroma_curr_ctb,
+                                                                                    ps_sao->b5_cr_band_pos,
+                                                                                    ps_sao->b5_cb_band_pos,
+                                                                                    ai1_offset_cr,
+                                                                                    ai1_offset_cb,
+                                                                                    sao_wd_chroma,
+                                                                                    sao_ht_chroma
+                                                                                   );
+                    }
+                    else
+                    {
+                        ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr(pu1_src_chroma,
+                                                                                    src_strd,
+                                                                                    pu1_src_left_chroma,
+                                                                                    pu1_src_top_chroma,
+                                                                                    pu1_sao_src_top_left_chroma_curr_ctb,
+                                                                                    ps_sao->b5_cb_band_pos,
+                                                                                    ps_sao->b5_cr_band_pos,
+                                                                                    ai1_offset_cb,
+                                                                                    ai1_offset_cr,
+                                                                                    sao_wd_chroma,
+                                                                                    sao_ht_chroma
+                                                                                   );
+                    }
+                }
+
+                else // if(2 <= ps_sao->b3_cb_type_idx)
+                {
+                    ai1_offset_cb[1] = ps_sao->b4_cb_offset_1;
+                    ai1_offset_cb[2] = ps_sao->b4_cb_offset_2;
+                    ai1_offset_cb[3] = ps_sao->b4_cb_offset_3;
+                    ai1_offset_cb[4] = ps_sao->b4_cb_offset_4;
+
+                    ai1_offset_cr[1] = ps_sao->b4_cr_offset_1;
+                    ai1_offset_cr[2] = ps_sao->b4_cr_offset_2;
+                    ai1_offset_cr[3] = ps_sao->b4_cr_offset_3;
+                    ai1_offset_cr[4] = ps_sao->b4_cr_offset_4;
+
+                    for(i = 0; i < 8; i++)
+                    {
+                        au1_avail_chroma[i] = 255;
+                        au1_tile_slice_boundary[i] = 0;
+                        au4_idx_c[i] = 0;
+                        au4_ilf_across_tile_slice_enable[i] = 1;
+                    }
+                    {
+                        if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+                        {
+                            ctbx_c_t = ps_sao_ctxt->i4_ctb_x;
+                            ctby_c_t = ps_sao_ctxt->i4_ctb_y - 1;
+
+                            ctbx_c_l = ps_sao_ctxt->i4_ctb_x - 1;
+                            ctby_c_l = ps_sao_ctxt->i4_ctb_y;
+
+                            ctbx_c_r = ps_sao_ctxt->i4_ctb_x;
+                            ctby_c_r = ps_sao_ctxt->i4_ctb_y;
+
+                            ctbx_c_d =  ps_sao_ctxt->i4_ctb_x;
+                            ctby_c_d =  ps_sao_ctxt->i4_ctb_y;
+
+                            ctbx_c = ps_sao_ctxt->i4_ctb_x;
+                            ctby_c = ps_sao_ctxt->i4_ctb_y;
+
+                            if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+                            {
+                                if(0 == ps_sao_ctxt->i4_ctb_x)
+                                {
+                                    au4_idx_c[0] = -1;
+                                    au4_idx_c[4] = -1;
+                                    au4_idx_c[6] = -1;
+                                }
+                                else
+                                {
+                                    au4_idx_c[0] =  au4_idx_c[6] = pu1_slice_idx[ctbx_c_l + (ctby_c_l * ps_sps->i2_pic_wd_in_ctb)];
+                                }
+
+                                if(0 == ps_sao_ctxt->i4_ctb_y)
+                                {
+                                    au4_idx_c[2] = -1;
+                                    au4_idx_c[4] = -1;
+                                    au4_idx_c[5] = -1;
+                                }
+                                else
+                                {
+                                    au4_idx_c[2] = au4_idx_c[5] = pu1_slice_idx[ctbx_c_t + (ctby_c_t * ps_sps->i2_pic_wd_in_ctb)];
+                                    au4_idx_c[4] =  pu1_slice_idx[ctbx_c_t - 1 + (ctby_c_t  * ps_sps->i2_pic_wd_in_ctb)];
+                                }
+                                idx_c = pu1_slice_idx[ctbx_c + (ctby_c * ps_sps->i2_pic_wd_in_ctb)];
+                                au4_idx_c[1] = au4_idx_c[7] = pu1_slice_idx[ctbx_c_r + (ctby_c_r * ps_sps->i2_pic_wd_in_ctb)];
+                                au4_idx_c[3] = pu1_slice_idx[ctbx_c_d + (ctby_c_d * ps_sps->i2_pic_wd_in_ctb)];
+
+                                if(0 == ps_sao_ctxt->i4_ctb_x)
+                                {
+                                    au4_ilf_across_tile_slice_enable[0] = 0;
+                                    au4_ilf_across_tile_slice_enable[4] = 0;
+                                    au4_ilf_across_tile_slice_enable[6] = 0;
+                                }
+                                else
+                                {
+                                    au4_ilf_across_tile_slice_enable[6] &= (ps_slice_hdr_base + au4_idx_c[6])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                    au4_ilf_across_tile_slice_enable[0] &= (ps_slice_hdr_base + idx_c)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                }
+
+                                if(0 == ps_sao_ctxt->i4_ctb_y)
+                                {
+                                    au4_ilf_across_tile_slice_enable[2] = 0;
+                                    au4_ilf_across_tile_slice_enable[4] = 0;
+                                    au4_ilf_across_tile_slice_enable[5] = 0;
+                                }
+                                else
+                                {
+                                    au4_ilf_across_tile_slice_enable[2] &= (ps_slice_hdr_base + idx_c)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                    au4_ilf_across_tile_slice_enable[5] = au4_ilf_across_tile_slice_enable[4] = au4_ilf_across_tile_slice_enable[2];
+                                }
+
+                                au4_ilf_across_tile_slice_enable[1] &= (ps_slice_hdr_base + au4_idx_c[1])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[3] &= (ps_slice_hdr_base + au4_idx_c[3])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[7] &= (ps_slice_hdr_base + au4_idx_c[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+
+                                /*
+                                 * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
+                                 * of the pixel having a greater address is checked. Accordingly, set the availability flags
+                                 */
+                                for(i = 0; i < 8; i++)
+                                {
+                                    /*Sets the edges that lie on the slice/tile boundary*/
+                                    if(au4_idx_c[i] != idx_c)
+                                    {
+                                        au1_tile_slice_boundary[i] = 1;
+                                    }
+                                    else
+                                    {
+                                        au4_ilf_across_tile_slice_enable[i] = 1;
+                                    }
+                                }
+                                /*Reset indices*/
+                                for(i = 0; i < 8; i++)
+                                {
+                                    au4_idx_c[i] = 0;
+                                }
+                            }
+
+                            if(ps_pps->i1_tiles_enabled_flag)
+                            {
+                                /* Calculate availability flags at slice boundary */
+                                if(((ps_tile->u1_pos_x == ps_sao_ctxt->i4_ctb_x) || (ps_tile->u1_pos_y == ps_sao_ctxt->i4_ctb_y)) && (!((0 == ps_tile->u1_pos_x) && (0 == ps_tile->u1_pos_y))))
+                                {
+                                    /*If ilf across tiles is enabled, boundary availability for tiles is not checked. */
+                                    if(!ps_pps->i1_loop_filter_across_tiles_enabled_flag)
+                                    {
+                                        if(0 == ps_sao_ctxt->i4_ctb_x)
+                                        {
+                                            au4_idx_c[6] = -1;
+                                            au4_idx_c[0] = -1;
+                                            au4_idx_c[4] = -1;
+                                        }
+                                        else
+                                        {
+                                            au4_idx_c[0] =  au4_idx_c[6] = pu1_tile_idx[ctbx_c_l + (ctby_c_l * ps_sps->i2_pic_wd_in_ctb)];
+                                        }
+
+                                        if(0 == ps_sao_ctxt->i4_ctb_y)
+                                        {
+                                            au4_idx_c[2] = -1;
+                                            au4_idx_c[5] = -1;
+                                            au4_idx_c[4] = -1;
+                                        }
+                                        else
+                                        {
+                                            au4_idx_c[4] =  pu1_tile_idx[ctbx_c_t - 1 + (ctby_c_t  * ps_sps->i2_pic_wd_in_ctb)];
+                                            au4_idx_c[2] = au4_idx_c[5] = pu1_tile_idx[ctbx_c_t + (ctby_c_t * ps_sps->i2_pic_wd_in_ctb)];
+                                        }
+                                        idx_c   = pu1_tile_idx[ctbx_c + (ctby_c * ps_sps->i2_pic_wd_in_ctb)];
+                                        au4_idx_c[1] = au4_idx_c[7] = pu1_tile_idx[ctbx_c_r + (ctby_c_r * ps_sps->i2_pic_wd_in_ctb)];
+                                        au4_idx_c[3] = pu1_tile_idx[ctbx_c_d + (ctby_c_d * ps_sps->i2_pic_wd_in_ctb)];
+
+                                        for(i = 0; i < 8; i++)
+                                        {
+                                            /*Sets the edges that lie on the slice/tile boundary*/
+                                            if(au4_idx_c[i] != idx_c)
+                                            {
+                                                au1_tile_slice_boundary[i] |= 1;
+                                                au4_ilf_across_tile_slice_enable[i] &= ps_pps->i1_loop_filter_across_tiles_enabled_flag; //=0
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+
+                            for(i = 0; i < 8; i++)
+                            {
+                                /*Sets the edges that lie on the slice/tile boundary*/
+                                if((au1_tile_slice_boundary[i]) && !(au4_ilf_across_tile_slice_enable[i]))
+                                {
+                                    au1_avail_chroma[i] = 0;
+                                }
+                            }
+                        }
+                    }
+
+                    if(0 == ps_sao_ctxt->i4_ctb_x)
+                    {
+                        au1_avail_chroma[0] = 0;
+                        au1_avail_chroma[4] = 0;
+                        au1_avail_chroma[6] = 0;
+                    }
+
+                    if(ps_sps->i2_pic_width_in_luma_samples - (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) <= sao_wd_chroma)
+                    {
+                        au1_avail_chroma[1] = 0;
+                        au1_avail_chroma[5] = 0;
+                        au1_avail_chroma[7] = 0;
+                    }
+
+                    if(0 == ps_sao_ctxt->i4_ctb_y)
+                    {
+                        au1_avail_chroma[2] = 0;
+                        au1_avail_chroma[4] = 0;
+                        au1_avail_chroma[5] = 0;
+                    }
+
+                    if(ps_sps->i2_pic_height_in_luma_samples / 2 - (ps_sao_ctxt->i4_ctb_y  << (log2_ctb_size - 1)) <= sao_ht_chroma)
+                    {
+                        au1_avail_chroma[3] = 0;
+                        au1_avail_chroma[6] = 0;
+                        au1_avail_chroma[7] = 0;
+                    }
+
+                    {
+                        au1_src_top_right[0] = pu1_src_chroma[sao_wd_chroma - src_strd];
+                        au1_src_top_right[1] = pu1_src_chroma[sao_wd_chroma - src_strd + 1];
+
+                        au1_sao_src_top_left_chroma_bot_left[0] = pu1_src_chroma[sao_ht_chroma * src_strd - 2];
+                        au1_sao_src_top_left_chroma_bot_left[1] = pu1_src_chroma[sao_ht_chroma * src_strd - 1];
+
+                        if(chroma_yuv420sp_vu)
+                        {
+                            ps_codec->apf_sao_chroma[ps_sao->b3_cb_type_idx - 2](pu1_src_chroma,
+                                                                                 src_strd,
+                                                                                 pu1_src_left_chroma,
+                                                                                 pu1_src_top_chroma,
+                                                                                 pu1_sao_src_top_left_chroma_curr_ctb,
+                                                                                 au1_src_top_right,
+                                                                                 au1_sao_src_top_left_chroma_bot_left,
+                                                                                 au1_avail_chroma,
+                                                                                 ai1_offset_cr,
+                                                                                 ai1_offset_cb,
+                                                                                 sao_wd_chroma,
+                                                                                 sao_ht_chroma);
+                        }
+                        else
+                        {
+                            ps_codec->apf_sao_chroma[ps_sao->b3_cb_type_idx - 2](pu1_src_chroma,
+                                                                                 src_strd,
+                                                                                 pu1_src_left_chroma,
+                                                                                 pu1_src_top_chroma,
+                                                                                 pu1_sao_src_top_left_chroma_curr_ctb,
+                                                                                 au1_src_top_right,
+                                                                                 au1_sao_src_top_left_chroma_bot_left,
+                                                                                 au1_avail_chroma,
+                                                                                 ai1_offset_cb,
+                                                                                 ai1_offset_cr,
+                                                                                 sao_wd_chroma,
+                                                                                 sao_ht_chroma);
+                        }
+                    }
+
+                }
+                pu1_sao_src_top_left_chroma_top_right[0] = pu1_src_chroma[(sao_ht_chroma - 1) * src_strd + sao_wd_chroma];
+                pu1_sao_src_top_left_chroma_top_right[1] = pu1_src_chroma[(sao_ht_chroma - 1) * src_strd + sao_wd_chroma + 1];
+
+                pu1_sao_src_top_left_chroma_bot_left[0] = pu1_src_chroma[(sao_ht_chroma)*src_strd + sao_wd_chroma - 2];
+                pu1_sao_src_top_left_chroma_bot_left[1] = pu1_src_chroma[(sao_ht_chroma)*src_strd + sao_wd_chroma - 1];
+            }
+
+        }
+    }
+
+
+
+
+/* If no loop filter is enabled copy the backed up values */
+    {
+        /* Luma */
+        if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_luma_flag && no_loop_filter_enabled_luma)
+        {
+            UWORD32 u4_no_loop_filter_flag;
+            WORD32 loop_filter_bit_pos;
+            WORD32 log2_min_cu = 3;
+            WORD32 min_cu = (1 << log2_min_cu);
+            UWORD8 *pu1_src_tmp_luma = pu1_src_luma;
+            WORD32 sao_blk_ht = ctb_size - SAO_SHIFT_CTB;
+            WORD32 sao_blk_wd = ctb_size;
+            WORD32 remaining_rows;
+            WORD32 remaining_cols;
+
+            remaining_rows = ps_sps->i2_pic_height_in_luma_samples - ((ps_sao_ctxt->i4_ctb_y << log2_ctb_size) + ctb_size - SAO_SHIFT_CTB);
+            remaining_cols = ps_sps->i2_pic_width_in_luma_samples - ((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) + ctb_size - SAO_SHIFT_CTB);
+            if(remaining_rows <= SAO_SHIFT_CTB)
+                sao_blk_ht += remaining_rows;
+            if(remaining_cols <= SAO_SHIFT_CTB)
+                sao_blk_wd += remaining_cols;
+
+            pu1_src_tmp_luma -= ps_sao_ctxt->i4_ctb_x ? SAO_SHIFT_CTB : 0;
+            pu1_src_tmp_luma -= ps_sao_ctxt->i4_ctb_y ? SAO_SHIFT_CTB * src_strd : 0;
+
+            pu1_src_backup_luma = ps_sao_ctxt->pu1_tmp_buf_luma;
+
+            loop_filter_bit_pos = (ps_sao_ctxt->i4_ctb_x << (log2_ctb_size - 3)) +
+                            (ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 3)) * (loop_filter_strd << 3);
+            if(ps_sao_ctxt->i4_ctb_x > 0)
+                loop_filter_bit_pos -= 1;
+
+            pu1_no_loop_filter_flag = ps_sao_ctxt->pu1_pic_no_loop_filter_flag +
+                            (loop_filter_bit_pos >> 3);
+
+            for(i = -(ps_sao_ctxt->i4_ctb_y ? SAO_SHIFT_CTB : 0) >> log2_min_cu;
+                            i < (sao_blk_ht + (min_cu - 1)) >> log2_min_cu; i++)
+            {
+                WORD32 tmp_wd = sao_blk_wd;
+
+                u4_no_loop_filter_flag = (*(UWORD32 *)(pu1_no_loop_filter_flag + i * loop_filter_strd)) >>
+                                (loop_filter_bit_pos & 7);
+                u4_no_loop_filter_flag &= (1 << ((tmp_wd + (min_cu - 1)) >> log2_min_cu)) - 1;
+
+                if(u4_no_loop_filter_flag)
+                {
+                    while(tmp_wd > 0)
+                    {
+                        if(CTZ(u4_no_loop_filter_flag))
+                        {
+                            pu1_src_tmp_luma += MIN((CTZ(u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+                            pu1_src_backup_luma += MIN((CTZ(u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+                            tmp_wd -= CTZ(u4_no_loop_filter_flag) << log2_min_cu;
+                            u4_no_loop_filter_flag  >>= (CTZ(u4_no_loop_filter_flag));
+                        }
+                        else
+                        {
+                            for(row = 0; row < min_cu; row++)
+                            {
+                                for(col = 0; col < MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd); col++)
+                                {
+                                    pu1_src_tmp_luma[row * src_strd + col] = pu1_src_backup_luma[row * backup_strd + col];
+                                }
+                            }
+                            pu1_src_tmp_luma += MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+                            pu1_src_backup_luma += MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+                            tmp_wd -= CTZ(~u4_no_loop_filter_flag) << log2_min_cu;
+                            u4_no_loop_filter_flag  >>= (CTZ(~u4_no_loop_filter_flag));
+                        }
+                    }
+
+                    pu1_src_tmp_luma -= sao_blk_wd;
+                    pu1_src_backup_luma -= sao_blk_wd;
+                }
+
+                pu1_src_tmp_luma += (src_strd << log2_min_cu);
+                pu1_src_backup_luma += (backup_strd << log2_min_cu);
+            }
+        }
+
+        /* Chroma */
+        if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_chroma_flag && no_loop_filter_enabled_chroma)
+        {
+            UWORD32 u4_no_loop_filter_flag;
+            WORD32 loop_filter_bit_pos;
+            WORD32 log2_min_cu = 3;
+            WORD32 min_cu = (1 << log2_min_cu);
+            UWORD8 *pu1_src_tmp_chroma = pu1_src_chroma;
+            WORD32 sao_blk_ht = ctb_size - 2 * SAO_SHIFT_CTB;
+            WORD32 sao_blk_wd = ctb_size;
+            WORD32 remaining_rows;
+            WORD32 remaining_cols;
+
+            remaining_rows = ps_sps->i2_pic_height_in_luma_samples - ((ps_sao_ctxt->i4_ctb_y << log2_ctb_size) + ctb_size - 2 * SAO_SHIFT_CTB);
+            remaining_cols = ps_sps->i2_pic_width_in_luma_samples - ((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) + ctb_size - 2 * SAO_SHIFT_CTB);
+            if(remaining_rows <= 2 * SAO_SHIFT_CTB)
+                sao_blk_ht += remaining_rows;
+            if(remaining_cols <= 2 * SAO_SHIFT_CTB)
+                sao_blk_wd += remaining_cols;
+
+            pu1_src_tmp_chroma -= ps_sao_ctxt->i4_ctb_x ? SAO_SHIFT_CTB * 2 : 0;
+            pu1_src_tmp_chroma -= ps_sao_ctxt->i4_ctb_y ? SAO_SHIFT_CTB * src_strd : 0;
+
+            pu1_src_backup_chroma = ps_sao_ctxt->pu1_tmp_buf_chroma;
+
+            loop_filter_bit_pos = (ps_sao_ctxt->i4_ctb_x << (log2_ctb_size - 3)) +
+                            (ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 3)) * (loop_filter_strd << 3);
+            if(ps_sao_ctxt->i4_ctb_x > 0)
+                loop_filter_bit_pos -= 2;
+
+            pu1_no_loop_filter_flag = ps_sao_ctxt->pu1_pic_no_loop_filter_flag +
+                            (loop_filter_bit_pos >> 3);
+
+            for(i = -(ps_sao_ctxt->i4_ctb_y ? 2 * SAO_SHIFT_CTB : 0) >> log2_min_cu;
+                            i < (sao_blk_ht + (min_cu - 1)) >> log2_min_cu; i++)
+            {
+                WORD32 tmp_wd = sao_blk_wd;
+
+                u4_no_loop_filter_flag = (*(UWORD32 *)(pu1_no_loop_filter_flag + i * loop_filter_strd)) >>
+                                (loop_filter_bit_pos & 7);
+                u4_no_loop_filter_flag &= (1 << ((tmp_wd + (min_cu - 1)) >> log2_min_cu)) - 1;
+
+                if(u4_no_loop_filter_flag)
+                {
+                    while(tmp_wd > 0)
+                    {
+                        if(CTZ(u4_no_loop_filter_flag))
+                        {
+                            pu1_src_tmp_chroma += MIN((CTZ(u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+                            pu1_src_backup_chroma += MIN((CTZ(u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+                            tmp_wd -= CTZ(u4_no_loop_filter_flag) << log2_min_cu;
+                            u4_no_loop_filter_flag  >>= (CTZ(u4_no_loop_filter_flag));
+                        }
+                        else
+                        {
+                            for(row = 0; row < min_cu / 2; row++)
+                            {
+                                for(col = 0; col < MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd); col++)
+                                {
+                                    pu1_src_tmp_chroma[row * src_strd + col] = pu1_src_backup_chroma[row * backup_strd + col];
+                                }
+                            }
+
+                            pu1_src_tmp_chroma += MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+                            pu1_src_backup_chroma += MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+                            tmp_wd -= CTZ(~u4_no_loop_filter_flag) << log2_min_cu;
+                            u4_no_loop_filter_flag  >>= (CTZ(~u4_no_loop_filter_flag));
+                        }
+                    }
+
+                    pu1_src_tmp_chroma -= sao_blk_wd;
+                    pu1_src_backup_chroma -= sao_blk_wd;
+                }
+
+                pu1_src_tmp_chroma += ((src_strd / 2) << log2_min_cu);
+                pu1_src_backup_chroma += ((backup_strd / 2) << log2_min_cu);
+            }
+        }
+    }
+
+}
+

diff --git a/decoder/ihevcd_sao.h b/decoder/ihevcd_sao.h
new file mode 100644
index 0000000..e549682
--- /dev/null
+++ b/decoder/ihevcd_sao.h

@@ -0,0 +1,40 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_sao.h
+*
+* @brief
+*
+*
+* @author
+*  Srinivas T
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVCD_SAO_H_
+#define _IHEVCD_SAO_H_
+
+void ihevcd_sao_ctb(sao_ctxt_t *ps_sao_ctxt);
+void ihevcd_sao_shift_ctb(sao_ctxt_t *ps_sao_ctxt);
+
+#endif /*_IHEVC_SAO_H_*/

diff --git a/decoder/ihevcd_statistics.c b/decoder/ihevcd_statistics.c
new file mode 100644
index 0000000..f4e5242
--- /dev/null
+++ b/decoder/ihevcd_statistics.c

@@ -0,0 +1,688 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_statistics.c
+*
+* @brief
+*  Contains macros for generating stats about hevc decoder
+*
+* @author
+*  Naveen SR
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevcd_defs.h"
+
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_iquant_itrans_recon_ctb.h"
+#include "ihevcd_statistics.h"
+
+#if STATISTICS_ENABLE
+statistics_t gs_ihevcd_stat;
+
+void ihevcd_init_trans_stat(stat_trans_t *ps_stat_trans)
+{
+    ps_stat_trans->num_4x4_dst = 0;
+    ps_stat_trans->num_4x4 = 0;
+    ps_stat_trans->num_8x8 = 0;
+    ps_stat_trans->num_16x16 = 0;
+    ps_stat_trans->num_32x32 = 0;
+    ps_stat_trans->num_64x64 = 0;
+};
+
+void ihevcd_sblk_pos_init()
+{
+    gs_ihevcd_stat.last_sblk_pos_x = 0;
+    gs_ihevcd_stat.last_sblk_pos_y = 0;
+    gs_ihevcd_stat.num_coded_sblk = 0;
+    gs_ihevcd_stat.num_coded_coeffs = 0;
+}
+void ihevcd_init_sblk_histogram(stat_sblk_histogram_t *ps_last_sblk_pos_histogram_t)
+{
+    memset(ps_last_sblk_pos_histogram_t->trans_4x4_dst, 0, 1 * sizeof(UWORD32));
+    memset(ps_last_sblk_pos_histogram_t->trans_4x4, 0, 1 * sizeof(UWORD32));
+    memset(ps_last_sblk_pos_histogram_t->trans_8x8, 0, 4 * sizeof(UWORD32));
+    memset(ps_last_sblk_pos_histogram_t->trans_16x16, 0, 16 * sizeof(UWORD32));
+    memset(ps_last_sblk_pos_histogram_t->trans_32x32, 0, 64 * sizeof(UWORD32));
+}
+void ihevcd_init_coeff_histogram(stat_coeff_histogram_t *ps_coeff_histogram)
+{
+    memset(ps_coeff_histogram->trans_4x4_dst, 0, 16 * sizeof(UWORD32));
+    memset(ps_coeff_histogram->trans_4x4, 0, 16 * sizeof(UWORD32));
+    memset(ps_coeff_histogram->trans_8x8, 0, 64 * sizeof(UWORD32));
+    memset(ps_coeff_histogram->trans_16x16, 0, 256 * sizeof(UWORD32));
+    memset(ps_coeff_histogram->trans_32x32, 0, 1024 * sizeof(UWORD32));
+}
+void ihevcd_init_statistics()
+{
+
+    memset(&gs_ihevcd_stat, 0, sizeof(statistics_t));
+    /* Number of transform block init */
+    ihevcd_init_trans_stat(&gs_ihevcd_stat.stat_num_all_trans_block[0]);
+    ihevcd_init_trans_stat(&gs_ihevcd_stat.stat_num_all_trans_block[1]);
+    /* Number of coded transform block init */
+    ihevcd_init_trans_stat(&gs_ihevcd_stat.stat_num_coded_trans_block[0]);
+    ihevcd_init_trans_stat(&gs_ihevcd_stat.stat_num_coded_trans_block[1]);
+    /* Number of coded DC transform block init */
+    ihevcd_init_trans_stat(&gs_ihevcd_stat.stat_num_coded_dc_block[0]);
+    ihevcd_init_trans_stat(&gs_ihevcd_stat.stat_num_coded_dc_block[1]);
+    /* Number of coded one coeff transform block init */
+    ihevcd_init_trans_stat(&gs_ihevcd_stat.stat_num_coded_one_coeff_block[0]);
+    ihevcd_init_trans_stat(&gs_ihevcd_stat.stat_num_coded_one_coeff_block[1]);
+    /* Last sblk histogram init */
+    ihevcd_init_sblk_histogram(&gs_ihevcd_stat.stat_last_sblk_pos_histogram);
+    /* Num Coded sblk histogram init */
+    ihevcd_init_sblk_histogram(&gs_ihevcd_stat.stat_num_coded_sblk_histogram);
+    /* Num Coded coeffs histogram init */
+    ihevcd_init_coeff_histogram(&gs_ihevcd_stat.stat_num_coded_coeff_histogram);
+    /* Last sblk position init */
+    ihevcd_sblk_pos_init();
+
+}
+
+void ihevcd_print_stat_trans(stat_trans_t *ps_stat_trans)
+{
+    WORD32 total_pixels_y, total_pixels_uv;
+    double y_ratio, y_ratio_total, uv_ratio, uv_ratio_total;
+    stat_trans_t *ps_stat_trans_all;
+    total_pixels_y = ps_stat_trans[0].num_4x4_dst * 4 * 4 +
+                    ps_stat_trans[0].num_4x4 * 4 * 4 +
+                    ps_stat_trans[0].num_8x8 * 8 * 8 +
+                    ps_stat_trans[0].num_16x16 * 16 * 16 +
+                    ps_stat_trans[0].num_32x32 * 32 * 32 +
+                    ps_stat_trans[0].num_64x64 * 64 * 64;
+
+    total_pixels_uv = ps_stat_trans[1].num_4x4_dst * 4 * 4 +
+                    ps_stat_trans[1].num_4x4  * 4 * 4 +
+                    ps_stat_trans[1].num_8x8  * 8 * 8 +
+                    ps_stat_trans[1].num_16x16 * 16 * 16 +
+                    ps_stat_trans[1].num_32x32 * 32 * 32 +
+                    ps_stat_trans[1].num_64x64 * 64 * 64;
+
+    ps_stat_trans_all = &gs_ihevcd_stat.stat_num_all_trans_block[0];
+
+    printf("\n_                   Y               Y            Y                U+V             U+V             U+V");
+    printf("\nTransform_Type      Num_Blocks      Percentage   %%wrt_total      Num_Blocks     Percentage   %%wrt_total ");
+
+    y_ratio = ps_stat_trans[0].num_4x4_dst * 4 * 4 * 100.0 / total_pixels_y;
+    y_ratio_total = ps_stat_trans[0].num_4x4_dst * 100.0 / ps_stat_trans_all[0].num_4x4_dst;
+    uv_ratio = ps_stat_trans[1].num_4x4_dst * 4 * 4 * 100.0 / total_pixels_uv;
+    uv_ratio_total = ps_stat_trans[1].num_4x4_dst * 100.0 / ps_stat_trans_all[1].num_4x4_dst;
+    printf("\nDST_4x4             %6d             %6.2f        %6.2f        %6d         %6.2f       %6.2f ", ps_stat_trans[0].num_4x4_dst, y_ratio, y_ratio_total, ps_stat_trans[1].num_4x4_dst, uv_ratio,  uv_ratio_total);
+
+    y_ratio = ps_stat_trans[0].num_4x4 * 4 * 4 * 100.0 / total_pixels_y;
+    y_ratio_total = ps_stat_trans[0].num_4x4 * 100.0 / ps_stat_trans_all[0].num_4x4;
+    uv_ratio = ps_stat_trans[1].num_4x4 * 4 * 4 * 100.0 / total_pixels_uv;
+    uv_ratio_total = ps_stat_trans[1].num_4x4 * 100.0 / ps_stat_trans_all[1].num_4x4;
+    printf("\nDCT_4x4             %6d             %6.2f        %6.2f        %6d         %6.2f       %6.2f ", ps_stat_trans[0].num_4x4, y_ratio, y_ratio_total, ps_stat_trans[1].num_4x4, uv_ratio,  uv_ratio_total);
+
+
+    y_ratio = ps_stat_trans[0].num_8x8 * 8 * 8 * 100.0 / total_pixels_y;
+    y_ratio_total = ps_stat_trans[0].num_8x8 * 100.0 / ps_stat_trans_all[0].num_8x8;
+    uv_ratio = ps_stat_trans[1].num_8x8 * 8 * 8 * 100.0 / total_pixels_uv;
+    uv_ratio_total = ps_stat_trans[1].num_8x8 * 100.0 / ps_stat_trans_all[1].num_8x8;
+    printf("\nDCT_8x8             %6d             %6.2f        %6.2f        %6d         %6.2f       %6.2f ", ps_stat_trans[0].num_8x8, y_ratio, y_ratio_total, ps_stat_trans[1].num_8x8, uv_ratio,  uv_ratio_total);
+
+    y_ratio = ps_stat_trans[0].num_16x16 * 16 * 16 * 100.0 / total_pixels_y;
+    y_ratio_total = ps_stat_trans[0].num_16x16 * 100.0 / ps_stat_trans_all[0].num_16x16;
+    uv_ratio = ps_stat_trans[1].num_16x16 * 16 * 16 * 100.0 / total_pixels_uv;
+    uv_ratio_total = ps_stat_trans[1].num_16x16 * 100.0 / ps_stat_trans_all[1].num_16x16;
+    printf("\nDCT_16x16           %6d             %6.2f        %6.2f        %6d         %6.2f       %6.2f ", ps_stat_trans[0].num_16x16, y_ratio, y_ratio_total, ps_stat_trans[1].num_16x16, uv_ratio,  uv_ratio_total);
+
+
+    y_ratio = ps_stat_trans[0].num_32x32 * 32 * 32 * 100.0 / total_pixels_y;
+    y_ratio_total = ps_stat_trans[0].num_32x32 * 100.0 / ps_stat_trans_all[0].num_32x32;
+    uv_ratio = ps_stat_trans[1].num_32x32 * 32 * 32 * 100.0 / total_pixels_uv;
+    uv_ratio_total = ps_stat_trans[1].num_32x32 * 100.0 / ps_stat_trans_all[1].num_32x32;
+    printf("\nDCT_32x32           %6d             %6.2f        %6.2f        %6d         %6.2f       %6.2f ", ps_stat_trans[0].num_32x32, y_ratio, y_ratio_total, ps_stat_trans[1].num_32x32, uv_ratio,  uv_ratio_total);
+
+
+    y_ratio = ps_stat_trans[0].num_64x64 * 64 * 64 * 100.0 / total_pixels_y;
+    y_ratio_total = ps_stat_trans[0].num_64x64 * 100.0 / ps_stat_trans_all[0].num_64x64;
+    uv_ratio = ps_stat_trans[1].num_64x64 * 64 * 64 * 100.0 / total_pixels_uv;
+    uv_ratio_total = ps_stat_trans[1].num_64x64 * 100.0 / ps_stat_trans_all[1].num_64x64;
+    printf("\nDCT_64x64           %6d             %6.2f        %6.2f        %6d         %6.2f       %6.2f ", ps_stat_trans[0].num_64x64, y_ratio, y_ratio_total, ps_stat_trans[1].num_64x64, uv_ratio,  uv_ratio_total);
+
+}
+
+void ihevcd_update_stat_num_trans(stat_trans_t *ps_stat_trans, TRANSFORM_TYPE e_trans_type)
+{
+    switch(e_trans_type)
+    {
+        case DST_4x4:
+            ps_stat_trans->num_4x4_dst++;
+            break;
+        case DCT_4x4:
+            ps_stat_trans->num_4x4++;
+            break;
+        case DCT_8x8:
+            ps_stat_trans->num_8x8++;
+            break;
+        case DCT_16x16:
+            ps_stat_trans->num_16x16++;
+            break;
+        case DCT_32x32:
+            ps_stat_trans->num_32x32++;
+            break;
+        case SKIP_64x64:
+            ps_stat_trans->num_64x64++;
+            break;
+        default:
+            break;
+    }
+}
+
+void ihevcd_update_num_all_trans_blocks(TRANSFORM_TYPE e_trans_type, WORD32 c_idx)
+{
+    stat_trans_t *ps_stat_trans;
+
+    ps_stat_trans = &gs_ihevcd_stat.stat_num_all_trans_block[0];
+
+    if(c_idx != 0)
+    {
+        ps_stat_trans++;
+    }
+    ihevcd_update_stat_num_trans(ps_stat_trans, e_trans_type);
+}
+
+void ihevcd_update_num_trans_blocks(TRANSFORM_TYPE e_trans_type, WORD32 c_idx, WORD32 update_type)
+{
+    stat_trans_t *ps_stat_trans;
+
+    if(0 == update_type)
+        ps_stat_trans = &gs_ihevcd_stat.stat_num_coded_trans_block[0];
+    else if(1 == update_type)
+        ps_stat_trans = &gs_ihevcd_stat.stat_num_coded_dc_block[0];
+    else
+        ps_stat_trans = &gs_ihevcd_stat.stat_num_coded_one_coeff_block[0];
+
+    if(c_idx != 0)
+    {
+        ps_stat_trans++;
+    }
+    ihevcd_update_stat_num_trans(ps_stat_trans, e_trans_type);
+}
+
+void ihevcd_print_sblk_histogram_per_transform(UWORD32 *pu4_stat, UWORD32 wd, UWORD32 ht, WORD32 is_2d)
+{
+    UWORD32 i, j, total = 0, val;
+
+    for(i = 0; i < ht; i++)
+    {
+        for(j = 0; j < wd; j++)
+        {
+            val = pu4_stat[j + i * ht];
+            printf("%d\t\t", val);
+            total += val;
+        }
+        if(1 == is_2d)
+            printf("\n");
+    }
+
+    {
+        printf("\n");
+        for(i = 0; i < ht; i++)
+        {
+            for(j = 0; j < wd; j++)
+            {
+                val = pu4_stat[j + i * ht];
+
+                printf("%.2f%%\t\t", val * 100.0 / total);
+            }
+            if(1 == is_2d)
+                printf("\n");
+        }
+    }
+}
+
+void ihevcd_print_sblk_histogram(stat_sblk_histogram_t *ps_stat_sblk_pos_histogram, WORD32 is_2d)
+{
+    printf("\nhistogram_4x4_DST\n");
+    ihevcd_print_sblk_histogram_per_transform(ps_stat_sblk_pos_histogram->trans_4x4_dst, 1, 1, is_2d);
+    printf("\nhistogram_4x4\n");
+    ihevcd_print_sblk_histogram_per_transform(ps_stat_sblk_pos_histogram->trans_4x4, 1, 1, is_2d);
+    printf("\nhistogram_8x8\n");
+    ihevcd_print_sblk_histogram_per_transform(ps_stat_sblk_pos_histogram->trans_8x8, 2, 2, is_2d);
+    printf("\nhistogram_16x16\n");
+    ihevcd_print_sblk_histogram_per_transform(ps_stat_sblk_pos_histogram->trans_16x16, 4, 4, is_2d);
+    printf("\nhistogram_32x32\n");
+    ihevcd_print_sblk_histogram_per_transform(ps_stat_sblk_pos_histogram->trans_32x32, 8, 8, is_2d);
+}
+
+void ihevcd_print_coeff_histogram(stat_coeff_histogram_t *ps_stat_coeff_histogram, WORD32 is_2d)
+{
+    printf("\nhistogram_4x4_DST\n");
+    ihevcd_print_sblk_histogram_per_transform(ps_stat_coeff_histogram->trans_4x4_dst, 4, 4, is_2d);
+    printf("\nhistogram_4x4\n");
+    ihevcd_print_sblk_histogram_per_transform(ps_stat_coeff_histogram->trans_4x4, 4, 4, is_2d);
+    printf("\nhistogram_8x8\n");
+    ihevcd_print_sblk_histogram_per_transform(ps_stat_coeff_histogram->trans_8x8, 8, 8, is_2d);
+    printf("\nhistogram_16x16\n");
+    ihevcd_print_sblk_histogram_per_transform(ps_stat_coeff_histogram->trans_16x16, 16, 16, is_2d);
+    printf("\nhistogram_32x32\n");
+    ihevcd_print_sblk_histogram_per_transform(ps_stat_coeff_histogram->trans_32x32, 32, 32, is_2d);
+}
+void ihevcd_print_transform_statistics()
+{
+    stat_trans_t *ps_stat_trans;
+    WORD32 total_blocks;
+
+    /* Num coded_transform blocks */
+    printf("\nNUM_ALL_TRANSFORM_BLOCKS\n");
+    ps_stat_trans = &gs_ihevcd_stat.stat_num_all_trans_block[0];
+    {
+        /* Updating chroma blocks. As the chroma blocks are not counted if cbf of y,u and v are zero */
+        ps_stat_trans[1].num_4x4 = (ps_stat_trans[0].num_4x4_dst + ps_stat_trans[0].num_4x4) / 4 + ps_stat_trans->num_8x8;
+        ps_stat_trans[1].num_8x8 = ps_stat_trans->num_16x16;
+        ps_stat_trans[1].num_16x16 = ps_stat_trans->num_32x32;
+        ps_stat_trans[1].num_32x32 = ps_stat_trans->num_64x64;
+    }
+    ihevcd_print_stat_trans(ps_stat_trans);
+
+    /* Num coded_transform blocks */
+    printf("\nNUM_CODED_TRANSFORM_BLOCKS(excluding_trans_skip_and_trans_quant_bypass)\n");
+    ps_stat_trans = &gs_ihevcd_stat.stat_num_coded_trans_block[0];
+    ihevcd_print_stat_trans(ps_stat_trans);
+
+    /* Num DC transform blocks */
+    printf("\nNUM_DC_TRANSFORM_BLOCKS(excluding_trans_skip_and_trans_quant_bypass)\n");
+    ps_stat_trans = &gs_ihevcd_stat.stat_num_coded_dc_block[0];
+    ihevcd_print_stat_trans(ps_stat_trans);
+
+    /* Num one coeff transform blocks */
+    printf("\nNUM_ONE_COEFF_TRANSFORM_BLOCKS(excluding_trans_skip_and_trans_quant_bypass)\n");
+    ps_stat_trans = &gs_ihevcd_stat.stat_num_coded_one_coeff_block[0];
+    ihevcd_print_stat_trans(ps_stat_trans);
+
+    /* Last sblk histogram */
+    printf("\nLAST_CODED_SBLK_HISTOGRAM\n");
+    ihevcd_print_sblk_histogram(&gs_ihevcd_stat.stat_last_sblk_pos_histogram, 1);
+
+    /* Num Coded sblks histogram */
+    printf("\nNUM_CODED_SBLK_HISTOGRAM\n");
+    ihevcd_print_sblk_histogram(&gs_ihevcd_stat.stat_num_coded_sblk_histogram, 1);
+
+    /* Num Coded coeff histogram */
+    printf("\nNUM_CODED_COEFF_HISTOGRAM\n");
+    ihevcd_print_coeff_histogram(&gs_ihevcd_stat.stat_num_coded_coeff_histogram, 1);
+}
+
+void ihevcd_update_sblk_histogram(stat_sblk_histogram_t *ps_last_sblk_pos_histogram, TRANSFORM_TYPE e_trans_type,  WORD32 last_sblk_x, WORD32 last_sblk_y)
+{
+    switch(e_trans_type)
+    {
+        case DST_4x4:
+            ps_last_sblk_pos_histogram->trans_4x4_dst[last_sblk_x + last_sblk_y * 0]++;
+            break;
+        case DCT_4x4:
+            ps_last_sblk_pos_histogram->trans_4x4[last_sblk_x + last_sblk_y * 0]++;
+            break;
+        case DCT_8x8:
+            ps_last_sblk_pos_histogram->trans_8x8[last_sblk_x + last_sblk_y * 2]++;
+            break;
+        case DCT_16x16:
+            ps_last_sblk_pos_histogram->trans_16x16[last_sblk_x + last_sblk_y * 4]++;
+            break;
+        case DCT_32x32:
+            ps_last_sblk_pos_histogram->trans_32x32[last_sblk_x + last_sblk_y * 8]++;
+            break;
+        default:
+            break;
+    }
+}
+
+void ihevcd_update_num_coded_sblk_histogram(stat_sblk_histogram_t *ps_sblk_histogram, TRANSFORM_TYPE e_trans_type,  WORD32 num_coded_blks)
+{
+    switch(e_trans_type)
+    {
+        case DST_4x4:
+            ps_sblk_histogram->trans_4x4_dst[num_coded_blks - 1]++;
+            break;
+        case DCT_4x4:
+            ps_sblk_histogram->trans_4x4[num_coded_blks - 1]++;
+            break;
+        case DCT_8x8:
+            ps_sblk_histogram->trans_8x8[num_coded_blks - 1]++;
+            break;
+        case DCT_16x16:
+            ps_sblk_histogram->trans_16x16[num_coded_blks - 1]++;
+            break;
+        case DCT_32x32:
+            ps_sblk_histogram->trans_32x32[num_coded_blks - 1]++;
+            break;
+        default:
+            break;
+    }
+}
+
+void ihevcd_update_num_coded_coeff_histogram(stat_coeff_histogram_t *ps_coeff_histogram, TRANSFORM_TYPE e_trans_type,  WORD32 num_coded_blks)
+{
+    switch(e_trans_type)
+    {
+        case DST_4x4:
+            ps_coeff_histogram->trans_4x4_dst[num_coded_blks - 1]++;
+            break;
+        case DCT_4x4:
+            ps_coeff_histogram->trans_4x4[num_coded_blks - 1]++;
+            break;
+        case DCT_8x8:
+            ps_coeff_histogram->trans_8x8[num_coded_blks - 1]++;
+            break;
+        case DCT_16x16:
+            ps_coeff_histogram->trans_16x16[num_coded_blks - 1]++;
+            break;
+        case DCT_32x32:
+            ps_coeff_histogram->trans_32x32[num_coded_blks - 1]++;
+            break;
+        default:
+            break;
+    }
+}
+
+void ihevcd_sblk_pos_update(TRANSFORM_TYPE e_trans_type, WORD32 t_skip_or_tq_bypass, UWORD32 sblk_x, UWORD32 sblk_y)
+{
+    if(1 == t_skip_or_tq_bypass)
+        return;
+
+    gs_ihevcd_stat.num_coded_sblk++;
+
+    /* Updating the last coded sblk pos */
+#if 0
+    if(gs_ihevcd_stat.last_sblk_pos_y > sblk_y)
+        return;
+    else if(gs_ihevcd_stat.last_sblk_pos_y == sblk_y)
+    {
+        if(gs_ihevcd_stat.last_sblk_pos_x >= sblk_x)
+            return;
+        else
+            gs_ihevcd_stat.last_sblk_pos_x = sblk_x;
+    }
+    else
+    {
+        gs_ihevcd_stat.last_sblk_pos_y = sblk_y;
+        gs_ihevcd_stat.last_sblk_pos_x = sblk_x;
+    }
+#endif
+    if(gs_ihevcd_stat.last_sblk_pos_y < sblk_y)
+        gs_ihevcd_stat.last_sblk_pos_y = sblk_y;
+
+    if(gs_ihevcd_stat.last_sblk_pos_x < sblk_x)
+        gs_ihevcd_stat.last_sblk_pos_x = sblk_x;
+}
+
+void ihevcd_update_coeff_count()
+{
+    gs_ihevcd_stat.num_coded_coeffs++;
+}
+
+void ihevcd_update_sblk_and_coeff_histogram(TRANSFORM_TYPE e_trans_type, WORD32 t_skip_or_tq_bypass)
+{
+    if(0 == t_skip_or_tq_bypass)
+    {
+        ihevcd_update_sblk_histogram(&gs_ihevcd_stat.stat_last_sblk_pos_histogram, e_trans_type, gs_ihevcd_stat.last_sblk_pos_x, gs_ihevcd_stat.last_sblk_pos_y);
+        ihevcd_update_num_coded_sblk_histogram(&gs_ihevcd_stat.stat_num_coded_sblk_histogram, e_trans_type, gs_ihevcd_stat.num_coded_sblk);
+        ihevcd_update_num_coded_coeff_histogram(&gs_ihevcd_stat.stat_num_coded_coeff_histogram, e_trans_type, gs_ihevcd_stat.num_coded_coeffs);
+    }
+}
+
+void ihevcd_update_pu_skip_size(pu_t *ps_pu)
+{
+    WORD32 wd, ht;
+
+    wd = (ps_pu->b4_wd);
+    ht = (ps_pu->b4_ht);
+
+    gs_ihevcd_stat.stat_pu_skip_size_hist[wd][ht]++;
+}
+void ihevcd_update_pu_size(pu_t *ps_pu)
+{
+    WORD32 wd, ht;
+
+    wd = (ps_pu->b4_wd);
+    ht = (ps_pu->b4_ht);
+    gs_ihevcd_stat.stat_pu_all_size_hist[wd][ht]++;
+    if(ps_pu->b1_intra_flag)
+    {
+        gs_ihevcd_stat.stat_pu_intra_size_hist[wd][ht]++;
+    }
+    else
+    {
+        gs_ihevcd_stat.stat_pu_inter_size_hist[wd][ht]++;
+
+
+
+        if(ps_pu->b1_merge_flag)
+            gs_ihevcd_stat.stat_pu_merge_size_hist[wd][ht]++;
+
+        if(ps_pu->b2_pred_mode == PRED_BI)
+            gs_ihevcd_stat.stat_pu_bipred_size_hist[wd][ht]++;
+
+        switch(ps_pu->b2_pred_mode)
+        {
+            case PRED_L0:
+                if((ps_pu->mv.s_l0_mv.i2_mvx == 0) &&
+                   (ps_pu->mv.s_l0_mv.i2_mvy == 0))
+                {
+                    gs_ihevcd_stat.stat_pu_zeromv_size_hist[wd][ht]++;
+                }
+
+                if((ABS(ps_pu->mv.s_l0_mv.i2_mvx) < 4) &&
+                   (ABS(ps_pu->mv.s_l0_mv.i2_mvy) < 4))
+                {
+                    gs_ihevcd_stat.stat_pu_zeromvfpel_size_hist[wd][ht]++;
+                }
+
+                break;
+
+            case PRED_L1:
+                if((ps_pu->mv.s_l1_mv.i2_mvx == 0) &&
+                   (ps_pu->mv.s_l1_mv.i2_mvy == 0))
+                {
+                    gs_ihevcd_stat.stat_pu_zeromv_size_hist[wd][ht]++;
+                }
+
+                if((ABS(ps_pu->mv.s_l1_mv.i2_mvx) < 4) &&
+                   (ABS(ps_pu->mv.s_l1_mv.i2_mvy) < 4))
+                {
+                    gs_ihevcd_stat.stat_pu_zeromvfpel_size_hist[wd][ht]++;
+                }
+                break;
+
+
+            case PRED_BI:
+                if((ps_pu->mv.s_l0_mv.i2_mvx == 0) &&
+                   (ps_pu->mv.s_l0_mv.i2_mvy == 0) &&
+                   (ps_pu->mv.s_l1_mv.i2_mvx == 0) &&
+                   (ps_pu->mv.s_l1_mv.i2_mvy == 0))
+                {
+                    gs_ihevcd_stat.stat_pu_zeromv_size_hist[wd][ht]++;
+                }
+                if((ABS(ps_pu->mv.s_l0_mv.i2_mvx) < 4) &&
+                   (ABS(ps_pu->mv.s_l0_mv.i2_mvy) < 4) &&
+                   (ABS(ps_pu->mv.s_l1_mv.i2_mvx) < 4) &&
+                   (ABS(ps_pu->mv.s_l1_mv.i2_mvy) < 4))
+                {
+                    gs_ihevcd_stat.stat_pu_zeromvfpel_size_hist[wd][ht]++;
+                }
+
+                break;
+
+        }
+    }
+}
+
+
+void ihevcd_print_pu_size_hist(UWORD32 *pu4_buf)
+{
+    WORD32 i, j;
+
+
+    for(i = 0; i < (MAX_CTB_SIZE / MIN_PU_SIZE); i++)
+    {
+        for(j = 0; j < (MAX_CTB_SIZE / MIN_PU_SIZE); j++)
+        {
+            printf("%12d ", pu4_buf[j]);
+        }
+        pu4_buf += (MAX_CTB_SIZE / MIN_PU_SIZE);
+        printf("\n");
+    }
+}
+
+void ihevcd_print_pu_size_hist_normalized(UWORD32 *pu4_buf)
+{
+    WORD32 i, j;
+    WORD32 sum;
+    UWORD32 *pu4_buf_orig = pu4_buf;
+    sum = 0;
+
+    for(i = 0; i < (MAX_CTB_SIZE / MIN_PU_SIZE); i++)
+    {
+        for(j = 0; j < (MAX_CTB_SIZE / MIN_PU_SIZE); j++)
+        {
+            sum += pu4_buf[j] * (i + 1) * (j + 1) * 16;
+        }
+        pu4_buf += (MAX_CTB_SIZE / MIN_PU_SIZE);
+    }
+
+    pu4_buf = pu4_buf_orig;
+    for(i = 0; i < (MAX_CTB_SIZE / MIN_PU_SIZE); i++)
+    {
+        for(j = 0; j < (MAX_CTB_SIZE / MIN_PU_SIZE); j++)
+        {
+            double num = pu4_buf[j] * (i + 1) * (j + 1) * 16 * 100.0;
+            printf("%6.2f ", num  / sum);
+        }
+        pu4_buf += (MAX_CTB_SIZE / MIN_PU_SIZE);
+        printf("\n");
+    }
+}
+
+void ihevcd_print_pu_size_hist_percentage(UWORD32 *pu4_num, UWORD32 *pu4_denom)
+{
+    WORD32 i, j;
+
+
+    for(i = 0; i < (MAX_CTB_SIZE / MIN_PU_SIZE); i++)
+    {
+        for(j = 0; j < (MAX_CTB_SIZE / MIN_PU_SIZE); j++)
+        {
+            double val;
+            val = 0;
+            if(pu4_denom[j])
+            {
+                val = (pu4_num[j] * 100.0) / pu4_denom[j];
+                printf("%6.2f ", val);
+            }
+            else
+            {
+                if(0 == pu4_num[j])
+                    printf("%6.2f ", 0.0);
+                else
+                    printf("NaN   ");
+            }
+        }
+        pu4_num += (MAX_CTB_SIZE / MIN_PU_SIZE);
+        pu4_denom += (MAX_CTB_SIZE / MIN_PU_SIZE);
+        printf("\n");
+    }
+}
+
+void ihevcd_print_pu_statistics()
+{
+
+    printf("\n\nPU Sizes\n\n");
+    ihevcd_print_pu_size_hist(&gs_ihevcd_stat.stat_pu_all_size_hist[0][0]);
+
+    printf("\n\nPU Sizes Intra\n\n");
+    ihevcd_print_pu_size_hist(&gs_ihevcd_stat.stat_pu_intra_size_hist[0][0]);
+
+    printf("\n\nPU Sizes Inter\n\n");
+    ihevcd_print_pu_size_hist(&gs_ihevcd_stat.stat_pu_inter_size_hist[0][0]);
+
+    printf("\n\nPU Sizes Skip\n\n");
+    ihevcd_print_pu_size_hist(&gs_ihevcd_stat.stat_pu_skip_size_hist[0][0]);
+
+    printf("\n\nPU Sizes Merge\n\n");
+    ihevcd_print_pu_size_hist(&gs_ihevcd_stat.stat_pu_merge_size_hist[0][0]);
+
+    printf("\n\nPU Sizes BiPred\n\n");
+    ihevcd_print_pu_size_hist(&gs_ihevcd_stat.stat_pu_bipred_size_hist[0][0]);
+
+    printf("\n\nPU Sizes Zero MV\n\n");
+    ihevcd_print_pu_size_hist(&gs_ihevcd_stat.stat_pu_zeromv_size_hist[0][0]);
+
+    printf("\n\nPU Sizes Zero MV including subpel MV less than +/- 1 in fullpel units\n\n");
+    ihevcd_print_pu_size_hist(&gs_ihevcd_stat.stat_pu_zeromvfpel_size_hist[0][0]);
+
+    printf("\n\nPU Sizes percentage \n\n");
+    ihevcd_print_pu_size_hist_normalized(&gs_ihevcd_stat.stat_pu_all_size_hist[0][0]);
+
+    printf("\n\nPU Sizes Intra ratio w.r.t all PUs\n\n");
+    ihevcd_print_pu_size_hist_percentage(&gs_ihevcd_stat.stat_pu_intra_size_hist[0][0], &gs_ihevcd_stat.stat_pu_all_size_hist[0][0]);
+
+    printf("\n\nPU Sizes Inter ratio w.r.t all PUs\n\n");
+    ihevcd_print_pu_size_hist_percentage(&gs_ihevcd_stat.stat_pu_inter_size_hist[0][0], &gs_ihevcd_stat.stat_pu_all_size_hist[0][0]);
+
+    printf("\n\nPU Sizes Skip ratio w.r.t all PUs\n\n");
+    ihevcd_print_pu_size_hist_percentage(&gs_ihevcd_stat.stat_pu_skip_size_hist[0][0], &gs_ihevcd_stat.stat_pu_all_size_hist[0][0]);
+
+    printf("\n\nPU Sizes Merge ratio w.r.t all PUs\n\n");
+    ihevcd_print_pu_size_hist_percentage(&gs_ihevcd_stat.stat_pu_merge_size_hist[0][0], &gs_ihevcd_stat.stat_pu_all_size_hist[0][0]);
+
+    printf("\n\nPU Sizes BiPred ratio w.r.t all PUs\n\n");
+    ihevcd_print_pu_size_hist_percentage(&gs_ihevcd_stat.stat_pu_bipred_size_hist[0][0], &gs_ihevcd_stat.stat_pu_all_size_hist[0][0]);
+
+    printf("\n\nPU Sizes Zero MV ratio w.r.t all PUs\n\n");
+    ihevcd_print_pu_size_hist_percentage(&gs_ihevcd_stat.stat_pu_zeromv_size_hist[0][0], &gs_ihevcd_stat.stat_pu_all_size_hist[0][0]);
+
+    printf("\n\nPU Sizes Zero MV including subpel MV less than +/- 1 in fullpel units ratio w.r.t all PUs\n\n");
+    ihevcd_print_pu_size_hist_percentage(&gs_ihevcd_stat.stat_pu_zeromvfpel_size_hist[0][0], &gs_ihevcd_stat.stat_pu_all_size_hist[0][0]);
+
+}
+
+void ihevcd_print_statistics()
+{
+    ihevcd_print_transform_statistics();
+    ihevcd_print_pu_statistics();
+}
+#endif

diff --git a/decoder/ihevcd_statistics.h b/decoder/ihevcd_statistics.h
new file mode 100644
index 0000000..58f35d6
--- /dev/null
+++ b/decoder/ihevcd_statistics.h

@@ -0,0 +1,149 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_statistics.h
+*
+* @brief
+*  Contains macros for generating stats about hevc decoder
+*
+* @author
+*  Naveen SR
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_STATISTICS_H_
+#define _IHEVCD_STATISTICS_H_
+
+#include <stdio.h>
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+
+#include "ihevc_cabac_tables.h"
+#include "ihevcd_defs.h"
+
+#include "ihevcd_structs.h"
+#include "ihevcd_iquant_itrans_recon_ctb.h"
+#include "ihevcd_statistics.h"
+
+#define STATISTICS_ENABLE 0
+
+#if STATISTICS_ENABLE
+
+typedef struct
+{
+    UWORD32 num_4x4_dst;
+    UWORD32 num_4x4;
+    UWORD32 num_8x8;
+    UWORD32 num_16x16;
+    UWORD32 num_32x32;
+    UWORD32 num_64x64;
+}stat_trans_t;
+
+typedef struct
+{
+    /* 4x4 Subblock count */
+    UWORD32 trans_4x4_dst[1];
+    UWORD32 trans_4x4[1];
+    UWORD32 trans_8x8[4];
+    UWORD32 trans_16x16[16];
+    UWORD32 trans_32x32[64];
+}stat_sblk_histogram_t;
+
+typedef struct
+{
+    /* 4x4 Subblock count */
+    UWORD32 trans_4x4_dst[16];
+    UWORD32 trans_4x4[16];
+    UWORD32 trans_8x8[64];
+    UWORD32 trans_16x16[256];
+    UWORD32 trans_32x32[1024];
+}stat_coeff_histogram_t;
+
+typedef struct
+{
+    stat_trans_t stat_num_all_trans_block[2]; /* Y and UV */
+    stat_trans_t stat_num_coded_trans_block[2]; /* Y and UV */
+    stat_trans_t stat_num_coded_dc_block[2]; /* Y and UV */
+    stat_trans_t stat_num_coded_one_coeff_block[2]; /* Y and UV */
+    stat_sblk_histogram_t stat_last_sblk_pos_histogram; /* Y + UV */
+    stat_sblk_histogram_t stat_num_coded_sblk_histogram; /* Y + UV */
+    stat_coeff_histogram_t stat_num_coded_coeff_histogram; /* Y + UV */
+    UWORD32   stat_pu_all_size_hist[16][16]; /* PU Sizes [Width from 4 to 64 in steps of 4] [Height from 4 to 64 in steps of 4]*/
+    UWORD32   stat_pu_skip_size_hist[16][16]; /* PU sizes for skip [Width from 4 to 64 in steps of 4] [Height from 4 to 64 in steps of 4]*/
+    UWORD32   stat_pu_inter_size_hist[16][16]; /* PU sizes for inter [Width from 4 to 64 in steps of 4] [Height from 4 to 64 in steps of 4]*/
+    UWORD32   stat_pu_intra_size_hist[16][16]; /* PU sizes for intra [Width from 4 to 64 in steps of 4] [Height from 4 to 64 in steps of 4]*/
+    UWORD32   stat_pu_bipred_size_hist[16][16]; /* PU sizes for bipred [Width from 4 to 64 in steps of 4] [Height from 4 to 64 in steps of 4]*/
+    UWORD32   stat_pu_merge_size_hist[16][16]; /* PU sizes for merge [Width from 4 to 64 in steps of 4] [Height from 4 to 64 in steps of 4]*/
+    UWORD32   stat_pu_zeromv_size_hist[16][16]; /* PU sizes for Zero MV [Width from 4 to 64 in steps of 4] [Height from 4 to 64 in steps of 4]*/
+    UWORD32   stat_pu_zeromvfpel_size_hist[16][16]; /* PU sizes for Zero MV (includes subpel less than +/- 1 full pel units [Width from 4 to 64 in steps of 4] [Height from 4 to 64 in steps of 4]*/
+    UWORD32 last_sblk_pos_x; /* Last sblk pos of transform block in processing */
+    UWORD32 last_sblk_pos_y;
+    UWORD32 num_coded_sblk;
+    UWORD32 num_coded_coeffs;
+}statistics_t;
+
+void ihevcd_update_num_all_trans_blocks(TRANSFORM_TYPE e_trans_type, WORD32 c_idx);
+void ihevcd_update_num_trans_blocks(TRANSFORM_TYPE e_trans_type, WORD32 c_idx, WORD32 update_type);
+void ihevcd_update_sblk_and_coeff_histogram(TRANSFORM_TYPE e_trans_type, WORD32 t_skip_or_tq_bypass);
+void ihevcd_sblk_pos_init();
+void ihevcd_sblk_pos_update(TRANSFORM_TYPE e_trans_type, WORD32 t_skip_or_tq_bypass, UWORD32 sblk_x, UWORD32 sblk_y);
+void ihevcd_print_transform_statistics();
+void ihevcd_update_coeff_count();
+void ihevcd_update_pu_size(pu_t *ps_pu);
+void ihevcd_update_pu_skip_size(pu_t *ps_pu);
+#endif //STATISTICS_ENABLE
+
+#if STATISTICS_ENABLE
+#define STATS_INIT()  ihevcd_init_statistics();
+#define STATS_UPDATE_ALL_TRANS(e_trans_type, c_idx) ihevcd_update_num_all_trans_blocks(e_trans_type, c_idx);
+#define STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, update_type) ihevcd_update_num_trans_blocks(e_trans_type, c_idx, update_type);
+#define STATS_PRINT() ihevcd_print_statistics();
+#define STATS_INIT_SBLK_AND_COEFF_POS() ihevcd_sblk_pos_init();
+#define STATS_LAST_SBLK_POS_UPDATE(e_trans_type, t_skip_or_tq_bypass, sblk_x, sblk_y) ihevcd_sblk_pos_update(e_trans_type, t_skip_or_tq_bypass, sblk_x, sblk_y);
+#define STATS_UPDATE_SBLK_AND_COEFF_HISTOGRAM(e_trans_type, t_skip_or_tq_bypass) ihevcd_update_sblk_and_coeff_histogram(e_trans_type, t_skip_or_tq_bypass);
+#define STATS_UPDATE_COEFF_COUNT() ihevcd_update_coeff_count();
+#define STATS_UPDATE_PU_SIZE(ps_pu) ihevcd_update_pu_size(ps_pu);
+#define STATS_UPDATE_PU_SKIP_SIZE(ps_pu) ihevcd_update_pu_skip_size(ps_pu);
+#else
+#define STATS_INIT()  ;
+#define STATS_UPDATE_ALL_TRANS(e_trans_type, c_idx) ;
+#define STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, update_type) ;
+#define STATS_PRINT() ;
+#define STATS_INIT_SBLK_AND_COEFF_POS() ;
+#define STATS_LAST_SBLK_POS_UPDATE(e_trans_type, t_skip_or_tq_bypass, sblk_x, sblk_y) ;
+#define STATS_UPDATE_SBLK_AND_COEFF_HISTOGRAM(e_trans_type, t_skip_or_tq_bypass) ;
+#define STATS_UPDATE_COEFF_COUNT() ;
+#define STATS_UPDATE_PU_SIZE(ps_pu) ;
+#define STATS_UPDATE_PU_SKIP_SIZE(ps_pu) ;
+#endif
+
+#endif /* _IHEVCD_STATISTICS_H_ */

diff --git a/decoder/ihevcd_structs.h b/decoder/ihevcd_structs.h
new file mode 100644
index 0000000..00e9a49
--- /dev/null
+++ b/decoder/ihevcd_structs.h

@@ -0,0 +1,2286 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+ *******************************************************************************
+ * @file
+ *  ihevcd_structs.h
+ *
+ * @brief
+ *  Structure definitions used in the decoder
+ *
+ * @author
+ *  Harish
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+#ifndef _IHEVCD_STRUCTS_H_
+#define _IHEVCD_STRUCTS_H_
+typedef enum
+{
+    INIT_DONE, HEADER_DONE, FIRST_FRAME_DONE,
+}CODEC_STATE_T;
+
+
+
+typedef struct _codec_t codec_t;
+
+/** Structure to hold format conversion context */
+typedef struct
+{
+    /** Current row for which format conversion should be done */
+    WORD32 i4_cur_row;
+
+    /** Number of rows for which format conversion should be done */
+    WORD32 i4_num_rows;
+}fmt_conv_t;
+
+/**
+ * Bitstream structure
+ */
+typedef struct
+{
+    /**
+     * Bitstream buffer base pointer
+     */
+    UWORD8 *pu1_buf_base;
+
+    /**
+     * Bitstream bit offset in current word. Value between 0 and 31
+     */
+    UWORD32 u4_bit_ofst;
+
+    /**
+     * Current bitstream buffer pointer
+     */
+    UWORD32 *pu4_buf;
+
+    /**
+     * Current word
+     */
+    UWORD32 u4_cur_word;
+
+    /**
+     * Next word
+     */
+    UWORD32 u4_nxt_word;
+
+    /**
+     * Max address for bitstream
+     */
+    UWORD8 *pu1_buf_max;
+}bitstrm_t;
+
+/**
+******************************************************************************
+ *  @brief      Cabac context for decoder
+******************************************************************************
+ */
+typedef struct cab_ctxt
+{
+    /*********************************************************************/
+    /*  CABAC ENGINE related fields                                      */
+    /*********************************************************************/
+    /** cabac interval range  R */
+    UWORD32  u4_range;
+
+    /** cabac interval offset O  */
+    UWORD32  u4_ofst;
+
+    /*********************************************************************/
+    /*  CABAC context models                                             */
+    /*********************************************************************/
+    /** All Context models stored in pscked form pState[bits6-1] | MPS[bit0] */
+    UWORD8  au1_ctxt_models[IHEVC_CAB_CTXT_END];
+
+    /** Context models memorized after decoding 2nd CTB in a row to be used
+     * during entropy sync cases
+     */
+    UWORD8 au1_ctxt_models_sync[IHEVC_CAB_CTXT_END];
+
+}cab_ctxt_t;
+
+typedef enum
+{
+    CMD_PROCESS,
+    CMD_FMTCONV,
+}JOBQ_CMD_T;
+
+/**
+ * Structure to represent a processing job entry
+ */
+typedef struct
+{
+    /**
+     * Command
+     * Currently: PROCESS, FMTCONV are the only two jobs
+     */
+    WORD32 i4_cmd;
+    /**
+     * CTB x of the starting CTB
+     */
+    WORD16 i2_ctb_x;
+
+    /**
+     * CTB y of the starting CTB
+     */
+
+    WORD16 i2_ctb_y;
+
+    /**
+     * Number of CTBs that need to be processed in this job
+     */
+    WORD16 i2_ctb_cnt;
+
+    /**
+     *  Slice index for the current CTB
+     */
+    WORD16 i2_slice_idx;
+
+    /**
+     * TU coefficient data offset for the current job
+     */
+    WORD32 i4_tu_coeff_data_ofst;
+#ifdef GPU_BUILD
+    /**
+     * OpenCL Granularity
+     */
+    WORD16 i2_granularity_idx;
+
+    /**
+     * Index to the process context
+     */
+    //WORD16 i2_proc_idx;
+
+    /**
+     * GPU Wait or NOT
+     */
+    WORD16 i2_wait;
+#endif
+}proc_job_t;
+/**
+ * Structure to represent a MV Bank buffer
+ */
+typedef struct
+{
+    /**
+     *  Pointer to hold PU index for each CTB in a picture
+     */
+    UWORD32 *pu4_pic_pu_idx;
+
+    /**
+     * Pointer to hold pu_t for each PU in a picture
+     */
+    pu_t *ps_pic_pu;
+
+    /**
+     * Pointer to hold PU map for each CTB in a picture
+     */
+    UWORD8 *pu1_pic_pu_map;
+
+    /**
+     * Pointer to hold the Slice map
+     */
+    UWORD16 *pu1_pic_slice_map;
+
+    /**
+     * Absolute POC for the current MV Bank
+     */
+    WORD32 i4_abs_poc;
+
+    /**
+     * Absolute POCs of reference List 0 for all slices in the frame from which this frame is reconstructed
+     */
+    WORD32 l0_collocated_poc[MAX_SLICE_SEGMENTS_IN_FRAME][MAX_DPB_SIZE];
+
+    /**
+     * Flag to indicate Long Term reference for POCs of reference List 0 for all slices in the frame from which this frame is reconstructed
+     */
+    WORD8 u1_l0_collocated_poc_lt[MAX_SLICE_SEGMENTS_IN_FRAME][MAX_DPB_SIZE];
+
+    /**
+     * Absolute POCs of reference List 1 for all slices in the frame from which this frame is reconstructed
+     */
+    WORD32 l1_collocated_poc[MAX_SLICE_SEGMENTS_IN_FRAME][MAX_DPB_SIZE];
+    /**
+     * Flag to indicate Long Term reference for POCs of reference List 1 for all slices in the frame from which this frame is reconstructed
+     */
+    WORD32 u1_l1_collocated_poc_lt[MAX_SLICE_SEGMENTS_IN_FRAME][MAX_DPB_SIZE];
+
+}mv_buf_t;
+
+typedef struct
+{
+    /**
+     * Pointer to current PPS
+     */
+    pps_t *ps_pps;
+
+    /**
+     * Pointer to current SPS
+     */
+    sps_t *ps_sps;
+
+    /**
+     * Pointer to current slice header structure
+     */
+    slice_header_t *ps_slice_hdr;
+
+    /**
+     * CTB's x position within a picture in raster scan in CTB units
+     */
+    WORD32 i4_ctb_x;
+
+    /**
+     * CTB's y position within a picture in raster scan in CTB units
+     */
+
+    WORD32 i4_ctb_y;
+
+    /**
+     * Current PU structure - set to CTB pu_t pointer at the start of CTB processing and incremented
+     * for every TU
+     */
+    pu_t *ps_pu;
+
+    /**
+     * Pointer to frame level pu_t for the current frame being parsed
+     * where MVs and Intra pred modes will be updated
+     */
+    pu_t *ps_pic_pu;
+
+    /**
+     * Store the current tile's information. This is needed for the computation of mvs.
+     */
+    tile_t *ps_tile;
+
+    /**
+     * Points to an array of PU indices which is used to identify
+     * start index of pu_t in ps_pic_pu and also to identify number of
+     * PUs in the current CTB by subtracting current idx from next CTB's
+     * PU idx
+     */
+    UWORD32 *pu4_pic_pu_idx;
+
+    /** PU Index map per CTB. The indices in this map are w.r.t picture pu array and not
+     * w.r.t CTB pu array.
+     * This will be used during mv prediction and since neighbours will have different CTB pu map
+     * it will be easier if they all have indices w.r.t picture level PU array rather than CTB level
+     * PU array.
+     * pu1_pic_pu_map is map w.r.t CTB's pu_t array
+     */
+    UWORD32 *pu4_pic_pu_idx_map;
+
+    /**
+      * Pointer to pu_map for the current frame being parsed
+      * where MVs and Intra pred modes will be updated
+      */
+    UWORD8 *pu1_pic_pu_map;
+
+    /**
+     *  PU count in current CTB
+     */
+    WORD32 i4_ctb_pu_cnt;
+
+    /**
+     *  PU count in current CTB
+     */
+    WORD32 i4_ctb_start_pu_idx;
+
+    /**
+     *  Top availability for current CTB level
+     */
+    UWORD8 u1_top_ctb_avail;
+
+    /**
+     *  Top right availability for current CTB level
+     */
+    UWORD8 u1_top_rt_ctb_avail;
+    /**
+     *  Top left availability for current CTB level
+     */
+    UWORD8 u1_top_lt_ctb_avail;
+    /**
+     *  left availability for current CTB level
+     */
+    UWORD8 u1_left_ctb_avail;
+
+}mv_ctxt_t;
+
+typedef struct
+{
+    /**
+     * Pointer to current PPS
+     */
+    pps_t *ps_pps;
+
+    /**
+     * Pointer to current SPS
+     */
+    sps_t *ps_sps;
+
+    /*
+     * Pointer to codec context
+     */
+    codec_t *ps_codec;
+
+    /**
+     * Index of the current Tile being parsed
+     */
+    tile_t *ps_tile;
+
+    /**
+     * Pointer to the current slice header
+     */
+    slice_header_t *ps_slice_hdr;
+
+    /**
+     *  TU count in current CTB
+     */
+    WORD32 i4_ctb_tu_cnt;
+
+    /**
+     * CTB's x position within a picture in raster scan in CTB units
+     */
+    WORD32 i4_ctb_x;
+
+    /**
+     * CTB's y position within a picture in raster scan in CTB units
+     */
+
+    WORD32 i4_ctb_y;
+
+    /**
+     * CTB's x position within a Tile in raster scan in CTB units
+     */
+    WORD32 i4_ctb_tile_x;
+
+    /**
+     * CTB's y position within a Tile in raster scan in CTB units
+     */
+
+    WORD32 i4_ctb_tile_y;
+
+    /**
+     * CTB's x position within a Slice in raster scan in CTB units
+     */
+    WORD32 i4_ctb_slice_x;
+
+    /**
+     * CTB's y position within a Slice in raster scan in CTB units
+     */
+
+    WORD32 i4_ctb_slice_y;
+
+    /* Two bits per edge.
+    Stored in format. BS[15] | BS[14] | .. |BS[0]*/
+    UWORD32 *pu4_pic_vert_bs;
+
+    /**
+     * Horizontal Boundary strength
+     */
+
+    /* Two bits per edge.
+    Stored in format. BS[15] | BS[14] | .. |BS[0]*/
+    UWORD32 *pu4_pic_horz_bs;
+
+    /**
+     * Flags to indicate if QP is constant through out a CTB - 1 bit for each CTB
+     * The bits are packed from LSB to MSB
+     * To get the flag corresponding to CTB with (ctb_x, ctb_y), use
+     *      pu4_qp_const_in_ctb[(ctb_x + pic_wd_in_ctb * ctb_y) >> 3] & (1 << ((ctb_x + pic_wd_in_ctb * ctb_y) & 7))
+     */
+    UWORD8 *pu1_pic_qp_const_in_ctb;
+
+    /**
+     *  Qp array stored for each 8x8 pixels
+     */
+    UWORD8  *pu1_pic_qp;
+
+    /**
+     * Current TU structure - set to CTB tu_t pointer at the start of CTB processing and incremented
+     * for every TU
+     */
+    tu_t *ps_tu;
+
+    /**
+     * Points to an array of TU indices which is used to identify
+     * start index of tu_t in ps_pic_tu and also to identify number of
+     * TUs in the current CTB by subtracting current idx from next CTB's
+     * TU idx
+     */
+    UWORD32 *pu4_pic_tu_idx;
+
+    /**
+     * Points to an array of PU indices which is used to identify
+     * start index of pu_t in ps_pic_pu and also to identify number of
+     * PUs in the current CTB by subtracting current idx from next CTB's
+     * PU idx
+     */
+    UWORD32 *pu4_pic_pu_idx;
+
+    /**
+     * Current PU structure - set to CTB pu_t pointer at the start of CTB processing and incremented
+     * for every TU
+     */
+    pu_t *ps_pu;
+
+    /**
+     * Pointer to frame level pu_t for the current frame being parsed
+     * where MVs and Intra pred modes will be updated
+     */
+    pu_t *ps_pic_pu;
+
+    /** PU Index map per CTB. The indices in this map are w.r.t picture pu array and not
+     * w.r.t CTB pu array.
+     * This will be used during mv prediction and since neighbours will have different CTB pu map
+     * it will be easier if they all have indices w.r.t picture level PU array rather than CTB level
+     * PU array.
+     * pu1_pic_pu_map is map w.r.t CTB's pu_t array
+     */
+    UWORD32 *pu4_pic_pu_idx_map;
+
+    /**
+     * Variable to store the next ctb count to compute pu idx
+     */
+    WORD32 i4_next_pu_ctb_cnt;
+
+    /**
+     * Variable to store the next ctb count to compute tu idx
+     */
+    WORD32 i4_next_tu_ctb_cnt;
+    /**
+     * Points to the array of slice indices which is used to identify the slice
+     *  to which each CTB in a frame belongs.
+     */
+    UWORD16 *pu1_slice_idx;
+}bs_ctxt_t;
+
+typedef struct
+{
+    /**
+     * Pointer to current PPS
+     */
+    pps_t *ps_pps;
+
+    /**
+     * Pointer to current SPS
+     */
+    sps_t *ps_sps;
+
+    /*
+     * Pointer to codec context
+     */
+    codec_t *ps_codec;
+
+    /**
+     * Pointer to current slice header structure
+     */
+    slice_header_t *ps_slice_hdr;
+
+    /**
+     * Pointer to the structure that contains BS and QP frame level arrays
+     */
+    bs_ctxt_t s_bs_ctxt;
+
+    /**
+     * CTB's x position within a picture in raster scan in CTB units
+     */
+    WORD32 i4_ctb_x;
+
+    /**
+     * CTB's y position within a picture in raster scan in CTB units
+     */
+
+    WORD32 i4_ctb_y;
+
+    /**
+     * Current pictures loop filter flag map at 8x8 level
+     */
+    UWORD8 *pu1_pic_no_loop_filter_flag;
+
+    /**
+     * Current CTB's no_loop_filter_flags
+     * each element corresponds to one row - including the left CTB's last 8x8
+     */
+    UWORD16 au2_ctb_no_loop_filter_flag[9];
+
+    /*
+     * Pointer to 0th luma pixel in current pic
+     */
+    UWORD8 *pu1_cur_pic_luma;
+
+    /*
+     * Pointer to 0th chroma pixel in current pic
+     */
+    UWORD8 *pu1_cur_pic_chroma;
+
+    /* Points to the array of slice indices which is used to identify the slice
+    *  to which each CTB in a frame belongs.
+    */
+    UWORD16 *pu1_slice_idx;
+
+    /* Specifies if the chroma format is yuv420sp_vu */
+    WORD32 is_chroma_yuv420sp_vu;
+
+#ifdef GPU_BUILD
+    //TODO GPU : Later define it for ARM only version as well
+    /**
+     * Pointer to base slice header structure
+     */
+    slice_header_t *ps_slice_hdr_base;
+#endif
+}deblk_ctxt_t;
+
+typedef struct
+{
+    /**
+     * Pointer to current PPS
+     */
+    pps_t *ps_pps;
+
+    /**
+     * Pointer to current SPS
+     */
+    sps_t *ps_sps;
+
+    /* Pointer to codec context
+     *
+     */
+    codec_t *ps_codec;
+
+    /**
+     * Pointer to base slice header structure
+     */
+    slice_header_t *ps_slice_hdr_base;
+
+    /**
+     * Pointer to current slice header structure
+     */
+    slice_header_t *ps_slice_hdr;
+
+    /**
+     * Pointer to current tile structure
+     */
+    tile_t *ps_tile;
+    /**
+     * CTB's x position within a picture in raster scan in CTB units
+     */
+    WORD32 i4_ctb_x;
+
+    /**
+     * CTB's y position within a picture in raster scan in CTB units
+     */
+
+    WORD32 i4_ctb_y;
+
+    /**
+     * Current pictures loop filter flag map at 8x8 level
+     */
+    UWORD8 *pu1_pic_no_loop_filter_flag;
+
+    /*
+     * Pointer to 0th luma pixel in current pic
+     */
+    UWORD8 *pu1_cur_pic_luma;
+
+    /*
+     * Pointer to 0th chroma pixel in current pic
+     */
+    UWORD8 *pu1_cur_pic_chroma;
+
+    /**
+     * Pointer to frame level sao_t for the current frame being parsed
+     */
+    sao_t *ps_pic_sao;
+
+    /**
+     * Temporary buffer needed during SAO processing
+     */
+    UWORD8 *pu1_tmp_buf_luma;
+
+    /**
+     * Temporary buffer needed during SAO processing
+     */
+    UWORD8 *pu1_tmp_buf_chroma;
+
+    /**
+     * Left column of luma pixels - used by SAO
+     */
+    UWORD8 *pu1_sao_src_left_luma;
+
+    /**
+     * Top row of luma pixels - used by SAO
+     */
+    UWORD8 *pu1_sao_src_top_luma;
+
+    /**
+     * Left column of chroma pixels(interleaved) - used by SAO
+     */
+    UWORD8 *pu1_sao_src_left_chroma;
+
+    /**
+     * Top row of chroma pixels(interleaved) - used by SAO
+     */
+    UWORD8 *pu1_sao_src_top_chroma;
+
+    /**
+     * Top-left luma pixel - used by SAO (for the top CTB row)
+     */
+    UWORD8 *pu1_sao_src_luma_top_left_ctb;
+
+    /**
+     * Top-left chroma pixel(interleaved) - used by SAO (for the top CTB row)
+     */
+    UWORD8 *pu1_sao_src_chroma_top_left_ctb;
+
+    /**
+     * Top-left luma pixel - used by SAO (for the current  CTB row)
+     */
+    UWORD8 *pu1_sao_src_top_left_luma_curr_ctb;
+
+    /**
+     * Top-left chroma pixel(interleaved) - used by SAO (for the current CTB row)
+     */
+    UWORD8 *pu1_sao_src_top_left_chroma_curr_ctb;
+
+    /**
+     * Top-right luma pixel - used by SAO (for the top CTB row)
+     */
+    UWORD8 *pu1_sao_src_top_left_luma_top_right;
+
+    /**
+     * Top-right chroma pixel(interleaved) - used by SAO (for the top CTB row)
+     */
+    UWORD8 *pu1_sao_src_top_left_chroma_top_right;
+
+    /**
+     * Bottom-left luma pixel - used by SAO
+     */
+    UWORD8 u1_sao_src_top_left_luma_bot_left;
+    /**
+     *  Pointer to array that stores bottom left luma pixel per row(interleaved) - used by SAO
+     */
+    UWORD8 *pu1_sao_src_top_left_luma_bot_left;
+
+    /**
+     * Bottom left chroma pixel(interleaved) - used by SAO
+     */
+    UWORD8 au1_sao_src_top_left_chroma_bot_left[2];
+    /**
+     *  Pointer to array that stores bottom left chroma pixel per row(interleaved) - used by SAO
+     */
+    UWORD8 *pu1_sao_src_top_left_chroma_bot_left;
+
+    /*
+     * Slice counter in a picture.
+     */
+    UWORD32 i4_cur_slice_idx;
+    /**
+     * Points to the array of slice indices which is used to identify the slice
+     *  to which each CTB in a frame belongs.
+     */
+    UWORD16 *pu1_slice_idx;
+    /**
+     * Points to the array of tile indices which is used to identify the slice
+     *  to which each CTB in a frame belongs.
+     */
+    UWORD16 *pu1_tile_idx;
+
+    /* Specifies if the chroma format is yuv420sp_vu */
+    WORD32 is_chroma_yuv420sp_vu;
+
+}sao_ctxt_t;
+
+typedef struct
+{
+    /** Log2 CU's size */
+    WORD32 i4_log2_cb_size;
+
+    /** CU's x position */
+    WORD32 i4_pos_x;
+
+    /** CU's y position */
+    WORD32 i4_pos_y;
+    /**
+     * Transquant Bypass enable flag at CU level - To be replicated at TU level
+     */
+    WORD32 i4_cu_transquant_bypass;
+    /**
+     * Prediction mode
+     */
+    WORD32 i4_pred_mode;
+
+    /**
+     * Partition mode
+     */
+    WORD32 i4_part_mode;
+
+    /**
+     * Intra luma pred mode for current CU. In case of PART2Nx2N
+     * the first value is replicated to avoid checks later
+     */
+    WORD32 ai4_intra_luma_pred_mode[4];
+
+    /**
+     * Previous intra luma pred flag used for intra pred mode computation
+     */
+    WORD32 ai4_prev_intra_luma_pred_flag[4];
+
+    /**
+     * mpm index used in intra prediction mode computation
+     */
+    WORD32 ai4_mpm_idx[4];
+    /**
+     * Remaining intra pred mode
+     */
+    WORD32 ai4_rem_intra_luma_pred_mode[4];
+    /**
+     * Chroma pred mode index to be used to compute intra pred mode for chroma
+     */
+    WORD32 i4_intra_chroma_pred_mode_idx;
+    /**
+     * Maximum transform depth
+     */
+    WORD32 i4_max_trafo_depth;
+
+    /**
+     *  Luma CBF for current TU
+     */
+    UWORD8 i1_cbf_luma;
+
+    /**
+     * Cb CBF
+     */
+    UWORD8 ai1_cbf_cb[MAX_TRAFO_DEPTH];
+
+    /**
+     * Cr CBF
+     */
+    UWORD8 ai1_cbf_cr[MAX_TRAFO_DEPTH];
+
+    /**
+     * Intra split flag
+     */
+    WORD32 i4_intra_split_flag;
+
+    /**
+     * Current QP
+     */
+    WORD32 i4_qp;
+
+    /**
+     * Number of TUs in CU parsed before a change in QP is signaled
+     */
+    WORD32 i4_tu_cnt;
+
+    /**
+     * Cu QP delta
+     */
+    WORD32 i4_cu_qp_delta;
+
+}parse_cu_t;
+/**
+ * Structure contains few common state variables such as CTB positions, current SPS, PPS ids etc which are to be
+ * used in the parsing thread. By keeping it a different structure it is being explicitly signalled that these
+ * variables are specific to Parsing threads context and other threads should not update these elements
+ */
+typedef struct
+{
+    /**
+     * CTB's x position within a picture in raster scan in CTB units
+     */
+    WORD32 i4_ctb_x;
+
+    /**
+     * CTB's y position within a picture in raster scan in CTB units
+     */
+
+    WORD32 i4_ctb_y;
+
+    /**
+     * CTB's x position within a Tile in raster scan in CTB units
+     */
+    WORD32 i4_ctb_tile_x;
+
+    /**
+     * CTB's y position within a Tile in raster scan in CTB units
+     */
+
+    WORD32 i4_ctb_tile_y;
+
+    /**
+     * CTB's x position within a Slice in raster scan in CTB units
+     */
+    WORD32 i4_ctb_slice_x;
+
+    /**
+     * CTB's y position within a Slice in raster scan in CTB units
+     */
+
+    WORD32 i4_ctb_slice_y;
+
+    /**
+     * Index of the current Tile being parsed
+     */
+    tile_t *ps_tile;
+
+    /**
+     * Current slice idx - Used in multi-core cases to ensure slice header is
+     * preserved till the last CB of the slice is decoded
+     */
+    WORD32 i4_cur_slice_idx;
+    /**
+     * Current slice idx - Used in multi-core cases to ensure slice header is
+     * preserved till the last CB of the slice is decoded
+     */
+    WORD32 i4_cur_independent_slice_idx;
+
+    /**
+     * Current slice idx - Used in multi-core cases to ensure slice header is
+     * preserved till the last CB of the slice is decoded
+     */
+    WORD32 i4_cur_tile_idx;
+
+    /**
+     * Pointer to current PPS
+     */
+    pps_t *ps_pps;
+
+    /**
+     * Pointer to current SPS
+     */
+    sps_t *ps_sps;
+
+    /**
+     * Signal that pic_init is called first time
+     */
+    WORD32 i4_first_pic_init;
+
+    /**
+     * Flag to indicate if CU QP delta is coded.
+     * By default it is set to 0 at the beginning of coding quad tree
+     */
+    WORD32 i4_is_cu_qp_delta_coded;
+
+    /**
+     * CU Qp delta
+     * By default it is set to 0 at the beginning of coding quad tree
+     */
+    WORD32 i4_cu_qp_delta;
+
+    /**
+     * Bitstream structure
+     */
+    bitstrm_t s_bitstrm;
+
+    /**
+     * Pointer frame level TU subblock coeff data
+     */
+    void *pv_pic_tu_coeff_data;
+
+    /**
+     * Pointer to TU subblock coeff data and number of coded subblocks and scan idx
+     * Incremented each time a coded subblock is parsed
+     *
+     */
+    void *pv_tu_coeff_data;
+
+    /**
+     * Current TU structure - set to CTB tu_t pointer at the start of CTB parsing and incremented
+     * for every TU
+     */
+    tu_t *ps_tu;
+
+    /**
+     * Current ctb's TU map
+     */
+    UWORD8 *pu1_tu_map;
+
+    /**
+     * Current PU structure - set to CTB pu_t pointer at the start of CTB parsing and incremented
+     * for every TU
+     */
+    pu_t *ps_pu;
+
+    /**
+     * Points to the array of slice indices which is used to identify the independent slice
+     *  to which each CTB in a frame belongs.
+     */
+    UWORD16 *pu1_slice_idx;
+
+    /**
+     * Current PU index in a frame
+     */
+    WORD32 i4_pic_pu_idx;
+
+    /**
+     * Current TU index in a frame
+     */
+    WORD32 i4_pic_tu_idx;
+
+    /**
+     * Current PU structure - set to CTB pu_map pointer at the start of CTB parsing
+     */
+    UWORD8 *pu1_pu_map;
+
+    /**
+     * Current QP
+     */
+    WORD32 u4_qp;
+
+    /**
+     * Current Group's QP
+     */
+    WORD32 u4_qpg;
+
+    /**
+     * Number of PCM blocks in current CTB - Needed only during parsing
+     * If needed during recon then move it to ctb_t
+     */
+    WORD32 i4_ctb_num_pcm_blks;
+
+    /**
+     * PCM flag for the current CU
+     */
+    WORD32 i4_cu_pcm_flag;
+
+    /**
+     * CU related information to be used to populate tu_t and pu_t during
+     * pred unit and transform tree parsing.
+     */
+    parse_cu_t s_cu;
+
+    /**
+     * Pointer to pu_map for the current frame being parsed
+     */
+    UWORD8 *pu1_pic_pu_map;
+
+    /**
+     * Pointer to frame level pu_t for the current frame being parsed
+     * where MVs and Intra pred modes will be updated
+     */
+    pu_t *ps_pic_pu;
+
+    /**
+     * Pointer to tu_map for the current frame being parsed
+     */
+    UWORD8 *pu1_pic_tu_map;
+
+    /**
+     * Pointer to frame level tu_t for the current frame being parsed
+     * where transform unit related info will be updated
+     */
+    tu_t *ps_pic_tu;
+
+    /**
+     * Points to an array of TU indices which is used to identify
+     * start index of tu_t in ps_pic_tu and also to identify number of
+     * TUs in the current CTB by subtracting current idx from next CTB's
+     * TU idx
+     */
+    UWORD32 *pu4_pic_tu_idx;
+
+    /**
+     * Points to an array of PU indices which is used to identify
+     * start index of pu_t in ps_pic_pu and also to identify number of
+     * PUs in the current CTB by subtracting current idx from next CTB's
+     * PU idx
+     */
+    UWORD32 *pu4_pic_pu_idx;
+
+
+    /**
+     * Current pictures intra mode map at 8x8 level
+     */
+    UWORD8 *pu1_pic_intra_flag;
+
+    /**
+     * Current pictures loop filter flag map at 8x8 level
+     */
+    UWORD8 *pu1_pic_no_loop_filter_flag;
+
+    /**
+     * Array to hold one row (top) of skip_flag flag stored at (8x8) level
+     * 1 bit per (8x8)
+     * read and written as a UWORD32
+     * LSB gives skip_flag for 0th 8x8 and MSB gives skip_flag for 31st 8x8 and so on
+     * This is independent of CTB size or minCU size
+     * Packed format requires extra calculations in extracting required bits but makes it easier
+     * to store skip data for larger sizes such as 32 x 32 where 4 bits need to be set instead of
+     * 4 bytes or for 64 x 64 where 8 bits need to be set instead of 8 bytes.
+     */
+    UWORD32 *pu4_skip_cu_top;
+
+    /**
+     * Array to hold one 64 pixel column (left) of skip_flag flag stored at (8x8) level
+     * 1 bit per (8x8)
+     * read and written as a UWORD32
+     * LSB gives skip_flag for 0th 8x8 and MSB gives skip for 31st 8x8 and so on
+     * This is independent of CTB size and allocated to store data for 64 pixels, of
+     * this only first ctb_size number of bits (starting from MSB) will have valid data
+     * This is also independent of min CU size and data is stored at 8x8 level.
+     * Since only 8 bits are needed to represent left 64 pixels at 8x8 level, this is not an array
+     */
+    UWORD32 u4_skip_cu_left;
+
+    /**
+     * Array to hold one row (top) of coding_tree_depth stored at (8x8) level
+     * 2 bits per (8x8) pixels
+     * read and written as a WORD32
+     * 2 LSBits give coding_tree_depth for 0th 8x8 and 2 MSBits give coding_tree_depth for 15th 8x8 and so on
+     * This is independent of CTB size or minCU size
+     */
+    UWORD32 *pu4_ct_depth_top;
+
+    /**
+     * Array to hold one 64 pixel column (left) of coding_tree_depth stored at (8x8) level
+     * 2 bits per (8x8) pixels
+     * read and written as a WORD32
+     * 2 LSBits give coding_tree_depth for 0th 8x8 and 2 MSBits give coding_tree_depth for 15th 8x8 and so on
+     * This is independent of CTB size and allocated to store data for 64 pixels, of
+     * this only first ctb_size * 2 number of bits (starting from MSB) will have valid data
+     * This is also independent of min CU size and data is stored at 8x8 level.
+     * Since only 16 bits are needed to represent left 64 pixels at 8x8 level, this is not an array
+     */
+    UWORD32 u4_ct_depth_left;
+
+    /**
+     * Array to hold top (one row) luma_intra_pred_mode stored at (4x4) level for a CTB
+     * 8 bits per (4x4) pixels
+     * read and written as a UWORD8
+     * This is independent of CTB size or minCU size
+     * This is independent of CTB size and allocated to store data for 64 pixels i.e. 64 bits is the size
+     * Note this data is used only within a CTB, There is no inter CTB dependencies for this
+     */
+    UWORD8 *pu1_luma_intra_pred_mode_top;
+
+    /**
+     * Array to hold  left (one column) luma_intra_pred_mode stored at (4x4) level for a CTB
+     * 8 bits per (4x4) pixels
+     * read and written as a UWORD8
+     * This is independent of CTB size and allocated to store data for 64 pixels i.e. 64 bits is the size
+     * This is also independent of min CU size and data is stored at 8x8 level.
+     * This is used for prediction of next CTB within a row in a slice or tile
+     */
+    UWORD8 *pu1_luma_intra_pred_mode_left;
+
+
+    /**
+     * Pointer to base of Video parameter set structure array
+     */
+    vps_t *ps_vps_base;
+
+    /**
+     * Pointer to base of Sequence parameter set structure array
+     */
+    sps_t *ps_sps_base;
+
+    /**
+     * Pointer to base of Picture parameter set structure array
+     */
+    pps_t *ps_pps_base;
+
+    /**
+     * Pointer to base of slice header structure array
+     */
+    slice_header_t *ps_slice_hdr_base;
+
+    /**
+     * Pointer to current slice header structure
+     */
+    slice_header_t *ps_slice_hdr;
+
+
+    /**
+     * Error code during parse stage
+     */
+    WORD32 i4_error_code;
+
+    /**
+     * Void pointer to process job context
+     */
+    void *pv_proc_jobq;
+
+    /* Cabac context */
+    cab_ctxt_t s_cabac;
+
+    /* Current Coding tree depth */
+    WORD32 i4_ct_depth;
+
+    /** Flag to signal end of frame */
+    WORD32 i4_end_of_frame;
+
+    /**
+     * Index of the next CTB parsed
+     */
+    WORD32 i4_next_ctb_indx;
+
+    /**
+     * Pointer to the structure that contains BS and QP frame level arrays
+     */
+    bs_ctxt_t s_bs_ctxt;
+
+    /**
+     * Pointer to the structure that contains deblock context
+     */
+    deblk_ctxt_t s_deblk_ctxt;
+
+    /**
+     * Pointer to the structure that contains sao context
+     */
+    sao_ctxt_t s_sao_ctxt;
+
+    /**
+     * QP Array for the current CTB
+     * Used in QP prediction
+     */
+    WORD8 ai1_8x8_cu_qp[MAX_CU_IN_CTB];
+
+
+    /**
+     * Pointer to frame level sao_t for the current frame being parsed
+     */
+    sao_t *ps_pic_sao;
+
+    /**
+     * Abs POC count of the frame
+     */
+    WORD32 i4_abs_pic_order_cnt;
+
+    /**
+     * Pointer points to mv_buffer of current frame
+     */
+    mv_buf_t *ps_cur_mv_buf;
+
+    /**
+     * Variable to store the next ctb count to compute pu idx
+     */
+    WORD32 i4_next_pu_ctb_cnt;
+
+    /**
+     * Variable to store the next ctb count to compute tu idx
+     */
+    WORD32 i4_next_tu_ctb_cnt;
+
+
+}parse_ctxt_t;
+
+/**
+ * Pixel processing thread context
+ */
+
+typedef struct
+{
+    /* Pointer to codec context
+     *
+     */
+    codec_t *ps_codec;
+
+    /**
+     * CTB's x position within a picture in raster scan in CTB units
+     */
+    WORD32 i4_ctb_x;
+
+    /**
+     * CTB's y position within a picture in raster scan in CTB units
+     */
+
+    WORD32 i4_ctb_y;
+
+    /**
+     * CTB's x position within a Tile in raster scan in CTB units
+     */
+    WORD32 i4_ctb_tile_x;
+
+    /**
+     * CTB's y position within a Tile in raster scan in CTB units
+     */
+
+    WORD32 i4_ctb_tile_y;
+
+    /**
+     * CTB's x position within a Slice in raster scan in CTB units
+     */
+    WORD32 i4_ctb_slice_x;
+
+    /**
+     * CTB's y position within a Slice in raster scan in CTB units
+     */
+
+    WORD32 i4_ctb_slice_y;
+
+    /**
+     * Current tile being processed
+     */
+    tile_t *ps_tile;
+
+    /**
+     * Current slice idx - Used in multi-core cases to store slice index for
+     * each ctb for sao filtering.
+     */
+    WORD32 i4_cur_slice_idx;
+
+    /**
+     * Current tile idx - Used in multi-core cases to store tile index for
+     * each ctb for sao filtering.
+     */
+    WORD32 i4_cur_tile_idx;
+    /**
+     * Pointer to current PPS
+     */
+    pps_t *ps_pps;
+
+    /**
+     * Pointer to current SPS
+     */
+    sps_t *ps_sps;
+
+    /**
+     * Pointer to current slice header structure
+     */
+    slice_header_t *ps_slice_hdr;
+
+    /**
+     * Error code during parse stage
+     */
+    WORD32 i4_error_code;
+
+    /**
+     * Signal that pic_init is called first time
+     */
+    WORD32 i4_first_pic_init;
+
+    /**
+     * Pointer frame level TU subblock coeff data
+     */
+    void *pv_pic_tu_coeff_data;
+
+    /**
+     * Pointer to TU subblock coeff data and number of subblocks and scan idx
+     * Incremented each time a coded subblock is processed
+     *
+     */
+    void *pv_tu_coeff_data;
+
+    /**
+     * Current TU structure - set to CTB tu_t pointer at the start of CTB processing and incremented
+     * for every TU
+     */
+    tu_t *ps_tu;
+
+    /**
+     * Current ctb's TU map
+     */
+    UWORD8 *pu1_tu_map;
+
+    /**
+     * Current PU structure - set to CTB pu_t pointer at the start of CTB processing and incremented
+     * for every TU
+     */
+    pu_t *ps_pu;
+
+    /**
+     * Points to an array of TU indices which is used to identify
+     * start index of tu_t in ps_pic_tu and also to identify number of
+     * TUs in the current CTB by subtracting current idx from next CTB's
+     * TU idx
+     */
+    UWORD32 *pu4_pic_tu_idx;
+
+    /**
+     * Points to an array of PU indices which is used to identify
+     * start index of pu_t in ps_pic_pu and also to identify number of
+     * PUs in the current CTB by subtracting current idx from next CTB's
+     * PU idx
+     */
+    UWORD32 *pu4_pic_pu_idx;
+
+    /**
+     * Pointer to tu_map for the current frame being parsed
+     */
+    UWORD8 *pu1_pic_tu_map;
+
+    /**
+      * Pointer to pu_map for the current frame being parsed
+      * where MVs and Intra pred modes will be updated
+      */
+    UWORD8 *pu1_pic_pu_map;
+
+    /**
+     * Pointer to frame level pu_t for the current frame being parsed
+     * where MVs and Intra pred modes will be updated
+     */
+    pu_t *ps_pic_pu;
+
+    /** PU Index map per CTB. The indices in this map are w.r.t picture pu array and not
+     * w.r.t CTB pu array.
+     * This will be used during mv prediction and since neighbours will have different CTB pu map
+     * it will be easier if they all have indices w.r.t picture level PU array rather than CTB level
+     * PU array.
+     * pu1_pic_pu_map is map w.r.t CTB's pu_t array
+     */
+    UWORD32 *pu4_pic_pu_idx_map;
+
+    /**
+     * PU Index of top 4x4 neighbors stored for an entire row
+     */
+    UWORD32 *pu4_pic_pu_idx_top;
+
+    /**
+     * PU Index of left 4x4 neighbors stored for 64 pixels
+     */
+    UWORD32 *pu4_pic_pu_idx_left;
+
+    /**
+     * Holds top left PU index at CTB level - top left gets overwritten
+     * by left CTB while updating top array. Before updating top at CTB
+     * level required top-left index is backed up in the following
+     */
+    UWORD32 u4_ctb_top_left_pu_idx;
+
+    /**
+     * Pointer to frame level tu_t for the current frame being parsed
+     * where transform unit related info will be updated
+     */
+    tu_t *ps_pic_tu;
+
+
+    /**
+    * Current PU structure - set to CTB pu_map pointer at the start of CTB parsing
+    */
+    UWORD8 *pu1_pu_map;
+
+    /** Current MV Bank's buffer ID */
+    WORD32 i4_cur_mv_bank_buf_id;
+
+    /**
+     * Current pictures intra mode map at 8x8 level
+     */
+    UWORD8 *pu1_pic_intra_flag;
+
+    /**
+     * Current pictures loop filter flag map at 8x8 level
+     */
+    UWORD8 *pu1_pic_no_loop_filter_flag;
+
+    /**
+     * Void pointer to process job context
+     */
+
+    void *pv_proc_jobq;
+
+    /**
+     * Number of CTBs to be processed in the current Job
+     */
+    WORD32 i4_ctb_cnt;
+    /**
+     * ID for the current context - Used for debugging
+     */
+    WORD32 i4_id;
+
+    /**
+     * Flag to indicate if parsing status has to be checked
+     * Needed when parsing and processing are done in different threads
+     */
+    WORD32 i4_check_parse_status;
+
+    /**
+     * Flag to indicate if processing status of top row CTBs has to be checked
+     * Needed when processing of different rows is done in different threads
+     */
+    WORD32 i4_check_proc_status;
+
+    /**
+     * Holds Intra dequantization matrices
+     */
+    WORD16 *api2_dequant_intra_matrix[4];
+
+    /**
+     * Holds Inter dequantization matrices
+     */
+    WORD16 *api2_dequant_inter_matrix[4];
+
+
+    /**
+     * Temporary buffer 1 - Used as a scratch in inter_pred_ctb()
+     */
+    WORD16 *pi2_inter_pred_tmp_buf1;
+
+    /**
+     * Temporary buffer 2 - Used as a scratch in inter_pred_ctb()
+     */
+    WORD16 *pi2_inter_pred_tmp_buf2;
+
+    /**
+     * Temporary buffer 3 - Used as a scratch in inter_pred_ctb()
+     */
+    WORD16 *pi2_inter_pred_tmp_buf3;
+
+    /**
+     * The above temporary buffers' stride
+     */
+    WORD32 i4_inter_pred_tmp_buf_strd;
+    /**
+     * Picture stride
+     * Used as prediction stride, destination stride while computing inverse transform
+     */
+    WORD32 i4_pic_strd;
+
+    /**
+     * Picture qp offset for U
+     */
+    WORD8 i1_pic_cb_qp_offset;
+
+    /**
+     * Slice qp offset for U
+     */
+    WORD32 i1_slice_cb_qp_offset;
+
+    /**
+     * Picture qp offset for V
+     */
+    WORD8 i1_pic_cr_qp_offset;
+
+    /**
+     * Slice qp offset for V
+     */
+    WORD32 i1_slice_cr_qp_offset;
+
+    /** Pointer to current picture buffer structure */
+    pic_buf_t *ps_cur_pic;
+
+    /** Current pic_buf's picture buffer id */
+    WORD32 i4_cur_pic_buf_id;
+
+    /** Pointer to 0th luma pixel in current pic */
+    UWORD8 *pu1_cur_pic_luma;
+
+    /** Pointer to 0th chroma pixel in current pic */
+    UWORD8 *pu1_cur_pic_chroma;
+
+    /** Intermediate buffer to be used during inverse transform */
+    WORD16 *pi2_itrans_intrmd_buf;
+
+    /** Buffer to hold output of inverse scan */
+    WORD16 *pi2_invscan_out;
+
+    /**
+     *  Top availability for current CTB level
+     */
+    UWORD8 u1_top_ctb_avail;
+
+    /**
+     *  Top right availability for current CTB level
+     */
+    UWORD8 u1_top_rt_ctb_avail;
+    /**
+     *  Top left availability for current CTB level
+     */
+    UWORD8 u1_top_lt_ctb_avail;
+    /**
+     *  left availability for current CTB level
+     */
+    UWORD8 u1_left_ctb_avail;
+    /**
+     *  TU count in current CTB
+     */
+    WORD32 i4_ctb_tu_cnt;
+
+    /**
+     *  Recon pointer to current CTB luma
+     */
+    UWORD8 *pu1_cur_ctb_luma;
+    /**
+     *  Recon pointer to current CTB chroma
+     */
+    UWORD8 *pu1_cur_ctb_chroma;
+
+    /**
+     *  PU count in current CTB
+     */
+    WORD32 i4_ctb_pu_cnt;
+
+    /**
+     *  PU count in current CTB
+     */
+    WORD32 i4_ctb_start_pu_idx;
+
+    /* Pointer to a structure describing output display buffer */
+    ivd_out_bufdesc_t *ps_out_buffer;
+
+    /** Flag to indicate if ps_proc was intialized at least once in a frame.
+     * This is needed to handle cases where a core starts to handle format conversion jobs directly
+     */
+    WORD32 i4_init_done;
+
+    /**
+     * Pointer to the structure that contains BS and QP frame level arrays
+     */
+    bs_ctxt_t s_bs_ctxt;
+
+    /**
+     * Pointer to the structure that contains deblock context
+     */
+    deblk_ctxt_t s_deblk_ctxt;
+
+    /**
+     * Pointer to the structure that contains sao context
+     */
+    sao_ctxt_t s_sao_ctxt;
+
+    /**
+     * Points to the array of slice indices which is used to identify the independent
+     * slice to which each CTB in a frame belongs.
+     */
+    UWORD16 *pu1_slice_idx;
+
+    /**
+     * Points to the array of slice indices which is used to identify the slice
+     *  to which each CTB in a frame belongs.
+     */
+    UWORD16 *pu1_tile_idx;
+    /**
+     * Variable to store the next ctb count to compute pu idx
+     */
+    WORD32 i4_next_pu_ctb_cnt;
+
+    /**
+     * Variable to store the next ctb count to compute tu idx
+     */
+    WORD32 i4_next_tu_ctb_cnt;
+#ifdef GPU_BUILD
+    //TODO GPU : Later define it for ARM only version as well
+    /** Process status: one byte per CTB */
+    UWORD8 *pu1_proc_map;
+#endif
+#ifdef GPU_BUILD
+    UWORD32 u4_gpu_inter_flag;
+#endif
+#ifdef GPU_BUILD
+    //TODO GPU : Later define it for ARM only version as well
+    /**
+     * Pointer to base slice header structure
+     */
+    slice_header_t *ps_slice_hdr_base;
+#endif
+    /**
+     * Number of ctb's to process in one loop
+     */
+    WORD32 i4_nctb;
+}process_ctxt_t;
+#ifdef GPU_BUILD
+typedef struct
+{
+    /** Pointer to private GPU memory */
+    void *pv_gpu_priv;
+
+    /**
+     * Array that contains the no of ctbs in each grain of the frame.
+     * Right now maximum no of grains in frame hardcoded to 16.
+     */
+    WORD32 ai4_ctbs_in_grain[16];
+
+    /**
+     * Array that contains the height of each grain of the frame in ctbs.
+     * Right now maximum no of grains in frame hardcoded to 16.
+     */
+    WORD32 ai4_grain_ht_in_ctb[16];
+
+    /**
+     * Array that contains the X position of each grain in the current
+     * frame in CTB units
+     */
+    WORD32 ai4_grain_pos_y[16];
+
+    /**
+     * Variables to store maximum extend of motion vectors in for the current grain.
+     */
+    //WORD32 i4_max_pu_y;
+    //WORD32 i4_max_pu_x;
+
+    /**
+     * Parameter that holds current grain index.
+     */
+    WORD32 i4_curr_grain_idx;
+
+    /**
+     * Arry to store coefficient offsets for each ctb row.
+     * Currently allocated for frame width of 4096(ctb size 16 * 256).
+     */
+    WORD32 ai4_tu_coeff_data_ofst[256];
+
+    /**
+     * Arry to store slice id for at the beginning of each ctb row.
+     * Currently allocated for frame width of 4096(ctb size 16 * 256).
+     */
+    WORD32 ai4_cur_slice_idx[256];
+
+    /**
+     * Variable to keep track of no of ctbs parsed in the current frame grain.
+     */
+    WORD32 i4_curr_grain_ctb_cnt;
+}gpu_ctxt_t;
+#endif
+
+typedef void (*pf_inter_pred)(void *,
+                              void *,
+                              WORD32,
+                              WORD32,
+                              WORD8 *,
+                              WORD32,
+                              WORD32);
+
+
+typedef void (*pf_intra_pred)(UWORD8 *pu1_ref,
+                              WORD32 src_strd,
+                              UWORD8 *pu1_dst,
+                              WORD32 dst_strd,
+                              WORD32 nt,
+                              WORD32 mode);
+
+typedef void (*pf_itrans_recon)(WORD16 *pi2_src,
+                                WORD16 *pi2_tmp,
+                                UWORD8 *pu1_pred,
+                                UWORD8 *pu1_dst,
+                                WORD32 src_strd,
+                                WORD32 pred_strd,
+                                WORD32 dst_strd,
+                                WORD32 zero_cols,
+                                WORD32 zero_rows);
+
+typedef void (*pf_recon)(WORD16 *pi2_src,
+                         UWORD8 *pu1_pred,
+                         UWORD8 *pu1_dst,
+                         WORD32 src_strd,
+                         WORD32 pred_strd,
+                         WORD32 dst_strd,
+                         WORD32 zero_cols);
+
+typedef void (*pf_itrans_recon_dc)(UWORD8 *pu1_pred,
+                                   UWORD8 *pu1_dst,
+                                   WORD32 pred_strd,
+                                   WORD32 dst_strd,
+                                   WORD32 log2_trans_size,
+                                   WORD16 i2_coeff_value);
+
+
+typedef void (*pf_sao_luma)(UWORD8 *,
+                            WORD32,
+                            UWORD8 *,
+                            UWORD8 *,
+                            UWORD8 *,
+                            UWORD8 *,
+                            UWORD8 *,
+                            UWORD8 *,
+                            WORD8 *,
+                            WORD32,
+                            WORD32);
+
+typedef void (*pf_sao_chroma)(UWORD8 *,
+                              WORD32,
+                              UWORD8 *,
+                              UWORD8 *,
+                              UWORD8 *,
+                              UWORD8 *,
+                              UWORD8 *,
+                              UWORD8 *,
+                              WORD8 *,
+                              WORD8 *,
+                              WORD32,
+                              WORD32);
+
+/**
+ * Codec context
+ */
+
+struct _codec_t
+{
+    /**
+     * Max width the codec can support
+     */
+    WORD32 i4_max_wd;
+
+    /**
+     * Max height the codec can support
+     */
+    WORD32 i4_max_ht;
+
+    /**
+     * Width : pic_width_in_luma_samples
+     */
+    WORD32 i4_wd;
+
+    /**
+     * Height : pic_height_in_luma_samples
+     */
+    WORD32 i4_ht;
+
+    /**
+     * Display width after cropping
+     */
+    WORD32 i4_disp_wd;
+
+    /**
+     * Display height after cropping
+     */
+    WORD32 i4_disp_ht;
+
+    /**
+     * Display stride
+     */
+    WORD32 i4_disp_strd;
+
+    /**
+     * Stride of reference buffers.
+     * For shared mode even display buffer will use the same stride
+     */
+    WORD32 i4_strd;
+
+    /**
+     * Level specified during init
+     */
+    WORD32 i4_init_level;
+
+    /**
+     * number of reference frames specified during init
+     */
+    WORD32 i4_init_num_ref;
+
+    /**
+     * number of reorder frames specified during init
+     */
+    WORD32 i4_init_num_reorder;
+
+    /**
+     * Number of extra display buffers allocated by application
+     */
+    WORD32 i4_init_num_extra_disp_buf;
+
+    /**
+     * Number of cores to be used
+     */
+    WORD32 i4_num_cores;
+
+    /**
+     * RASL output flag
+     */
+    WORD32 i4_rasl_output_flag;
+
+    /**
+     * Pictures that are are degraded
+     * 0 : No degrade
+     * 1 : Only on non-reference frames
+     * 2 : Use interval specified by u4_nondegrade_interval
+     * 3 : All non-key frames
+     * 4 : All frames
+     */
+    WORD32                                     i4_degrade_pics;
+
+    /**
+     * Interval for pictures which are completely decoded without any degradation
+     */
+    WORD32                                     i4_nondegrade_interval;
+
+    /**
+     * bit position (lsb is zero): Type of degradation
+     * 0 : Disable SAO
+     * 1 : Disable deblocking
+     * 2 : Faster inter prediction filters
+     * 3 : Fastest inter prediction filters
+     */
+    WORD32                                     i4_degrade_type;
+
+    /** Degrade pic count, Used to maintain the interval between non-degraded pics
+     *
+     */
+    WORD32  i4_degrade_pic_cnt;
+
+    /**
+     * Total number of display buffers to be used
+     * In case of shared mode, this will be number of reference frames
+     */
+    WORD32 i4_num_disp_bufs;
+
+    /**
+     * Flag to enable shared display buffer mode
+     */
+    WORD32 i4_share_disp_buf;
+
+    /**
+     * Chroma format of display buffers.
+     In shared mode only 420SP_UV and 420SP_VU are supported
+     */
+    IV_COLOR_FORMAT_T e_chroma_fmt;
+
+    /**
+     * Chroma format of reference buffers.
+     * In non-shared mode it will be 420SP_UV
+     * In shared mode only 420SP_UV and 420SP_VU are supported
+     */
+    IV_COLOR_FORMAT_T e_ref_chroma_fmt;
+
+    /**
+     * Frame skip mode
+     */
+    IVD_FRAME_SKIP_MODE_T e_pic_skip_mode;
+
+    /**
+     * Display or decode order dump of output
+     */
+    IVD_DISPLAY_FRAME_OUT_MODE_T e_pic_out_order;
+
+    /**
+     * Coding type of the picture that is decoded
+     */
+    IV_PICTURE_CODING_TYPE_T e_dec_pic_type;
+
+    /**
+     * Flag to signal if a frame was decoded in this call
+     */
+    WORD32 i4_pic_decoded;
+
+    /**
+     * Flag to signal if picture data is present in the current input bitstream
+     */
+    WORD32 i4_pic_present;
+
+    /**
+     * Flag to disable deblocking of a frame
+     */
+    WORD32 i4_disable_deblk_pic;
+
+    /**
+     * Flag to disable sao of a frame
+     */
+    WORD32 i4_disable_sao_pic;
+
+    /**
+     * Flag to use full pel MC
+     */
+    WORD32 i4_fullpel_inter_pred;
+    /**
+     * Flush mode
+     */
+    WORD32 i4_flush_mode;
+
+    /**
+     * Decode header mode
+     */
+    WORD32 i4_header_mode;
+
+    /**
+     * Header in slice mode
+     */
+    WORD32 i4_header_in_slice_mode;
+
+    /**
+     * Flag to signal sps done
+     */
+    WORD32 i4_sps_done;
+
+    /**
+     * Flag to signal pps done
+     */
+    WORD32 i4_pps_done;
+
+    /**
+     * To signal successful completion of init
+     */
+    WORD32 i4_init_done;
+
+    /**
+     * To signal that at least one picture was decoded
+     */
+    WORD32 i4_first_pic_done;
+
+    /**
+     * To signal error in slice
+     */
+    WORD32 i4_slice_error;
+
+    /**
+     * Reset flag - Codec is reset if this flag is set
+     */
+    WORD32 i4_reset_flag;
+
+    /**
+     * Number of pictures decoded till now
+     */
+    UWORD32 u4_pic_cnt;
+
+    /**
+     * Number of pictures displayed till now
+     */
+    UWORD32 u4_disp_cnt;
+
+    /**
+     * Current error code
+     */
+    WORD32 i4_error_code;
+
+    /**
+     * Pointer to input bitstream. This is incremented everytime a NAL is processed
+     */
+    UWORD8 *pu1_inp_bitsbuf;
+
+    /**
+     * Offset to first byte after the start code in current NAL
+     */
+    WORD32 i4_nal_ofst;
+
+    /**
+     * Length of the NAL unit including the emulation bytes
+     */
+    WORD32 i4_nal_len;
+
+    /**
+     * Number of emulation prevention bytes present in the current NAL
+     */
+    WORD32 i4_num_emln_bytes;
+
+    /**
+     * Number of bytes remaining in the input bitstream
+     */
+    /**
+     * Decremented everytime a NAL is processed
+     */
+    WORD32 i4_bytes_remaining;
+
+    /**
+     * Pointer to bitstream after emulation prevention
+     */
+    UWORD8 *pu1_bitsbuf;
+
+    /**
+     * Size of intermediate bitstream buffer
+     */
+    UWORD32 u4_bitsbuf_size;
+
+    /**
+     * Pointer to hold TU data for a set of CTBs or a picture
+     */
+#ifndef GPU_BUILD
+    void *pv_tu_data;
+#else
+    void *apv_tu_data[2];
+#endif
+    /**
+     * Holds mem records passed during init.
+     * This will be used to return the mem records during retrieve call
+     */
+    iv_mem_rec_t *ps_mem_rec_backup;
+
+    /**
+     * Process Job queue buffer base
+     */
+    void *pv_proc_jobq_buf;
+
+    /**
+     * Process Job Queue mem tab size
+     */
+    WORD32 i4_proc_jobq_buf_size;
+
+    /** Parse status: one byte per CTB */
+    UWORD8 *pu1_parse_map;
+
+    /** Process status: one byte per CTB */
+#ifndef GPU_BUILD
+    UWORD8 *pu1_proc_map;
+#else
+    UWORD8 *apu1_proc_map[2];
+#endif
+    /**
+     * Current pictures intra mode map at 8x8 level
+     */
+#ifndef GPU_BUILD
+    UWORD8 *pu1_pic_intra_flag;
+#else
+    UWORD8 *apu1_pic_intra_flag[2];
+#endif
+    /**
+     * Current pictures loop filter flag map at 8x8 level
+     */
+#ifndef GPU_BUILD
+    UWORD8 *pu1_pic_no_loop_filter_flag;
+#else
+    UWORD8 *apu1_pic_no_loop_filter_flag[2];
+#endif
+    /**
+     * MV Bank buffer manager
+     */
+    void *pv_mv_buf_mgr;
+
+    /**
+     * Pointer to MV Buf structure array
+     */
+    void *ps_mv_buf;
+
+    /**
+     * Base address for Motion Vector bank buffer
+     */
+    void *pv_mv_bank_buf_base;
+
+    /**
+     * MV Bank size allocated
+     */
+    WORD32 i4_total_mv_bank_size;
+
+    /**
+     * Picture buffer manager
+     */
+    void *pv_pic_buf_mgr;
+
+    /**
+     * Pointer to Pic Buf structure array
+     */
+    void *ps_pic_buf;
+
+    /**
+     * Base address for Picture buffer
+     */
+    void *pv_pic_buf_base;
+
+    /**
+     * Total pic buffer size allocated
+     */
+    WORD32 i4_total_pic_buf_size;
+
+
+    /**
+     * Picture buffer manager
+     */
+    void *pv_disp_buf_mgr;
+
+    /**
+     * Current display buffer's buffer ID
+     */
+    WORD32 i4_disp_buf_id;
+
+    /**
+     * Current display buffer
+     */
+    pic_buf_t *ps_disp_buf;
+
+    /**
+     * Pointer to dpb manager structure
+     */
+    void *pv_dpb_mgr;
+
+    /**
+     * Scaling matrices for each PPS
+     */
+    WORD16 *pi2_scaling_mat;
+
+    /**
+     * Array containing Tile information for each PPS
+     */
+    tile_t *ps_tile;
+
+    /**
+     * Timestamp associated with the current display output
+     */
+    UWORD32 u4_ts;
+
+    /**
+     * Pointer to base of Video parameter set structure array
+     */
+    vps_t *ps_vps_base;
+
+    /**
+     * Pointer to base of Sequence parameter set structure array
+     */
+    sps_t *ps_sps_base;
+
+    /**
+     * Pointer to base of Picture parameter set structure array
+     */
+    pps_t *ps_pps_base;
+
+    /**
+     * Pointer to base of slice header structure array
+     */
+#ifndef GPU_BUILD
+    slice_header_t *ps_slice_hdr_base;
+#else
+    slice_header_t *aps_slice_hdr_base[2];
+#endif
+    /**
+     * Pointer to base of entry point offsets in a frame
+     */
+    WORD32 *pi4_entry_ofst;
+
+    /**
+     * Current offset in pi4_entry_ofst
+     */
+    WORD32 i4_cur_entry_ofst;
+
+    /**
+     *  Parsing context
+     */
+    parse_ctxt_t s_parse;
+
+    /**
+     * Processing context - One for each processing thread
+     */
+    process_ctxt_t as_process[MAX_PROCESS_THREADS];
+
+    /**
+     * Thread handle for each of the processing threads
+     */
+    void *apv_process_thread_handle[MAX_PROCESS_THREADS];
+
+    /**
+     * Thread created flag for each of the processing threads
+     */
+    WORD32 ai4_process_thread_created[MAX_PROCESS_THREADS];
+
+    /**
+     * Void pointer to process job context
+     */
+    void *pv_proc_jobq;
+
+    /* Number of CTBs processed together for better instruction cache handling */
+    WORD32 i4_proc_nctb;
+
+    /**
+     * Previous POC lsb
+     */
+    WORD32 i4_prev_poc_lsb;
+
+    /**
+     * Previous POC msb
+     */
+    WORD32 i4_prev_poc_msb;
+
+    /**
+     * Max POC lsb that has arrived till now
+     */
+    WORD32 i4_max_prev_poc_lsb;
+
+    /** Context for format conversion */
+    fmt_conv_t s_fmt_conv;
+
+    /** Pointer to a structure describing output display buffer */
+    ivd_out_bufdesc_t *ps_out_buffer;
+    /**
+     * Variable to store the next ctb count to compute pu idx
+     */
+    WORD32 i4_next_pu_ctb_cnt;
+
+    /**
+     * Variable to store the next ctb count to compute tu idx
+     */
+    WORD32 i4_next_tu_ctb_cnt;
+
+    /**  Active SPS id - mainly to be used during codec initializations in shared mode */
+    WORD32 i4_sps_id;
+
+    /**  Number of ctbs to be decoded in one process call */
+    UWORD32 u4_nctb;
+
+    /** Flag to enable scheduling of format conversion jobs ahead of processing jobs */
+    UWORD32 u4_enable_fmt_conv_ahead;
+
+    /** Mask used to change MVs to full pel when configured to run in reduced complexity mode */
+    WORD32 i4_mv_frac_mask;
+#ifdef GPU_BUILD
+    /* Two bits per edge.
+    Stored in format. BS[15] | BS[14] | .. |BS[0]*/
+    UWORD32 *apu4_pic_vert_bs[2];
+
+    /**
+     * Horizontal Boundary strength
+     */
+
+    /* Two bits per edge.
+    Stored in format. BS[15] | BS[14] | .. |BS[0]*/
+    UWORD32 *apu4_pic_horz_bs[2];
+
+    /**
+     * Flags to indicate if QP is constant through out a CTB - 1 bit for each CTB
+     * The bits are packed from LSB to MSB
+     * To get the flag corresponding to CTB with (ctb_x, ctb_y), use
+     *      pu4_qp_const_in_ctb[(ctb_x + pic_wd_in_ctb * ctb_y) >> 3] & (1 << ((ctb_x + pic_wd_in_ctb * ctb_y) & 7))
+     */
+    UWORD8 *apu1_pic_qp_const_in_ctb[2];
+
+    /**
+     *  Qp array stored for each 8x8 pixels
+     */
+    UWORD8  *apu1_pic_qp[2];
+
+    /**
+     * Pointer to frame level sao_t for the current frame being parsed
+     */
+    sao_t *aps_pic_sao[2];
+
+    /* GPU context structure */
+    gpu_ctxt_t s_gpu_ctxt;
+
+    /* Flag to switch bw MC on GPU and CPU dynamically */
+    UWORD32 u4_gpu_enabled;
+
+    /* Variable to store the view(ping or pong) for parsing */
+    UWORD32 u4_parsing_view;
+
+    /*
+     * Set the flag to remember to add the frame for flushing
+     * call is a flush call.
+     */
+    UWORD32 u4_add_last_frame;
+#endif
+    /**  Funtion pointers for inter_pred leaf level functions */
+    pf_inter_pred apf_inter_pred[22];
+
+    /**  Funtion pointers for inter_pred_luma leaf level functions */
+    pf_intra_pred apf_intra_pred_luma[11];
+
+    /**  Funtion pointers for inter_pred_chroma leaf level functions */
+    pf_intra_pred apf_intra_pred_chroma[11];
+
+    /**  Funtion pointers for itrans_recon leaf level functions */
+    pf_itrans_recon apf_itrans_recon[8];
+
+    /**  Funtion pointers for recon leaf level functions */
+    pf_recon apf_recon[8];
+
+    /**  Funtion pointers for itrans_recon_dc leaf level functions */
+    pf_itrans_recon_dc apf_itrans_recon_dc[2];
+
+    /**  Funtion pointers for sao_luma leaf level functions */
+    pf_sao_luma apf_sao_luma[4];
+
+    /**  Funtion pointers for sao_chroma leaf level functions */
+    pf_sao_chroma apf_sao_chroma[4];
+
+    /**  Funtion pointers for all the leaf level functions */
+    func_selector_t s_func_selector;
+    /**  Processor architecture */
+    IVD_ARCH_T e_processor_arch;
+    /**  Processor soc */
+    IVD_SOC_T e_processor_soc;
+};
+
+#endif /* _IHEVCD_STRUCTS_H_ */

diff --git a/decoder/ihevcd_trace.c b/decoder/ihevcd_trace.c
new file mode 100644
index 0000000..7811bc8
--- /dev/null
+++ b/decoder/ihevcd_trace.c

@@ -0,0 +1,144 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_trace.c
+*
+* @brief
+*  Contains trace related functions
+*
+* @author
+*  Ittiam
+*
+* @par List of Functions:
+*   - ihevcd_trace_init()
+*   - ihevcd_trace_deinit()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#ifdef TRACE
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_trace.h"
+
+
+
+/*****************************************************************************/
+/* Declare globals                                                           */
+/*****************************************************************************/
+/**
+ * Trace context
+ */
+trace_t g_trace;
+/**
+ * Trace file name
+ */
+CHAR ac_trace_fname[] = "trace.txt";
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Function used for initialization of trace parameters
+*
+* @par Description:
+*  Initialize trace structure elements
+*
+* @param[in] pc_fname
+*  File name for trace dumps
+*
+* @returns  none
+*
+* @remarks
+*  Uses global hence not thread safe
+*
+*******************************************************************************
+*/
+
+void ihevcd_trace_init(CHAR *pc_fname)
+{
+    trace_t *ps_trace = &g_trace;
+
+    if(pc_fname == NULL)
+        pc_fname = ac_trace_fname;
+
+    ps_trace->fp = fopen(pc_fname, "w");
+
+    if(NULL == ps_trace->fp)
+    {
+        exit(-1);
+    }
+    return;
+}
+/**
+*******************************************************************************
+*
+* @brief
+*  Function used for deinitialization of trace parameters
+*
+* @par Description:
+*  Initialize trace structure elements
+*
+* @param[in] ps_trace
+*  Pointer to trace context
+*
+* @returns  none
+*
+* @remarks
+*  Uses global hence not thread safe
+*
+*******************************************************************************
+*/
+void ihevcd_trace_deinit(trace_t *ps_trace)
+{
+    if(NULL != ps_trace->fp)
+    {
+        fclose(ps_trace->fp);
+    }
+    return;
+}
+
+#endif /* TRACE */

diff --git a/decoder/ihevcd_trace.h b/decoder/ihevcd_trace.h
new file mode 100644
index 0000000..09aa7d8
--- /dev/null
+++ b/decoder/ihevcd_trace.h

@@ -0,0 +1,175 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_trace.h
+*
+* @brief
+*  Header for codec trace messages
+*
+* @author
+*  Ittiam
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVCD_TRACE_H_
+#define _IHEVCD_TRACE_H_
+
+#define FULLRANGE 1
+
+
+#define RANGE_NUMBITS 31
+#define RANGE_SHIFT  (RANGE_NUMBITS - 9)
+
+#ifdef TRACE
+/**
+ * Context for trace
+ */
+typedef struct
+{
+    /**
+     * fp
+     */
+    FILE    *fp;
+
+    /**
+     * u8_cnt
+     */
+    ULWORD64  u8_cnt;
+}trace_t;
+
+/**
+ * Global context for trace info
+ */
+extern trace_t g_trace;
+
+/**
+ * Call ihevcd_bits_get() to read from bitstream and dumps the data to trace file
+ */
+#define BITS_PARSE(m_str, m_value, m_ps_bitstrm, m_numbits)                 \
+{                                                                           \
+    m_value = ihevcd_bits_get(m_ps_bitstrm, m_numbits);                     \
+    fprintf( g_trace.fp, "%-40s u(%d) : %d\n", m_str, m_numbits, m_value ); \
+    fflush ( g_trace.fp);                                                   \
+}
+
+/**
+ * Call ihevcd_uev() to read from bitstream and dumps the data to trace file
+ */
+
+#define UEV_PARSE(m_str, m_value, m_ps_bitstrm)                             \
+{                                                                           \
+    m_value = ihevcd_uev(m_ps_bitstrm);                                     \
+    fprintf( g_trace.fp, "%-40s ue(v) : %d\n", m_str, m_value );            \
+    fflush ( g_trace.fp);                                                   \
+}
+/**
+ * Call ihevcd_sev() to read from bitstream and dumps the data to trace file
+ */
+#define SEV_PARSE(m_str, m_value, m_ps_bitstrm)                             \
+{                                                                           \
+    m_value = ihevcd_sev(m_ps_bitstrm);                                     \
+    fprintf( g_trace.fp, "%-40s se(v) : %d\n", m_str, m_value );            \
+    fflush ( g_trace.fp);                                                   \
+}
+
+
+#if FULLRANGE
+#define TRACE_CABAC_CTXT(m_string, m_range, m_ctxt_idx)                  \
+{                                                                        \
+    UWORD32 m_clz, m_range_shift, m_state_mps;                           \
+    m_state_mps = ps_cabac->au1_ctxt_models[m_ctxt_idx];                  \
+    m_clz = CLZ(m_range);                                                \
+    m_clz -= (32 - RANGE_NUMBITS);                                       \
+    m_range_shift = m_range << m_clz;                                    \
+    m_range_shift = m_range_shift >> RANGE_SHIFT;                        \
+    fprintf( g_trace.fp, "%-40s: Range:%3d State:%3d MPS:%1d\n",         \
+        m_string, m_range_shift, m_state_mps >> 1, m_state_mps & 1);     \
+    fflush ( g_trace.fp);                                                \
+}
+#define AEV_TRACE(m_str, m_value, m_range)                                  \
+{                                                                           \
+    UWORD32 m_clz, m_range_shift;                                           \
+    m_clz = CLZ(m_range);                                                   \
+    m_clz -= (32 - RANGE_NUMBITS);                                       \
+    m_range_shift = m_range << m_clz;                                       \
+    m_range_shift = m_range_shift >> RANGE_SHIFT;                           \
+    fprintf( g_trace.fp, "%-40s:%8d R:%d\n", m_str, m_value, m_range_shift);\
+    fflush ( g_trace.fp);                                                   \
+}
+#else
+#define TRACE_CABAC_CTXT(m_string, m_range, m_ctxt_idx)                  \
+{                                                                        \
+    UWORD32 m_state_mps;                                                 \
+    m_state_mps = ps_cabac->au1_ctxt_models[m_ctxt_idx];                 \
+    fprintf( g_trace.fp, "%-40s: Range:%3d State:%3d MPS:%1d\n",         \
+        m_string, m_range, m_state_mps >> 1, m_state_mps & 1);           \
+    fflush ( g_trace.fp);                                                \
+}
+
+#define AEV_TRACE(m_str, m_value, m_range)                              \
+{                                                                       \
+    fprintf( g_trace.fp, "%-40s:%8d R:%d\n", m_str, m_value, m_range);  \
+    fflush ( g_trace.fp);                                               \
+}
+#endif
+
+#define TUV_PARSE(m_str, m_value, m_ps_bitstrm)                      \
+    m_value = ihevcd_bits_get(m_ps_bitstrm, 1);
+
+#define TRACE_INIT(a)   ihevcd_trace_init(a)
+#define TRACE_DEINIT(a) ihevcd_trace_deinit(a)
+
+#else /* TRACE */
+/**
+ * Call ihevcd_bits_get() to read from bitstream
+ */
+
+#define BITS_PARSE(m_str, m_value, m_ps_bitstrm, m_numbits)           \
+    m_value = ihevcd_bits_get(m_ps_bitstrm, m_numbits);
+
+/**
+ * Call ihevcd_uev() to read from bitstream
+ */
+
+#define UEV_PARSE(m_str, m_value, m_ps_bitstrm)                       \
+    m_value = ihevcd_uev(m_ps_bitstrm);
+
+/**
+ * Call ihevcd_sev() to read from bitstream
+ */
+
+#define SEV_PARSE(m_str, m_value, m_ps_bitstrm)                       \
+    m_value = ihevcd_sev(m_ps_bitstrm);
+
+#define TUV_PARSE(m_str, m_value, m_ps_bitstrm)                      \
+    m_value = ihevcd_bits_get(m_ps_bitstrm, 1);
+
+#define TRACE_CABAC_CTXT(m_string, m_range, m_state_mps)
+
+#define AEV_TRACE(m_str, m_value, m_range)
+
+
+#define TRACE_INIT(a)
+#define TRACE_DEINIT(a)
+#endif /* TRACE */
+#endif /* _IHEVCD_TRACE_H_ */

diff --git a/decoder/ihevcd_utils.c b/decoder/ihevcd_utils.c
new file mode 100644
index 0000000..36399a7
--- /dev/null
+++ b/decoder/ihevcd_utils.c

@@ -0,0 +1,1318 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_utils.c
+*
+* @brief
+*  Contains miscellaneous utility functions such as init() etc
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_defs.h"
+#include "ihevc_error.h"
+#include "ihevc_structs.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+
+#include "ihevc_common_tables.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_cabac_tables.h"
+
+#include "ihevcd_defs.h"
+
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_utils.h"
+#include "ihevcd_trace.h"
+#include "ihevcd_process_slice.h"
+#include "ihevcd_job_queue.h"
+#ifdef GPU_BUILD
+#include "ihevcd_opencl_mc_interface.h"
+#endif
+#define MAX_DPB_PIC_BUF 6
+
+/* Function declarations */
+mv_buf_t* ihevcd_mv_mgr_get_poc(buf_mgr_t *ps_mv_buf_mgr, UWORD32 abs_poc);
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Used to get level index for a given level
+*
+* @par Description:
+*  Converts from level_idc (which is multiplied by 30) to an index that can be
+*  used as a lookup. Also used to ignore invalid levels like 2.2 , 3.2 etc
+*
+* @param[in] level
+*  Level of the stream
+*
+* @returns  Level index for a given level
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_get_lvl_idx(WORD32 level)
+{
+    WORD32 lvl_idx = 0;
+
+    if(level < IHEVC_LEVEL_20)
+    {
+        lvl_idx = 0;
+    }
+    else if(level >= IHEVC_LEVEL_20 && level < IHEVC_LEVEL_21)
+    {
+        lvl_idx = 1;
+    }
+    else if(level >= IHEVC_LEVEL_21 && level < IHEVC_LEVEL_30)
+    {
+        lvl_idx = 2;
+    }
+    else if(level >= IHEVC_LEVEL_30 && level < IHEVC_LEVEL_31)
+    {
+        lvl_idx = 3;
+    }
+    else if(level >= IHEVC_LEVEL_31 && level < IHEVC_LEVEL_40)
+    {
+        lvl_idx = 4;
+    }
+    else if(level >= IHEVC_LEVEL_40 && level < IHEVC_LEVEL_41)
+    {
+        lvl_idx = 5;
+    }
+    else if(level >= IHEVC_LEVEL_41 && level < IHEVC_LEVEL_50)
+    {
+        lvl_idx = 6;
+    }
+    else if(level >= IHEVC_LEVEL_50 && level < IHEVC_LEVEL_51)
+    {
+        lvl_idx = 7;
+    }
+    else if(level >= IHEVC_LEVEL_51 && level < IHEVC_LEVEL_52)
+    {
+        lvl_idx = 8;
+    }
+    else if(level >= IHEVC_LEVEL_52 && level < IHEVC_LEVEL_60)
+    {
+        lvl_idx = 9;
+    }
+    else if(level >= IHEVC_LEVEL_60 && level < IHEVC_LEVEL_61)
+    {
+        lvl_idx = 10;
+    }
+    else if(level >= IHEVC_LEVEL_61 && level < IHEVC_LEVEL_62)
+    {
+        lvl_idx = 11;
+    }
+    else if(level >= IHEVC_LEVEL_62)
+    {
+        lvl_idx = 12;
+    }
+
+    return (lvl_idx);
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Used to get DPB size for a given level, and number of luma samples
+*
+* @par Description:
+*  For given width, height and level number of max_dpb_size is computed as per
+*  Annex A.4.1
+*
+* @param[in] level
+*  Level of the stream
+*
+* @param[in] pic_size
+*  Width * Height
+*
+* @returns  Number of buffers in DPB
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_get_dpb_size(WORD32 level, WORD32 pic_size)
+{
+
+    WORD32 max_luma_samples;
+
+    WORD32 max_dpb_size;
+    WORD32 lvl_idx = ihevcd_get_lvl_idx(level);
+    max_luma_samples = gai4_ihevc_max_luma_pic_size[lvl_idx];
+
+
+
+    if(pic_size <= (max_luma_samples >> 2))
+    {
+        max_dpb_size = MIN(4 * MAX_DPB_PIC_BUF, 16);
+    }
+    else if(pic_size <= (max_luma_samples >> 1))
+    {
+        max_dpb_size = MIN(2 * MAX_DPB_PIC_BUF, 16);
+    }
+    else if(pic_size <= ((3 * max_luma_samples) >> 2))
+    {
+        max_dpb_size = MIN((4 * MAX_DPB_PIC_BUF) / 3, 16);
+    }
+    else
+    {
+        max_dpb_size = MAX_DPB_PIC_BUF;
+    }
+
+    return max_dpb_size;
+}
+/**
+*******************************************************************************
+*
+* @brief
+*  Used to get reference picture buffer size for a given level and
+*  and padding used
+*
+* @par Description:
+*  Used to get reference picture buffer size for a given level and padding used
+*  Each picture is padded on all four sides
+*
+* @param[in] pic_size
+*  Mumber of luma samples (Width * Height)
+*
+* @param[in] level
+*  Level
+*
+* @param[in] horz_pad
+*  Total padding used in horizontal direction
+*
+* @param[in] vert_pad
+*  Total padding used in vertical direction
+*
+* @returns  Total picture buffer size
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_get_total_pic_buf_size(WORD32 pic_size,
+                                     WORD32 level,
+                                     WORD32 horz_pad,
+                                     WORD32 vert_pad,
+                                     WORD32 num_ref_frames,
+                                     WORD32 num_reorder_frames)
+{
+    WORD32 size;
+    WORD32 num_luma_samples;
+    WORD32 lvl_idx;
+    WORD32 max_wd;
+    WORD32 max_dpb_size;
+    WORD32 num_samples;
+    WORD32 max_num_bufs;
+    WORD32 pad = MAX(horz_pad, vert_pad);
+
+
+    /* Get maximum number of buffers for the current picture size */
+    max_dpb_size = ihevcd_get_dpb_size(level, pic_size);
+
+
+    max_num_bufs = (2 * max_dpb_size + 1);
+    /* If num_ref_frames and num_reorder_frmaes is specified
+     * Use minimum value
+     */
+    max_num_bufs = MIN(max_num_bufs, (num_ref_frames + num_reorder_frames + 1));
+
+    /* Get level index */
+    lvl_idx = ihevcd_get_lvl_idx(level);
+
+    /* Maximum number of luma samples in a picture at given level */
+    num_luma_samples = gai4_ihevc_max_luma_pic_size[lvl_idx];
+
+    /* Account for chroma */
+    num_samples = num_luma_samples * 3 / 2;
+
+    /* Maximum width of luma samples in a picture at given level */
+    max_wd = gai4_ihevc_max_wd_ht[lvl_idx];
+
+
+    /* Allocation is required for
+     * (Wd + horz_pad) * (Ht + vert_pad) * (2 * max_dpb_size + 1)
+     *
+     * Above expanded as
+     * ((Wd * Ht) + (horz_pad * vert_pad) + Wd * vert_pad + Ht * horz_pad) * (2 * max_dpb_size + 1)
+     * (Wd * Ht) * (2 * max_dpb_size + 1) + ((horz_pad * vert_pad) + Wd * vert_pad + Ht * horz_pad) * (2 * max_dpb_size + 1)
+     * Now  max_dpb_size increases with smaller Wd and Ht, but Wd * ht * max_dpb_size will still be lesser or equal to max_wd * max_ht * dpb_size
+     *
+     * In the above equation (Wd * Ht) * (2 * max_dpb_size + 1) is accounted by using num_samples * (2 * max_dpb_size + 1) below
+     *
+     * For the padded area use MAX(horz_pad, vert_pad) as pad
+     * ((pad * pad) + pad * (Wd + Ht)) * (2 * max_dpb_size + 1) has to accounted from the above for padding
+     *
+     * Since Width and Height can change worst Wd + Ht is when One of the dimensions is max and other is min
+     * So use max_wd and min_ht
+     */
+
+    /* Number of bytes in reference pictures */
+    size = num_samples * max_num_bufs;
+
+    /* Account for padding area */
+    size += ((pad * pad) + pad * (max_wd + max_wd)) * max_num_bufs;
+
+    return size;
+}
+/**
+*******************************************************************************
+*
+* @brief
+*  Used to get MV bank size for a given number of luma samples
+*
+* @par Description:
+*  For given number of luma samples  one MV bank size is computed
+*  Each MV bank includes pu_map and pu_t for all the min PUs(4x4) in a picture
+*
+* @param[in] num_luma_samples
+*  Max number of luma pixels in the frame
+*
+* @returns  Total MV Bank size
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_get_pic_mv_bank_size(WORD32 num_luma_samples)
+{
+    WORD32 size;
+
+    WORD32 pic_size;
+
+    WORD32 mv_bank_size;
+    WORD32 num_pu;
+    WORD32 num_ctb;
+    pic_size = num_luma_samples;
+
+
+    num_pu = pic_size / (MIN_PU_SIZE * MIN_PU_SIZE);
+    num_ctb = pic_size / (MIN_CTB_SIZE * MIN_CTB_SIZE);
+
+    mv_bank_size = 0;
+
+    /* Size for storing pu_t start index each CTB */
+    /* One extra entry is needed to compute number of PUs in the last CTB */
+    mv_bank_size += (num_ctb + 1) * sizeof(WORD32);
+
+    /* Size for pu_map */
+    mv_bank_size += num_pu;
+
+    /* Size for storing pu_t for each PU */
+    mv_bank_size += num_pu * sizeof(pu_t);
+
+
+    size =  mv_bank_size;
+    return size;
+}
+/**
+*******************************************************************************
+*
+* @brief
+*  Used to get TU data size for a given number luma samples
+*
+* @par Description:
+*  For a given number of luma samples TU data size is computed
+*  Each TU data includes tu_map and tu_t and coeff data for all
+*  the min TUs(4x4) in given CTB
+*
+* @param[in] num_luma_samples
+*  Number of 64 x 64 CTBs for which TU data has to be allocated.
+*
+* @returns  Total TU data size
+*
+* @remarks Assumption is num_luma_samples will be at least
+* 64 x 64 to handle CTB of size 64 x 64. Can be frame size as well
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_get_tu_data_size(WORD32 num_luma_samples)
+{
+
+
+    WORD32 tu_data_size;
+    WORD32 num_ctb;
+    WORD32 num_luma_tu, num_chroma_tu, num_tu;
+    num_ctb = num_luma_samples / (MIN_CTB_SIZE * MIN_CTB_SIZE);
+
+    num_luma_tu = num_luma_samples / (MIN_TU_SIZE * MIN_TU_SIZE);
+    num_chroma_tu = num_luma_tu >> 1;
+
+    num_tu = num_luma_tu + num_chroma_tu;
+    tu_data_size = 0;
+
+    /* Size for storing tu_t start index each CTB */
+    /* One extra entry is needed to compute number of TUs in the last CTB */
+    tu_data_size += (num_ctb + 1) * sizeof(WORD32);
+
+    /* Size for storing tu map */
+    tu_data_size += num_luma_tu * sizeof(UWORD8);
+
+    /* Size for storing tu_t for each TU */
+    tu_data_size += num_tu * sizeof(tu_t);
+
+    /* Size for storing number of coded subblocks and scan_idx for each TU */
+    tu_data_size += num_tu * (sizeof(WORD8) + sizeof(WORD8));
+
+    /* Size for storing coeff data for each TU */
+    tu_data_size += num_tu * sizeof(tu_sblk_coeff_data_t);
+
+
+    return tu_data_size;
+}
+
+
+WORD32 ihevcd_nctb_cnt(codec_t *ps_codec, sps_t *ps_sps)
+{
+    WORD32 nctb = 1;
+    UNUSED(ps_codec);
+    //TODO: Currently set to 1
+    /* If CTB size is less than 32 x 32 then set nCTB as 4 */
+    if(ps_sps->i1_log2_ctb_size < 5)
+        nctb = 1;
+
+    return nctb;
+}
+
+IHEVCD_ERROR_T ihevcd_get_tile_pos(pps_t *ps_pps,
+                                   sps_t *ps_sps,
+                                   WORD32 ctb_x,
+                                   WORD32 ctb_y,
+                                   WORD32 *pi4_ctb_tile_x,
+                                   WORD32 *pi4_ctb_tile_y,
+                                   WORD32 *pi4_tile_idx)
+{
+
+    tile_t *ps_tile_tmp;
+    WORD32 i;
+    WORD32 tile_row, tile_col;
+
+    if(ctb_x < 0 || ctb_y < 0)
+    {
+        *pi4_ctb_tile_x = 0;
+        *pi4_ctb_tile_y = 0;
+        *pi4_tile_idx = 0;
+
+        return (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    }
+
+    tile_row = 0;
+    tile_col = 0;
+    ps_tile_tmp = ps_pps->ps_tile;
+    if(0 == ps_pps->i1_tiles_enabled_flag)
+    {
+        *pi4_ctb_tile_x = ctb_x;
+        *pi4_ctb_tile_y = ctb_y;
+        *pi4_tile_idx = 0;
+    }
+    else
+    {
+        for(i = 0; i < ps_pps->i1_num_tile_columns; i++)
+        {
+            WORD16 next_tile_ctb_x;
+            ps_tile_tmp = ps_pps->ps_tile + i; //* ps_pps->i1_num_tile_rows;
+            if((ps_pps->i1_num_tile_columns - 1) == i)
+            {
+                next_tile_ctb_x = ps_sps->i2_pic_wd_in_ctb;
+            }
+            else
+            {
+                tile_t *ps_tile_next_tmp;
+                ps_tile_next_tmp = ps_pps->ps_tile + i + 1;
+                next_tile_ctb_x = ps_tile_next_tmp->u1_pos_x;
+            }
+            if((ctb_x >= ps_tile_tmp->u1_pos_x) && (ctb_x < next_tile_ctb_x))
+            {
+                tile_col = i;
+                break;
+            }
+        }
+        *pi4_ctb_tile_x = ctb_x - ps_tile_tmp->u1_pos_x;
+
+        for(i = 0; i < ps_pps->i1_num_tile_rows; i++)
+        {
+            WORD16 next_tile_ctb_y;
+            ps_tile_tmp = ps_pps->ps_tile + i * ps_pps->i1_num_tile_columns;
+            if((ps_pps->i1_num_tile_rows - 1) == i)
+            {
+                next_tile_ctb_y = ps_sps->i2_pic_ht_in_ctb;
+            }
+            else
+            {
+                tile_t *ps_tile_next_tmp;
+                ps_tile_next_tmp = ps_pps->ps_tile + ((i + 1) * ps_pps->i1_num_tile_columns);
+                next_tile_ctb_y = ps_tile_next_tmp->u1_pos_y;
+            }
+            if((ctb_y >= ps_tile_tmp->u1_pos_y) && (ctb_y < next_tile_ctb_y))
+            {
+                tile_row = i;
+                break;
+            }
+
+        }
+        *pi4_ctb_tile_y = ctb_y - ps_tile_tmp->u1_pos_y;
+        *pi4_tile_idx = tile_row * ps_pps->i1_num_tile_columns
+                        + tile_col;
+    }
+    return (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+}
+/**
+*******************************************************************************
+*
+* @brief
+*  Function to initialize ps_pic_buf structs add pic buffers to
+*  buffer manager in case of non-shared mode
+*
+* @par Description:
+*  Function to initialize ps_pic_buf structs add pic buffers to
+*  buffer manager in case of non-shared mode
+*  To be called once per stream or for every reset
+*
+* @param[in] ps_codec
+*  Pointer to codec context
+*
+* @returns  Error from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_pic_buf_mgr_add_bufs(codec_t *ps_codec)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    WORD32 i;
+    WORD32 max_dpb_size;
+    sps_t *ps_sps;
+    UWORD8 *pu1_buf;
+    pic_buf_t *ps_pic_buf;
+    WORD32 pic_buf_size_allocated;
+
+    WORD32 max_num_bufs;
+    WORD32 pic_size;
+    WORD32 level;
+
+
+    /* Initialize MV Bank buffer manager */
+    ps_sps = ps_codec->s_parse.ps_sps;
+
+    pic_size = ps_sps->i2_pic_width_in_luma_samples *
+                    ps_sps->i2_pic_height_in_luma_samples;
+
+
+    /* Compute the number of MB Bank buffers needed */
+    level = ps_codec->i4_init_level;
+    max_dpb_size = ihevcd_get_dpb_size(level, pic_size);
+    /* Allocate twice dpb size to handle worst case reorder without returning more
+     * than one output per call
+     */
+    max_dpb_size *= 2;
+    /* Allocate one extra picture to handle current frame
+     * In case of asynchronous parsing and processing, number of buffers should increase here
+     * based on when parsing and processing threads are synchronized
+     */
+    max_dpb_size++;
+
+    /* If num_ref_frames and num_reorder_frmaes is specified
+     * Use minimum value
+     */
+    max_num_bufs = MIN(max_dpb_size, (ps_codec->i4_init_num_ref + ps_codec->i4_init_num_reorder + 1));
+
+
+    pu1_buf = (UWORD8 *)ps_codec->ps_pic_buf;
+
+    ps_pic_buf = (pic_buf_t *)ps_codec->ps_pic_buf;
+
+    pu1_buf += BUF_MGR_MAX_CNT  * sizeof(pic_buf_t);
+
+    /* In case of non-shared mode, add picture buffers to buffer manager
+     * In case of shared mode buffers are added in the run-time
+     */
+    if(0 == ps_codec->i4_share_disp_buf)
+    {
+        WORD32 buf_ret;
+        WORD32 luma_samples;
+        WORD32 chroma_samples;
+        pic_buf_size_allocated = ps_codec->i4_total_pic_buf_size -
+                        BUF_MGR_MAX_CNT * sizeof(pic_buf_t);
+
+        luma_samples = (ps_codec->i4_strd) *
+                        (ps_sps->i2_pic_height_in_luma_samples + PAD_HT);
+
+        chroma_samples = luma_samples / 2;
+
+        /* Try to add as many buffers as possible since memory is already allocated */
+        /* If the number of buffers that can be added is less than max_num_bufs
+         * return with an error.
+         */
+        for(i = 0; i < (2 * MAX_DPB_SIZE) + 1; i++)
+        {
+            pic_buf_size_allocated -= (luma_samples + chroma_samples);
+
+            if(pic_buf_size_allocated < 0)
+            {
+                if(i < max_num_bufs)
+                {
+                    ps_codec->s_parse.i4_error_code = IHEVCD_INSUFFICIENT_MEM_PICBUF;
+                    return IHEVCD_INSUFFICIENT_MEM_PICBUF;
+                }
+                break;
+            }
+
+            ps_pic_buf->pu1_luma = pu1_buf + ps_codec->i4_strd * PAD_TOP + PAD_LEFT;
+            pu1_buf += luma_samples;
+
+            ps_pic_buf->pu1_chroma = pu1_buf + ps_codec->i4_strd * (PAD_TOP / 2) + PAD_LEFT;
+            pu1_buf += chroma_samples;
+
+            buf_ret = ihevc_buf_mgr_add((buf_mgr_t *)ps_codec->pv_pic_buf_mgr, ps_pic_buf, i);
+
+            if(0 != buf_ret)
+            {
+                ps_codec->s_parse.i4_error_code = IHEVCD_BUF_MGR_ERROR;
+                return IHEVCD_BUF_MGR_ERROR;
+            }
+            ps_pic_buf++;
+        }
+    }
+
+    return ret;
+}
+/**
+*******************************************************************************
+*
+* @brief
+*  Function to add buffers to MV Bank buffer manager
+*
+* @par Description:
+*  Function to add buffers to MV Bank buffer manager
+*  To be called once per stream or for every reset
+*
+* @param[in] ps_codec
+*  Pointer to codec context
+*
+* @returns  Error from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_mv_buf_mgr_add_bufs(codec_t *ps_codec)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    WORD32 i;
+    WORD32 max_dpb_size;
+    WORD32 mv_bank_size_allocated;
+    WORD32 pic_mv_bank_size;
+    WORD32 level;
+    sps_t *ps_sps;
+    UWORD8 *pu1_buf;
+    mv_buf_t *ps_mv_buf;
+
+
+    /* Initialize MV Bank buffer manager */
+    ps_sps = ps_codec->s_parse.ps_sps;
+
+
+    /* Compute the number of MB Bank buffers needed */
+    level = ps_codec->i4_init_level;
+    max_dpb_size = ihevcd_get_dpb_size(level,
+                                       ps_sps->i2_pic_width_in_luma_samples *
+                                       ps_sps->i2_pic_height_in_luma_samples);
+
+    /* Allocate one extra MV Bank to handle current frame
+     * In case of asynchronous parsing and processing, number of buffers should increase here
+     * based on when parsing and processing threads are synchronized
+     */
+    max_dpb_size++;
+
+    pu1_buf = (UWORD8 *)ps_codec->pv_mv_bank_buf_base;
+
+    ps_mv_buf = (mv_buf_t *)pu1_buf;
+    pu1_buf += BUF_MGR_MAX_CNT  * sizeof(mv_buf_t);
+    ps_codec->ps_mv_buf = ps_mv_buf;
+    mv_bank_size_allocated = ps_codec->i4_total_mv_bank_size - BUF_MGR_MAX_CNT * sizeof(mv_buf_t);
+
+    /* Compute MV bank size per picture */
+    pic_mv_bank_size = ihevcd_get_pic_mv_bank_size(ps_sps->i2_pic_width_in_luma_samples *
+                                                   ps_sps->i2_pic_height_in_luma_samples);
+
+    for(i = 0; i < max_dpb_size; i++)
+    {
+        WORD32 buf_ret;
+        WORD32 num_pu;
+        WORD32 num_ctb;
+        WORD32 pic_size;
+        pic_size = ALIGN64(ps_sps->i2_pic_width_in_luma_samples) *
+                        ALIGN64(ps_sps->i2_pic_height_in_luma_samples);
+
+
+        num_pu = pic_size / (MIN_PU_SIZE * MIN_PU_SIZE);
+        num_ctb = pic_size / (MIN_CTB_SIZE * MIN_CTB_SIZE);
+
+
+        mv_bank_size_allocated -= pic_mv_bank_size;
+
+        if(mv_bank_size_allocated < 0)
+        {
+            ps_codec->s_parse.i4_error_code = IHEVCD_INSUFFICIENT_MEM_MVBANK;
+            return IHEVCD_INSUFFICIENT_MEM_MVBANK;
+        }
+
+        ps_mv_buf->pu4_pic_pu_idx = (UWORD32 *)pu1_buf;
+        pu1_buf += (num_ctb + 1) * sizeof(WORD32);
+
+        ps_mv_buf->pu1_pic_pu_map = pu1_buf;
+        pu1_buf += num_pu;
+
+        ps_mv_buf->pu1_pic_slice_map = (UWORD16 *)pu1_buf;
+        pu1_buf += num_ctb * sizeof(UWORD16);
+
+        ps_mv_buf->ps_pic_pu = (pu_t *)pu1_buf;
+
+        buf_ret = ihevc_buf_mgr_add((buf_mgr_t *)ps_codec->pv_mv_buf_mgr, ps_mv_buf, i);
+
+        if(0 != buf_ret)
+        {
+            ps_codec->s_parse.i4_error_code = IHEVCD_BUF_MGR_ERROR;
+            return IHEVCD_BUF_MGR_ERROR;
+        }
+        pu1_buf += pic_mv_bank_size;
+        ps_mv_buf++;
+
+    }
+    return ret;
+}
+/**
+*******************************************************************************
+*
+* @brief
+*  Picture level initializations required during parsing
+*
+* @par Description:
+*  Initialize picture level context variables during parsing Initialize mv
+* bank buffer manager in the first init call
+*
+* @param[in] ps_codec
+*  Pointer to codec context
+*
+* @returns  Error from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_parse_pic_init(codec_t *ps_codec)
+{
+    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+    mv_buf_t *ps_mv_buf;
+    sps_t *ps_sps;
+    WORD32 num_min_cu;
+    WORD32 cur_pic_buf_id;
+    WORD32 cur_mv_bank_buf_id;
+    pic_buf_t *ps_cur_pic;
+    slice_header_t *ps_slice_hdr;
+    UWORD8 *pu1_cur_pic_luma, *pu1_cur_pic_chroma;
+    WORD32 i;
+
+    ps_codec->s_parse.i4_error_code = IHEVCD_SUCCESS;
+    ps_sps = ps_codec->s_parse.ps_sps;
+#ifdef GPU_BUILD
+    //TODO GPU : Later define it for ARM only version as well
+    ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr_base + (ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1));
+#else
+    ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr;
+#endif
+    /* If parse_pic_init is called, then slice data is present in the input bitstrea stream */
+    ps_codec->i4_pic_present = 1;
+
+    /* Memset picture level intra map and transquant bypass map to zero */
+#ifdef GPU_BUILD
+    ps_codec->s_parse.pu1_pic_intra_flag = ps_codec->apu1_pic_intra_flag[ps_codec->u4_parsing_view];
+    ps_codec->s_parse.pu1_pic_no_loop_filter_flag = ps_codec->apu1_pic_no_loop_filter_flag[ps_codec->u4_parsing_view];
+#endif
+    num_min_cu = ((ps_sps->i2_pic_height_in_luma_samples + 7) / 8) * ((ps_sps->i2_pic_width_in_luma_samples + 63) / 64);
+    memset(ps_codec->s_parse.pu1_pic_intra_flag, 0, num_min_cu);
+    memset(ps_codec->s_parse.pu1_pic_no_loop_filter_flag, 0, num_min_cu);
+
+
+
+    if(0 == ps_codec->s_parse.i4_first_pic_init)
+    {
+        ret = ihevcd_mv_buf_mgr_add_bufs(ps_codec);
+        RETURN_IF((ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), ret);
+
+        ret = ihevcd_pic_buf_mgr_add_bufs(ps_codec);
+        RETURN_IF((ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), ret);
+
+        ps_codec->s_parse.i4_first_pic_init = 1;
+    }
+
+    /* Initialize all the slice headers' slice addresses to zero */
+    {
+        WORD32 slice_idx;
+        WORD32 slice_start_idx;
+
+        slice_start_idx = ps_codec->i4_slice_error ? 2 : 1;
+
+        for(slice_idx = slice_start_idx; slice_idx < MAX_SLICE_HDR_CNT; slice_idx++)
+        {
+#ifdef GPU_BUILD
+            slice_header_t *ps_slice_hdr_tmp = ps_codec->aps_slice_hdr_base[0] + slice_idx;
+            ps_slice_hdr_tmp->i2_ctb_x = -1;
+            ps_slice_hdr_tmp->i2_ctb_y = -1;
+            ps_slice_hdr_tmp = ps_codec->aps_slice_hdr_base[1] + slice_idx;
+            ps_slice_hdr_tmp->i2_ctb_x = -1;
+            ps_slice_hdr_tmp->i2_ctb_y = -1;
+#else
+            slice_header_t *ps_slice_hdr_tmp = ps_codec->ps_slice_hdr_base + slice_idx;
+            ps_slice_hdr_tmp->i2_ctb_x = -1;
+            ps_slice_hdr_tmp->i2_ctb_y = -1;
+#endif
+
+        }
+    }
+
+    /* Get free MV Bank to hold current picture's motion vector data */
+    {
+        ps_mv_buf = (mv_buf_t *)ihevc_buf_mgr_get_next_free((buf_mgr_t *)ps_codec->pv_mv_buf_mgr, &cur_mv_bank_buf_id);
+
+        /* If there are no free buffers then return with an error code.
+         * If the buffer is to be freed by another thread , change the
+         * following to call thread yield and wait for buffer to be freed
+         */
+        if(NULL == ps_mv_buf)
+        {
+            ps_codec->s_parse.i4_error_code = IHEVCD_NO_FREE_MVBANK;
+            ps_codec->i4_error_code = IHEVCD_NO_FREE_MVBANK;
+            return IHEVCD_NO_FREE_MVBANK;
+        }
+
+        ps_codec->s_parse.ps_cur_mv_buf = ps_mv_buf;
+        /* Set current ABS poc to ps_mv_buf, so that while freeing a reference buffer
+         * corresponding mv buffer can be found by looping through ps_codec->ps_mv_buf array
+         * and getting a buffer id to free
+         */
+        ps_mv_buf->i4_abs_poc = ps_slice_hdr->i4_abs_pic_order_cnt;
+    }
+
+    /* Get free picture buffer to hold current picture recon data */
+    /* TODO: For asynchronous api the following initializations related to picture
+     * buffer should be moved to processing side
+     */
+    {
+
+        UWORD8 *pu1_buf;
+        ps_cur_pic = (pic_buf_t *)ihevc_buf_mgr_get_next_free((buf_mgr_t *)ps_codec->pv_pic_buf_mgr, &cur_pic_buf_id);
+
+        /* If there are no free buffers then return with an error code.
+         * TODO: If the buffer is to be freed by another thread , change the
+         * following to call thread yield and wait for buffer to be freed
+         */
+        if(NULL == ps_cur_pic)
+        {
+            ps_codec->s_parse.i4_error_code = IHEVCD_NO_FREE_PICBUF;
+            ps_codec->i4_error_code = IHEVCD_NO_FREE_PICBUF;
+            return IHEVCD_NO_FREE_PICBUF;
+        }
+
+        /* Store input timestamp sent with input buffer */
+        ps_cur_pic->u4_ts = ps_codec->u4_ts;
+        ps_cur_pic->i4_abs_poc = ps_slice_hdr->i4_abs_pic_order_cnt;
+        ps_cur_pic->i4_poc_lsb = ps_slice_hdr->i4_pic_order_cnt_lsb;
+        pu1_buf = ps_cur_pic->pu1_luma;
+        pu1_cur_pic_luma = pu1_buf;
+
+        pu1_buf = ps_cur_pic->pu1_chroma;
+
+        pu1_cur_pic_chroma = pu1_buf;
+    }
+
+    if(0 == ps_codec->u4_pic_cnt)
+    {
+        memset(ps_cur_pic->pu1_luma, 128, (ps_sps->i2_pic_width_in_luma_samples + PAD_WD) * ps_sps->i2_pic_height_in_luma_samples);
+        memset(ps_cur_pic->pu1_chroma, 128, (ps_sps->i2_pic_width_in_luma_samples + PAD_WD) * ps_sps->i2_pic_height_in_luma_samples / 2);
+    }
+
+    /* Fill the remaining entries of the reference lists with the nearest POC
+     * This is done to handle cases where there is a corruption in the reference index */
+    {
+        pic_buf_t *ps_pic_buf_ref;
+        mv_buf_t *ps_mv_buf_ref;
+        WORD32 r_idx;
+        dpb_mgr_t *ps_dpb_mgr = (dpb_mgr_t *)ps_codec->pv_dpb_mgr;
+        buf_mgr_t *ps_mv_buf_mgr = (buf_mgr_t *)ps_codec->pv_mv_buf_mgr;
+
+        ps_pic_buf_ref = ihevc_dpb_mgr_get_ref_by_nearest_poc(ps_dpb_mgr, ps_slice_hdr->i4_abs_pic_order_cnt);
+        if(NULL == ps_pic_buf_ref)
+        {
+            ps_pic_buf_ref = ps_cur_pic;
+            ps_mv_buf_ref = ps_mv_buf;
+        }
+        else
+        {
+            ps_mv_buf_ref = ihevcd_mv_mgr_get_poc(ps_mv_buf_mgr, ps_pic_buf_ref->i4_abs_poc);
+        }
+
+        for(r_idx = 0; r_idx < ps_slice_hdr->i1_num_ref_idx_l0_active; r_idx++)
+        {
+            if(NULL == ps_slice_hdr->as_ref_pic_list0[r_idx].pv_pic_buf)
+            {
+                ps_slice_hdr->as_ref_pic_list0[r_idx].pv_pic_buf = (void *)ps_pic_buf_ref;
+                ps_slice_hdr->as_ref_pic_list0[r_idx].pv_mv_buf = (void *)ps_mv_buf_ref;
+            }
+        }
+
+        for(r_idx = ps_slice_hdr->i1_num_ref_idx_l0_active; r_idx < MAX_DPB_SIZE; r_idx++)
+        {
+            ps_slice_hdr->as_ref_pic_list0[r_idx].pv_pic_buf = (void *)ps_pic_buf_ref;
+            ps_slice_hdr->as_ref_pic_list0[r_idx].pv_mv_buf = (void *)ps_mv_buf_ref;
+        }
+
+        for(r_idx = 0; r_idx < ps_slice_hdr->i1_num_ref_idx_l1_active; r_idx++)
+        {
+            if(NULL == ps_slice_hdr->as_ref_pic_list1[r_idx].pv_pic_buf)
+            {
+                ps_slice_hdr->as_ref_pic_list1[r_idx].pv_pic_buf = (void *)ps_pic_buf_ref;
+                ps_slice_hdr->as_ref_pic_list1[r_idx].pv_mv_buf = (void *)ps_mv_buf_ref;
+            }
+        }
+
+        for(r_idx = ps_slice_hdr->i1_num_ref_idx_l1_active; r_idx < MAX_DPB_SIZE; r_idx++)
+        {
+            ps_slice_hdr->as_ref_pic_list1[r_idx].pv_pic_buf = (void *)ps_pic_buf_ref;
+            ps_slice_hdr->as_ref_pic_list1[r_idx].pv_mv_buf = (void *)ps_mv_buf_ref;
+        }
+    }
+
+
+    /* Reset the jobq to start of the jobq buffer */
+    ihevcd_jobq_reset((jobq_t *)ps_codec->pv_proc_jobq);
+
+    ps_codec->s_parse.i4_pic_pu_idx = 0;
+    ps_codec->s_parse.i4_pic_tu_idx = 0;
+
+    ps_codec->s_parse.pu1_pic_pu_map = ps_mv_buf->pu1_pic_pu_map;
+    ps_codec->s_parse.ps_pic_pu      = ps_mv_buf->ps_pic_pu;
+    ps_codec->s_parse.pu4_pic_pu_idx = ps_mv_buf->pu4_pic_pu_idx;
+    ps_codec->s_parse.pu1_slice_idx = (UWORD16 *)ps_mv_buf->pu1_pic_slice_map;
+#ifndef GPU_BUILD
+    for(i = 0; i < MAX_PROCESS_THREADS; i++)
+    {
+        ps_codec->as_process[i].pu1_slice_idx = (UWORD16 *)ps_mv_buf->pu1_pic_slice_map;
+    }
+#endif
+    ps_codec->s_parse.pu1_pu_map = ps_codec->s_parse.pu1_pic_pu_map;
+    ps_codec->s_parse.ps_pu = ps_codec->s_parse.ps_pic_pu;
+
+    {
+        UWORD8 *pu1_buf;
+        WORD32 ctb_luma_min_tu_cnt, ctb_chroma_min_tu_cnt, ctb_min_tu_cnt;
+        WORD32 pic_size;
+        WORD32 num_ctb;
+
+        pic_size = ps_sps->i2_pic_width_in_luma_samples *
+                        ps_sps->i2_pic_height_in_luma_samples;
+
+        ctb_luma_min_tu_cnt = pic_size / (MIN_TU_SIZE * MIN_TU_SIZE);
+
+        ctb_chroma_min_tu_cnt = ctb_luma_min_tu_cnt >> 1;
+
+        ctb_min_tu_cnt = ctb_luma_min_tu_cnt + ctb_chroma_min_tu_cnt;
+
+        num_ctb = pic_size / (MIN_CTB_SIZE * MIN_CTB_SIZE);
+#ifdef GPU_BUILD
+        pu1_buf  = (UWORD8 *)ps_codec->apv_tu_data[ps_codec->u4_parsing_view];
+#else
+        pu1_buf  = (UWORD8 *)ps_codec->pv_tu_data;
+#endif
+        ps_codec->s_parse.pu4_pic_tu_idx = (UWORD32 *)pu1_buf;
+        pu1_buf += (num_ctb + 1) * sizeof(WORD32);
+
+        ps_codec->s_parse.pu1_pic_tu_map = pu1_buf;
+        pu1_buf += ctb_min_tu_cnt;
+
+        ps_codec->s_parse.ps_pic_tu = (tu_t *)pu1_buf;
+        pu1_buf += ctb_min_tu_cnt * sizeof(tu_t);
+
+        ps_codec->s_parse.pv_pic_tu_coeff_data = pu1_buf;
+
+        ps_codec->s_parse.pu1_tu_map = ps_codec->s_parse.pu1_pic_tu_map;
+        ps_codec->s_parse.ps_tu = ps_codec->s_parse.ps_pic_tu;
+        ps_codec->s_parse.pv_tu_coeff_data = ps_codec->s_parse.pv_pic_tu_coeff_data;
+    }
+
+    ps_codec->s_parse.s_bs_ctxt.ps_pic_pu = ps_codec->s_parse.ps_pic_pu;
+    ps_codec->s_parse.s_bs_ctxt.pu4_pic_pu_idx = ps_codec->s_parse.pu4_pic_pu_idx;
+    ps_codec->s_parse.s_bs_ctxt.pu4_pic_tu_idx = ps_codec->s_parse.pu4_pic_tu_idx;
+
+
+    /* Set number of CTBs to be processed simultaneously */
+    ps_codec->i4_proc_nctb = ihevcd_nctb_cnt(ps_codec, ps_sps);
+
+    /* Memset Parse Map and process map at the start of frame */
+    //TODO: In case of asynchronous API proc_map can not be set to zero here
+    {
+        WORD32 num_ctb;
+
+        num_ctb = ps_sps->i4_pic_size_in_ctb;
+
+        memset(ps_codec->pu1_parse_map, 0, num_ctb);
+
+#ifndef GPU_BUILD
+        memset(ps_codec->pu1_proc_map, 0, num_ctb);
+#endif
+    }
+
+
+
+    /* Initialize disp buf id to -1, this will be updated at the end of frame if there is
+     * buffer to be displayed
+     */
+    ps_codec->i4_disp_buf_id = -1;
+    ps_codec->ps_disp_buf = NULL;
+
+    ps_codec->i4_disable_deblk_pic  = 0;
+    ps_codec->i4_disable_sao_pic    = 0;
+    ps_codec->i4_fullpel_inter_pred = 0;
+    ps_codec->i4_mv_frac_mask       = 0x7FFFFFFF;
+
+    /* If degrade is enabled, set the degrade flags appropriately */
+    if(ps_codec->i4_degrade_type && ps_codec->i4_degrade_pics)
+    {
+        WORD32 degrade_pic;
+        ps_codec->i4_degrade_pic_cnt++;
+        degrade_pic = 0;
+
+        /* If degrade is to be done in all frames, then do not check further */
+        switch(ps_codec->i4_degrade_pics)
+        {
+            case 4:
+            {
+                degrade_pic = 1;
+                break;
+            }
+            case 3:
+            {
+                if(ps_slice_hdr->i1_slice_type != ISLICE)
+                    degrade_pic = 1;
+
+                break;
+            }
+            case 2:
+            {
+
+                /* If pic count hits non-degrade interval or it is an islice, then do not degrade */
+                if((ps_slice_hdr->i1_slice_type != ISLICE) &&
+                   (ps_codec->i4_degrade_pic_cnt != ps_codec->i4_nondegrade_interval))
+                    degrade_pic = 1;
+
+                break;
+            }
+            case 1:
+            {
+                /* Check if the current picture is non-ref */
+                if((ps_slice_hdr->i1_nal_unit_type < NAL_BLA_W_LP) &&
+                   (ps_slice_hdr->i1_nal_unit_type % 2 == 0))
+                {
+                    degrade_pic = 1;
+                }
+                break;
+            }
+
+
+        }
+        if(degrade_pic)
+        {
+            if(ps_codec->i4_degrade_type & 0x1)
+                ps_codec->i4_disable_sao_pic = 1;
+
+            if(ps_codec->i4_degrade_type & 0x2)
+                ps_codec->i4_disable_deblk_pic = 1;
+
+            /* MC degrading is done only for non-ref pictures */
+            if((ps_slice_hdr->i1_nal_unit_type < NAL_BLA_W_LP) &&
+               (ps_slice_hdr->i1_nal_unit_type % 2 == 0))
+            {
+                if(ps_codec->i4_degrade_type & 0x4)
+                    ps_codec->i4_mv_frac_mask = 0;
+
+                if(ps_codec->i4_degrade_type & 0x8)
+                    ps_codec->i4_mv_frac_mask = 0;
+            }
+        }
+        else
+            ps_codec->i4_degrade_pic_cnt = 0;
+    }
+
+
+    {
+        WORD32 i;
+#ifdef GPU_BUILD
+        gpu_ctxt_t *ps_gpu = &ps_codec->s_gpu_ctxt;
+        ps_gpu->i4_curr_grain_ctb_cnt = 0;
+        ps_codec->s_parse.s_bs_ctxt.pu4_pic_vert_bs = ps_codec->apu4_pic_vert_bs[ps_codec->u4_parsing_view];
+        ps_codec->s_parse.s_bs_ctxt.pu4_pic_horz_bs = ps_codec->apu4_pic_horz_bs[ps_codec->u4_parsing_view];
+        ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp = ps_codec->apu1_pic_qp[ps_codec->u4_parsing_view];
+        ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp_const_in_ctb = ps_codec->apu1_pic_qp_const_in_ctb[ps_codec->u4_parsing_view];
+
+        ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu4_pic_vert_bs = (UWORD32 *)ps_codec->s_parse.s_bs_ctxt.pu4_pic_vert_bs;
+        ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu4_pic_horz_bs = (UWORD32 *)ps_codec->s_parse.s_bs_ctxt.pu4_pic_horz_bs;
+        ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp = (UWORD8 *)ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp;
+        ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp_const_in_ctb = (UWORD8 *)ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp_const_in_ctb;
+
+        ps_codec->s_parse.s_deblk_ctxt.pu1_pic_no_loop_filter_flag = ps_codec->apu1_pic_no_loop_filter_flag[ps_codec->u4_parsing_view];
+        ps_codec->s_parse.s_sao_ctxt.pu1_pic_no_loop_filter_flag = ps_codec->apu1_pic_no_loop_filter_flag[ps_codec->u4_parsing_view];
+
+        ps_codec->s_parse.s_deblk_ctxt.ps_slice_hdr_base = ps_codec->s_parse.ps_slice_hdr_base;
+
+        ps_codec->s_parse.ps_pic_sao            = (sao_t *)ps_codec->aps_pic_sao[ps_codec->u4_parsing_view];
+        ps_codec->s_parse.s_sao_ctxt.ps_pic_sao = (sao_t *)ps_codec->aps_pic_sao[ps_codec->u4_parsing_view];
+#endif
+        for(i = 0; i < MAX_PROCESS_THREADS; i++)
+        {
+            ps_codec->as_process[i].pu4_pic_pu_idx = ps_codec->s_parse.pu4_pic_pu_idx;
+            ps_codec->as_process[i].ps_pic_pu = ps_codec->s_parse.ps_pic_pu;
+            ps_codec->as_process[i].pu1_pic_pu_map = ps_codec->s_parse.pu1_pic_pu_map;
+            ps_codec->as_process[i].pu4_pic_tu_idx = ps_codec->s_parse.pu4_pic_tu_idx;
+            ps_codec->as_process[i].ps_pic_tu = ps_codec->s_parse.ps_pic_tu;
+            ps_codec->as_process[i].pu1_pic_tu_map = ps_codec->s_parse.pu1_pic_tu_map;
+            ps_codec->as_process[i].pv_pic_tu_coeff_data = ps_codec->s_parse.pv_pic_tu_coeff_data;
+            ps_codec->as_process[i].i4_cur_mv_bank_buf_id = cur_mv_bank_buf_id;
+            ps_codec->as_process[i].s_sao_ctxt.pu1_slice_idx = ps_codec->as_process[i].pu1_slice_idx;
+            ps_codec->as_process[i].s_sao_ctxt.pu1_tile_idx = ps_codec->as_process[i].pu1_tile_idx;
+
+            /* TODO: For asynchronous api the following initializations related to picture
+             * buffer should be moved to processing side
+             */
+            ps_codec->as_process[i].pu1_cur_pic_luma = pu1_cur_pic_luma;
+            ps_codec->as_process[i].pu1_cur_pic_chroma = pu1_cur_pic_chroma;
+            ps_codec->as_process[i].ps_cur_pic = ps_cur_pic;
+            ps_codec->as_process[i].i4_cur_pic_buf_id = cur_pic_buf_id;
+
+            ps_codec->as_process[i].ps_out_buffer = ps_codec->ps_out_buffer;
+            if(1 < ps_codec->i4_num_cores)
+            {
+                ps_codec->as_process[i].i4_check_parse_status = 1;
+                ps_codec->as_process[i].i4_check_proc_status = 1;
+            }
+            else
+            {
+                ps_codec->as_process[i].i4_check_parse_status = 0;
+                ps_codec->as_process[i].i4_check_proc_status = 0;
+            }
+            ps_codec->as_process[i].pu1_pic_intra_flag = ps_codec->s_parse.pu1_pic_intra_flag;
+            ps_codec->as_process[i].pu1_pic_no_loop_filter_flag = ps_codec->s_parse.pu1_pic_no_loop_filter_flag;
+            ps_codec->as_process[i].i4_init_done = 0;
+
+            ps_codec->as_process[i].s_bs_ctxt.pu4_pic_tu_idx = ps_codec->as_process[i].pu4_pic_tu_idx;
+            ps_codec->as_process[i].s_bs_ctxt.pu4_pic_pu_idx = ps_codec->as_process[i].pu4_pic_pu_idx;
+            ps_codec->as_process[i].s_bs_ctxt.ps_pic_pu = ps_codec->as_process[i].ps_pic_pu;
+#ifdef GPU_BUILD
+            ps_codec->as_process[i].u4_gpu_inter_flag = ps_codec->u4_gpu_enabled;
+            ps_codec->as_process[i].s_bs_ctxt.pu4_pic_vert_bs = (UWORD32 *)ps_codec->s_parse.s_bs_ctxt.pu4_pic_vert_bs;
+            ps_codec->as_process[i].s_bs_ctxt.pu4_pic_horz_bs = (UWORD32 *)ps_codec->s_parse.s_bs_ctxt.pu4_pic_horz_bs;
+            ps_codec->as_process[i].s_bs_ctxt.pu1_pic_qp = (UWORD8 *)ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp;
+            ps_codec->as_process[i].s_bs_ctxt.pu1_pic_qp_const_in_ctb = (UWORD8 *)ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp_const_in_ctb;
+
+            ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu4_pic_vert_bs = (UWORD32 *)ps_codec->s_parse.s_bs_ctxt.pu4_pic_vert_bs;
+            ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu4_pic_horz_bs = (UWORD32 *)ps_codec->s_parse.s_bs_ctxt.pu4_pic_horz_bs;
+            ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp = (UWORD8 *)ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp;
+            ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp_const_in_ctb = (UWORD8 *)ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp_const_in_ctb;
+            ps_codec->as_process[i].pu1_proc_map = ps_codec->apu1_proc_map[ps_codec->u4_parsing_view];
+
+            ps_codec->as_process[i].pu1_slice_idx = (UWORD16 *)ps_mv_buf->pu1_pic_slice_map;
+
+#else
+#ifdef GPU_BUILD
+            //TODO GPU : Later define it for ARM only version as well
+            ps_codec->as_process[i].pu1_proc_map = ps_codec->pu1_proc_map;
+#endif
+#endif
+            ps_codec->as_process[i].s_deblk_ctxt.pu1_pic_no_loop_filter_flag = ps_codec->s_parse.pu1_pic_no_loop_filter_flag;
+            ps_codec->as_process[i].s_deblk_ctxt.pu1_cur_pic_luma = pu1_cur_pic_luma;
+            ps_codec->as_process[i].s_deblk_ctxt.pu1_cur_pic_chroma = pu1_cur_pic_chroma;
+#ifdef GPU_BUILD
+            //TODO GPU : Later define it for ARM only version as well
+            ps_codec->as_process[i].s_deblk_ctxt.ps_slice_hdr_base = ps_codec->s_parse.ps_slice_hdr_base;
+#endif
+            ps_codec->as_process[i].s_sao_ctxt.pu1_pic_no_loop_filter_flag = ps_codec->s_parse.pu1_pic_no_loop_filter_flag;
+            ps_codec->as_process[i].s_sao_ctxt.pu1_cur_pic_luma = pu1_cur_pic_luma;
+            ps_codec->as_process[i].s_sao_ctxt.pu1_cur_pic_chroma = pu1_cur_pic_chroma;
+#ifdef GPU_BUILD
+            //TODO GPU : Later define it for ARM only version as well
+            ps_codec->as_process[i].s_sao_ctxt.ps_slice_hdr_base = ps_codec->s_parse.ps_slice_hdr_base;
+            ps_codec->as_process[i].ps_slice_hdr_base = ps_codec->s_parse.ps_slice_hdr_base;
+
+            ps_codec->as_process[i].s_sao_ctxt.ps_pic_sao = ps_codec->s_parse.ps_pic_sao;
+#endif
+            if(i < (ps_codec->i4_num_cores - 1))
+            {
+                ithread_create(ps_codec->apv_process_thread_handle[i], NULL,
+                               (void *)ihevcd_process_thread,
+                               (void *)&ps_codec->as_process[i]);
+                ps_codec->ai4_process_thread_created[i] = 1;
+            }
+            else
+            {
+                ps_codec->ai4_process_thread_created[i] = 0;
+            }
+
+        }
+#ifdef GPU_BUILD
+        memset(ps_codec->apu1_proc_map[ps_codec->u4_parsing_view], 0, ps_sps->i4_pic_size_in_ctb);
+#else
+#ifdef GPU_BUILD
+        //TODO GPU : Later define it for ARM only version as well
+        // and remove from above.
+        memset(ps_codec->pu1_proc_map, 0, ps_sps->i4_pic_size_in_ctb);
+#endif
+#endif
+        ps_codec->s_parse.s_deblk_ctxt.pu1_cur_pic_luma = pu1_cur_pic_luma;
+        ps_codec->s_parse.s_deblk_ctxt.pu1_cur_pic_chroma = pu1_cur_pic_chroma;
+
+        ps_codec->s_parse.s_sao_ctxt.pu1_cur_pic_luma = pu1_cur_pic_luma;
+        ps_codec->s_parse.s_sao_ctxt.pu1_cur_pic_chroma = pu1_cur_pic_chroma;
+    }
+    /* Since any input bitstream buffer that contains slice data will be sent to output(even in
+     * case of error, this buffer is added to display queue and next buffer in the display queue
+     * will be returned as the display buffer.
+     * Note: If format conversion (or frame copy) is used and is scheduled
+     * in a different thread then it has to check if the processing for the current row is complete before
+     * it copies/converts a given row. In case of low delay or in case of B pictures, current frame being decoded has to be
+     * returned, which requires a status check to ensure that the current row is reconstructed before copying.
+     */
+    /* Add current picture to display manager */
+#ifndef GPU_BUILD
+    {
+        WORD32 abs_poc;
+        slice_header_t *ps_slice_hdr;
+        ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr;
+        abs_poc = ps_slice_hdr->i4_abs_pic_order_cnt;
+        ihevc_disp_mgr_add((disp_mgr_t *)ps_codec->pv_disp_buf_mgr,
+                           ps_codec->as_process[0].i4_cur_pic_buf_id,
+                           abs_poc,
+                           ps_codec->as_process[0].ps_cur_pic);
+    }
+#endif
+    ps_codec->ps_disp_buf = NULL;
+    /* Get picture to be displayed if number of pictures decoded is more than max allowed reorder */
+    /* Since the current will be decoded, check is fore >= instead of > */
+#ifdef GPU_BUILD
+    //TODO OPENCL delay this by one frame
+    //TODO GPU : Should it be just +1
+    if(((WORD32)(ps_codec->u4_pic_cnt - ps_codec->u4_disp_cnt) >= (ps_sps->ai1_sps_max_num_reorder_pics[ps_sps->i1_sps_max_sub_layers - 1]+2)) ||
+#else
+    if(((WORD32)(ps_codec->u4_pic_cnt - ps_codec->u4_disp_cnt) >= ps_sps->ai1_sps_max_num_reorder_pics[ps_sps->i1_sps_max_sub_layers - 1]) ||
+#endif
+       ((WORD32)(ps_codec->u4_pic_cnt - ps_codec->u4_disp_cnt) >= ps_codec->i4_init_num_reorder))
+
+    {
+        ps_codec->ps_disp_buf = (pic_buf_t *)ihevc_disp_mgr_get((disp_mgr_t *)ps_codec->pv_disp_buf_mgr, &ps_codec->i4_disp_buf_id);
+        ps_codec->u4_disp_cnt++;
+    }
+
+    ps_codec->s_fmt_conv.i4_cur_row = 0;
+    /* Set number of rows to be processed at a time */
+    ps_codec->s_fmt_conv.i4_num_rows = 4;
+
+    if(ps_codec->u4_enable_fmt_conv_ahead && (ps_codec->i4_num_cores > 1))
+    {
+        process_ctxt_t *ps_proc;
+
+        /* i4_num_cores - 1 contexts are currently being used by other threads */
+        ps_proc = &ps_codec->as_process[ps_codec->i4_num_cores - 1];
+
+        /* If the frame being decoded and displayed are different, schedule format conversion jobs
+         * this will keep the proc threads busy and lets parse thread decode few CTBs ahead
+         * If the frame being decoded and displayed are same, then format conversion is scheduled later.
+         */
+        if((ps_codec->ps_disp_buf) && (ps_codec->i4_disp_buf_id != ps_proc->i4_cur_pic_buf_id) &&
+           ((0 == ps_codec->i4_share_disp_buf) || (IV_YUV_420P == ps_codec->e_chroma_fmt)))
+        {
+
+            for(i = 0; i < ps_sps->i2_pic_ht_in_ctb; i++)
+            {
+                proc_job_t s_job;
+                IHEVCD_ERROR_T ret;
+                s_job.i4_cmd = CMD_FMTCONV;
+                s_job.i2_ctb_cnt = 0;
+                s_job.i2_ctb_x = 0;
+                s_job.i2_ctb_y = i;
+                s_job.i2_slice_idx = 0;
+                s_job.i4_tu_coeff_data_ofst = 0;
+                ret = ihevcd_jobq_queue((jobq_t *)ps_codec->s_parse.pv_proc_jobq,
+                                        &s_job, sizeof(proc_job_t), 1);
+                if(ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS)
+                    return ret;
+            }
+        }
+    }
+
+#ifdef GPU_BUILD
+    /* Pic init for Opencl device */
+    ihevcd_gpu_mc_pic_init(ps_codec);
+#endif
+
+    return ret;
+}
+
+

diff --git a/decoder/ihevcd_utils.h b/decoder/ihevcd_utils.h
new file mode 100644
index 0000000..c2cbcc4
--- /dev/null
+++ b/decoder/ihevcd_utils.h

@@ -0,0 +1,60 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_parse_slice.h
+*
+* @brief
+*  Contains miscellaneous utility functions such as init() etc
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_UTILS_H_
+#define _IHEVCD_UTILS_H_
+
+WORD32 ihevcd_get_lvl_idx(WORD32 level);
+WORD32 ihevcd_get_dpb_size(WORD32 level, WORD32 pic_size);
+WORD32 ihevcd_get_pic_mv_bank_size(WORD32 num_luma_samples);
+WORD32 ihevcd_get_tu_data_size(WORD32 num_luma_samples);
+WORD32 ihevcd_nctb_cnt(codec_t *ps_codec, sps_t *ps_sps);
+WORD32 ihevcd_get_max_luma_samples(WORD32 level);
+IHEVCD_ERROR_T ihevcd_get_tile_pos(pps_t *ps_pps,
+                                   sps_t *ps_sps,
+                                   WORD32 ctb_x,
+                                   WORD32 ctb_y,
+                                   WORD32 *pi4_ctb_tile_x,
+                                   WORD32 *pi4_ctb_tile_y,
+                                   WORD32 *pi4_tile_idx);
+IHEVCD_ERROR_T ihevcd_parse_pic_init(codec_t *ps_codec);
+WORD32 ihevcd_get_total_pic_buf_size(WORD32 pic_size,
+                                     WORD32 level,
+                                     WORD32 horz_pad,
+                                     WORD32 vert_pad,
+                                     WORD32 num_ref_frames,
+                                     WORD32 num_reorder_frames);
+#endif /* _IHEVCD_UTILS_H_ */

diff --git a/decoder/ihevcd_version.c b/decoder/ihevcd_version.c
new file mode 100644
index 0000000..a47c6fc
--- /dev/null
+++ b/decoder/ihevcd_version.c

@@ -0,0 +1,131 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_version.c
+*
+* @brief
+*  Contains version info for HEVC decoder
+*
+* @author
+*  Harish
+*
+* @par List of Functions:
+* - ihevcd_get_version()
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+/**
+ * Name of the codec
+ */
+#define CODEC_NAME              "HEVCDEC"
+/**
+ * Codec release type, production or evaluation
+ */
+#define CODEC_RELEASE_TYPE      "production"
+/**
+ * Version string. First two digits signify major version and last two minor
+ * Increment major version for API change or major feature update
+ */
+#define CODEC_RELEASE_VER       "04.01"
+/**
+ * Vendor name
+ */
+#define CODEC_VENDOR            "ITTIAM"
+
+/**
+*******************************************************************************
+* Concatenates various strings to form a version string
+*******************************************************************************
+*/
+#define VERSION(version_string, codec_name, codec_release_type, codec_release_ver, codec_vendor)    \
+    strcpy(version_string,"@(#)Id:");                                                               \
+    strcat(version_string,codec_name);                                                              \
+    strcat(version_string,"_");                                                                     \
+    strcat(version_string,codec_release_type);                                                      \
+    strcat(version_string," Ver:");                                                                 \
+    strcat(version_string,codec_release_ver);                                                       \
+    strcat(version_string," Released by ");                                                         \
+    strcat(version_string,codec_vendor);                                                            \
+    strcat(version_string," Build: ");                                                              \
+    strcat(version_string,__DATE__);                                                                \
+    strcat(version_string," @ ");                                                                   \
+    strcat(version_string,__TIME__);
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*  Fills the version info in the given string
+*
+* @par Description:
+*
+*
+* @param[in] pc_version_string
+*  Pointer to hold version info
+*
+* @param[in] u4_version_buffer_size
+*  Size of the buffer passed
+*
+* @returns  Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IV_API_CALL_STATUS_T ihevcd_get_version(CHAR *pc_version_string,
+                                        UWORD32 u4_version_buffer_size)
+{
+    CHAR ac_version_tmp[512];
+    VERSION(ac_version_tmp, CODEC_NAME, CODEC_RELEASE_TYPE, CODEC_RELEASE_VER, CODEC_VENDOR);
+
+    if(u4_version_buffer_size >= (strlen(ac_version_tmp) + 1))
+    {
+        memcpy(pc_version_string, ac_version_tmp, (strlen(ac_version_tmp) + 1));
+        return IV_SUCCESS;
+    }
+    else
+    {
+        return IV_FAIL;
+    }
+
+}
+
+

diff --git a/decoder/mips/ihevcd_function_selector.c b/decoder/mips/ihevcd_function_selector.c
new file mode 100644
index 0000000..da734d7
--- /dev/null
+++ b/decoder/mips/ihevcd_function_selector.c

@@ -0,0 +1,85 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_function_selector.c
+*
+* @brief
+*  Contains functions to initialize function pointers used in hevc
+*
+* @author
+*  Naveen
+*
+* @par List of Functions:
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr_mips_generic(codec_t *ps_codec);
+void ihevcd_init_function_ptr_mips_32(codec_t *ps_codec);
+
+void ihevcd_init_function_ptr(void *pv_codec)
+{
+    codec_t *ps_codec = (codec_t *)pv_codec;
+    switch(ps_codec->e_processor_arch)
+    {
+#if ENABLE_MIPS32_SIMD
+        case ARCH_MIPS_32:
+            ihevcd_init_function_ptr_mips_32(ps_codec);
+            break;
+#endif
+        case ARCH_MIPS_GENERIC:
+        default:
+            ihevcd_init_function_ptr_mips_generic(ps_codec);
+            break;
+    }
+}
+
+void ihevcd_init_arch(void *pv_codec)
+{
+    codec_t *ps_codec = (codec_t *)pv_codec;
+    ps_codec->e_processor_arch = ARCH_MIPS_32;
+}

diff --git a/decoder/mips/ihevcd_function_selector_mips_generic.c b/decoder/mips/ihevcd_function_selector_mips_generic.c
new file mode 100644
index 0000000..88c56f4
--- /dev/null
+++ b/decoder/mips/ihevcd_function_selector_mips_generic.c

@@ -0,0 +1,160 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_function_selector_noneon.c
+*
+* @brief
+*  Contains functions to initialize noneon function pointers used in hevc
+*
+* @author
+*  Naveen
+*
+* @par List of Functions:
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr_mips_generic(codec_t *ps_codec)
+{
+    ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr                      =  &ihevc_deblk_chroma_horz;
+    ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr                      =  &ihevc_deblk_chroma_vert;
+    ps_codec->s_func_selector.ihevc_deblk_luma_vert_fptr                        =  &ihevc_deblk_luma_vert;
+    ps_codec->s_func_selector.ihevc_deblk_luma_horz_fptr                        =  &ihevc_deblk_luma_horz;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_fptr                 =  &ihevc_inter_pred_chroma_copy;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_w16out_fptr          =  &ihevc_inter_pred_chroma_copy_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_fptr                 =  &ihevc_inter_pred_chroma_horz;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr          =  &ihevc_inter_pred_chroma_horz_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_fptr                 =  &ihevc_inter_pred_chroma_vert;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_fptr          =  &ihevc_inter_pred_chroma_vert_w16inp;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_w16out_fptr   =  &ihevc_inter_pred_chroma_vert_w16inp_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16out_fptr          =  &ihevc_inter_pred_chroma_vert_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_fptr                   =  &ihevc_inter_pred_luma_horz;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_fptr                   =  &ihevc_inter_pred_luma_vert;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16out_fptr            =  &ihevc_inter_pred_luma_vert_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_fptr            =  &ihevc_inter_pred_luma_vert_w16inp;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_fptr                   =  &ihevc_inter_pred_luma_copy;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_w16out_fptr            =  &ihevc_inter_pred_luma_copy_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr            =  &ihevc_inter_pred_luma_horz_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_w16out_fptr     =  &ihevc_inter_pred_luma_vert_w16inp_w16out;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr     =  &ihevc_intra_pred_chroma_ref_substitution;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr       =  &ihevc_intra_pred_luma_ref_substitution;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr   =  &ihevc_intra_pred_luma_ref_subst_all_avlble;
+    ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr               =  &ihevc_intra_pred_ref_filtering;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_dc_fptr                   =  &ihevc_intra_pred_chroma_dc;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_horz_fptr                 =  &ihevc_intra_pred_chroma_horz;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode2_fptr                =  &ihevc_intra_pred_chroma_mode2;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_18_34_fptr           =  &ihevc_intra_pred_chroma_mode_18_34;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_27_to_33_fptr        =  &ihevc_intra_pred_chroma_mode_27_to_33;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_3_to_9_fptr          =  &ihevc_intra_pred_chroma_mode_3_to_9;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_planar_fptr               =  &ihevc_intra_pred_chroma_planar;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_ver_fptr                  =  &ihevc_intra_pred_chroma_ver;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_11_to_17_fptr        =  &ihevc_intra_pred_chroma_mode_11_to_17;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_19_to_25_fptr        =  &ihevc_intra_pred_chroma_mode_19_to_25;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_11_to_17_fptr          =  &ihevc_intra_pred_luma_mode_11_to_17;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_19_to_25_fptr          =  &ihevc_intra_pred_luma_mode_19_to_25;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_dc_fptr                     =  &ihevc_intra_pred_luma_dc;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_horz_fptr                   =  &ihevc_intra_pred_luma_horz;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode2_fptr                  =  &ihevc_intra_pred_luma_mode2;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_18_34_fptr             =  &ihevc_intra_pred_luma_mode_18_34;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_27_to_33_fptr          =  &ihevc_intra_pred_luma_mode_27_to_33;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_3_to_9_fptr            =  &ihevc_intra_pred_luma_mode_3_to_9;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_planar_fptr                 =  &ihevc_intra_pred_luma_planar;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ver_fptr                    =  &ihevc_intra_pred_luma_ver;
+    ps_codec->s_func_selector.ihevc_itrans_4x4_ttype1_fptr                      =  &ihevc_itrans_4x4_ttype1;
+    ps_codec->s_func_selector.ihevc_itrans_4x4_fptr                             =  &ihevc_itrans_4x4;
+    ps_codec->s_func_selector.ihevc_itrans_8x8_fptr                             =  &ihevc_itrans_8x8;
+    ps_codec->s_func_selector.ihevc_itrans_16x16_fptr                           =  &ihevc_itrans_16x16;
+    ps_codec->s_func_selector.ihevc_itrans_32x32_fptr                           =  &ihevc_itrans_32x32;
+    ps_codec->s_func_selector.ihevc_itrans_recon_4x4_ttype1_fptr                =  &ihevc_itrans_recon_4x4_ttype1;
+    ps_codec->s_func_selector.ihevc_itrans_recon_4x4_fptr                       =  &ihevc_itrans_recon_4x4;
+    ps_codec->s_func_selector.ihevc_itrans_recon_8x8_fptr                       =  &ihevc_itrans_recon_8x8;
+    ps_codec->s_func_selector.ihevc_itrans_recon_16x16_fptr                     =  &ihevc_itrans_recon_16x16;
+    ps_codec->s_func_selector.ihevc_itrans_recon_32x32_fptr                     =  &ihevc_itrans_recon_32x32;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_4x4_fptr                =  &ihevc_chroma_itrans_recon_4x4;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_8x8_fptr                =  &ihevc_chroma_itrans_recon_8x8;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_16x16_fptr              =  &ihevc_chroma_itrans_recon_16x16;
+    ps_codec->s_func_selector.ihevc_recon_4x4_ttype1_fptr                       =  &ihevc_recon_4x4_ttype1;
+    ps_codec->s_func_selector.ihevc_recon_4x4_fptr                              =  &ihevc_recon_4x4;
+    ps_codec->s_func_selector.ihevc_recon_8x8_fptr                              =  &ihevc_recon_8x8;
+    ps_codec->s_func_selector.ihevc_recon_16x16_fptr                            =  &ihevc_recon_16x16;
+    ps_codec->s_func_selector.ihevc_recon_32x32_fptr                            =  &ihevc_recon_32x32;
+    ps_codec->s_func_selector.ihevc_chroma_recon_4x4_fptr                       =  &ihevc_chroma_recon_4x4;
+    ps_codec->s_func_selector.ihevc_chroma_recon_8x8_fptr                       =  &ihevc_chroma_recon_8x8;
+    ps_codec->s_func_selector.ihevc_chroma_recon_16x16_fptr                     =  &ihevc_chroma_recon_16x16;
+    ps_codec->s_func_selector.ihevc_memcpy_mul_8_fptr                           =  &ihevc_memcpy_mul_8;
+    ps_codec->s_func_selector.ihevc_memcpy_fptr                                 =  &ihevc_memcpy;
+    ps_codec->s_func_selector.ihevc_memset_mul_8_fptr                           =  &ihevc_memset_mul_8;
+    ps_codec->s_func_selector.ihevc_memset_fptr                                 =  &ihevc_memset;
+    ps_codec->s_func_selector.ihevc_memset_16bit_mul_8_fptr                     =  &ihevc_memset_16bit_mul_8;
+    ps_codec->s_func_selector.ihevc_memset_16bit_fptr                           =  &ihevc_memset_16bit;
+    ps_codec->s_func_selector.ihevc_pad_left_luma_fptr                          =  &ihevc_pad_left_luma;
+    ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr                        =  &ihevc_pad_left_chroma;
+    ps_codec->s_func_selector.ihevc_pad_right_luma_fptr                         =  &ihevc_pad_right_luma;
+    ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr                       =  &ihevc_pad_right_chroma;
+    ps_codec->s_func_selector.ihevc_weighted_pred_bi_fptr                       =  &ihevc_weighted_pred_bi;
+    ps_codec->s_func_selector.ihevc_weighted_pred_bi_default_fptr               =  &ihevc_weighted_pred_bi_default;
+    ps_codec->s_func_selector.ihevc_weighted_pred_uni_fptr                      =  &ihevc_weighted_pred_uni;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_fptr                =  &ihevc_weighted_pred_chroma_bi;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_default_fptr        =  &ihevc_weighted_pred_chroma_bi_default;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_uni_fptr               =  &ihevc_weighted_pred_chroma_uni;
+    ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr                   =  &ihevc_sao_band_offset_luma;
+    ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr                 =  &ihevc_sao_band_offset_chroma;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_fptr                 =  &ihevc_sao_edge_offset_class0;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_chroma_fptr          =  &ihevc_sao_edge_offset_class0_chroma;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_fptr                 =  &ihevc_sao_edge_offset_class1;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_chroma_fptr          =  &ihevc_sao_edge_offset_class1_chroma;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_fptr                 =  &ihevc_sao_edge_offset_class2;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_chroma_fptr          =  &ihevc_sao_edge_offset_class2_chroma;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_fptr                 =  &ihevc_sao_edge_offset_class3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_chroma_fptr          =  &ihevc_sao_edge_offset_class3_chroma;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgba8888_fptr            =  &ihevcd_fmt_conv_420sp_to_rgba8888;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgb565_fptr              =  &ihevcd_fmt_conv_420sp_to_rgb565;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420sp_fptr               =  &ihevcd_fmt_conv_420sp_to_420sp;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420p_fptr                =  &ihevcd_fmt_conv_420sp_to_420p;
+    ps_codec->s_func_selector.ihevcd_itrans_recon_dc_luma_fptr                  =  &ihevcd_itrans_recon_dc_luma;
+    ps_codec->s_func_selector.ihevcd_itrans_recon_dc_chroma_fptr                =  &ihevcd_itrans_recon_dc_chroma;
+}

diff --git a/decoder/x86/ihevcd_fmt_conv_ssse3_intr.c b/decoder/x86/ihevcd_fmt_conv_ssse3_intr.c
new file mode 100644
index 0000000..f963e66
--- /dev/null
+++ b/decoder/x86/ihevcd_fmt_conv_ssse3_intr.c

@@ -0,0 +1,270 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_frm_cvt_x86_intr.c
+*
+* @brief
+*  Platform specific intrinsic implementation of certain functions
+*
+* @author
+*  Ittiam
+* @par List of Functions:
+*  - ihevcd_itrans_recon_dc
+*  - ihevcd_fmt_conv_420sp_to_420p
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+#include "string.h"
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevcd_function_selector.h"
+#include <string.h>
+#include <immintrin.h>
+
+
+void ihevcd_fmt_conv_420sp_to_420p_ssse3(UWORD8 *pu1_y_src,
+                                         UWORD8 *pu1_uv_src,
+                                         UWORD8 *pu1_y_dst,
+                                         UWORD8 *pu1_u_dst,
+                                         UWORD8 *pu1_v_dst,
+                                         WORD32 wd,
+                                         WORD32 ht,
+                                         WORD32 src_y_strd,
+                                         WORD32 src_uv_strd,
+                                         WORD32 dst_y_strd,
+                                         WORD32 dst_uv_strd,
+                                         WORD32 is_u_first,
+                                         WORD32 disable_luma_copy)
+{
+    UWORD8 *pu1_src, *pu1_dst;
+    UWORD8 *pu1_u_src, *pu1_v_src;
+    WORD32 num_rows, num_cols, src_strd, dst_strd, cols, rows;
+    WORD32 i, j;
+
+    cols = 0;
+    pu1_u_src = (UWORD8 *)pu1_uv_src;
+    pu1_v_src = (UWORD8 *)pu1_uv_src + 1;
+    if(0 == disable_luma_copy)
+    {
+        /* copy luma */
+        pu1_src = (UWORD8 *)pu1_y_src;
+        pu1_dst = (UWORD8 *)pu1_y_dst;
+
+        num_rows = ht;
+        num_cols = wd;
+
+        src_strd = src_y_strd;
+        dst_strd = dst_y_strd;
+        for(i = 0; i < num_rows; i++)
+        {
+            memcpy(pu1_dst, pu1_src, num_cols);
+            pu1_dst += dst_strd;
+            pu1_src += src_strd;
+        }
+    }
+
+    /* de-interleave U and V and copy to destination */
+    if(!is_u_first)
+    {
+        UWORD8 *temp = pu1_u_dst;
+        pu1_u_dst = pu1_v_dst;
+        pu1_v_dst = temp;
+
+        pu1_u_src = (UWORD8 *)pu1_uv_src + 1;
+        pu1_v_src = (UWORD8 *)pu1_uv_src;
+    }
+
+    {
+        __m128i src_uv0_8x16b, src_uv1_8x16b, src_u_8x16b, src_v_8x16b;
+        __m128i temp0_8x16b, temp1_8x16b, alt_first_mask;
+
+        UWORD8 FIRST_ALT_SHUFFLE[16] = {
+            0x00, 0x02, 0x04, 0x06,
+            0x08, 0x0A, 0x0C, 0x0E,
+            0x01, 0x03, 0x05, 0x07,
+            0x09, 0x0B, 0x0D, 0x0F };
+
+        PREFETCH((char const *)(pu1_uv_src + (0 * src_uv_strd)), _MM_HINT_T0)
+        PREFETCH((char const *)(pu1_uv_src + (1 * src_uv_strd)), _MM_HINT_T0)
+        PREFETCH((char const *)(pu1_uv_src + (2 * src_uv_strd)), _MM_HINT_T0)
+        PREFETCH((char const *)(pu1_uv_src + (3 * src_uv_strd)), _MM_HINT_T0)
+        PREFETCH((char const *)(pu1_uv_src + (4 * src_uv_strd)), _MM_HINT_T0)
+        PREFETCH((char const *)(pu1_uv_src + (5 * src_uv_strd)), _MM_HINT_T0)
+        PREFETCH((char const *)(pu1_uv_src + (6 * src_uv_strd)), _MM_HINT_T0)
+        PREFETCH((char const *)(pu1_uv_src + (7 * src_uv_strd)), _MM_HINT_T0)
+
+        num_rows = ht >> 1;
+        num_cols = wd >> 1;
+
+        src_strd = src_uv_strd;
+        dst_strd = dst_uv_strd;
+
+        alt_first_mask = _mm_loadu_si128((__m128i *)&FIRST_ALT_SHUFFLE[0]);
+
+        if(num_cols > 15)
+        {
+            cols = num_cols >> 4;
+
+            for(i = 0; i < (num_rows >> 2); i++)
+            {
+                UWORD8 *pu1_uv_src_temp, *pu1_u_dst_temp, *pu1_v_dst_temp;
+
+                PREFETCH((char const *)(pu1_uv_src + (8 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_uv_src + (9 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_uv_src + (10 * src_strd)), _MM_HINT_T0)
+                PREFETCH((char const *)(pu1_uv_src + (11 * src_strd)), _MM_HINT_T0)
+
+                pu1_uv_src_temp = pu1_uv_src;
+                pu1_u_dst_temp =  pu1_u_dst;
+                pu1_v_dst_temp =  pu1_v_dst;
+
+                for(j = 0; j < cols; j++)
+                {
+
+                    /**** Row 0 ***/
+                    src_uv0_8x16b = _mm_loadu_si128((__m128i *)pu1_uv_src_temp);
+                    src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + 16));
+
+                    temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
+                    temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
+
+                    src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
+                    src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
+
+                    _mm_storeu_si128((__m128i *)(pu1_u_dst_temp), src_u_8x16b);
+                    _mm_storeu_si128((__m128i *)(pu1_v_dst_temp), src_v_8x16b);
+
+                    /**** Row 1 ***/
+                    src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (1 * src_strd)));
+                    src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (1 * src_strd) + 16));
+
+                    temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
+                    temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
+
+                    src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
+                    src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
+
+                    _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (1 * dst_strd)), src_u_8x16b);
+                    _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (1 * dst_strd)), src_v_8x16b);
+
+                    /**** Row 2 ***/
+                    src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (2 * src_strd)));
+                    src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (2 * src_strd) + 16));
+
+                    temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
+                    temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
+
+                    src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
+                    src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
+
+                    _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (2 * dst_strd)), src_u_8x16b);
+                    _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (2 * dst_strd)), src_v_8x16b);
+
+                    /**** Row 3 ***/
+                    src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (3 * src_strd)));
+                    src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (3 * src_strd) + 16));
+
+                    temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
+                    temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
+
+                    src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
+                    src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
+
+                    _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (3 * dst_strd)), src_u_8x16b);
+                    _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (3 * dst_strd)), src_v_8x16b);
+
+                    pu1_u_dst_temp += 16;
+                    pu1_v_dst_temp += 16;
+                    pu1_uv_src_temp += 32;
+                }
+
+                pu1_u_dst += 4 * dst_strd;
+                pu1_v_dst += 4 * dst_strd;
+                pu1_uv_src += 4 * src_strd;
+                //pu1_v_src += src_strd;
+            }
+            rows = num_rows & 0x3;
+            if(rows)
+            {
+                for(i = 0; i < rows; i++)
+                {
+                    UWORD8 *pu1_uv_src_temp, *pu1_u_dst_temp, *pu1_v_dst_temp;
+
+                    pu1_uv_src_temp = pu1_uv_src;
+                    pu1_u_dst_temp =  pu1_u_dst;
+                    pu1_v_dst_temp =  pu1_v_dst;
+
+                    for(j = 0; j < cols; j++)
+                    {
+
+                        src_uv0_8x16b = _mm_loadu_si128((__m128i *)pu1_uv_src_temp);
+                        src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + 16));
+
+                        temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
+                        temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
+
+                        src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
+                        src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
+
+                        _mm_storeu_si128((__m128i *)(pu1_u_dst_temp), src_u_8x16b);
+                        _mm_storeu_si128((__m128i *)(pu1_v_dst_temp), src_v_8x16b);
+
+                        pu1_u_dst_temp += 16;
+                        pu1_v_dst_temp += 16;
+                        pu1_uv_src_temp += 32;
+                    }
+
+                    pu1_u_dst += dst_strd;
+                    pu1_v_dst += dst_strd;
+                    pu1_uv_src += src_strd;
+                }
+            }
+            pu1_u_dst -= (num_rows * dst_strd);
+            pu1_v_dst -= (num_rows * dst_strd);
+            num_cols &= 0x0F;
+        }
+        if(num_cols)
+        {
+            pu1_u_dst += (cols << 4);
+            pu1_v_dst += (cols << 4);
+            pu1_u_src += 2 * (cols << 4);
+            pu1_v_src += 2 * (cols << 4);
+            for(i = 0; i < num_rows; i++)
+            {
+                for(j = 0; j < num_cols; j++)
+                {
+                    pu1_u_dst[j] = pu1_u_src[j * 2];
+                    pu1_v_dst[j] = pu1_v_src[j * 2];
+                }
+
+                pu1_u_dst += dst_strd;
+                pu1_v_dst += dst_strd;
+                pu1_u_src += src_strd;
+                pu1_v_src += src_strd;
+            }
+        }
+    }
+    return;
+}

diff --git a/decoder/x86/ihevcd_function_selector.c b/decoder/x86/ihevcd_function_selector.c
new file mode 100644
index 0000000..b058a62
--- /dev/null
+++ b/decoder/x86/ihevcd_function_selector.c

@@ -0,0 +1,105 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_function_selector.c
+*
+* @brief
+*  Contains functions to initialize function pointers used in hevc
+*
+* @author
+*  Naveen
+*
+* @par List of Functions:
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr(void *pv_codec)
+{
+    codec_t *ps_codec = (codec_t *)pv_codec;
+    switch(ps_codec->e_processor_arch)
+    {
+        case ARCH_X86_GENERIC:
+            ihevcd_init_function_ptr_generic(pv_codec);
+            break;
+        case ARCH_X86_SSSE3:
+            ihevcd_init_function_ptr_ssse3(pv_codec);
+            break;
+        case ARCH_X86_SSE42:
+            ihevcd_init_function_ptr_sse42(pv_codec);
+            break;
+        case ARCH_X86_AVX2:
+#ifndef DISABLE_AVX2
+            ihevcd_init_function_ptr_avx2(pv_codec);
+#else
+            ihevcd_init_function_ptr_sse42(pv_codec);
+#endif
+            break;
+        default:
+            ihevcd_init_function_ptr_ssse3(pv_codec);
+            break;
+    }
+}
+
+void ihevcd_init_arch(void *pv_codec)
+{
+    codec_t *ps_codec = (codec_t *)pv_codec;
+
+#ifdef DEFAULT_ARCH
+#if DEFAULT_ARCH == D_ARCH_X86_GENERIC
+    ps_codec->e_processor_arch = ARCH_X86_GENERIC;
+#elif DEFAULT_ARCH == D_ARCH_X86_SSE42
+    ps_codec->e_processor_arch = ARCH_X86_SSE42;
+#elif DEFAULT_ARCH == D_ARCH_X86_AVX2
+    ps_codec->e_processor_arch = ARCH_X86_AVX2;
+#else
+    ps_codec->e_processor_arch = ARCH_X86_SSSE3;
+#endif
+#else
+    ps_codec->e_processor_arch = ARCH_X86_SSSE3;
+#endif
+}

diff --git a/decoder/x86/ihevcd_function_selector_generic.c b/decoder/x86/ihevcd_function_selector_generic.c
new file mode 100644
index 0000000..f8b53ad
--- /dev/null
+++ b/decoder/x86/ihevcd_function_selector_generic.c

@@ -0,0 +1,162 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_function_selector.c
+*
+* @brief
+*  Contains functions to initialize function pointers used in hevc
+*
+* @author
+*  Naveen
+*
+* @par List of Functions:
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr_generic(void *pv_codec)
+{
+    codec_t *ps_codec = (codec_t *)pv_codec;
+
+    ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr                      =  &ihevc_deblk_chroma_horz;
+    ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr                      =  &ihevc_deblk_chroma_vert;
+    ps_codec->s_func_selector.ihevc_deblk_luma_vert_fptr                        =  &ihevc_deblk_luma_vert;
+    ps_codec->s_func_selector.ihevc_deblk_luma_horz_fptr                        =  &ihevc_deblk_luma_horz;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_fptr                 =  &ihevc_inter_pred_chroma_copy;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_w16out_fptr          =  &ihevc_inter_pred_chroma_copy_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_fptr                 =  &ihevc_inter_pred_chroma_horz;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr          =  &ihevc_inter_pred_chroma_horz_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_fptr                 =  &ihevc_inter_pred_chroma_vert;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_fptr          =  &ihevc_inter_pred_chroma_vert_w16inp;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_w16out_fptr   =  &ihevc_inter_pred_chroma_vert_w16inp_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16out_fptr          =  &ihevc_inter_pred_chroma_vert_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_fptr                   =  &ihevc_inter_pred_luma_horz;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_fptr                   =  &ihevc_inter_pred_luma_vert;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16out_fptr            =  &ihevc_inter_pred_luma_vert_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_fptr            =  &ihevc_inter_pred_luma_vert_w16inp;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_fptr                   =  &ihevc_inter_pred_luma_copy;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_w16out_fptr            =  &ihevc_inter_pred_luma_copy_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr            =  &ihevc_inter_pred_luma_horz_w16out;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_w16out_fptr     =  &ihevc_inter_pred_luma_vert_w16inp_w16out;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr     =  &ihevc_intra_pred_chroma_ref_substitution;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr       =  &ihevc_intra_pred_luma_ref_substitution;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr   =  &ihevc_intra_pred_luma_ref_subst_all_avlble;
+    ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr               =  &ihevc_intra_pred_ref_filtering;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_dc_fptr                   =  &ihevc_intra_pred_chroma_dc;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_horz_fptr                 =  &ihevc_intra_pred_chroma_horz;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode2_fptr                =  &ihevc_intra_pred_chroma_mode2;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_18_34_fptr           =  &ihevc_intra_pred_chroma_mode_18_34;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_27_to_33_fptr        =  &ihevc_intra_pred_chroma_mode_27_to_33;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_3_to_9_fptr          =  &ihevc_intra_pred_chroma_mode_3_to_9;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_planar_fptr               =  &ihevc_intra_pred_chroma_planar;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_ver_fptr                  =  &ihevc_intra_pred_chroma_ver;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_11_to_17_fptr        =  &ihevc_intra_pred_chroma_mode_11_to_17;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_19_to_25_fptr        =  &ihevc_intra_pred_chroma_mode_19_to_25;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_11_to_17_fptr          =  &ihevc_intra_pred_luma_mode_11_to_17;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_19_to_25_fptr          =  &ihevc_intra_pred_luma_mode_19_to_25;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_dc_fptr                     =  &ihevc_intra_pred_luma_dc;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_horz_fptr                   =  &ihevc_intra_pred_luma_horz;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode2_fptr                  =  &ihevc_intra_pred_luma_mode2;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_18_34_fptr             =  &ihevc_intra_pred_luma_mode_18_34;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_27_to_33_fptr          =  &ihevc_intra_pred_luma_mode_27_to_33;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_3_to_9_fptr            =  &ihevc_intra_pred_luma_mode_3_to_9;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_planar_fptr                 =  &ihevc_intra_pred_luma_planar;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ver_fptr                    =  &ihevc_intra_pred_luma_ver;
+    ps_codec->s_func_selector.ihevc_itrans_4x4_ttype1_fptr                      =  &ihevc_itrans_4x4_ttype1;
+    ps_codec->s_func_selector.ihevc_itrans_4x4_fptr                             =  &ihevc_itrans_4x4;
+    ps_codec->s_func_selector.ihevc_itrans_8x8_fptr                             =  &ihevc_itrans_8x8;
+    ps_codec->s_func_selector.ihevc_itrans_16x16_fptr                           =  &ihevc_itrans_16x16;
+    ps_codec->s_func_selector.ihevc_itrans_32x32_fptr                           =  &ihevc_itrans_32x32;
+    ps_codec->s_func_selector.ihevc_itrans_recon_4x4_ttype1_fptr                =  &ihevc_itrans_recon_4x4_ttype1;
+    ps_codec->s_func_selector.ihevc_itrans_recon_4x4_fptr                       =  &ihevc_itrans_recon_4x4;
+    ps_codec->s_func_selector.ihevc_itrans_recon_8x8_fptr                       =  &ihevc_itrans_recon_8x8;
+    ps_codec->s_func_selector.ihevc_itrans_recon_16x16_fptr                     =  &ihevc_itrans_recon_16x16;
+    ps_codec->s_func_selector.ihevc_itrans_recon_32x32_fptr                     =  &ihevc_itrans_recon_32x32;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_4x4_fptr                =  &ihevc_chroma_itrans_recon_4x4;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_8x8_fptr                =  &ihevc_chroma_itrans_recon_8x8;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_16x16_fptr              =  &ihevc_chroma_itrans_recon_16x16;
+    ps_codec->s_func_selector.ihevc_recon_4x4_ttype1_fptr                       =  &ihevc_recon_4x4_ttype1;
+    ps_codec->s_func_selector.ihevc_recon_4x4_fptr                              =  &ihevc_recon_4x4;
+    ps_codec->s_func_selector.ihevc_recon_8x8_fptr                              =  &ihevc_recon_8x8;
+    ps_codec->s_func_selector.ihevc_recon_16x16_fptr                            =  &ihevc_recon_16x16;
+    ps_codec->s_func_selector.ihevc_recon_32x32_fptr                            =  &ihevc_recon_32x32;
+    ps_codec->s_func_selector.ihevc_chroma_recon_4x4_fptr                       =  &ihevc_chroma_recon_4x4;
+    ps_codec->s_func_selector.ihevc_chroma_recon_8x8_fptr                       =  &ihevc_chroma_recon_8x8;
+    ps_codec->s_func_selector.ihevc_chroma_recon_16x16_fptr                     =  &ihevc_chroma_recon_16x16;
+    ps_codec->s_func_selector.ihevc_memcpy_mul_8_fptr                           =  &ihevc_memcpy_mul_8;
+    ps_codec->s_func_selector.ihevc_memcpy_fptr                                 =  &ihevc_memcpy;
+    ps_codec->s_func_selector.ihevc_memset_mul_8_fptr                           =  &ihevc_memset_mul_8;
+    ps_codec->s_func_selector.ihevc_memset_fptr                                 =  &ihevc_memset;
+    ps_codec->s_func_selector.ihevc_memset_16bit_mul_8_fptr                     =  &ihevc_memset_16bit_mul_8;
+    ps_codec->s_func_selector.ihevc_memset_16bit_fptr                           =  &ihevc_memset_16bit;
+    ps_codec->s_func_selector.ihevc_pad_left_luma_fptr                          =  &ihevc_pad_left_luma;
+    ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr                        =  &ihevc_pad_left_chroma;
+    ps_codec->s_func_selector.ihevc_pad_right_luma_fptr                         =  &ihevc_pad_right_luma;
+    ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr                       =  &ihevc_pad_right_chroma;
+    ps_codec->s_func_selector.ihevc_weighted_pred_bi_fptr                       =  &ihevc_weighted_pred_bi;
+    ps_codec->s_func_selector.ihevc_weighted_pred_bi_default_fptr               =  &ihevc_weighted_pred_bi_default;
+    ps_codec->s_func_selector.ihevc_weighted_pred_uni_fptr                      =  &ihevc_weighted_pred_uni;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_fptr                =  &ihevc_weighted_pred_chroma_bi;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_default_fptr        =  &ihevc_weighted_pred_chroma_bi_default;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_uni_fptr               =  &ihevc_weighted_pred_chroma_uni;
+    ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr                   =  &ihevc_sao_band_offset_luma;
+    ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr                 =  &ihevc_sao_band_offset_chroma;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_fptr                 =  &ihevc_sao_edge_offset_class0;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_chroma_fptr          =  &ihevc_sao_edge_offset_class0_chroma;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_fptr                 =  &ihevc_sao_edge_offset_class1;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_chroma_fptr          =  &ihevc_sao_edge_offset_class1_chroma;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_fptr                 =  &ihevc_sao_edge_offset_class2;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_chroma_fptr          =  &ihevc_sao_edge_offset_class2_chroma;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_fptr                 =  &ihevc_sao_edge_offset_class3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_chroma_fptr          =  &ihevc_sao_edge_offset_class3_chroma;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgba8888_fptr            =  &ihevcd_fmt_conv_420sp_to_rgba8888;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgb565_fptr              =  &ihevcd_fmt_conv_420sp_to_rgb565;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420sp_fptr               =  &ihevcd_fmt_conv_420sp_to_420sp;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420p_fptr                =  &ihevcd_fmt_conv_420sp_to_420p;
+    ps_codec->s_func_selector.ihevcd_itrans_recon_dc_luma_fptr                  =  &ihevcd_itrans_recon_dc_luma;
+    ps_codec->s_func_selector.ihevcd_itrans_recon_dc_chroma_fptr                =  &ihevcd_itrans_recon_dc_chroma;
+}

diff --git a/decoder/x86/ihevcd_function_selector_sse42.c b/decoder/x86/ihevcd_function_selector_sse42.c
new file mode 100644
index 0000000..fe46cc4
--- /dev/null
+++ b/decoder/x86/ihevcd_function_selector_sse42.c

@@ -0,0 +1,162 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_function_selector_sse42.c
+*
+* @brief
+*  Contains functions to initialize function pointers used in hevc
+*
+* @author
+*  Naveen
+*
+* @par List of Functions:
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr_sse42(void *pv_codec)
+{
+    codec_t *ps_codec = (codec_t *)pv_codec;
+
+    ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr                      =  &ihevc_deblk_chroma_horz_ssse3;
+    ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr                      =  &ihevc_deblk_chroma_vert_ssse3;
+    ps_codec->s_func_selector.ihevc_deblk_luma_vert_fptr                        =  &ihevc_deblk_luma_vert_ssse3;
+    ps_codec->s_func_selector.ihevc_deblk_luma_horz_fptr                        =  &ihevc_deblk_luma_horz_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_fptr                 =  &ihevc_inter_pred_chroma_copy_sse42;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_w16out_fptr          =  &ihevc_inter_pred_chroma_copy_w16out_sse42;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_fptr                 =  &ihevc_inter_pred_chroma_horz_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr          =  &ihevc_inter_pred_chroma_horz_w16out_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_fptr                 =  &ihevc_inter_pred_chroma_vert_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_fptr          =  &ihevc_inter_pred_chroma_vert_w16inp_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_w16out_fptr   =  &ihevc_inter_pred_chroma_vert_w16inp_w16out_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16out_fptr          =  &ihevc_inter_pred_chroma_vert_w16out_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_fptr                   =  &ihevc_inter_pred_luma_horz_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_fptr                   =  &ihevc_inter_pred_luma_vert_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16out_fptr            =  &ihevc_inter_pred_luma_vert_w16out_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_fptr            =  &ihevc_inter_pred_luma_vert_w16inp_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_fptr                   =  &ihevc_inter_pred_luma_copy_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_w16out_fptr            =  &ihevc_inter_pred_luma_copy_w16out_sse42;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr            =  &ihevc_inter_pred_luma_horz_w16out_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_w16out_fptr     =  &ihevc_inter_pred_luma_vert_w16inp_w16out_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr     =  &ihevc_intra_pred_chroma_ref_substitution;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr       =  &ihevc_intra_pred_luma_ref_substitution;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr   =  &ihevc_intra_pred_luma_ref_subst_all_avlble;
+    ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr               =  &ihevc_intra_pred_ref_filtering_sse42;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_dc_fptr                   =  &ihevc_intra_pred_chroma_dc_sse42;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_horz_fptr                 =  &ihevc_intra_pred_chroma_horz_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode2_fptr                =  &ihevc_intra_pred_chroma_mode2_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_18_34_fptr           =  &ihevc_intra_pred_chroma_mode_18_34_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_27_to_33_fptr        =  &ihevc_intra_pred_chroma_mode_27_to_33_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_3_to_9_fptr          =  &ihevc_intra_pred_chroma_mode_3_to_9_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_planar_fptr               =  &ihevc_intra_pred_chroma_planar_sse42;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_ver_fptr                  =  &ihevc_intra_pred_chroma_ver_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_11_to_17_fptr        =  &ihevc_intra_pred_chroma_mode_11_to_17_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_19_to_25_fptr        =  &ihevc_intra_pred_chroma_mode_19_to_25_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_11_to_17_fptr          =  &ihevc_intra_pred_luma_mode_11_to_17_sse42;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_19_to_25_fptr          =  &ihevc_intra_pred_luma_mode_19_to_25_sse42;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_dc_fptr                     =  &ihevc_intra_pred_luma_dc_sse42;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_horz_fptr                   =  &ihevc_intra_pred_luma_horz_sse42;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode2_fptr                  =  &ihevc_intra_pred_luma_mode2_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_18_34_fptr             =  &ihevc_intra_pred_luma_mode_18_34_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_27_to_33_fptr          =  &ihevc_intra_pred_luma_mode_27_to_33_sse42;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_3_to_9_fptr            =  &ihevc_intra_pred_luma_mode_3_to_9_sse42;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_planar_fptr                 =  &ihevc_intra_pred_luma_planar_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ver_fptr                    =  &ihevc_intra_pred_luma_ver_sse42;
+    ps_codec->s_func_selector.ihevc_itrans_4x4_ttype1_fptr                      =  &ihevc_itrans_4x4_ttype1;
+    ps_codec->s_func_selector.ihevc_itrans_4x4_fptr                             =  &ihevc_itrans_4x4;
+    ps_codec->s_func_selector.ihevc_itrans_8x8_fptr                             =  &ihevc_itrans_8x8;
+    ps_codec->s_func_selector.ihevc_itrans_16x16_fptr                           =  &ihevc_itrans_16x16;
+    ps_codec->s_func_selector.ihevc_itrans_32x32_fptr                           =  &ihevc_itrans_32x32;
+    ps_codec->s_func_selector.ihevc_itrans_recon_4x4_ttype1_fptr                =  &ihevc_itrans_recon_4x4_ttype1_sse42;
+    ps_codec->s_func_selector.ihevc_itrans_recon_4x4_fptr                       =  &ihevc_itrans_recon_4x4_sse42;
+    ps_codec->s_func_selector.ihevc_itrans_recon_8x8_fptr                       =  &ihevc_itrans_recon_8x8_sse42;
+    ps_codec->s_func_selector.ihevc_itrans_recon_16x16_fptr                     =  &ihevc_itrans_recon_16x16_ssse3;
+    ps_codec->s_func_selector.ihevc_itrans_recon_32x32_fptr                     =  &ihevc_itrans_recon_32x32_sse42;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_4x4_fptr                =  &ihevc_chroma_itrans_recon_4x4;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_8x8_fptr                =  &ihevc_chroma_itrans_recon_8x8;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_16x16_fptr              =  &ihevc_chroma_itrans_recon_16x16;
+    ps_codec->s_func_selector.ihevc_recon_4x4_ttype1_fptr                       =  &ihevc_recon_4x4_ttype1;
+    ps_codec->s_func_selector.ihevc_recon_4x4_fptr                              =  &ihevc_recon_4x4;
+    ps_codec->s_func_selector.ihevc_recon_8x8_fptr                              =  &ihevc_recon_8x8;
+    ps_codec->s_func_selector.ihevc_recon_16x16_fptr                            =  &ihevc_recon_16x16;
+    ps_codec->s_func_selector.ihevc_recon_32x32_fptr                            =  &ihevc_recon_32x32;
+    ps_codec->s_func_selector.ihevc_chroma_recon_4x4_fptr                       =  &ihevc_chroma_recon_4x4;
+    ps_codec->s_func_selector.ihevc_chroma_recon_8x8_fptr                       =  &ihevc_chroma_recon_8x8;
+    ps_codec->s_func_selector.ihevc_chroma_recon_16x16_fptr                     =  &ihevc_chroma_recon_16x16;
+    ps_codec->s_func_selector.ihevc_memcpy_mul_8_fptr                           =  &ihevc_memcpy_mul_8;
+    ps_codec->s_func_selector.ihevc_memcpy_fptr                                 =  &ihevc_memcpy;
+    ps_codec->s_func_selector.ihevc_memset_mul_8_fptr                           =  &ihevc_memset_mul_8;
+    ps_codec->s_func_selector.ihevc_memset_fptr                                 =  &ihevc_memset;
+    ps_codec->s_func_selector.ihevc_memset_16bit_mul_8_fptr                     =  &ihevc_memset_16bit_mul_8;
+    ps_codec->s_func_selector.ihevc_memset_16bit_fptr                           =  &ihevc_memset_16bit;
+    ps_codec->s_func_selector.ihevc_pad_left_luma_fptr                          =  &ihevc_pad_left_luma;
+    ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr                        =  &ihevc_pad_left_chroma;
+    ps_codec->s_func_selector.ihevc_pad_right_luma_fptr                         =  &ihevc_pad_right_luma;
+    ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr                       =  &ihevc_pad_right_chroma;
+    ps_codec->s_func_selector.ihevc_weighted_pred_bi_fptr                       =  &ihevc_weighted_pred_bi_sse42;
+    ps_codec->s_func_selector.ihevc_weighted_pred_bi_default_fptr               =  &ihevc_weighted_pred_bi_default_sse42;
+    ps_codec->s_func_selector.ihevc_weighted_pred_uni_fptr                      =  &ihevc_weighted_pred_uni_sse42;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_fptr                =  &ihevc_weighted_pred_chroma_bi_sse42;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_default_fptr        =  &ihevc_weighted_pred_chroma_bi_default_ssse3;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_uni_fptr               =  &ihevc_weighted_pred_chroma_uni_sse42;
+    ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr                   =  &ihevc_sao_band_offset_luma_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr                 =  &ihevc_sao_band_offset_chroma_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_fptr                 =  &ihevc_sao_edge_offset_class0_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_chroma_fptr          =  &ihevc_sao_edge_offset_class0_chroma_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_fptr                 =  &ihevc_sao_edge_offset_class1_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_chroma_fptr          =  &ihevc_sao_edge_offset_class1_chroma_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_fptr                 =  &ihevc_sao_edge_offset_class2_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_chroma_fptr          =  &ihevc_sao_edge_offset_class2_chroma_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_fptr                 =  &ihevc_sao_edge_offset_class3_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_chroma_fptr          =  &ihevc_sao_edge_offset_class3_chroma_ssse3;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgba8888_fptr            =  &ihevcd_fmt_conv_420sp_to_rgba8888;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgb565_fptr              =  &ihevcd_fmt_conv_420sp_to_rgb565;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420sp_fptr               =  &ihevcd_fmt_conv_420sp_to_420sp;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420p_fptr                =  &ihevcd_fmt_conv_420sp_to_420p_ssse3;
+    ps_codec->s_func_selector.ihevcd_itrans_recon_dc_luma_fptr                  =  &ihevcd_itrans_recon_dc_luma_sse42;
+    ps_codec->s_func_selector.ihevcd_itrans_recon_dc_chroma_fptr                =  &ihevcd_itrans_recon_dc_chroma_sse42;
+}

diff --git a/decoder/x86/ihevcd_function_selector_ssse3.c b/decoder/x86/ihevcd_function_selector_ssse3.c
new file mode 100644
index 0000000..fdb471a
--- /dev/null
+++ b/decoder/x86/ihevcd_function_selector_ssse3.c

@@ -0,0 +1,162 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_function_selector_ssse3.c
+*
+* @brief
+*  Contains functions to initialize function pointers used in hevc
+*
+* @author
+*  Naveen
+*
+* @par List of Functions:
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr_ssse3(void *pv_codec)
+{
+    codec_t *ps_codec = (codec_t *)pv_codec;
+
+    ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr                      =  &ihevc_deblk_chroma_horz_ssse3;
+    ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr                      =  &ihevc_deblk_chroma_vert_ssse3;
+    ps_codec->s_func_selector.ihevc_deblk_luma_vert_fptr                        =  &ihevc_deblk_luma_vert_ssse3;
+    ps_codec->s_func_selector.ihevc_deblk_luma_horz_fptr                        =  &ihevc_deblk_luma_horz_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_fptr                 =  &ihevc_inter_pred_chroma_copy_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_w16out_fptr          =  &ihevc_inter_pred_chroma_copy_w16out_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_fptr                 =  &ihevc_inter_pred_chroma_horz_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr          =  &ihevc_inter_pred_chroma_horz_w16out_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_fptr                 =  &ihevc_inter_pred_chroma_vert_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_fptr          =  &ihevc_inter_pred_chroma_vert_w16inp_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_w16out_fptr   =  &ihevc_inter_pred_chroma_vert_w16inp_w16out_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16out_fptr          =  &ihevc_inter_pred_chroma_vert_w16out_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_fptr                   =  &ihevc_inter_pred_luma_horz_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_fptr                   =  &ihevc_inter_pred_luma_vert_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16out_fptr            =  &ihevc_inter_pred_luma_vert_w16out_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_fptr            =  &ihevc_inter_pred_luma_vert_w16inp_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_fptr                   =  &ihevc_inter_pred_luma_copy_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_w16out_fptr            =  &ihevc_inter_pred_luma_copy_w16out_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr            =  &ihevc_inter_pred_luma_horz_w16out_ssse3;
+    ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_w16out_fptr     =  &ihevc_inter_pred_luma_vert_w16inp_w16out_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr     =  &ihevc_intra_pred_chroma_ref_substitution;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr       =  &ihevc_intra_pred_luma_ref_substitution;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr   =  &ihevc_intra_pred_luma_ref_subst_all_avlble;
+    ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr               =  &ihevc_intra_pred_ref_filtering_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_dc_fptr                   =  &ihevc_intra_pred_chroma_dc_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_horz_fptr                 =  &ihevc_intra_pred_chroma_horz_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode2_fptr                =  &ihevc_intra_pred_chroma_mode2_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_18_34_fptr           =  &ihevc_intra_pred_chroma_mode_18_34_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_27_to_33_fptr        =  &ihevc_intra_pred_chroma_mode_27_to_33_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_3_to_9_fptr          =  &ihevc_intra_pred_chroma_mode_3_to_9_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_planar_fptr               =  &ihevc_intra_pred_chroma_planar_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_ver_fptr                  =  &ihevc_intra_pred_chroma_ver_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_11_to_17_fptr        =  &ihevc_intra_pred_chroma_mode_11_to_17_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_19_to_25_fptr        =  &ihevc_intra_pred_chroma_mode_19_to_25_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_11_to_17_fptr          =  &ihevc_intra_pred_luma_mode_11_to_17_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_19_to_25_fptr          =  &ihevc_intra_pred_luma_mode_19_to_25_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_dc_fptr                     =  &ihevc_intra_pred_luma_dc_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_horz_fptr                   =  &ihevc_intra_pred_luma_horz_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode2_fptr                  =  &ihevc_intra_pred_luma_mode2_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_18_34_fptr             =  &ihevc_intra_pred_luma_mode_18_34_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_27_to_33_fptr          =  &ihevc_intra_pred_luma_mode_27_to_33_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_3_to_9_fptr            =  &ihevc_intra_pred_luma_mode_3_to_9_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_planar_fptr                 =  &ihevc_intra_pred_luma_planar_ssse3;
+    ps_codec->s_func_selector.ihevc_intra_pred_luma_ver_fptr                    =  &ihevc_intra_pred_luma_ver_ssse3;
+    ps_codec->s_func_selector.ihevc_itrans_4x4_ttype1_fptr                      =  &ihevc_itrans_4x4_ttype1;
+    ps_codec->s_func_selector.ihevc_itrans_4x4_fptr                             =  &ihevc_itrans_4x4;
+    ps_codec->s_func_selector.ihevc_itrans_8x8_fptr                             =  &ihevc_itrans_8x8;
+    ps_codec->s_func_selector.ihevc_itrans_16x16_fptr                           =  &ihevc_itrans_16x16;
+    ps_codec->s_func_selector.ihevc_itrans_32x32_fptr                           =  &ihevc_itrans_32x32;
+    ps_codec->s_func_selector.ihevc_itrans_recon_4x4_ttype1_fptr                =  &ihevc_itrans_recon_4x4_ttype1_ssse3;
+    ps_codec->s_func_selector.ihevc_itrans_recon_4x4_fptr                       =  &ihevc_itrans_recon_4x4_ssse3;
+    ps_codec->s_func_selector.ihevc_itrans_recon_8x8_fptr                       =  &ihevc_itrans_recon_8x8_ssse3;
+    ps_codec->s_func_selector.ihevc_itrans_recon_16x16_fptr                     =  &ihevc_itrans_recon_16x16_ssse3;
+    ps_codec->s_func_selector.ihevc_itrans_recon_32x32_fptr                     =  &ihevc_itrans_recon_32x32_ssse3;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_4x4_fptr                =  &ihevc_chroma_itrans_recon_4x4;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_8x8_fptr                =  &ihevc_chroma_itrans_recon_8x8;
+    ps_codec->s_func_selector.ihevc_chroma_itrans_recon_16x16_fptr              =  &ihevc_chroma_itrans_recon_16x16;
+    ps_codec->s_func_selector.ihevc_recon_4x4_ttype1_fptr                       =  &ihevc_recon_4x4_ttype1;
+    ps_codec->s_func_selector.ihevc_recon_4x4_fptr                              =  &ihevc_recon_4x4;
+    ps_codec->s_func_selector.ihevc_recon_8x8_fptr                              =  &ihevc_recon_8x8;
+    ps_codec->s_func_selector.ihevc_recon_16x16_fptr                            =  &ihevc_recon_16x16;
+    ps_codec->s_func_selector.ihevc_recon_32x32_fptr                            =  &ihevc_recon_32x32;
+    ps_codec->s_func_selector.ihevc_chroma_recon_4x4_fptr                       =  &ihevc_chroma_recon_4x4;
+    ps_codec->s_func_selector.ihevc_chroma_recon_8x8_fptr                       =  &ihevc_chroma_recon_8x8;
+    ps_codec->s_func_selector.ihevc_chroma_recon_16x16_fptr                     =  &ihevc_chroma_recon_16x16;
+    ps_codec->s_func_selector.ihevc_memcpy_mul_8_fptr                           =  &ihevc_memcpy_mul_8;
+    ps_codec->s_func_selector.ihevc_memcpy_fptr                                 =  &ihevc_memcpy;
+    ps_codec->s_func_selector.ihevc_memset_mul_8_fptr                           =  &ihevc_memset_mul_8;
+    ps_codec->s_func_selector.ihevc_memset_fptr                                 =  &ihevc_memset;
+    ps_codec->s_func_selector.ihevc_memset_16bit_mul_8_fptr                     =  &ihevc_memset_16bit_mul_8;
+    ps_codec->s_func_selector.ihevc_memset_16bit_fptr                           =  &ihevc_memset_16bit;
+    ps_codec->s_func_selector.ihevc_pad_left_luma_fptr                          =  &ihevc_pad_left_luma;
+    ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr                        =  &ihevc_pad_left_chroma;
+    ps_codec->s_func_selector.ihevc_pad_right_luma_fptr                         =  &ihevc_pad_right_luma;
+    ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr                       =  &ihevc_pad_right_chroma;
+    ps_codec->s_func_selector.ihevc_weighted_pred_bi_fptr                       =  &ihevc_weighted_pred_bi_ssse3;
+    ps_codec->s_func_selector.ihevc_weighted_pred_bi_default_fptr               =  &ihevc_weighted_pred_bi_default_ssse3;
+    ps_codec->s_func_selector.ihevc_weighted_pred_uni_fptr                      =  &ihevc_weighted_pred_uni_ssse3;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_fptr                =  &ihevc_weighted_pred_chroma_bi_ssse3;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_default_fptr        =  &ihevc_weighted_pred_chroma_bi_default_ssse3;
+    ps_codec->s_func_selector.ihevc_weighted_pred_chroma_uni_fptr               =  &ihevc_weighted_pred_chroma_uni_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr                   =  &ihevc_sao_band_offset_luma_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr                 =  &ihevc_sao_band_offset_chroma_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_fptr                 =  &ihevc_sao_edge_offset_class0_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_chroma_fptr          =  &ihevc_sao_edge_offset_class0_chroma_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_fptr                 =  &ihevc_sao_edge_offset_class1_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_chroma_fptr          =  &ihevc_sao_edge_offset_class1_chroma_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_fptr                 =  &ihevc_sao_edge_offset_class2_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_chroma_fptr          =  &ihevc_sao_edge_offset_class2_chroma_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_fptr                 =  &ihevc_sao_edge_offset_class3_ssse3;
+    ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_chroma_fptr          =  &ihevc_sao_edge_offset_class3_chroma_ssse3;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgba8888_fptr            =  &ihevcd_fmt_conv_420sp_to_rgba8888;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgb565_fptr              =  &ihevcd_fmt_conv_420sp_to_rgb565;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420sp_fptr               =  &ihevcd_fmt_conv_420sp_to_420sp;
+    ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420p_fptr                =  &ihevcd_fmt_conv_420sp_to_420p;
+    ps_codec->s_func_selector.ihevcd_itrans_recon_dc_luma_fptr                  =  &ihevcd_itrans_recon_dc_luma_ssse3;
+    ps_codec->s_func_selector.ihevcd_itrans_recon_dc_chroma_fptr                =  &ihevcd_itrans_recon_dc_chroma_ssse3;
+}

diff --git a/decoder/x86/ihevcd_it_rec_dc_sse42_intr.c b/decoder/x86/ihevcd_it_rec_dc_sse42_intr.c
new file mode 100644
index 0000000..55fa21b
--- /dev/null
+++ b/decoder/x86/ihevcd_it_rec_dc_sse42_intr.c

@@ -0,0 +1,401 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_it_rec_dc_x86_intr.c
+*
+* @brief
+*  Platform specific intrinsic implementation of certain functions
+*
+* @author
+*  Ittiam
+* @par List of Functions:
+*  - ihevcd_itrans_recon_dc
+*  - ihevcd_fmt_conv_420sp_to_420p
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevcd_function_selector.h"
+
+#include <immintrin.h>
+
+
+void ihevcd_itrans_recon_dc_luma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
+                                       WORD32 log2_trans_size, WORD16 i2_coeff_value)
+{
+    __m128i m_temp_reg_0;
+    __m128i m_temp_reg_1;
+    __m128i m_temp_reg_2;
+    __m128i m_temp_reg_3;
+    __m128i m_temp_reg_4;
+    __m128i m_temp_reg_5;
+    __m128i m_temp_reg_6;
+    __m128i m_temp_reg_7;
+    __m128i m_temp_reg_8;
+    __m128i m_temp_reg_9;
+    __m128i m_temp_reg_10;
+    __m128i m_temp_reg_11;
+    __m128i m_temp_reg_12;
+    __m128i m_temp_reg_13;
+    __m128i m_temp_reg_14;
+    __m128i m_temp_reg_15;
+    __m128i m_temp_reg_20, zero_8x16b;
+    __m128i *pi4_dst = (__m128i *)pu1_dst;
+
+
+    //WORD32 row,col;
+    WORD32 add, shift;
+    WORD32 dc_value, quant_out;
+    WORD32 trans_size;
+
+
+
+
+    trans_size = (1 << log2_trans_size);
+
+    quant_out = i2_coeff_value;
+
+    shift = IT_SHIFT_STAGE_1;
+    add = 1 << (shift - 1);
+    dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
+    shift = IT_SHIFT_STAGE_2;
+    add = 1 << (shift - 1);
+    dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
+
+    /*Replicate the DC value within 16 bits in 128 bit register*/
+    m_temp_reg_20 = _mm_set1_epi16(dc_value);
+    zero_8x16b = _mm_setzero_si128();
+
+    if(trans_size == 4)
+    {
+        WORD32 *pi4_dst = (WORD32 *)pu1_dst;
+
+        m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
+        m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
+        m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
+        m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
+
+        m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_0, m_temp_reg_1);
+        m_temp_reg_5 = _mm_unpacklo_epi32(m_temp_reg_2, m_temp_reg_3);
+
+        m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
+        m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
+
+        m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+        m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+
+        m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
+
+
+        *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_8);
+        m_temp_reg_1 = _mm_srli_si128(m_temp_reg_8, 4);
+        m_temp_reg_2 = _mm_srli_si128(m_temp_reg_8, 8);
+        m_temp_reg_3 = _mm_srli_si128(m_temp_reg_8, 12);
+        pu1_dst += dst_strd;
+        pi4_dst = (WORD32 *)(pu1_dst);
+
+        *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
+        pu1_dst += dst_strd;
+        pi4_dst = (WORD32 *)(pu1_dst);
+
+        *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
+        pu1_dst += dst_strd;
+        pi4_dst = (WORD32 *)(pu1_dst);
+
+        *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
+    }
+    else
+    {
+        WORD32 i, j;
+
+        for(i = 1; i <= trans_size; i += 4)
+        {
+            for(j = 1; j <= trans_size; j += 8)
+            {
+
+                m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
+                m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
+                m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
+
+
+                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_0, zero_8x16b);
+                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_1, zero_8x16b);
+                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_2, zero_8x16b);
+                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_3, zero_8x16b);
+
+                m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+                m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+                m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
+                m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
+
+                pi4_dst = (__m128i *)(pu1_dst);
+
+                m_temp_reg_12 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
+                _mm_storel_epi64(pi4_dst, m_temp_reg_12);
+
+                pi4_dst = (__m128i *)(pu1_dst + dst_strd);
+
+                m_temp_reg_13 = _mm_srli_si128(m_temp_reg_12, 8);
+                _mm_storel_epi64(pi4_dst, m_temp_reg_13);
+
+                pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
+
+                m_temp_reg_14 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
+                _mm_storel_epi64(pi4_dst, m_temp_reg_14);
+
+                pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
+
+                m_temp_reg_15 = _mm_srli_si128(m_temp_reg_14, 8);
+                _mm_storel_epi64(pi4_dst, m_temp_reg_15);
+
+                pu1_pred += 8;
+                pu1_dst += 8;
+            }
+            pu1_pred += 4 * pred_strd - trans_size;
+            pu1_dst += 4 * dst_strd - trans_size;
+        }
+    }
+
+
+}
+
+void ihevcd_itrans_recon_dc_chroma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
+                                         WORD32 log2_trans_size, WORD16 i2_coeff_value)
+{
+    __m128i m_temp_reg_0;
+    __m128i m_temp_reg_1;
+    __m128i m_temp_reg_2;
+    __m128i m_temp_reg_3;
+    __m128i m_temp_reg_4;
+    __m128i m_temp_reg_5;
+    __m128i m_temp_reg_6;
+    __m128i m_temp_reg_7;
+    __m128i m_temp_reg_8;
+    __m128i m_temp_reg_9;
+    __m128i m_temp_reg_10;
+    __m128i m_temp_reg_11;
+    __m128i m_temp_reg_12;
+    __m128i m_temp_reg_13;
+    __m128i m_temp_reg_14;
+    __m128i m_temp_reg_15;
+    __m128i m_temp_reg_20, zero_8x16b;
+    __m128i *pi4_dst = (__m128i *)pu1_dst;
+
+
+    //WORD32 row,col;
+    WORD32 add, shift;
+    WORD32 dc_value, quant_out;
+    WORD32 trans_size;
+
+
+    WORD32 shuffle_mask_4x4 = 0x06040200;
+    WORD32 unchanged_mask_4x4 = 0x07050301;
+    LWORD64 shuffle_mask = 0x0E0C0A0806040200LL;
+    LWORD64 unchanged_mask = 0x0F0D0B0907050301LL;
+
+    trans_size = (1 << log2_trans_size);
+
+    quant_out = i2_coeff_value;
+
+    shift = IT_SHIFT_STAGE_1;
+    add = 1 << (shift - 1);
+    dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
+    shift = IT_SHIFT_STAGE_2;
+    add = 1 << (shift - 1);
+    dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
+
+    /*Replicate the DC value within 16 bits in 128 bit register*/
+    m_temp_reg_20 = _mm_set1_epi16(dc_value);
+    zero_8x16b = _mm_setzero_si128();
+
+    if(trans_size == 4)
+    {
+        __m128i chroma_shuffle_mask_16x8b;
+        __m128i chroma_unchanged_mask_16x8b;
+        chroma_shuffle_mask_16x8b = _mm_cvtsi32_si128(shuffle_mask_4x4);
+        chroma_unchanged_mask_16x8b = _mm_cvtsi32_si128(unchanged_mask_4x4);
+
+        /*Load the prediction data*/
+        m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
+        m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
+        m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
+        m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
+
+        m_temp_reg_10  = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
+        m_temp_reg_11  = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
+        m_temp_reg_12  = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
+        m_temp_reg_13  = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
+
+        m_temp_reg_14 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+        m_temp_reg_15 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+
+        m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_14, zero_8x16b);
+        m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_15, zero_8x16b);
+
+        m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+        m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+
+        /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
+        m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_dst);
+        m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
+        m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_dst + 2 * dst_strd));
+        m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_dst + 3 * dst_strd));
+
+        m_temp_reg_0  = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
+        m_temp_reg_1  = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
+        m_temp_reg_2  = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
+        m_temp_reg_3  = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
+
+
+        m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
+        m_temp_reg_9 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_0);
+        m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
+        m_temp_reg_10 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_1);
+        m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
+        m_temp_reg_11 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_2);
+        m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
+        m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_3);
+
+        /*Store the result in the destination*/
+        _mm_storel_epi64(pi4_dst, m_temp_reg_9);
+        pu1_dst += dst_strd;
+        pi4_dst = (__m128i *)(pu1_dst);
+
+
+        _mm_storel_epi64(pi4_dst, m_temp_reg_10);
+        pu1_dst += dst_strd;
+        pi4_dst = (__m128i *)(pu1_dst);
+
+        _mm_storel_epi64(pi4_dst, m_temp_reg_11);
+        pu1_dst += dst_strd;
+        pi4_dst = (__m128i *)(pu1_dst);
+
+        _mm_storel_epi64(pi4_dst, m_temp_reg_12);
+    }
+    else
+    {
+        WORD32 i, j;
+        __m128i chroma_shuffle_mask_16x8b;
+        __m128i chroma_unchanged_mask_16x8b;
+        chroma_shuffle_mask_16x8b = _mm_loadl_epi64((__m128i *)(&shuffle_mask));
+        chroma_unchanged_mask_16x8b =
+                        _mm_loadl_epi64((__m128i *)(&unchanged_mask));
+
+        for(i = 0; i < trans_size; i += 4)
+        {
+            for(j = 0; j < trans_size; j += 8)
+            {
+
+                m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_pred);
+                m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_pred + pred_strd));
+                m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_pred + 2 * pred_strd));
+                m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_pred + 3 * pred_strd));
+
+                /*Retain only one chroma component*/
+                m_temp_reg_4  = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
+                m_temp_reg_5  = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
+                m_temp_reg_6  = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
+                m_temp_reg_7  = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
+
+                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
+                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
+                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, zero_8x16b);
+                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, zero_8x16b);
+
+                m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+                m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+                m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
+                m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
+
+
+                /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
+                m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_dst);
+                m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_dst + dst_strd));
+                m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_dst + 2 * dst_strd));
+                m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_dst + 3 * dst_strd));
+
+                m_temp_reg_0  = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
+                m_temp_reg_1  = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
+                m_temp_reg_2  = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
+                m_temp_reg_3  = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
+
+                m_temp_reg_4 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
+                m_temp_reg_5 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
+
+                m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_0);
+                m_temp_reg_4 = _mm_srli_si128(m_temp_reg_4, 8);
+                m_temp_reg_13 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_1);
+
+                m_temp_reg_14 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_2);
+                m_temp_reg_5 = _mm_srli_si128(m_temp_reg_5, 8);
+                m_temp_reg_15 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_3);
+
+                /*Store the result in the destination*/
+                pi4_dst = (__m128i *)(pu1_dst);
+
+                _mm_storel_epi64(pi4_dst, m_temp_reg_12);
+                m_temp_reg_8 = _mm_srli_si128(m_temp_reg_12, 8);
+
+                pi4_dst = (__m128i *)(pu1_dst + 8);
+                _mm_storel_epi64(pi4_dst, m_temp_reg_8);
+
+                pi4_dst = (__m128i *)(pu1_dst + dst_strd);
+
+                _mm_storel_epi64(pi4_dst, m_temp_reg_13);
+                m_temp_reg_9 = _mm_srli_si128(m_temp_reg_13, 8);
+
+                pi4_dst = (__m128i *)(pu1_dst + dst_strd + 8);
+                _mm_storel_epi64(pi4_dst, m_temp_reg_9);
+
+                pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
+
+                _mm_storel_epi64(pi4_dst, m_temp_reg_14);
+                m_temp_reg_10 = _mm_srli_si128(m_temp_reg_14, 8);
+
+                pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd + 8);
+                _mm_storel_epi64(pi4_dst, m_temp_reg_10);
+
+                pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
+
+                _mm_storel_epi64(pi4_dst, m_temp_reg_15);
+                m_temp_reg_11 = _mm_srli_si128(m_temp_reg_15, 8);
+
+                pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd + 8);
+                _mm_storel_epi64(pi4_dst, m_temp_reg_11);
+
+                pu1_pred += 16;
+                pu1_dst += 16;
+            }
+
+            pu1_pred += 4 * pred_strd - 2 * trans_size;
+            pu1_dst += 4 * dst_strd - 2 * trans_size;
+        }
+    }
+
+
+}

diff --git a/decoder/x86/ihevcd_it_rec_dc_ssse3_intr.c b/decoder/x86/ihevcd_it_rec_dc_ssse3_intr.c
new file mode 100644
index 0000000..2857a07
--- /dev/null
+++ b/decoder/x86/ihevcd_it_rec_dc_ssse3_intr.c

@@ -0,0 +1,401 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+*  ihevcd_it_rec_dc_atom_intr.c
+*
+* @brief
+*  Platform specific intrinsic implementation of certain functions
+*
+* @author
+*  Ittiam
+* @par List of Functions:
+*  - ihevcd_itrans_recon_dc
+*  - ihevcd_fmt_conv_420sp_to_420p
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevcd_function_selector.h"
+
+#include <immintrin.h>
+
+
+
+
+void ihevcd_itrans_recon_dc_luma_ssse3(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
+                                       WORD32 log2_trans_size, WORD16 i2_coeff_value)
+{
+    __m128i m_temp_reg_0;
+    __m128i m_temp_reg_1;
+    __m128i m_temp_reg_2;
+    __m128i m_temp_reg_3;
+    __m128i m_temp_reg_4;
+    __m128i m_temp_reg_5;
+    __m128i m_temp_reg_6;
+    __m128i m_temp_reg_7;
+    __m128i m_temp_reg_8;
+    __m128i m_temp_reg_9;
+    __m128i m_temp_reg_10;
+    __m128i m_temp_reg_11;
+    __m128i m_temp_reg_12;
+    __m128i m_temp_reg_13;
+    __m128i m_temp_reg_14;
+    __m128i m_temp_reg_15;
+    __m128i m_temp_reg_20, zero_8x16b;
+    __m128i *pi4_dst = (__m128i *)pu1_dst;
+
+
+
+    WORD32 add, shift;
+    WORD32 dc_value, quant_out;
+    WORD32 trans_size;
+
+
+
+    trans_size = (1 << log2_trans_size);
+
+    quant_out = i2_coeff_value;
+
+    shift = IT_SHIFT_STAGE_1;
+    add = 1 << (shift - 1);
+    dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
+    shift = IT_SHIFT_STAGE_2;
+    add = 1 << (shift - 1);
+    dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
+
+    /*Replicate the DC value within 16 bits in 128 bit register*/
+    m_temp_reg_20 = _mm_set1_epi16(dc_value);
+    zero_8x16b = _mm_setzero_si128();
+
+    if(trans_size == 4)
+    {
+        WORD32 *pi4_dst = (WORD32 *)pu1_dst;
+
+        m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
+        m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
+        m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
+        m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
+
+        m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_0, m_temp_reg_1);
+        m_temp_reg_5 = _mm_unpacklo_epi32(m_temp_reg_2, m_temp_reg_3);
+
+        m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
+        m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
+
+        m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+        m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+
+        m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
+
+
+        *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_8);
+        m_temp_reg_1 = _mm_srli_si128(m_temp_reg_8, 4);
+        m_temp_reg_2 = _mm_srli_si128(m_temp_reg_8, 8);
+        m_temp_reg_3 = _mm_srli_si128(m_temp_reg_8, 12);
+        pu1_dst += dst_strd;
+        pi4_dst = (WORD32 *)(pu1_dst);
+
+        *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
+        pu1_dst += dst_strd;
+        pi4_dst = (WORD32 *)(pu1_dst);
+
+        *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
+        pu1_dst += dst_strd;
+        pi4_dst = (WORD32 *)(pu1_dst);
+
+        *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
+    }
+    else
+    {
+        WORD32 i, j;
+
+        for(i = 1; i <= trans_size; i += 4)
+        {
+            for(j = 1; j <= trans_size; j += 8)
+            {
+
+                m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+                m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
+                m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
+                m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
+
+
+                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_0, zero_8x16b);
+                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_1, zero_8x16b);
+                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_2, zero_8x16b);
+                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_3, zero_8x16b);
+
+                m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+                m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+                m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
+                m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
+
+                pi4_dst = (__m128i *)(pu1_dst);
+
+                m_temp_reg_12 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
+                _mm_storel_epi64(pi4_dst, m_temp_reg_12);
+
+                pi4_dst = (__m128i *)(pu1_dst + dst_strd);
+
+                m_temp_reg_13 = _mm_srli_si128(m_temp_reg_12, 8);
+                _mm_storel_epi64(pi4_dst, m_temp_reg_13);
+
+                pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
+
+                m_temp_reg_14 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
+                _mm_storel_epi64(pi4_dst, m_temp_reg_14);
+
+                pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
+
+                m_temp_reg_15 = _mm_srli_si128(m_temp_reg_14, 8);
+                _mm_storel_epi64(pi4_dst, m_temp_reg_15);
+
+                pu1_pred += 8;
+                pu1_dst += 8;
+            }
+            pu1_pred += 4 * pred_strd - trans_size;
+            pu1_dst += 4 * dst_strd - trans_size;
+        }
+    }
+
+
+}
+
+void ihevcd_itrans_recon_dc_chroma_ssse3(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
+                                         WORD32 log2_trans_size, WORD16 i2_coeff_value)
+{
+    __m128i m_temp_reg_0;
+    __m128i m_temp_reg_1;
+    __m128i m_temp_reg_2;
+    __m128i m_temp_reg_3;
+    __m128i m_temp_reg_4;
+    __m128i m_temp_reg_5;
+    __m128i m_temp_reg_6;
+    __m128i m_temp_reg_7;
+    __m128i m_temp_reg_8;
+    __m128i m_temp_reg_9;
+    __m128i m_temp_reg_10;
+    __m128i m_temp_reg_11;
+    __m128i m_temp_reg_12;
+    __m128i m_temp_reg_13;
+    __m128i m_temp_reg_14;
+    __m128i m_temp_reg_15;
+    __m128i m_temp_reg_20, zero_8x16b;
+    __m128i *pi4_dst = (__m128i *)pu1_dst;
+
+
+    WORD32 add, shift;
+    WORD32 dc_value, quant_out;
+    WORD32 trans_size;
+
+
+    WORD32 shuffle_mask_4x4 = 0x06040200;
+    WORD32 unchanged_mask_4x4 = 0x07050301;
+    LWORD64 shuffle_mask = 0x0E0C0A0806040200LL;
+    LWORD64 unchanged_mask = 0x0F0D0B0907050301LL;
+
+    trans_size = (1 << log2_trans_size);
+
+    quant_out = i2_coeff_value;
+
+    shift = IT_SHIFT_STAGE_1;
+    add = 1 << (shift - 1);
+    dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
+    shift = IT_SHIFT_STAGE_2;
+    add = 1 << (shift - 1);
+    dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
+
+    /*Replicate the DC value within 16 bits in 128 bit register*/
+    m_temp_reg_20 = _mm_set1_epi16(dc_value);
+    zero_8x16b = _mm_setzero_si128();
+
+    if(trans_size == 4)
+    {
+        __m128i chroma_shuffle_mask_16x8b;
+        __m128i chroma_unchanged_mask_16x8b;
+        chroma_shuffle_mask_16x8b = _mm_cvtsi32_si128(shuffle_mask_4x4);
+        chroma_unchanged_mask_16x8b = _mm_cvtsi32_si128(unchanged_mask_4x4);
+
+        /*Load the prediction data*/
+        m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
+        m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
+        m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
+        m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
+
+        m_temp_reg_10  = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
+        m_temp_reg_11  = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
+        m_temp_reg_12  = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
+        m_temp_reg_13  = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
+
+        m_temp_reg_14 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+        m_temp_reg_15 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+
+        m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_14, zero_8x16b);
+        m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_15, zero_8x16b);
+
+        m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+        m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+
+        /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
+        m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_dst);
+        m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
+        m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_dst + 2 * dst_strd));
+        m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_dst + 3 * dst_strd));
+
+        m_temp_reg_0  = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
+        m_temp_reg_1  = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
+        m_temp_reg_2  = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
+        m_temp_reg_3  = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
+
+
+        m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
+        m_temp_reg_9 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_0);
+        m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
+        m_temp_reg_10 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_1);
+        m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
+        m_temp_reg_11 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_2);
+        m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
+        m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_3);
+
+        /*Store the result in the destination*/
+        _mm_storel_epi64(pi4_dst, m_temp_reg_9);
+        pu1_dst += dst_strd;
+        pi4_dst = (__m128i *)(pu1_dst);
+
+
+        _mm_storel_epi64(pi4_dst, m_temp_reg_10);
+        pu1_dst += dst_strd;
+        pi4_dst = (__m128i *)(pu1_dst);
+
+        _mm_storel_epi64(pi4_dst, m_temp_reg_11);
+        pu1_dst += dst_strd;
+        pi4_dst = (__m128i *)(pu1_dst);
+
+        _mm_storel_epi64(pi4_dst, m_temp_reg_12);
+    }
+    else
+    {
+        WORD32 i, j;
+        __m128i chroma_shuffle_mask_16x8b;
+        __m128i chroma_unchanged_mask_16x8b;
+        chroma_shuffle_mask_16x8b = _mm_loadl_epi64((__m128i *)(&shuffle_mask));
+        chroma_unchanged_mask_16x8b =
+                        _mm_loadl_epi64((__m128i *)(&unchanged_mask));
+
+        for(i = 0; i < trans_size; i += 4)
+        {
+            for(j = 0; j < trans_size; j += 8)
+            {
+
+                m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_pred);
+                m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_pred + pred_strd));
+                m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_pred + 2 * pred_strd));
+                m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_pred + 3 * pred_strd));
+
+                /*Retain only one chroma component*/
+                m_temp_reg_4  = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
+                m_temp_reg_5  = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
+                m_temp_reg_6  = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
+                m_temp_reg_7  = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
+
+                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
+                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
+                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, zero_8x16b);
+                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, zero_8x16b);
+
+                m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+                m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+                m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
+                m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
+
+
+                /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
+                m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_dst);
+                m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_dst + dst_strd));
+                m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_dst + 2 * dst_strd));
+                m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_dst + 3 * dst_strd));
+
+                m_temp_reg_0  = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
+                m_temp_reg_1  = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
+                m_temp_reg_2  = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
+                m_temp_reg_3  = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
+
+                m_temp_reg_4 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
+                m_temp_reg_5 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
+
+                m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_0);
+                m_temp_reg_4 = _mm_srli_si128(m_temp_reg_4, 8);
+                m_temp_reg_13 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_1);
+
+                m_temp_reg_14 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_2);
+                m_temp_reg_5 = _mm_srli_si128(m_temp_reg_5, 8);
+                m_temp_reg_15 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_3);
+
+                /*Store the result in the destination*/
+                pi4_dst = (__m128i *)(pu1_dst);
+
+                _mm_storel_epi64(pi4_dst, m_temp_reg_12);
+                m_temp_reg_8 = _mm_srli_si128(m_temp_reg_12, 8);
+
+                pi4_dst = (__m128i *)(pu1_dst + 8);
+                _mm_storel_epi64(pi4_dst, m_temp_reg_8);
+
+                pi4_dst = (__m128i *)(pu1_dst + dst_strd);
+
+                _mm_storel_epi64(pi4_dst, m_temp_reg_13);
+                m_temp_reg_9 = _mm_srli_si128(m_temp_reg_13, 8);
+
+                pi4_dst = (__m128i *)(pu1_dst + dst_strd + 8);
+                _mm_storel_epi64(pi4_dst, m_temp_reg_9);
+
+                pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
+
+                _mm_storel_epi64(pi4_dst, m_temp_reg_14);
+                m_temp_reg_10 = _mm_srli_si128(m_temp_reg_14, 8);
+
+                pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd + 8);
+                _mm_storel_epi64(pi4_dst, m_temp_reg_10);
+
+                pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
+
+                _mm_storel_epi64(pi4_dst, m_temp_reg_15);
+                m_temp_reg_11 = _mm_srli_si128(m_temp_reg_15, 8);
+
+                pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd + 8);
+                _mm_storel_epi64(pi4_dst, m_temp_reg_11);
+
+                pu1_pred += 16;
+                pu1_dst += 16;
+            }
+
+            pu1_pred += 4 * pred_strd - 2 * trans_size;
+            pu1_dst += 4 * dst_strd - 2 * trans_size;
+        }
+    }
+
+
+}

diff --git a/test/Android.mk b/test/Android.mk
new file mode 100644
index 0000000..7807003
--- /dev/null
+++ b/test/Android.mk

@@ -0,0 +1,5 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+# decoder
+include $(LOCAL_PATH)/decoder.mk

diff --git a/test/decoder.mk b/test/decoder.mk
new file mode 100644
index 0000000..ef560b3
--- /dev/null
+++ b/test/decoder.mk

@@ -0,0 +1,13 @@
+LOCAL_PATH := $(call my-dir)
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE    := hevcdec
+LOCAL_MODULE_TAGS := optional
+
+LOCAL_CFLAGS := -DPROFILE_ENABLE -DARM  -fPIC
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/../decoder $(LOCAL_PATH)/../common $(LOCAL_PATH)/
+LOCAL_SRC_FILES := decoder/main.c
+LOCAL_STATIC_LIBRARIES := libhevcdec
+
+include $(BUILD_EXECUTABLE)

diff --git a/test/decoder/main.c b/test/decoder/main.c
new file mode 100644
index 0000000..a4bf626
--- /dev/null
+++ b/test/decoder/main.c

@@ -0,0 +1,3169 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/*****************************************************************************/
+/*                                                                           */
+/*  File Name         : main.c                                               */
+/*                                                                           */
+/*  Description       : Contains an application that demonstrates use of HEVC*/
+/*                      decoder API                                          */
+/*                                                                           */
+/*  List of Functions :                                                      */
+/*                                                                           */
+/*  Issues / Problems : None                                                 */
+/*                                                                           */
+/*  Revision History  :                                                      */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         07 09 2012   Harish          Initial Version                      */
+/*****************************************************************************/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#ifdef X86_MINGW
+#include <signal.h>
+#endif
+
+#ifndef IOS
+#include <malloc.h>
+#endif
+#ifdef IOS_DISPLAY
+#include "cast_types.h"
+#else
+#include "ihevc_typedefs.h"
+#endif
+
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+
+#define MD5_DISABLE
+#ifdef X86_MSVC
+#include <windows.h>
+#else
+#include <sys/time.h>
+#endif
+
+#define ALIGN8(x) ((((x) + 7) >> 3) << 3)
+#define NUM_DISPLAY_BUFFERS 4
+#define DEFAULT_FPS         30
+
+
+#define ENABLE_DEGRADE 0
+#define MAX_DISP_BUFFERS    64
+#define EXTRA_DISP_BUFFERS  0
+#define STRLENGTH 1000
+
+//#define TEST_FLUSH
+#define FLUSH_FRM_CNT 100
+
+
+#ifdef IOS
+#define PATHLENMAX 500
+char filename_with_path[PATHLENMAX];
+#endif
+
+#ifdef PROFILE_ENABLE
+#ifdef X86_MSVC
+typedef  LARGE_INTEGER TIMER;
+#else
+//#ifdef X86_MINGW
+typedef struct timeval TIMER;
+//#endif
+#endif
+#else
+typedef WORD32 TIMER;
+#endif
+
+#ifdef PROFILE_ENABLE
+#ifdef X86_MSVC
+#define GETTIME(timer) QueryPerformanceCounter(timer);
+#else
+//#ifdef X86_MINGW
+#define GETTIME(timer) gettimeofday(timer,NULL);
+//#endif
+#endif
+
+#ifdef X86_MSVC
+#define ELAPSEDTIME(s_start_timer,s_end_timer, s_elapsed_time, frequency) \
+                  { \
+                   TIMER s_temp_time;   \
+                   s_temp_time.LowPart = s_end_timer.LowPart - s_start_timer.LowPart ; \
+                   s_elapsed_time = (UWORD32) ( ((DOUBLE)s_temp_time.LowPart / (DOUBLE)frequency.LowPart )  * 1000000); \
+                }
+#else
+//#ifdef X86_MINGW
+#define ELAPSEDTIME(s_start_timer,s_end_timer, s_elapsed_time, frequency) \
+                   s_elapsed_time = ((s_end_timer.tv_sec - s_start_timer.tv_sec) * 1000000) + (s_end_timer.tv_usec - s_start_timer.tv_usec);
+//#endif
+#endif
+
+#else
+#define GETTIME(timer)
+#define ELAPSEDTIME(s_start_timer,s_end_timer, s_elapsed_time, frequency)
+#endif
+
+
+/* Function declarations */
+#ifndef MD5_DISABLE
+void calc_md5_cksum(UWORD8 *pu1_inbuf, UWORD32 u4_stride, UWORD32 u4_width, UWORD32 u4_height, UWORD8 *pu1_cksum_p);
+#else
+#define calc_md5_cksum(a, b, c, d, e)
+#endif
+#ifdef SDL_DISPLAY
+void* sdl_disp_init(UWORD32, UWORD32, WORD32, WORD32, WORD32, WORD32, WORD32, WORD32 *, WORD32 *);
+void sdl_alloc_disp_buffers(void *);
+void sdl_display(void *, WORD32);
+void sdl_set_disp_buffers(void *, WORD32, UWORD8 **, UWORD8 **, UWORD8 **);
+void sdl_disp_deinit(void *);
+void sdl_disp_usleep(UWORD32);
+IV_COLOR_FORMAT_T sdl_get_color_fmt(void);
+UWORD32 sdl_get_stride(void);
+#endif
+
+#ifdef INTEL_CE5300
+void* gdl_disp_init(UWORD32, UWORD32, WORD32, WORD32, WORD32, WORD32, WORD32, WORD32 *, WORD32 *);
+void gdl_alloc_disp_buffers(void *);
+void gdl_display(void *, WORD32);
+void gdl_set_disp_buffers(void *, WORD32, UWORD8 **, UWORD8 **, UWORD8 **);
+void gdl_disp_deinit(void *);
+void gdl_disp_usleep(UWORD32);
+IV_COLOR_FORMAT_T gdl_get_color_fmt(void);
+UWORD32 gdl_get_stride(void);
+#endif
+
+#ifdef FBDEV_DISPLAY
+void* fbd_disp_init(UWORD32, UWORD32, WORD32, WORD32, WORD32, WORD32, WORD32, WORD32 *, WORD32 *);
+void fbd_alloc_disp_buffers(void *);
+void fbd_display(void *, WORD32);
+void fbd_set_disp_buffers(void *, WORD32, UWORD8 **, UWORD8 **, UWORD8 **);
+void fbd_disp_deinit(void *);
+void fbd_disp_usleep(UWORD32);
+IV_COLOR_FORMAT_T fbd_get_color_fmt(void);
+UWORD32 fbd_get_stride(void);
+#endif
+
+#ifdef IOS_DISPLAY
+void* ios_disp_init(UWORD32, UWORD32, WORD32, WORD32, WORD32, WORD32, WORD32, WORD32 *, WORD32 *);
+void ios_alloc_disp_buffers(void *);
+void ios_display(void *, WORD32);
+void ios_set_disp_buffers(void *, WORD32, UWORD8 **, UWORD8 **, UWORD8 **);
+void ios_disp_deinit(void *);
+void ios_disp_usleep(UWORD32);
+IV_COLOR_FORMAT_T ios_get_color_fmt(void);
+UWORD32 ios_get_stride(void);
+#endif
+
+typedef struct
+{
+    UWORD32 u4_piclen_flag;
+    UWORD32 u4_file_save_flag;
+    UWORD32 u4_chksum_save_flag;
+    UWORD32 u4_max_frm_ts;
+    IV_COLOR_FORMAT_T e_output_chroma_format;
+    IVD_ARCH_T e_arch;
+    IVD_SOC_T e_soc;
+    UWORD32 dump_q_rd_idx;
+    UWORD32 dump_q_wr_idx;
+    WORD32  disp_q_wr_idx;
+    WORD32  disp_q_rd_idx;
+
+    void *cocodec_obj;
+    UWORD32 share_disp_buf;
+    UWORD32 num_disp_buf;
+    UWORD32 b_pic_present;
+    WORD32 i4_degrade_type;
+    WORD32 i4_degrade_pics;
+    UWORD32 u4_num_cores;
+#ifdef GPU_BUILD
+    UWORD32 u4_gpu_enable_diable;
+#endif
+    UWORD32 disp_delay;
+    WORD32 trace_enable;
+    CHAR ac_trace_fname[STRLENGTH];
+    CHAR ac_piclen_fname[STRLENGTH];
+    CHAR ac_ip_fname[STRLENGTH];
+    CHAR ac_op_fname[STRLENGTH];
+    CHAR ac_op_chksum_fname[STRLENGTH];
+    ivd_out_bufdesc_t s_disp_buffers[MAX_DISP_BUFFERS];
+    iv_yuv_buf_t s_disp_frm_queue[MAX_DISP_BUFFERS];
+    UWORD32 s_disp_frm_id_queue[MAX_DISP_BUFFERS];
+    UWORD32 loopback;
+    UWORD32 display;
+    UWORD32 full_screen;
+    UWORD32 fps;
+    UWORD32 max_wd;
+    UWORD32 max_ht;
+    UWORD32 max_level;
+
+    UWORD32 u4_strd;
+
+    /* For signalling to display thread */
+    UWORD32 u4_pic_wd;
+    UWORD32 u4_pic_ht;
+
+    /* For IOS diplay */
+    WORD32 i4_screen_wd;
+    WORD32 i4_screen_ht;
+
+    //UWORD32 u4_output_present;
+    WORD32  quit;
+    WORD32  paused;
+
+
+    void *pv_disp_ctx;
+    void *display_thread_handle;
+    WORD32 display_thread_created;
+    volatile WORD32 display_init_done;
+    volatile WORD32 display_deinit_flag;
+
+    void* (*disp_init)(UWORD32, UWORD32, WORD32, WORD32, WORD32, WORD32, WORD32, WORD32 *, WORD32 *);
+    void (*alloc_disp_buffers)(void *);
+    void (*display_buffer)(void *, WORD32);
+    void (*set_disp_buffers)(void *, WORD32, UWORD8 **, UWORD8 **, UWORD8 **);
+    void (*disp_deinit)(void *);
+    void (*disp_usleep)(UWORD32);
+    IV_COLOR_FORMAT_T(*get_color_fmt)(void);
+    UWORD32(*get_stride)(void);
+}vid_dec_ctx_t;
+
+
+
+typedef enum
+{
+    INVALID,
+    HELP,
+    VERSION,
+    INPUT_FILE,
+    OUTPUT,
+    CHKSUM,
+    SAVE_OUTPUT,
+    SAVE_CHKSUM,
+    CHROMA_FORMAT,
+    NUM_FRAMES,
+    NUM_CORES,
+
+    SHARE_DISPLAY_BUF,
+#ifdef GPU_BUILD
+    ENABLE_GPU,
+#endif
+    LOOPBACK,
+    DISPLAY,
+    FULLSCREEN,
+    FPS,
+    TRACE,
+    MAX_WD,
+    MAX_HT,
+    MAX_LEVEL,
+    CONFIG,
+
+    DEGRADE_TYPE,
+    DEGRADE_PICS,
+    ARCH,
+    SOC,
+    PICLEN,
+    PICLEN_FILE,
+}ARGUMENT_T;
+
+typedef struct
+{
+    CHAR argument_shortname[4];
+    CHAR argument_name[128];
+    ARGUMENT_T argument;
+    CHAR description[512];
+}argument_t;
+
+static const argument_t argument_mapping[] =
+{
+    { "-h",  "--help",                   HELP,
+        "Print this help\n" },
+    { "-c", "--config",      CONFIG,
+        "config file (Default: test.cfg)\n" },
+
+    { "-v",  "--version",                VERSION,
+        "Version information\n" },
+    { "-i",  "--input",                  INPUT_FILE,
+        "Input file\n" },
+    { "-o",  "--output",                 OUTPUT,
+        "Output file\n" },
+    { "--",  "--piclen",                 PICLEN,
+        "Flag to signal if the decoder has to use a file containing number of bytes in each picture to be fed in each call\n" },
+    { "--",  "--piclen_file",                 PICLEN_FILE,
+        "File containing number of bytes in each picture - each line containing one size\n" },
+    { "--",  "--chksum",          CHKSUM,
+        "Output MD5 Checksum file\n" },
+    { "-s", "--save_output",            SAVE_OUTPUT,
+        "Save Output file\n" },
+    { "--", "--save_chksum",            SAVE_CHKSUM,
+        "Save Check sum file\n" },
+    { "--",  "--chroma_format",          CHROMA_FORMAT,
+        "Output Chroma format Supported values YUV_420P, YUV_422ILE, RGB_565, YUV_420SP_UV, YUV_420SP_VU\n" },
+    { "-n", "--num_frames",             NUM_FRAMES,
+        "Number of frames to be decoded\n" },
+    { "--", "--num_cores",              NUM_CORES,
+        "Number of cores to be used\n" },
+    { "--",  "--degrade_type",  DEGRADE_TYPE,
+        "Degrade type : 0: No degrade 0th bit set : Disable SAO 1st bit set : Disable deblocking 2nd bit set : Faster inter prediction filters 3rd bit set : Fastest inter prediction filters\n" },
+    { "--",  "--degrade_pics",  DEGRADE_PICS,
+        "Degrade pics : 0 : No degrade  1 : Only on non-reference frames  2 : Do not degrade every 4th or key frames  3 : All non-key frames  4 : All frames" },
+    { "--", "--share_display_buf",      SHARE_DISPLAY_BUF,
+        "Enable shared display buffer mode\n" },
+    { "--", "--loopback",      LOOPBACK,
+        "Enable playback in a loop\n" },
+    { "--", "--display",      DISPLAY,
+        "Enable display (uses SDL)\n" },
+    { "--", "--fullscreen",      FULLSCREEN,
+        "Enable full screen (Only for GDL and SDL)\n" },
+    { "--", "--fps",      FPS,
+        "FPS to be used for display \n" },
+#ifdef GPU_BUILD
+    { "--",  "--enable_gpu",       ENABLE_GPU,
+        "Enable shared display buffer mode\n" },
+#endif
+    { "-i",  "--trace",                   TRACE,
+        "Trace file\n" },
+    { "--", "--max_wd",      MAX_WD,
+        "Maximum width (Default: 2560) \n" },
+    { "--", "--max_ht",      MAX_HT,
+        "Maximum height (Default: 1600)\n" },
+    { "--", "--max_level",      MAX_LEVEL,
+        "Maximum Decoder Level (Default: 50)\n" },
+    { "--",  "--arch", ARCH,
+        "Set Architecture. Supported values  ARM_NONEON, ARM_A9Q, ARM_A7, ARM_A5, ARM_NEONINTR, X86_GENERIC, X86_SSSE3, X86_SSE4 \n" },
+    { "--",  "--soc", SOC,
+        "Set SOC. Supported values  GENERIC, HISI_37X \n" },
+};
+
+#define PEAK_WINDOW_SIZE            8
+#define MAX_FRAME_WIDTH             2560
+#define MAX_FRAME_HEIGHT            1600
+#define MAX_LEVEL_SUPPORTED         50
+#define MAX_REF_FRAMES              16
+#define MAX_REORDER_FRAMES          16
+#define DEFAULT_SHARE_DISPLAY_BUF   0
+#define STRIDE                      0
+#define DEFAULT_NUM_CORES           1
+
+#define DUMP_SINGLE_BUF 0
+#define IV_ISFATALERROR(x)         (((x) >> IVD_FATALERROR) & 0x1)
+
+#define ivd_cxa_api_function        ihevcd_cxa_api_function
+
+#ifdef IOS
+char filename_trace[PATHLENMAX];
+#endif
+
+#if ANDROID_NDK
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : raise                                                    */
+/*                                                                           */
+/*  Description   : Needed as a workaround when the application is built in  */
+/*                  Android NDK. This is an exception to be called for divide*/
+/*                  by zero error                                            */
+/*                                                                           */
+/*  Inputs        : a                                                        */
+/*  Globals       :                                                          */
+/*  Processing    : None                                                     */
+/*                                                                           */
+/*  Outputs       :                                                          */
+/*  Returns       :                                                          */
+/*                                                                           */
+/*  Issues        :                                                          */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         07 09 2012   100189          Initial Version                      */
+/*                                                                           */
+/*****************************************************************************/
+int raise(int a)
+{
+    printf("Divide by zero\n");
+    return 0;
+}
+#endif
+
+#ifdef _WIN32
+/*****************************************************************************/
+/* Function to print library calls                                           */
+/*****************************************************************************/
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : memalign                                                 */
+/*                                                                           */
+/*  Description   : Returns malloc data. Ideally should return aligned memory*/
+/*                  support alignment will be added later                    */
+/*                                                                           */
+/*  Inputs        : alignment                                                */
+/*                  size                                                     */
+/*  Globals       :                                                          */
+/*  Processing    :                                                          */
+/*                                                                           */
+/*  Outputs       :                                                          */
+/*  Returns       :                                                          */
+/*                                                                           */
+/*  Issues        :                                                          */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         07 09 2012   100189          Initial Version                      */
+/*                                                                           */
+/*****************************************************************************/
+
+void* ihevca_aligned_malloc(WORD32 alignment, WORD32 size)
+{
+    return (void *)_aligned_malloc(size, alignment);
+}
+
+void ihevca_aligned_free(void *pv_buf)
+{
+    _aligned_free(pv_buf);
+    return;
+}
+#endif
+
+#if IOS
+void* ihevca_aligned_malloc(WORD32 alignment, WORD32 size)
+{
+    return malloc(size);
+}
+
+void ihevca_aligned_free(void *pv_buf)
+{
+    free(pv_buf);
+    return;
+}
+#endif
+
+#if (!defined(IOS)) && (!defined(_WIN32))
+void* ihevca_aligned_malloc(WORD32 alignment, WORD32 size)
+{
+    return memalign(alignment, size);
+}
+
+void ihevca_aligned_free(void *pv_buf)
+{
+    free(pv_buf);
+    return;
+}
+#endif
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : set_degrade                                 */
+/*                                                                           */
+/*  Description   : Control call to set degrade level       */
+/*                                                                           */
+/*                                                                           */
+/*  Inputs        : codec_obj  - Codec Handle                                */
+/*                  type - degrade level value between 0 to 4                */
+/*                    0 : No degrade                                         */
+/*                    1st bit : Disable SAO                                  */
+/*                    2nd bit : Disable Deblock                              */
+/*                    3rd bit : Faster MC for non-ref                        */
+/*                    4th bit : Fastest MC for non-ref                       */
+/*                  pics - Pictures that are are degraded                    */
+/*                    0 : No degrade                                         */
+/*                    1 : Non-ref pictures                                   */
+/*                    2 : Pictures at given interval are not degraded        */
+/*                    3 : All non-key pictures                               */
+/*                    4 : All pictures                                       */
+/*  Globals       :                                                          */
+/*  Processing    : Calls degrade control to the codec                       */
+/*                                                                           */
+/*  Outputs       :                                                          */
+/*  Returns       : Control call return status                               */
+/*                                                                           */
+/*  Issues        :                                                          */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         07 09 2012   100189          Initial Version                      */
+/*                                                                           */
+/*****************************************************************************/
+
+IV_API_CALL_STATUS_T set_degrade(void *codec_obj, UWORD32 type, WORD32 pics)
+{
+    ihevcd_cxa_ctl_degrade_ip_t s_ctl_ip;
+    ihevcd_cxa_ctl_degrade_op_t s_ctl_op;
+    void *pv_api_ip, *pv_api_op;
+    IV_API_CALL_STATUS_T e_dec_status;
+
+    s_ctl_ip.u4_size = sizeof(ihevcd_cxa_ctl_degrade_ip_t);
+    s_ctl_ip.i4_degrade_type = type;
+    s_ctl_ip.i4_nondegrade_interval = 4;
+    s_ctl_ip.i4_degrade_pics = pics;
+
+    s_ctl_op.u4_size = sizeof(ihevcd_cxa_ctl_degrade_op_t);
+
+    pv_api_ip = (void *)&s_ctl_ip;
+    pv_api_op = (void *)&s_ctl_op;
+
+    s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+    s_ctl_ip.e_sub_cmd = (IVD_CONTROL_API_COMMAND_TYPE_T)IHEVCD_CXA_CMD_CTL_DEGRADE;
+
+    e_dec_status = ivd_cxa_api_function((iv_obj_t *)codec_obj, pv_api_ip, pv_api_op);
+
+    if(IV_SUCCESS != e_dec_status)
+    {
+        printf("Error in setting degrade level \n");
+    }
+    return (e_dec_status);
+
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : enable_skipb_frames                                      */
+/*                                                                           */
+/*  Description   : Control call to enable skipping of b frames              */
+/*                                                                           */
+/*                                                                           */
+/*  Inputs        : codec_obj : Codec handle                                 */
+/*  Globals       :                                                          */
+/*  Processing    : Calls enable skip B frames control                       */
+/*                                                                           */
+/*  Outputs       :                                                          */
+/*  Returns       : Control call return status                               */
+/*                                                                           */
+/*  Issues        :                                                          */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         07 09 2012   100189          Initial Version                      */
+/*                                                                           */
+/*****************************************************************************/
+
+IV_API_CALL_STATUS_T enable_skipb_frames(void *codec_obj,
+                                         vid_dec_ctx_t *ps_app_ctx)
+{
+    ivd_ctl_set_config_ip_t s_ctl_ip;
+    ivd_ctl_set_config_op_t s_ctl_op;
+    IV_API_CALL_STATUS_T e_dec_status;
+
+    s_ctl_ip.u4_disp_wd = ps_app_ctx->u4_strd;
+    s_ctl_ip.e_frm_skip_mode = IVD_SKIP_B;
+
+    s_ctl_ip.e_frm_out_mode = IVD_DISPLAY_FRAME_OUT;
+    s_ctl_ip.e_vid_dec_mode = IVD_DECODE_FRAME;
+    s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+    s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_SETPARAMS;
+    s_ctl_ip.u4_size = sizeof(ivd_ctl_set_config_ip_t);
+    s_ctl_op.u4_size = sizeof(ivd_ctl_set_config_op_t);
+
+    e_dec_status = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+                                        (void *)&s_ctl_op);
+
+    if(IV_SUCCESS != e_dec_status)
+    {
+        printf("Error in Enable SkipB frames \n");
+    }
+
+    return e_dec_status;
+}
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : disable_skipb_frames                                     */
+/*                                                                           */
+/*  Description   : Control call to disable skipping of b frames             */
+/*                                                                           */
+/*                                                                           */
+/*  Inputs        : codec_obj : Codec handle                                 */
+/*  Globals       :                                                          */
+/*  Processing    : Calls disable B frame skip control                       */
+/*                                                                           */
+/*  Outputs       :                                                          */
+/*  Returns       : Control call return status                               */
+/*                                                                           */
+/*  Issues        :                                                          */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         07 09 2012   100189          Initial Version                      */
+/*                                                                           */
+/*****************************************************************************/
+
+IV_API_CALL_STATUS_T disable_skipb_frames(void *codec_obj,
+                                          vid_dec_ctx_t *ps_app_ctx)
+{
+    ivd_ctl_set_config_ip_t s_ctl_ip;
+    ivd_ctl_set_config_op_t s_ctl_op;
+    IV_API_CALL_STATUS_T e_dec_status;
+
+    s_ctl_ip.u4_disp_wd = ps_app_ctx->u4_strd;
+    s_ctl_ip.e_frm_skip_mode = IVD_SKIP_NONE;
+
+    s_ctl_ip.e_frm_out_mode = IVD_DISPLAY_FRAME_OUT;
+    s_ctl_ip.e_vid_dec_mode = IVD_DECODE_FRAME;
+    s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+    s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_SETPARAMS;
+    s_ctl_ip.u4_size = sizeof(ivd_ctl_set_config_ip_t);
+    s_ctl_op.u4_size = sizeof(ivd_ctl_set_config_op_t);
+
+    e_dec_status = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+                                        (void *)&s_ctl_op);
+
+    if(IV_SUCCESS != e_dec_status)
+    {
+        printf("Error in Disable SkipB frames\n");
+    }
+
+    return e_dec_status;
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : enable_skippb_frames                                     */
+/*                                                                           */
+/*  Description   : Control call to enable skipping of P & B frames          */
+/*                                                                           */
+/*                                                                           */
+/*  Inputs        : codec_obj : Codec handle                                 */
+/*  Globals       :                                                          */
+/*  Processing    : Calls enable skip P and B frames control                 */
+/*                                                                           */
+/*  Outputs       :                                                          */
+/*  Returns       : Control call return status                               */
+/*                                                                           */
+/*  Issues        :                                                          */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         07 09 2012   100189          Initial Version                      */
+/*                                                                           */
+/*****************************************************************************/
+
+IV_API_CALL_STATUS_T enable_skippb_frames(void *codec_obj,
+                                          vid_dec_ctx_t *ps_app_ctx)
+{
+    ivd_ctl_set_config_ip_t s_ctl_ip;
+    ivd_ctl_set_config_op_t s_ctl_op;
+    IV_API_CALL_STATUS_T e_dec_status;
+
+    s_ctl_ip.u4_disp_wd = ps_app_ctx->u4_strd;
+    s_ctl_ip.e_frm_skip_mode = IVD_SKIP_PB;
+
+    s_ctl_ip.e_frm_out_mode = IVD_DISPLAY_FRAME_OUT;
+    s_ctl_ip.e_vid_dec_mode = IVD_DECODE_FRAME;
+    s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+    s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_SETPARAMS;
+    s_ctl_ip.u4_size = sizeof(ivd_ctl_set_config_ip_t);
+    s_ctl_op.u4_size = sizeof(ivd_ctl_set_config_op_t);
+
+    e_dec_status = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+                                        (void *)&s_ctl_op);
+    if(IV_SUCCESS != e_dec_status)
+    {
+        printf("Error in Enable SkipPB frames\n");
+    }
+
+    return e_dec_status;
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : disable_skippb_frames                                    */
+/*                                                                           */
+/*  Description   : Control call to disable skipping of P and B frames       */
+/*                                                                           */
+/*                                                                           */
+/*  Inputs        : codec_obj : Codec handle                                 */
+/*  Globals       :                                                          */
+/*  Processing    : Calls disable P and B frame skip control                 */
+/*                                                                           */
+/*  Outputs       :                                                          */
+/*  Returns       : Control call return status                               */
+/*                                                                           */
+/*  Issues        :                                                          */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         07 09 2012   100189          Initial Version                      */
+/*                                                                           */
+/*****************************************************************************/
+
+IV_API_CALL_STATUS_T disable_skippb_frames(void *codec_obj,
+                                           vid_dec_ctx_t *ps_app_ctx)
+{
+    ivd_ctl_set_config_ip_t s_ctl_ip;
+    ivd_ctl_set_config_op_t s_ctl_op;
+    IV_API_CALL_STATUS_T e_dec_status;
+
+    s_ctl_ip.u4_disp_wd = ps_app_ctx->u4_strd;
+    s_ctl_ip.e_frm_skip_mode = IVD_SKIP_NONE;
+
+    s_ctl_ip.e_frm_out_mode = IVD_DISPLAY_FRAME_OUT;
+    s_ctl_ip.e_vid_dec_mode = IVD_DECODE_FRAME;
+    s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+    s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_SETPARAMS;
+    s_ctl_ip.u4_size = sizeof(ivd_ctl_set_config_ip_t);
+    s_ctl_op.u4_size = sizeof(ivd_ctl_set_config_op_t);
+
+    e_dec_status = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+                                        (void *)&s_ctl_op);
+    if(IV_SUCCESS != e_dec_status)
+    {
+        printf("Error in Disable SkipPB frames\n");
+    }
+
+    return e_dec_status;
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : release_disp_frame                                       */
+/*                                                                           */
+/*  Description   : Calls release display control - Used to signal to the    */
+/*                  decoder that this particular buffer has been displayed   */
+/*                  and that the codec is now free to write to this buffer   */
+/*                                                                           */
+/*                                                                           */
+/*  Inputs        : codec_obj : Codec Handle                                 */
+/*                  buf_id    : Buffer Id of the buffer to be released       */
+/*                              This id would have been returned earlier by  */
+/*                              the codec                                    */
+/*  Globals       :                                                          */
+/*  Processing    : Calls Release Display call                               */
+/*                                                                           */
+/*  Outputs       :                                                          */
+/*  Returns       : Status of release display call                           */
+/*                                                                           */
+/*  Issues        :                                                          */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         07 09 2012   100189          Initial Version                      */
+/*                                                                           */
+/*****************************************************************************/
+
+IV_API_CALL_STATUS_T release_disp_frame(void *codec_obj, UWORD32 buf_id)
+{
+    ivd_rel_display_frame_ip_t s_video_rel_disp_ip;
+    ivd_rel_display_frame_op_t s_video_rel_disp_op;
+    IV_API_CALL_STATUS_T e_dec_status;
+
+    s_video_rel_disp_ip.e_cmd = IVD_CMD_REL_DISPLAY_FRAME;
+    s_video_rel_disp_ip.u4_size = sizeof(ivd_rel_display_frame_ip_t);
+    s_video_rel_disp_op.u4_size = sizeof(ivd_rel_display_frame_op_t);
+    s_video_rel_disp_ip.u4_disp_buf_id = buf_id;
+
+    e_dec_status = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_video_rel_disp_ip,
+                                        (void *)&s_video_rel_disp_op);
+    if(IV_SUCCESS != e_dec_status)
+    {
+        printf("Error in Release Disp frame\n");
+    }
+
+
+    return (e_dec_status);
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : get_version                                      */
+/*                                                                           */
+/*  Description   : Control call to get codec version              */
+/*                                                                           */
+/*                                                                           */
+/*  Inputs        : codec_obj : Codec handle                                 */
+/*  Globals       :                                                          */
+/*  Processing    : Calls enable skip B frames control                       */
+/*                                                                           */
+/*  Outputs       :                                                          */
+/*  Returns       : Control call return status                               */
+/*                                                                           */
+/*  Issues        :                                                          */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         07 09 2012   100189          Initial Version                      */
+/*                                                                           */
+/*****************************************************************************/
+
+IV_API_CALL_STATUS_T get_version(void *codec_obj)
+{
+    ivd_ctl_getversioninfo_ip_t s_ctl_dec_ip;
+    ivd_ctl_getversioninfo_op_t s_ctl_dec_op;
+    UWORD8 au1_buf[512];
+    IV_API_CALL_STATUS_T status;
+    s_ctl_dec_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+    s_ctl_dec_ip.e_sub_cmd = IVD_CMD_CTL_GETVERSION;
+    s_ctl_dec_ip.u4_size = sizeof(ivd_ctl_getversioninfo_ip_t);
+    s_ctl_dec_op.u4_size = sizeof(ivd_ctl_getversioninfo_op_t);
+    s_ctl_dec_ip.pv_version_buffer = au1_buf;
+    s_ctl_dec_ip.u4_version_buffer_size = sizeof(au1_buf);
+
+    status = ivd_cxa_api_function((iv_obj_t *)codec_obj,
+                                  (void *)&(s_ctl_dec_ip),
+                                  (void *)&(s_ctl_dec_op));
+
+    if(status != IV_SUCCESS)
+    {
+        printf("Error in Getting Version number e_dec_status = %d u4_error_code = %x\n",
+               status, s_ctl_dec_op.u4_error_code);
+    }
+    else
+    {
+        printf("Ittiam Decoder Version number: %s\n",
+               (char *)s_ctl_dec_ip.pv_version_buffer);
+    }
+    return status;
+}
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : codec_exit                                               */
+/*                                                                           */
+/*  Description   : handles unrecoverable errors                             */
+/*  Inputs        : Error message                                            */
+/*  Globals       : None                                                     */
+/*  Processing    : Prints error message to console and exits.               */
+/*  Outputs       : Error mesage to the console                              */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        :                                                          */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
+/*         07 06 2006   Sankar          Creation                             */
+/*                                                                           */
+/*****************************************************************************/
+void codec_exit(CHAR *pc_err_message)
+{
+    printf("%s\n", pc_err_message);
+    exit(-1);
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : dump_output                                              */
+/*                                                                           */
+/*  Description   : Used to dump output YUV                                  */
+/*  Inputs        : App context, disp output desc, File pointer              */
+/*  Globals       : None                                                     */
+/*  Processing    : Dumps to a file                                          */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        :                                                          */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
+/*         07 06 2006   Sankar          Creation                             */
+/*                                                                           */
+/*****************************************************************************/
+void dump_output(vid_dec_ctx_t *ps_app_ctx,
+                 iv_yuv_buf_t *ps_disp_frm_buf,
+                 UWORD32 u4_disp_frm_id,
+                 FILE *ps_op_file,
+                 FILE *ps_op_chksum_file,
+                 WORD32 i4_op_frm_ts,
+                 UWORD32 file_save,
+                 UWORD32 chksum_save)
+
+{
+
+    UWORD32 i;
+    iv_yuv_buf_t s_dump_disp_frm_buf;
+    UWORD32 u4_disp_id;
+
+    memset(&s_dump_disp_frm_buf, 0, sizeof(iv_yuv_buf_t));
+
+    if(ps_app_ctx->share_disp_buf)
+    {
+        if(ps_app_ctx->dump_q_wr_idx == MAX_DISP_BUFFERS
+                        )
+            ps_app_ctx->dump_q_wr_idx = 0;
+
+        if(ps_app_ctx->dump_q_rd_idx == MAX_DISP_BUFFERS
+                        )
+            ps_app_ctx->dump_q_rd_idx = 0;
+
+        ps_app_ctx->s_disp_frm_queue[ps_app_ctx->dump_q_wr_idx] =
+                        *ps_disp_frm_buf;
+        ps_app_ctx->s_disp_frm_id_queue[ps_app_ctx->dump_q_wr_idx] =
+                        u4_disp_frm_id;
+        ps_app_ctx->dump_q_wr_idx++;
+
+        if((WORD32)i4_op_frm_ts >= (WORD32)(ps_app_ctx->disp_delay - 1))
+        {
+            s_dump_disp_frm_buf =
+                            ps_app_ctx->s_disp_frm_queue[ps_app_ctx->dump_q_rd_idx];
+            u4_disp_id =
+                            ps_app_ctx->s_disp_frm_id_queue[ps_app_ctx->dump_q_rd_idx];
+            ps_app_ctx->dump_q_rd_idx++;
+        }
+        else
+        {
+            return;
+        }
+    }
+    else
+    {
+        s_dump_disp_frm_buf = *ps_disp_frm_buf;
+        u4_disp_id = u4_disp_frm_id;
+    }
+
+    release_disp_frame(ps_app_ctx->cocodec_obj, u4_disp_id);
+
+    if(0 == file_save && 0 == chksum_save)
+        return;
+
+    if(NULL == s_dump_disp_frm_buf.pv_y_buf)
+        return;
+
+    if(ps_app_ctx->e_output_chroma_format == IV_YUV_420P)
+    {
+#if DUMP_SINGLE_BUF
+        {
+            UWORD8 *buf = s_dump_disp_frm_buf.pv_y_buf - 24 - (s_dump_disp_frm_buf.u4_y_strd * 40);
+
+            UWORD32 size = s_dump_disp_frm_buf.u4_y_strd * ((s_dump_disp_frm_buf.u4_y_ht + 80) + (s_dump_disp_frm_buf.u4_u_ht + 40));
+            fwrite(buf, 1, size, ps_op_file);
+
+        }
+#else
+        if(0 != file_save)
+        {
+            UWORD8 *buf;
+
+            buf = (UWORD8 *)s_dump_disp_frm_buf.pv_y_buf;
+            for(i = 0; i < s_dump_disp_frm_buf.u4_y_ht; i++)
+            {
+                fwrite(buf, 1, s_dump_disp_frm_buf.u4_y_wd, ps_op_file);
+                buf += s_dump_disp_frm_buf.u4_y_strd;
+            }
+
+            buf = (UWORD8 *)s_dump_disp_frm_buf.pv_u_buf;
+            for(i = 0; i < s_dump_disp_frm_buf.u4_u_ht; i++)
+            {
+                fwrite(buf, 1, s_dump_disp_frm_buf.u4_u_wd, ps_op_file);
+                buf += s_dump_disp_frm_buf.u4_u_strd;
+            }
+            buf = (UWORD8 *)s_dump_disp_frm_buf.pv_v_buf;
+            for(i = 0; i < s_dump_disp_frm_buf.u4_v_ht; i++)
+            {
+                fwrite(buf, 1, s_dump_disp_frm_buf.u4_v_wd, ps_op_file);
+                buf += s_dump_disp_frm_buf.u4_v_strd;
+            }
+
+        }
+
+        if(0 != chksum_save)
+        {
+            UWORD8 au1_y_chksum[16];
+            UWORD8 au1_u_chksum[16];
+            UWORD8 au1_v_chksum[16];
+            calc_md5_cksum((UWORD8 *)s_dump_disp_frm_buf.pv_y_buf,
+                           s_dump_disp_frm_buf.u4_y_strd,
+                           s_dump_disp_frm_buf.u4_y_wd,
+                           s_dump_disp_frm_buf.u4_y_ht,
+                           au1_y_chksum);
+            calc_md5_cksum((UWORD8 *)s_dump_disp_frm_buf.pv_u_buf,
+                           s_dump_disp_frm_buf.u4_u_strd,
+                           s_dump_disp_frm_buf.u4_u_wd,
+                           s_dump_disp_frm_buf.u4_u_ht,
+                           au1_u_chksum);
+            calc_md5_cksum((UWORD8 *)s_dump_disp_frm_buf.pv_v_buf,
+                           s_dump_disp_frm_buf.u4_v_strd,
+                           s_dump_disp_frm_buf.u4_v_wd,
+                           s_dump_disp_frm_buf.u4_v_ht,
+                           au1_v_chksum);
+
+            fwrite(au1_y_chksum, sizeof(UWORD8), 16, ps_op_chksum_file);
+            fwrite(au1_u_chksum, sizeof(UWORD8), 16, ps_op_chksum_file);
+            fwrite(au1_v_chksum, sizeof(UWORD8), 16, ps_op_chksum_file);
+        }
+#endif
+    }
+    else if((ps_app_ctx->e_output_chroma_format == IV_YUV_420SP_UV)
+                    || (ps_app_ctx->e_output_chroma_format == IV_YUV_420SP_VU))
+    {
+#if DUMP_SINGLE_BUF
+        {
+
+            UWORD8 *buf = s_dump_disp_frm_buf.pv_y_buf - 24 - (s_dump_disp_frm_buf.u4_y_strd * 40);
+
+            UWORD32 size = s_dump_disp_frm_buf.u4_y_strd * ((s_dump_disp_frm_buf.u4_y_ht + 80) + (s_dump_disp_frm_buf.u4_u_ht + 40));
+            fwrite(buf, 1, size, ps_op_file);
+        }
+#else
+        {
+            UWORD8 *buf;
+
+            buf = (UWORD8 *)s_dump_disp_frm_buf.pv_y_buf;
+            for(i = 0; i < s_dump_disp_frm_buf.u4_y_ht; i++)
+            {
+                fwrite(buf, 1, s_dump_disp_frm_buf.u4_y_wd, ps_op_file);
+                buf += s_dump_disp_frm_buf.u4_y_strd;
+            }
+
+            buf = (UWORD8 *)s_dump_disp_frm_buf.pv_u_buf;
+            for(i = 0; i < s_dump_disp_frm_buf.u4_u_ht; i++)
+            {
+                fwrite(buf, 1, s_dump_disp_frm_buf.u4_u_wd, ps_op_file);
+                buf += s_dump_disp_frm_buf.u4_u_strd;
+            }
+        }
+#endif
+    }
+    else if(ps_app_ctx->e_output_chroma_format == IV_RGBA_8888)
+    {
+        UWORD8 *buf;
+
+        buf = (UWORD8 *)s_dump_disp_frm_buf.pv_y_buf;
+        for(i = 0; i < s_dump_disp_frm_buf.u4_y_ht; i++)
+        {
+            fwrite(buf, 1, s_dump_disp_frm_buf.u4_y_wd * 4, ps_op_file);
+            buf += s_dump_disp_frm_buf.u4_y_strd * 4;
+        }
+    }
+    else
+    {
+        UWORD8 *buf;
+
+        buf = (UWORD8 *)s_dump_disp_frm_buf.pv_y_buf;
+        for(i = 0; i < s_dump_disp_frm_buf.u4_y_ht; i++)
+        {
+            fwrite(buf, 1, s_dump_disp_frm_buf.u4_y_strd * 2, ps_op_file);
+            buf += s_dump_disp_frm_buf.u4_y_strd * 2;
+        }
+    }
+
+    fflush(ps_op_file);
+    fflush(ps_op_chksum_file);
+
+}
+
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : print_usage                                              */
+/*                                                                           */
+/*  Description   : Prints argument format                                   */
+/*                                                                           */
+/*                                                                           */
+/*  Inputs        :                                                          */
+/*  Globals       :                                                          */
+/*  Processing    : Prints argument format                                   */
+/*                                                                           */
+/*  Outputs       :                                                          */
+/*  Returns       :                                                          */
+/*                                                                           */
+/*  Issues        :                                                          */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         07 09 2012   100189          Initial Version                      */
+/*                                                                           */
+/*****************************************************************************/
+
+void print_usage(void)
+{
+    WORD32 i = 0;
+    WORD32 num_entries = sizeof(argument_mapping) / sizeof(argument_t);
+    printf("\nUsage:\n");
+    while(i < num_entries)
+    {
+        printf("%-32s\t %s", argument_mapping[i].argument_name,
+               argument_mapping[i].description);
+        i++;
+    }
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : get_argument                                             */
+/*                                                                           */
+/*  Description   : Gets argument for a given string                         */
+/*                                                                           */
+/*                                                                           */
+/*  Inputs        : name                                                     */
+/*  Globals       :                                                          */
+/*  Processing    : Searches the given string in the array and returns       */
+/*                  appropriate argument ID                                  */
+/*                                                                           */
+/*  Outputs       : Argument ID                                              */
+/*  Returns       : Argument ID                                              */
+/*                                                                           */
+/*  Issues        :                                                          */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         07 09 2012   100189          Initial Version                      */
+/*                                                                           */
+/*****************************************************************************/
+
+ARGUMENT_T get_argument(CHAR *name)
+{
+    WORD32 i = 0;
+    WORD32 num_entries = sizeof(argument_mapping) / sizeof(argument_t);
+    while(i < num_entries)
+    {
+        if((0 == strcmp(argument_mapping[i].argument_name, name)) ||
+                        ((0 == strcmp(argument_mapping[i].argument_shortname, name)) &&
+                                        (0 != strcmp(argument_mapping[i].argument_shortname, "--"))))
+        {
+            return argument_mapping[i].argument;
+        }
+        i++;
+    }
+    return INVALID;
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : get_argument                                             */
+/*                                                                           */
+/*  Description   : Gets argument for a given string                         */
+/*                                                                           */
+/*                                                                           */
+/*  Inputs        : name                                                     */
+/*  Globals       :                                                          */
+/*  Processing    : Searches the given string in the array and returns       */
+/*                  appropriate argument ID                                  */
+/*                                                                           */
+/*  Outputs       : Argument ID                                              */
+/*  Returns       : Argument ID                                              */
+/*                                                                           */
+/*  Issues        :                                                          */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         07 09 2012   100189          Initial Version                      */
+/*                                                                           */
+/*****************************************************************************/
+
+void parse_argument(vid_dec_ctx_t *ps_app_ctx, CHAR *argument, CHAR *value)
+{
+    ARGUMENT_T arg;
+
+    arg = get_argument(argument);
+    switch(arg)
+    {
+        case HELP:
+            print_usage();
+            exit(-1);
+        case VERSION:
+            break;
+        case INPUT_FILE:
+            sscanf(value, "%s", ps_app_ctx->ac_ip_fname);
+            //input_passed = 1;
+            break;
+
+        case OUTPUT:
+            sscanf(value, "%s", ps_app_ctx->ac_op_fname);
+            break;
+
+        case CHKSUM:
+            sscanf(value, "%s", ps_app_ctx->ac_op_chksum_fname);
+            break;
+
+        case SAVE_OUTPUT:
+            sscanf(value, "%d", &ps_app_ctx->u4_file_save_flag);
+            break;
+
+        case SAVE_CHKSUM:
+            sscanf(value, "%d", &ps_app_ctx->u4_chksum_save_flag);
+            break;
+
+        case CHROMA_FORMAT:
+            if((strcmp(value, "YUV_420P")) == 0)
+                ps_app_ctx->e_output_chroma_format = IV_YUV_420P;
+            else if((strcmp(value, "YUV_422ILE")) == 0)
+                ps_app_ctx->e_output_chroma_format = IV_YUV_422ILE;
+            else if((strcmp(value, "RGB_565")) == 0)
+                ps_app_ctx->e_output_chroma_format = IV_RGB_565;
+            else if((strcmp(value, "RGBA_8888")) == 0)
+                ps_app_ctx->e_output_chroma_format = IV_RGBA_8888;
+            else if((strcmp(value, "YUV_420SP_UV")) == 0)
+                ps_app_ctx->e_output_chroma_format = IV_YUV_420SP_UV;
+            else if((strcmp(value, "YUV_420SP_VU")) == 0)
+                ps_app_ctx->e_output_chroma_format = IV_YUV_420SP_VU;
+            else
+            {
+                printf("\nInvalid colour format setting it to IV_YUV_420P\n");
+                ps_app_ctx->e_output_chroma_format = IV_YUV_420P;
+            }
+
+            break;
+        case NUM_FRAMES:
+            sscanf(value, "%d", &ps_app_ctx->u4_max_frm_ts);
+            break;
+
+        case NUM_CORES:
+            sscanf(value, "%d", &ps_app_ctx->u4_num_cores);
+            break;
+        case DEGRADE_PICS:
+            sscanf(value, "%d", &ps_app_ctx->i4_degrade_pics);
+            break;
+        case DEGRADE_TYPE:
+            sscanf(value, "%d", &ps_app_ctx->i4_degrade_type);
+            break;
+        case SHARE_DISPLAY_BUF:
+            sscanf(value, "%d", &ps_app_ctx->share_disp_buf);
+            break;
+        case LOOPBACK:
+            sscanf(value, "%d", &ps_app_ctx->loopback);
+            break;
+        case DISPLAY:
+#if defined(SDL_DISPLAY) || defined(FBDEV_DISPLAY) || defined(INTEL_CE5300) || defined(IOS_DISPLAY)
+            sscanf(value, "%d", &ps_app_ctx->display);
+#else
+            ps_app_ctx->display = 0;
+#endif
+            break;
+        case FULLSCREEN:
+            sscanf(value, "%d", &ps_app_ctx->full_screen);
+            break;
+        case FPS:
+            sscanf(value, "%d", &ps_app_ctx->fps);
+            if(ps_app_ctx->fps <= 0)
+                ps_app_ctx->fps = DEFAULT_FPS;
+            break;
+        case MAX_WD:
+            sscanf(value, "%d", &ps_app_ctx->max_wd);
+            break;
+        case MAX_HT:
+            sscanf(value, "%d", &ps_app_ctx->max_ht);
+            break;
+        case MAX_LEVEL:
+            sscanf(value, "%d", &ps_app_ctx->max_level);
+            break;
+        case ARCH:
+            if((strcmp(value, "ARM_NONEON")) == 0)
+                ps_app_ctx->e_arch = ARCH_ARM_NONEON;
+            else if((strcmp(value, "ARM_A9Q")) == 0)
+                ps_app_ctx->e_arch = ARCH_ARM_A9Q;
+            else if((strcmp(value, "ARM_A7")) == 0)
+                ps_app_ctx->e_arch = ARCH_ARM_A7;
+            else if((strcmp(value, "ARM_A5")) == 0)
+                ps_app_ctx->e_arch = ARCH_ARM_A5;
+            else if((strcmp(value, "ARM_NEONINTR")) == 0)
+                ps_app_ctx->e_arch = ARCH_ARM_NEONINTR;
+            else if((strcmp(value, "X86_GENERIC")) == 0)
+                ps_app_ctx->e_arch = ARCH_X86_GENERIC;
+            else if((strcmp(value, "X86_SSSE3")) == 0)
+                ps_app_ctx->e_arch = ARCH_X86_SSSE3;
+            else if((strcmp(value, "X86_SSE42")) == 0)
+                ps_app_ctx->e_arch = ARCH_X86_SSE42;
+            else if((strcmp(value, "X86_AVX2")) == 0)
+                ps_app_ctx->e_arch = ARCH_X86_AVX2;
+            else if((strcmp(value, "MIPS_GENERIC")) == 0)
+                ps_app_ctx->e_arch = ARCH_MIPS_GENERIC;
+            else if((strcmp(value, "MIPS_32")) == 0)
+                ps_app_ctx->e_arch = ARCH_MIPS_32;
+            else
+            {
+                printf("\nInvalid Arch. Setting it to ARM_A9Q\n");
+                ps_app_ctx->e_arch = ARCH_ARM_A9Q;
+            }
+
+            break;
+        case SOC:
+            if((strcmp(value, "GENERIC")) == 0)
+                ps_app_ctx->e_soc = SOC_GENERIC;
+            else if((strcmp(value, "HISI_37X")) == 0)
+                ps_app_ctx->e_soc = SOC_HISI_37X;
+            else
+            {
+                ps_app_ctx->e_soc = atoi(value);
+/*
+                printf("\nInvalid SOC. Setting it to GENERIC\n");
+                ps_app_ctx->e_soc = SOC_GENERIC;
+*/
+            }
+            break;
+        case PICLEN:
+            sscanf(value, "%d", &ps_app_ctx->u4_piclen_flag);
+            break;
+
+        case PICLEN_FILE:
+            sscanf(value, "%s", ps_app_ctx->ac_piclen_fname);
+            break;
+
+#ifdef GPU_BUILD
+        case ENABLE_GPU:
+            sscanf(value, "%d", &ps_app_ctx->u4_gpu_enable_diable);
+            break;
+#endif
+        case INVALID:
+        default:
+            printf("Ignoring argument :  %s\n", argument);
+            break;
+    }
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : read_cfg_file                                            */
+/*                                                                           */
+/*  Description   : Reads arguments from a configuration file                */
+/*                                                                           */
+/*                                                                           */
+/*  Inputs        : ps_app_ctx  : Application context                        */
+/*                  fp_cfg_file : Configuration file handle                  */
+/*  Globals       :                                                          */
+/*  Processing    : Parses the arguments and fills in the application context*/
+/*                                                                           */
+/*  Outputs       : Arguments parsed                                         */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        :                                                          */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         07 09 2012   100189          Initial Version                      */
+/*                                                                           */
+/*****************************************************************************/
+
+void read_cfg_file(vid_dec_ctx_t *ps_app_ctx, FILE *fp_cfg_file)
+{
+
+    CHAR line[STRLENGTH];
+    CHAR description[STRLENGTH];
+    CHAR value[STRLENGTH];
+    CHAR argument[STRLENGTH];
+    void *ret;
+    while(0 == feof(fp_cfg_file))
+    {
+        line[0] = '\0';
+        ret = fgets(line, STRLENGTH, fp_cfg_file);
+        if(NULL == ret)
+            break;
+        argument[0] = '\0';
+        /* Reading Input File Name */
+        sscanf(line, "%s %s %s", argument, value, description);
+        if(argument[0] == '\0')
+            continue;
+
+        parse_argument(ps_app_ctx, argument, value);
+    }
+
+
+}
+
+/*!
+**************************************************************************
+* \if Function name : dispq_producer_dequeue \endif
+*
+* \brief
+*    This function gets a free buffer index where display data can be written
+*    This is a blocking call and can be exited by setting quit to true in
+*    the application context
+*
+* \param[in]  ps_app_ctx  : Pointer to application context
+*
+* \return
+*    returns Next free buffer index for producer
+*
+* \author
+*  Ittiam
+*
+**************************************************************************
+*/
+WORD32 dispq_producer_dequeue(vid_dec_ctx_t *ps_app_ctx)
+{
+    WORD32 idx;
+
+    /* If there is no free buffer wait */
+
+    while(((ps_app_ctx->disp_q_wr_idx + 1) % NUM_DISPLAY_BUFFERS) == ps_app_ctx->disp_q_rd_idx)
+    {
+
+        ithread_msleep(1);
+
+        if(ps_app_ctx->quit)
+            return (-1);
+    }
+
+    idx = ps_app_ctx->disp_q_wr_idx;
+    return (idx);
+}
+
+/*!
+**************************************************************************
+* \if Function name : dispq_producer_queue \endif
+*
+* \brief
+*    This function adds buffer which can be displayed
+*
+* \param[in]  ps_app_ctx  : Pointer to application context
+*
+* \return
+*    returns Next free buffer index for producer
+*
+* \author
+*  Ittiam
+*
+**************************************************************************
+*/
+WORD32 dispq_producer_queue(vid_dec_ctx_t *ps_app_ctx)
+{
+    ps_app_ctx->disp_q_wr_idx++;
+    if(ps_app_ctx->disp_q_wr_idx == NUM_DISPLAY_BUFFERS)
+        ps_app_ctx->disp_q_wr_idx = 0;
+
+    return (0);
+}
+/*!
+**************************************************************************
+* \if Function name : dispq_consumer_dequeue \endif
+*
+* \brief
+*    This function gets a free buffer index where display data can be written
+*    This is a blocking call and can be exited by setting quit to true in
+*    the application context
+*
+* \param[in]  ps_app_ctx  : Pointer to application context
+*
+* \return
+*    returns Next free buffer index for producer
+*
+* \author
+*  Ittiam
+*
+**************************************************************************
+*/
+WORD32 dispq_consumer_dequeue(vid_dec_ctx_t *ps_app_ctx)
+{
+    WORD32 idx;
+
+    /* If there is no free buffer wait */
+
+    while(ps_app_ctx->disp_q_wr_idx == ps_app_ctx->disp_q_rd_idx)
+    {
+
+        ithread_msleep(1);
+
+        if(ps_app_ctx->quit)
+            return (-1);
+    }
+
+    idx = ps_app_ctx->disp_q_rd_idx;
+    return (idx);
+}
+
+/*!
+**************************************************************************
+* \if Function name : dispq_producer_queue \endif
+*
+* \brief
+*    This function adds buffer which can be displayed
+*
+* \param[in]  ps_app_ctx  : Pointer to application context
+*
+* \return
+*    returns Next free buffer index for producer
+*
+* \author
+*  Ittiam
+*
+**************************************************************************
+*/
+WORD32 dispq_consumer_queue(vid_dec_ctx_t *ps_app_ctx)
+{
+    ps_app_ctx->disp_q_rd_idx++;
+    if(ps_app_ctx->disp_q_rd_idx == NUM_DISPLAY_BUFFERS)
+        ps_app_ctx->disp_q_rd_idx = 0;
+
+    return (0);
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : display_thread                                           */
+/*                                                                           */
+/*  Description   : Thread to display the frame                              */
+/*                                                                           */
+/*                                                                           */
+/*  Inputs        : pv_ctx  : Application context                            */
+/*                                                                           */
+/*  Globals       :                                                          */
+/*  Processing    : Wait for a buffer to get produced by decoder and display */
+/*                  that frame                                               */
+/*                                                                           */
+/*  Outputs       :                                                          */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        : Pause followed by quit is making some deadlock condn     */
+/*                  If decoder was lagging initially and then fasten up,     */
+/*                  display will also go at faster rate till it reaches      */
+/*                  equilibrium wrt the initial time                         */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         07 05 2013   100578          Initial Version                      */
+/*                                                                           */
+/*****************************************************************************/
+
+WORD32 display_thread(void *pv_ctx)
+{
+    vid_dec_ctx_t *ps_app_ctx = (vid_dec_ctx_t *)pv_ctx;
+
+
+    UWORD32 frm_duration; /* in us */
+    UWORD32 current_time;
+    UWORD32 expected_time;
+    TIMER   s_end_timer;
+    TIMER   s_first_frame_time;
+    UWORD32 first_frame_displayed;
+
+#ifdef X86_MINGW
+    UWORD32 frequency = 0;
+#endif
+#ifdef X86_MSVC
+    TIMER frequency;
+#endif
+
+#ifdef X86_MSVC
+    QueryPerformanceFrequency(&frequency);
+#endif
+    first_frame_displayed = 0;
+    expected_time = 0;
+    frm_duration = 1000000 / ps_app_ctx->fps;
+
+    /* Init display and allocate display buffers */
+    ps_app_ctx->pv_disp_ctx = (void *)ps_app_ctx->disp_init(ps_app_ctx->u4_pic_wd,
+                                                            ps_app_ctx->u4_pic_ht,
+                                                            ps_app_ctx->i4_screen_wd,
+                                                            ps_app_ctx->i4_screen_ht,
+                                                            ps_app_ctx->max_wd,
+                                                            ps_app_ctx->max_ht,
+                                                            ps_app_ctx->full_screen,
+                                                            &ps_app_ctx->quit,
+                                                            &ps_app_ctx->paused);
+    ps_app_ctx->alloc_disp_buffers(ps_app_ctx->pv_disp_ctx);
+
+    ps_app_ctx->display_init_done = 1;
+
+    while(1)
+    {
+        WORD32 rd_idx;
+
+        rd_idx = dispq_consumer_dequeue(ps_app_ctx);
+        if(ps_app_ctx->quit)
+            break;
+
+        ps_app_ctx->display_buffer(ps_app_ctx->pv_disp_ctx, rd_idx);
+
+        if(0 == first_frame_displayed)
+        {
+            GETTIME(&s_first_frame_time);
+            first_frame_displayed = 1;
+        }
+
+        /*********************************************************************/
+        /* Sleep based on the expected time of arrival of current buffer and */
+        /* the Current frame                                                 */
+        /*********************************************************************/
+
+        GETTIME(&s_end_timer);
+        ELAPSEDTIME(s_first_frame_time, s_end_timer, current_time, frequency);
+
+        /* time in micro second */
+        expected_time += frm_duration;
+
+        //printf("current_time %d expected_time %d diff %d \n", current_time, expected_time, (expected_time - current_time));
+        /* sleep for the diff. in time */
+        if(current_time < expected_time)
+            ps_app_ctx->disp_usleep((expected_time - current_time));
+        else
+            expected_time += (current_time - expected_time);
+
+        dispq_consumer_queue(ps_app_ctx);
+
+    }
+
+
+    while(0 == ps_app_ctx->display_deinit_flag)
+    {
+        ps_app_ctx->disp_usleep(1000);
+    }
+    ps_app_ctx->disp_deinit(ps_app_ctx->pv_disp_ctx);
+
+    /* destroy the display thread */
+    ithread_exit(ps_app_ctx->display_thread_handle);
+
+    return 0;
+}
+
+void flush_output(iv_obj_t *codec_obj,
+                  vid_dec_ctx_t *ps_app_ctx,
+                  ivd_out_bufdesc_t *ps_out_buf,
+                  UWORD8 *pu1_bs_buf,
+                  UWORD32 *pu4_op_frm_ts,
+                  FILE *ps_op_file,
+                  FILE *ps_op_chksum_file,
+                  UWORD32 u4_ip_frm_ts,
+                  UWORD32 u4_bytes_remaining)
+{
+    WORD32 ret;
+
+    do
+    {
+
+        ivd_ctl_flush_ip_t s_ctl_ip;
+        ivd_ctl_flush_op_t s_ctl_op;
+
+        if(*pu4_op_frm_ts >= (ps_app_ctx->u4_max_frm_ts + ps_app_ctx->disp_delay))
+            break;
+
+        s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+        s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_FLUSH;
+        s_ctl_ip.u4_size = sizeof(ivd_ctl_flush_ip_t);
+        s_ctl_op.u4_size = sizeof(ivd_ctl_flush_op_t);
+        ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+                                   (void *)&s_ctl_op);
+
+        if(ret != IV_SUCCESS)
+        {
+            printf("Error in Setting the decoder in flush mode\n");
+        }
+
+        if(IV_SUCCESS == ret)
+        {
+            ivd_video_decode_ip_t s_video_decode_ip;
+            ivd_video_decode_op_t s_video_decode_op;
+
+            s_video_decode_ip.e_cmd = IVD_CMD_VIDEO_DECODE;
+            s_video_decode_ip.u4_ts = u4_ip_frm_ts;
+            s_video_decode_ip.pv_stream_buffer = pu1_bs_buf;
+            s_video_decode_ip.u4_num_Bytes = u4_bytes_remaining;
+            s_video_decode_ip.u4_size = sizeof(ivd_video_decode_ip_t);
+            s_video_decode_ip.s_out_buffer.u4_min_out_buf_size[0] =
+                            ps_out_buf->u4_min_out_buf_size[0];
+            s_video_decode_ip.s_out_buffer.u4_min_out_buf_size[1] =
+                            ps_out_buf->u4_min_out_buf_size[1];
+            s_video_decode_ip.s_out_buffer.u4_min_out_buf_size[2] =
+                            ps_out_buf->u4_min_out_buf_size[2];
+
+            s_video_decode_ip.s_out_buffer.pu1_bufs[0] =
+                            ps_out_buf->pu1_bufs[0];
+            s_video_decode_ip.s_out_buffer.pu1_bufs[1] =
+                            ps_out_buf->pu1_bufs[1];
+            s_video_decode_ip.s_out_buffer.pu1_bufs[2] =
+                            ps_out_buf->pu1_bufs[2];
+            s_video_decode_ip.s_out_buffer.u4_num_bufs =
+                            ps_out_buf->u4_num_bufs;
+
+            s_video_decode_op.u4_size = sizeof(ivd_video_decode_op_t);
+
+            /*****************************************************************************/
+            /*   API Call: Video Decode                                                  */
+            /*****************************************************************************/
+            ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_video_decode_ip,
+                                       (void *)&s_video_decode_op);
+
+            if(1 == s_video_decode_op.u4_output_present)
+            {
+                dump_output(ps_app_ctx, &(s_video_decode_op.s_disp_frm_buf),
+                            s_video_decode_op.u4_disp_buf_id, ps_op_file,
+                            ps_op_chksum_file,
+                            *pu4_op_frm_ts, ps_app_ctx->u4_file_save_flag,
+                            ps_app_ctx->u4_chksum_save_flag);
+
+                (*pu4_op_frm_ts)++;
+            }
+        }
+    }while(IV_SUCCESS == ret);
+
+}
+
+#ifdef X86_MINGW
+void sigsegv_handler()
+{
+    printf("Segmentation fault, Exiting.. \n");
+    exit(-1);
+}
+#endif
+
+UWORD32 default_get_stride(void)
+{
+    return 0;
+}
+
+
+IV_COLOR_FORMAT_T default_get_color_fmt(void)
+{
+    return IV_YUV_420P;
+}
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : main                                                     */
+/*                                                                           */
+/*  Description   : Application to demonstrate codec API                     */
+/*                                                                           */
+/*                                                                           */
+/*  Inputs        : argc    - Number of arguments                            */
+/*                  argv[]  - Arguments                                      */
+/*  Globals       :                                                          */
+/*  Processing    : Shows how to use create, process, control and delete     */
+/*                                                                           */
+/*  Outputs       : Codec output in a file                                   */
+/*  Returns       :                                                          */
+/*                                                                           */
+/*  Issues        : Assumes both PROFILE_ENABLE to be                        */
+/*                  defined for multithread decode-display working           */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         07 09 2012   100189          Initial Version                      */
+/*         09 05 2013   100578          Multithread decode-display           */
+/*****************************************************************************/
+#ifdef IOS
+int hevcdec_main(char *homedir, char *documentdir, int screen_wd, int screen_ht)
+#else
+int main(WORD32 argc, CHAR *argv[])
+#endif
+{
+    CHAR ac_cfg_fname[STRLENGTH];
+    FILE *fp_cfg_file = NULL;
+    FILE *ps_piclen_file = NULL;
+    FILE *ps_ip_file = NULL;
+    FILE *ps_op_file = NULL;
+    FILE *ps_op_chksum_file = NULL;
+    WORD32 ret;
+    CHAR ac_error_str[STRLENGTH];
+    vid_dec_ctx_t s_app_ctx;
+    UWORD8 *pu1_bs_buf;
+
+    ivd_out_bufdesc_t *ps_out_buf;
+    UWORD32 u4_num_bytes_dec = 0;
+    UWORD32 file_pos = 0;
+    IV_API_CALL_STATUS_T e_dec_status;
+    UWORD32 u4_ip_frm_ts = 0, u4_op_frm_ts = 0;
+
+    WORD32 u4_bytes_remaining = 0;
+    void *pv_mem_rec_location;
+    UWORD32 u4_num_mem_recs;
+    UWORD32 i;
+    UWORD32 u4_ip_buf_len;
+    UWORD32 frm_cnt = 0;
+    WORD32 total_bytes_comsumed;
+
+#ifdef PROFILE_ENABLE
+    UWORD32 u4_tot_cycles = 0;
+    UWORD32 u4_tot_fmt_cycles = 0;
+    UWORD32 peak_window[PEAK_WINDOW_SIZE];
+    UWORD32 peak_window_idx = 0;
+    UWORD32 peak_avg_max = 0;
+#ifdef INTEL_CE5300
+    UWORD32 time_consumed = 0;
+    UWORD32 bytes_consumed = 0;
+#endif
+#endif
+
+#ifdef X86_MINGW
+    UWORD32 frequency = 0;
+#endif
+#ifdef X86_MSVC
+    TIMER frequency;
+#endif
+    WORD32 width = 0, height = 0;
+    iv_obj_t *codec_obj;
+#if defined(GPU_BUILD) && !defined(X86)
+//    int ioctl_init();
+//    ioctl_init();
+#endif
+
+#ifdef X86_MINGW
+    //For getting printfs without any delay
+    setvbuf(stdout, NULL, _IONBF, 0);
+    setvbuf(stderr, NULL, _IONBF, 0);
+#endif
+#ifdef IOS
+    sprintf(filename_trace, "%s/iostrace.txt", homedir);
+    printf("\ntrace file name = %s", filename_trace);
+#endif
+
+#ifdef X86_MINGW
+    {
+        signal(SIGSEGV, sigsegv_handler);
+    }
+#endif
+
+
+#ifndef IOS
+    /* Usage */
+    if(argc < 2)
+    {
+        printf("Using test.cfg as configuration file \n");
+        strcpy(ac_cfg_fname, "test.cfg");
+    }
+    else if(argc == 2)
+    {
+        strcpy(ac_cfg_fname, argv[1]);
+    }
+
+#else
+    strcpy(ac_cfg_fname, "test.cfg");
+
+#endif
+
+
+    /***********************************************************************/
+    /*                  Initialize Application parameters                  */
+    /***********************************************************************/
+
+    strcpy(s_app_ctx.ac_ip_fname, "\0");
+    s_app_ctx.dump_q_wr_idx = 0;
+    s_app_ctx.dump_q_rd_idx = 0;
+    s_app_ctx.display_thread_created = 0;
+    s_app_ctx.disp_q_wr_idx = 0;
+    s_app_ctx.disp_q_rd_idx = 0;
+    s_app_ctx.disp_delay = 0;
+    s_app_ctx.loopback = 0;
+    s_app_ctx.display = 0;
+    s_app_ctx.full_screen = 0;
+    s_app_ctx.u4_piclen_flag = 0;
+    s_app_ctx.fps = DEFAULT_FPS;
+    file_pos = 0;
+    total_bytes_comsumed = 0;
+    u4_ip_frm_ts = 0;
+    u4_op_frm_ts = 0;
+#ifdef PROFILE_ENABLE
+    memset(peak_window, 0, sizeof(WORD32) * PEAK_WINDOW_SIZE);
+#endif
+    s_app_ctx.share_disp_buf = DEFAULT_SHARE_DISPLAY_BUF;
+    s_app_ctx.u4_num_cores = DEFAULT_NUM_CORES;
+    s_app_ctx.i4_degrade_type = 0;
+    s_app_ctx.i4_degrade_pics = 0;
+    s_app_ctx.max_wd = 0;
+    s_app_ctx.max_ht = 0;
+    s_app_ctx.max_level = 0;
+    s_app_ctx.e_arch = ARCH_ARM_A9Q;
+    s_app_ctx.e_soc = SOC_GENERIC;
+
+    s_app_ctx.u4_strd = STRIDE;
+
+    s_app_ctx.display_thread_handle           = malloc(ithread_get_handle_size());
+    s_app_ctx.quit          = 0;
+    s_app_ctx.paused        = 0;
+    //s_app_ctx.u4_output_present = 0;
+
+#ifdef GPU_BUILD
+    s_app_ctx.u4_gpu_enable_diable = 0;
+#endif
+    s_app_ctx.get_stride = &default_get_stride;
+
+    s_app_ctx.get_color_fmt = &default_get_color_fmt;
+
+    /* Set function pointers for display */
+#ifdef SDL_DISPLAY
+    s_app_ctx.disp_init = &sdl_disp_init;
+    s_app_ctx.alloc_disp_buffers = &sdl_alloc_disp_buffers;
+    s_app_ctx.display_buffer = &sdl_display;
+    s_app_ctx.set_disp_buffers = &sdl_set_disp_buffers;
+    s_app_ctx.disp_deinit = &sdl_disp_deinit;
+    s_app_ctx.disp_usleep = &sdl_disp_usleep;
+    s_app_ctx.get_color_fmt = &sdl_get_color_fmt;
+    s_app_ctx.get_stride = &sdl_get_stride;
+#endif
+
+#ifdef FBDEV_DISPLAY
+    s_app_ctx.disp_init = &fbd_disp_init;
+    s_app_ctx.alloc_disp_buffers = &fbd_alloc_disp_buffers;
+    s_app_ctx.display_buffer = &fbd_display;
+    s_app_ctx.set_disp_buffers = &fbd_set_disp_buffers;
+    s_app_ctx.disp_deinit = &fbd_disp_deinit;
+    s_app_ctx.disp_usleep = &fbd_disp_usleep;
+    s_app_ctx.get_color_fmt = &fbd_get_color_fmt;
+    s_app_ctx.get_stride = &fbd_get_stride;
+#endif
+
+#ifdef INTEL_CE5300
+    s_app_ctx.disp_init = &gdl_disp_init;
+    s_app_ctx.alloc_disp_buffers = &gdl_alloc_disp_buffers;
+    s_app_ctx.display_buffer = &gdl_display;
+    s_app_ctx.set_disp_buffers = &gdl_set_disp_buffers;
+    s_app_ctx.disp_deinit = &gdl_disp_deinit;
+    s_app_ctx.disp_usleep = &gdl_disp_usleep;
+    s_app_ctx.get_color_fmt = &gdl_get_color_fmt;
+    s_app_ctx.get_stride = &gdl_get_stride;
+#endif
+
+#ifdef IOS_DISPLAY
+    s_app_ctx.disp_init = &ios_disp_init;
+    s_app_ctx.alloc_disp_buffers = &ios_alloc_disp_buffers;
+    s_app_ctx.display_buffer = &ios_display;
+    s_app_ctx.set_disp_buffers = &ios_set_disp_buffers;
+    s_app_ctx.disp_deinit = &ios_disp_deinit;
+    s_app_ctx.disp_usleep = &ios_disp_usleep;
+    s_app_ctx.get_color_fmt = &ios_get_color_fmt;
+    s_app_ctx.get_stride = &ios_get_stride;
+#endif
+
+    s_app_ctx.display_deinit_flag = 0;
+    s_app_ctx.e_output_chroma_format = IV_YUV_420SP_UV;
+    /*************************************************************************/
+    /* Parse arguments                                                       */
+    /*************************************************************************/
+
+#ifndef IOS
+    /* Read command line arguments */
+    if(argc > 2)
+    {
+        for(i = 1; i < (UWORD32)argc; i += 2)
+        {
+            if(CONFIG == get_argument(argv[i]))
+            {
+                strcpy(ac_cfg_fname, argv[i + 1]);
+                if((fp_cfg_file = fopen(ac_cfg_fname, "r")) == NULL)
+                {
+                    sprintf(ac_error_str, "Could not open Configuration file %s",
+                            ac_cfg_fname);
+                    codec_exit(ac_error_str);
+                }
+                read_cfg_file(&s_app_ctx, fp_cfg_file);
+                fclose(fp_cfg_file);
+            }
+            else
+            {
+                parse_argument(&s_app_ctx, argv[i], argv[i + 1]);
+            }
+        }
+    }
+    else
+    {
+        if((fp_cfg_file = fopen(ac_cfg_fname, "r")) == NULL)
+        {
+            sprintf(ac_error_str, "Could not open Configuration file %s",
+                    ac_cfg_fname);
+            codec_exit(ac_error_str);
+        }
+        read_cfg_file(&s_app_ctx, fp_cfg_file);
+        fclose(fp_cfg_file);
+    }
+#else
+    sprintf(filename_with_path, "%s/%s", homedir, ac_cfg_fname);
+    if((fp_cfg_file = fopen(filename_with_path, "r")) == NULL)
+    {
+        sprintf(ac_error_str, "Could not open Configuration file %s",
+                ac_cfg_fname);
+        codec_exit(ac_error_str);
+
+    }
+    read_cfg_file(&s_app_ctx, fp_cfg_file);
+    fclose(fp_cfg_file);
+
+#endif
+#ifdef PRINT_PICSIZE
+    /* If the binary is used for only getting number of bytes in each picture, then disable the following features */
+    s_app_ctx.u4_piclen_flag = 0;
+    s_app_ctx.u4_file_save_flag = 0;
+    s_app_ctx.u4_chksum_save_flag = 0;
+    s_app_ctx.i4_degrade_pics = 0;
+    s_app_ctx.i4_degrade_type = 0;
+    s_app_ctx.loopback = 0;
+    s_app_ctx.share_disp_buf = 0;
+    s_app_ctx.display = 0;
+#endif
+
+    /* If display is enabled, then turn off shared mode and get color format that is supported by display */
+    if(1 == s_app_ctx.display)
+    {
+        s_app_ctx.share_disp_buf = 0;
+        s_app_ctx.e_output_chroma_format = s_app_ctx.get_color_fmt();
+    }
+    if(strcmp(s_app_ctx.ac_ip_fname, "\0") == 0)
+    {
+        printf("\nNo input file given for decoding\n");
+        exit(-1);
+    }
+
+
+    /***********************************************************************/
+    /*          create the file object for input file                      */
+    /***********************************************************************/
+#ifdef IOS
+    sprintf(filename_with_path, "%s/%s", homedir, s_app_ctx.ac_ip_fname);
+    ps_ip_file = fopen(filename_with_path, "rb");
+#else
+    ps_ip_file = fopen(s_app_ctx.ac_ip_fname, "rb");
+#endif
+    if(NULL == ps_ip_file)
+    {
+        sprintf(ac_error_str, "Could not open input file %s",
+                s_app_ctx.ac_ip_fname);
+        codec_exit(ac_error_str);
+    }
+    /***********************************************************************/
+    /*          create the file object for input file                      */
+    /***********************************************************************/
+    if(1 == s_app_ctx.u4_piclen_flag)
+    {
+#ifdef IOS
+        sprintf(filename_with_path, "%s/%s", homedir, s_app_ctx.ac_piclen_fname);
+        ps_piclen_file = fopen(filename_with_path, "rb");
+#else
+        ps_piclen_file = fopen(s_app_ctx.ac_piclen_fname, "rb");
+#endif
+        if(NULL == ps_piclen_file)
+        {
+            sprintf(ac_error_str, "Could not open piclen file %s",
+                    s_app_ctx.ac_piclen_fname);
+            codec_exit(ac_error_str);
+        }
+    }
+
+    /***********************************************************************/
+    /*          create the file object for output file                     */
+    /***********************************************************************/
+    if(1 == s_app_ctx.u4_file_save_flag)
+    {
+#ifdef IOS
+        sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctx.ac_op_fname);
+        ps_op_file = fopen(filename_with_path, "wb");
+#else
+        ps_op_file = fopen(s_app_ctx.ac_op_fname, "wb");
+#endif
+
+        if(NULL == ps_op_file)
+        {
+            sprintf(ac_error_str, "Could not open output file %s",
+                    s_app_ctx.ac_op_fname);
+            codec_exit(ac_error_str);
+        }
+    }
+
+    /***********************************************************************/
+    /*          create the file object for check sum file                  */
+    /***********************************************************************/
+    if(1 == s_app_ctx.u4_chksum_save_flag)
+    {
+#if IOS
+        sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctx.ac_op_chksum_fname);
+        ps_op_chksum_file = fopen(filename_with_path, "wb");
+#else
+        ps_op_chksum_file = fopen(s_app_ctx.ac_op_chksum_fname, "wb");
+#endif
+        if(NULL == ps_op_chksum_file)
+        {
+            sprintf(ac_error_str, "Could not open check sum file %s",
+                    s_app_ctx.ac_op_chksum_fname);
+            codec_exit(ac_error_str);
+        }
+    }
+    /***********************************************************************/
+    /*                      Create decoder instance                        */
+    /***********************************************************************/
+    {
+
+        ps_out_buf = (ivd_out_bufdesc_t *)malloc(sizeof(ivd_out_bufdesc_t));
+
+        {
+            iv_num_mem_rec_ip_t s_no_of_mem_rec_query_ip;
+            iv_num_mem_rec_op_t s_no_of_mem_rec_query_op;
+
+            s_no_of_mem_rec_query_ip.u4_size = sizeof(s_no_of_mem_rec_query_ip);
+            s_no_of_mem_rec_query_op.u4_size = sizeof(s_no_of_mem_rec_query_op);
+            s_no_of_mem_rec_query_ip.e_cmd = IV_CMD_GET_NUM_MEM_REC;
+
+            /*****************************************************************************/
+            /*   API Call: Get Number of Mem Records                                     */
+            /*****************************************************************************/
+            e_dec_status = ivd_cxa_api_function(
+                            NULL, (void *)&s_no_of_mem_rec_query_ip,
+                            (void *)&s_no_of_mem_rec_query_op);
+            if(IV_SUCCESS != e_dec_status)
+            {
+                sprintf(ac_error_str, "Error in get mem records");
+                codec_exit(ac_error_str);
+            }
+
+            u4_num_mem_recs = s_no_of_mem_rec_query_op.u4_num_mem_rec;
+        }
+
+        pv_mem_rec_location = malloc(u4_num_mem_recs * sizeof(iv_mem_rec_t));
+        if(pv_mem_rec_location == NULL)
+        {
+            sprintf(ac_error_str, "Allocation failure for mem_rec_location");
+            codec_exit(ac_error_str);
+
+        }
+
+        {
+            ihevcd_cxa_fill_mem_rec_ip_t s_fill_mem_rec_ip;
+            ihevcd_cxa_fill_mem_rec_op_t s_fill_mem_rec_op;
+            iv_mem_rec_t *ps_mem_rec;
+            UWORD32 total_size;
+
+            s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.e_cmd =
+                            IV_CMD_FILL_NUM_MEM_REC;
+            s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.pv_mem_rec_location =
+                            (iv_mem_rec_t *)pv_mem_rec_location;
+            s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.u4_max_frm_wd =
+                            (s_app_ctx.max_wd == 0) ? MAX_FRAME_WIDTH : s_app_ctx.max_wd;
+            s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.u4_max_frm_ht =
+                            (s_app_ctx.max_ht == 0) ? MAX_FRAME_HEIGHT : s_app_ctx.max_ht;
+            s_fill_mem_rec_ip.i4_level = (s_app_ctx.max_level == 0) ? MAX_LEVEL_SUPPORTED : s_app_ctx.max_level;
+            s_fill_mem_rec_ip.u4_num_ref_frames = MAX_REF_FRAMES;
+            s_fill_mem_rec_ip.u4_num_reorder_frames = MAX_REORDER_FRAMES;
+            s_fill_mem_rec_ip.u4_share_disp_buf = s_app_ctx.share_disp_buf;
+            s_fill_mem_rec_ip.e_output_format =
+                            (IV_COLOR_FORMAT_T)s_app_ctx.e_output_chroma_format;
+            s_fill_mem_rec_ip.u4_num_extra_disp_buf = EXTRA_DISP_BUFFERS;
+
+            s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.u4_size =
+                            sizeof(ihevcd_cxa_fill_mem_rec_ip_t);
+            s_fill_mem_rec_op.s_ivd_fill_mem_rec_op_t.u4_size =
+                            sizeof(ihevcd_cxa_fill_mem_rec_op_t);
+
+            ps_mem_rec = (iv_mem_rec_t *)pv_mem_rec_location;
+            for(i = 0; i < u4_num_mem_recs; i++)
+                ps_mem_rec[i].u4_size = sizeof(iv_mem_rec_t);
+
+            /*****************************************************************************/
+            /*   API Call: Fill Mem Records                                     */
+            /*****************************************************************************/
+
+            e_dec_status = ivd_cxa_api_function(NULL,
+                                                (void *)&s_fill_mem_rec_ip,
+                                                (void *)&s_fill_mem_rec_op);
+
+            u4_num_mem_recs =
+                            s_fill_mem_rec_op.s_ivd_fill_mem_rec_op_t.u4_num_mem_rec_filled;
+
+            if(IV_SUCCESS != e_dec_status)
+            {
+                sprintf(ac_error_str, "Error in fill mem records: %x", s_fill_mem_rec_op.s_ivd_fill_mem_rec_op_t.u4_error_code);
+                codec_exit(ac_error_str);
+            }
+
+            ps_mem_rec = (iv_mem_rec_t *)pv_mem_rec_location;
+            total_size = 0;
+            for(i = 0; i < u4_num_mem_recs; i++)
+            {
+                ps_mem_rec->pv_base = ihevca_aligned_malloc(ps_mem_rec->u4_mem_alignment,
+                                                            ps_mem_rec->u4_mem_size);
+                if(ps_mem_rec->pv_base == NULL)
+                {
+                    sprintf(ac_error_str,
+                            "\nAllocation failure for mem record id %d size %d\n",
+                            i, ps_mem_rec->u4_mem_size);
+                    codec_exit(ac_error_str);
+
+                }
+                total_size += ps_mem_rec->u4_mem_size;
+
+                ps_mem_rec++;
+            }
+            //printf("\nTotal memory for codec %d\n", total_size);
+        }
+        /*****************************************************************************/
+        /*   API Call: Initialize the Decoder                                        */
+        /*****************************************************************************/
+        {
+            ihevcd_cxa_init_ip_t s_init_ip;
+            ihevcd_cxa_init_op_t s_init_op;
+            void *fxns = &ivd_cxa_api_function;
+            iv_mem_rec_t *mem_tab;
+
+            mem_tab = (iv_mem_rec_t *)pv_mem_rec_location;
+            s_init_ip.s_ivd_init_ip_t.e_cmd = (IVD_API_COMMAND_TYPE_T)IV_CMD_INIT;
+            s_init_ip.s_ivd_init_ip_t.pv_mem_rec_location = mem_tab;
+            s_init_ip.s_ivd_init_ip_t.u4_frm_max_wd = (s_app_ctx.max_wd == 0) ? MAX_FRAME_WIDTH : s_app_ctx.max_wd;
+            s_init_ip.s_ivd_init_ip_t.u4_frm_max_ht = (s_app_ctx.max_ht == 0) ? MAX_FRAME_HEIGHT : s_app_ctx.max_ht;
+            s_init_ip.i4_level = (s_app_ctx.max_level == 0) ? MAX_LEVEL_SUPPORTED : s_app_ctx.max_level;
+            s_init_ip.u4_num_ref_frames = MAX_REF_FRAMES;
+            s_init_ip.u4_num_reorder_frames = MAX_REORDER_FRAMES;
+            s_init_ip.u4_share_disp_buf = s_app_ctx.share_disp_buf;
+            s_init_ip.u4_num_extra_disp_buf = EXTRA_DISP_BUFFERS;
+            s_init_ip.s_ivd_init_ip_t.u4_num_mem_rec = u4_num_mem_recs;
+            s_init_ip.s_ivd_init_ip_t.e_output_format =
+                            (IV_COLOR_FORMAT_T)s_app_ctx.e_output_chroma_format;
+            s_init_ip.s_ivd_init_ip_t.u4_size = sizeof(ihevcd_cxa_init_ip_t);
+            s_init_op.s_ivd_init_op_t.u4_size = sizeof(ihevcd_cxa_init_op_t);
+
+            codec_obj = (iv_obj_t *)mem_tab[0].pv_base;
+            codec_obj->pv_fxns = fxns;
+            codec_obj->u4_size = sizeof(iv_obj_t);
+
+            s_app_ctx.cocodec_obj = codec_obj;
+
+            ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_init_ip,
+                                       (void *)&s_init_op);
+            if(ret != IV_SUCCESS)
+            {
+                sprintf(ac_error_str, "Error in Init %8x\n",
+                        s_init_op.s_ivd_init_op_t.u4_error_code);
+                codec_exit(ac_error_str);
+            }
+
+            /*****************************************************************************/
+            /*  Input and output buffer allocation                                       */
+            /*****************************************************************************/
+            {
+
+                ivd_ctl_getbufinfo_ip_t s_ctl_ip;
+                ivd_ctl_getbufinfo_op_t s_ctl_op;
+
+                s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+                s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_GETBUFINFO;
+                s_ctl_ip.u4_size = sizeof(ivd_ctl_getbufinfo_ip_t);
+                s_ctl_op.u4_size = sizeof(ivd_ctl_getbufinfo_op_t);
+                ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+                                           (void *)&s_ctl_op);
+                if(ret != IV_SUCCESS)
+                {
+                    sprintf(ac_error_str, "Error in Get Buf Info %x", s_ctl_op.u4_error_code);
+                    codec_exit(ac_error_str);
+                }
+
+                /* Allocate input buffer */
+                u4_ip_buf_len = s_ctl_op.u4_min_in_buf_size[0];
+                pu1_bs_buf = (UWORD8 *)malloc(u4_ip_buf_len);
+
+                if(pu1_bs_buf == NULL)
+                {
+                    sprintf(ac_error_str,
+                            "\nAllocation failure for input buffer of size %d",
+                            u4_ip_buf_len);
+                    codec_exit(ac_error_str);
+                }
+                s_app_ctx.num_disp_buf = s_ctl_op.u4_num_disp_bufs;
+                /* Allocate output buffer only if display buffers are not shared */
+                /* Or if shared and output is 420P */
+                if((0 == s_app_ctx.share_disp_buf) || (IV_YUV_420P == s_app_ctx.e_output_chroma_format))
+                {
+                    UWORD32 outlen;
+                    ps_out_buf->u4_min_out_buf_size[0] =
+                                    s_ctl_op.u4_min_out_buf_size[0];
+                    ps_out_buf->u4_min_out_buf_size[1] =
+                                    s_ctl_op.u4_min_out_buf_size[1];
+                    ps_out_buf->u4_min_out_buf_size[2] =
+                                    s_ctl_op.u4_min_out_buf_size[2];
+
+                    outlen = s_ctl_op.u4_min_out_buf_size[0];
+                    if(s_ctl_op.u4_min_num_out_bufs > 1)
+                        outlen += s_ctl_op.u4_min_out_buf_size[1];
+
+                    if(s_ctl_op.u4_min_num_out_bufs > 2)
+                        outlen += s_ctl_op.u4_min_out_buf_size[2];
+
+                    ps_out_buf->pu1_bufs[0] = (UWORD8 *)malloc(outlen);
+                    if(ps_out_buf->pu1_bufs[0] == NULL)
+                    {
+                        sprintf(ac_error_str,
+                                "\nAllocation failure for output buffer of size %d",
+                                outlen);
+                        codec_exit(ac_error_str);
+                    }
+
+                    if(s_ctl_op.u4_min_num_out_bufs > 1)
+                        ps_out_buf->pu1_bufs[1] = ps_out_buf->pu1_bufs[0]
+                                        + (s_ctl_op.u4_min_out_buf_size[0]);
+
+                    if(s_ctl_op.u4_min_num_out_bufs > 2)
+                        ps_out_buf->pu1_bufs[2] = ps_out_buf->pu1_bufs[1]
+                                        + (s_ctl_op.u4_min_out_buf_size[1]);
+
+                    ps_out_buf->u4_num_bufs = s_ctl_op.u4_min_num_out_bufs;
+                }
+
+            }
+        }
+
+    }
+
+
+    /*************************************************************************/
+    /* set num of cores                                                      */
+    /*************************************************************************/
+    {
+
+        ihevcd_cxa_ctl_set_num_cores_ip_t s_ctl_set_cores_ip;
+        ihevcd_cxa_ctl_set_num_cores_op_t s_ctl_set_cores_op;
+
+        s_ctl_set_cores_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+        s_ctl_set_cores_ip.e_sub_cmd = (IVD_CONTROL_API_COMMAND_TYPE_T)IHEVCD_CXA_CMD_CTL_SET_NUM_CORES;
+        s_ctl_set_cores_ip.u4_num_cores = s_app_ctx.u4_num_cores;
+        s_ctl_set_cores_ip.u4_size = sizeof(ihevcd_cxa_ctl_set_num_cores_ip_t);
+        s_ctl_set_cores_op.u4_size = sizeof(ihevcd_cxa_ctl_set_num_cores_op_t);
+
+        ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_set_cores_ip,
+                                   (void *)&s_ctl_set_cores_op);
+        if(ret != IV_SUCCESS)
+        {
+            sprintf(ac_error_str, "\nError in setting number of cores");
+            codec_exit(ac_error_str);
+        }
+
+    }
+    /*************************************************************************/
+    /* set processsor                                                        */
+    /*************************************************************************/
+    {
+
+        ihevcd_cxa_ctl_set_processor_ip_t s_ctl_set_num_processor_ip;
+        ihevcd_cxa_ctl_set_processor_op_t s_ctl_set_num_processor_op;
+
+        s_ctl_set_num_processor_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+        s_ctl_set_num_processor_ip.e_sub_cmd = (IVD_CONTROL_API_COMMAND_TYPE_T)IHEVCD_CXA_CMD_CTL_SET_PROCESSOR;
+        s_ctl_set_num_processor_ip.u4_arch = s_app_ctx.e_arch;
+        s_ctl_set_num_processor_ip.u4_soc = s_app_ctx.e_soc;
+        s_ctl_set_num_processor_ip.u4_size = sizeof(ihevcd_cxa_ctl_set_processor_ip_t);
+        s_ctl_set_num_processor_op.u4_size = sizeof(ihevcd_cxa_ctl_set_processor_op_t);
+
+        ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_set_num_processor_ip,
+                                   (void *)&s_ctl_set_num_processor_op);
+        if(ret != IV_SUCCESS)
+        {
+            sprintf(ac_error_str, "\nError in setting Processor type");
+            codec_exit(ac_error_str);
+        }
+
+    }
+
+#ifdef GPU_BUILD
+    /*************************************************************************/
+    /* Enalbe/Disalbe GPU                                                    */
+    /*************************************************************************/
+    {
+
+        ihevcd_cxa_ctl_gpu_enable_diable_ip_t s_ctl_gpu_cnl_ip;
+        ihevcd_cxa_ctl_gpu_enable_diable_op_t s_ctl_gpu_cnl_op;
+
+        s_ctl_gpu_cnl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+        s_ctl_gpu_cnl_ip.e_sub_cmd = IHEVCD_CXA_CMD_CTL_GPU_ENABLE_DISABLE;
+        s_ctl_gpu_cnl_ip.u4_gpu_enable_diable = s_app_ctx.u4_gpu_enable_diable;
+        s_ctl_gpu_cnl_ip.u4_size =
+                        sizeof(ihevcd_cxa_ctl_gpu_enable_diable_ip_t);
+        s_ctl_gpu_cnl_op.u4_size =
+                        sizeof(ihevcd_cxa_ctl_gpu_enable_diable_op_t);
+
+        ret = ivd_cxa_api_function(codec_obj, (void *)&s_ctl_gpu_cnl_ip,
+                                   (void *)&s_ctl_gpu_cnl_op);
+        if(ret != IV_SUCCESS)
+        {
+            sprintf(ac_error_str, "\nError enalbing/disabling GPU");
+            //codec_exit(ac_error_str);
+
+        }
+
+    }
+#endif
+
+    /*****************************************************************************/
+    /*   Decode header to get width and height and buffer sizes                  */
+    /*****************************************************************************/
+    {
+
+        ivd_ctl_set_config_ip_t s_ctl_ip;
+        ivd_ctl_set_config_op_t s_ctl_op;
+
+        ivd_video_decode_ip_t s_video_decode_ip;
+        ivd_video_decode_op_t s_video_decode_op;
+
+        s_ctl_ip.u4_disp_wd = STRIDE;
+        if(1 == s_app_ctx.display)
+            s_ctl_ip.u4_disp_wd = s_app_ctx.get_stride();
+
+        s_ctl_ip.e_frm_skip_mode = IVD_SKIP_NONE;
+        s_ctl_ip.e_frm_out_mode = IVD_DISPLAY_FRAME_OUT;
+        s_ctl_ip.e_vid_dec_mode = IVD_DECODE_HEADER;
+        s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+        s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_SETPARAMS;
+        s_ctl_ip.u4_size = sizeof(ivd_ctl_set_config_ip_t);
+        s_ctl_op.u4_size = sizeof(ivd_ctl_set_config_op_t);
+
+        ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+                                   (void *)&s_ctl_op);
+        if(ret != IV_SUCCESS)
+        {
+            sprintf(ac_error_str,
+                    "\nError in setting the codec in header decode mode");
+            codec_exit(ac_error_str);
+        }
+
+        do
+        {
+            WORD32 numbytes;
+            if(0 == s_app_ctx.u4_piclen_flag)
+            {
+                fseek(ps_ip_file, file_pos, SEEK_SET);
+                numbytes = u4_ip_buf_len;
+            }
+            else
+            {
+                WORD32 entries;
+                entries = fscanf(ps_piclen_file, "%d\n", &numbytes);
+                if(1 != entries)
+                    numbytes = u4_ip_buf_len;
+            }
+
+            u4_bytes_remaining = fread(pu1_bs_buf, sizeof(UWORD8), numbytes,
+                                       ps_ip_file);
+
+            if(0 == u4_bytes_remaining)
+            {
+                sprintf(ac_error_str, "\nUnable to read from input file");
+                codec_exit(ac_error_str);
+            }
+
+            s_video_decode_ip.e_cmd = IVD_CMD_VIDEO_DECODE;
+            s_video_decode_ip.u4_ts = u4_ip_frm_ts;
+            s_video_decode_ip.pv_stream_buffer = pu1_bs_buf;
+            s_video_decode_ip.u4_num_Bytes = u4_bytes_remaining;
+            s_video_decode_ip.u4_size = sizeof(ivd_video_decode_ip_t);
+            s_video_decode_op.u4_size = sizeof(ivd_video_decode_op_t);
+
+            /*****************************************************************************/
+            /*   API Call: Header Decode                                                  */
+            /*****************************************************************************/
+            ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_video_decode_ip,
+                                       (void *)&s_video_decode_op);
+
+            if(ret != IV_SUCCESS)
+            {
+                sprintf(ac_error_str, "\nError in header decode %x",
+                        s_video_decode_op.u4_error_code);
+                // codec_exit(ac_error_str);
+            }
+
+            u4_num_bytes_dec = s_video_decode_op.u4_num_bytes_consumed;
+#ifndef PROFILE_ENABLE
+            printf("%d\n", s_video_decode_op.u4_num_bytes_consumed);
+#endif
+            file_pos += u4_num_bytes_dec;
+            total_bytes_comsumed += u4_num_bytes_dec;
+        }while(ret != IV_SUCCESS);
+
+        /* copy pic_wd and pic_ht to initialize buffers */
+        s_app_ctx.u4_pic_wd = s_video_decode_op.u4_pic_wd;
+        s_app_ctx.u4_pic_ht = s_video_decode_op.u4_pic_ht;
+
+#if IOS_DISPLAY
+        s_app_ctx.i4_screen_wd = screen_wd;
+        s_app_ctx.i4_screen_ht = screen_ht;
+#endif
+
+        /* Create display thread and wait for the display buffers to be initialized */
+        if(1 == s_app_ctx.display)
+        {
+            if(0 == s_app_ctx.display_thread_created)
+            {
+                s_app_ctx.display_init_done = 0;
+                ithread_create(s_app_ctx.display_thread_handle, NULL,
+                               (void *)&display_thread, (void *)&s_app_ctx);
+                s_app_ctx.display_thread_created = 1;
+
+                while(1)
+                {
+                    if(s_app_ctx.display_init_done)
+                        break;
+
+                    ithread_msleep(1);
+                }
+            }
+
+            s_app_ctx.u4_strd = s_app_ctx.get_stride();
+        }
+    }
+
+    /*************************************************************************/
+    /* Get actual number of output buffers requried, which is dependent      */
+    /* on stream properties such as width, height and level etc              */
+    /* This is needed mainly for shared display mode                         */
+    /*************************************************************************/
+    //if(1 == s_app_ctx.share_disp_buf)
+    {
+        ivd_ctl_getbufinfo_ip_t s_ctl_ip;
+        ivd_ctl_getbufinfo_op_t s_ctl_op;
+        WORD32 outlen = 0;
+
+        s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+        s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_GETBUFINFO;
+        s_ctl_ip.u4_size = sizeof(ivd_ctl_getbufinfo_ip_t);
+        s_ctl_op.u4_size = sizeof(ivd_ctl_getbufinfo_op_t);
+        ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+                                   (void *)&s_ctl_op);
+        if(ret != IV_SUCCESS)
+        {
+            sprintf(ac_error_str, "Error in Get Buf Info %x", s_ctl_op.u4_error_code);
+            codec_exit(ac_error_str);
+        }
+
+#ifdef APP_EXTRA_BUFS
+        s_app_ctx.disp_delay = EXTRA_DISP_BUFFERS;
+        s_ctl_op.u4_num_disp_bufs += EXTRA_DISP_BUFFERS;
+#endif
+
+        /*****************************************************************************/
+        /*   API Call: Allocate display buffers for display buffer shared case       */
+        /*****************************************************************************/
+
+        for(i = 0; i < s_ctl_op.u4_num_disp_bufs; i++)
+        {
+
+            s_app_ctx.s_disp_buffers[i].u4_min_out_buf_size[0] =
+                            s_ctl_op.u4_min_out_buf_size[0];
+            s_app_ctx.s_disp_buffers[i].u4_min_out_buf_size[1] =
+                            s_ctl_op.u4_min_out_buf_size[1];
+            s_app_ctx.s_disp_buffers[i].u4_min_out_buf_size[2] =
+                            s_ctl_op.u4_min_out_buf_size[2];
+
+            outlen = s_ctl_op.u4_min_out_buf_size[0];
+            if(s_ctl_op.u4_min_num_out_bufs > 1)
+                outlen += s_ctl_op.u4_min_out_buf_size[1];
+
+            if(s_ctl_op.u4_min_num_out_bufs > 2)
+                outlen += s_ctl_op.u4_min_out_buf_size[2];
+
+            s_app_ctx.s_disp_buffers[i].pu1_bufs[0] = (UWORD8 *)malloc(outlen);
+
+            if(s_app_ctx.s_disp_buffers[i].pu1_bufs[0] == NULL)
+            {
+                sprintf(ac_error_str,
+                        "\nAllocation failure for output buffer of size %d",
+                        outlen);
+                codec_exit(ac_error_str);
+            }
+
+            if(s_ctl_op.u4_min_num_out_bufs > 1)
+                s_app_ctx.s_disp_buffers[i].pu1_bufs[1] =
+                                s_app_ctx.s_disp_buffers[i].pu1_bufs[0]
+                                                + (s_ctl_op.u4_min_out_buf_size[0]);
+
+            if(s_ctl_op.u4_min_num_out_bufs > 2)
+                s_app_ctx.s_disp_buffers[i].pu1_bufs[2] =
+                                s_app_ctx.s_disp_buffers[i].pu1_bufs[1]
+                                                + (s_ctl_op.u4_min_out_buf_size[1]);
+
+            s_app_ctx.s_disp_buffers[i].u4_num_bufs =
+                            s_ctl_op.u4_min_num_out_bufs;
+        }
+        s_app_ctx.num_disp_buf = s_ctl_op.u4_num_disp_bufs;
+
+        /*****************************************************************************/
+        /*   API Call: Send the allocated display buffers to codec                   */
+        /*****************************************************************************/
+        {
+            ivd_set_display_frame_ip_t s_set_display_frame_ip;
+            ivd_set_display_frame_op_t s_set_display_frame_op;
+
+            s_set_display_frame_ip.e_cmd = IVD_CMD_SET_DISPLAY_FRAME;
+            s_set_display_frame_ip.u4_size = sizeof(ivd_set_display_frame_ip_t);
+            s_set_display_frame_op.u4_size = sizeof(ivd_set_display_frame_op_t);
+
+            s_set_display_frame_ip.num_disp_bufs = s_app_ctx.num_disp_buf;
+
+            memcpy(&(s_set_display_frame_ip.s_disp_buffer),
+                   &(s_app_ctx.s_disp_buffers),
+                   s_ctl_op.u4_num_disp_bufs * sizeof(ivd_out_bufdesc_t));
+
+            ret = ivd_cxa_api_function((iv_obj_t *)codec_obj,
+                                       (void *)&s_set_display_frame_ip,
+                                       (void *)&s_set_display_frame_op);
+
+            if(IV_SUCCESS != ret)
+            {
+                sprintf(ac_error_str, "Error in Set display frame");
+                codec_exit(ac_error_str);
+            }
+
+        }
+
+    }
+
+    /*************************************************************************/
+    /* Get frame dimensions for display buffers such as x_offset,y_offset    */
+    /* etc. This information might be needed to set display buffer           */
+    /* offsets in case of shared display buffer mode                         */
+    /*************************************************************************/
+    {
+
+        ihevcd_cxa_ctl_get_frame_dimensions_ip_t s_ctl_get_frame_dimensions_ip;
+        ihevcd_cxa_ctl_get_frame_dimensions_op_t s_ctl_get_frame_dimensions_op;
+
+        s_ctl_get_frame_dimensions_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+        s_ctl_get_frame_dimensions_ip.e_sub_cmd =
+                        (IVD_CONTROL_API_COMMAND_TYPE_T)IHEVCD_CXA_CMD_CTL_GET_BUFFER_DIMENSIONS;
+        s_ctl_get_frame_dimensions_ip.u4_size =
+                        sizeof(ihevcd_cxa_ctl_get_frame_dimensions_ip_t);
+        s_ctl_get_frame_dimensions_op.u4_size =
+                        sizeof(ihevcd_cxa_ctl_get_frame_dimensions_op_t);
+
+        ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_get_frame_dimensions_ip,
+                                   (void *)&s_ctl_get_frame_dimensions_op);
+        if(IV_SUCCESS != ret)
+        {
+            sprintf(ac_error_str, "Error in Get buffer Dimensions");
+            codec_exit(ac_error_str);
+        }
+
+/*
+        printf("Frame offsets due to padding\n");
+        printf("s_ctl_get_frame_dimensions_op.x_offset[0] %d s_ctl_get_frame_dimensions_op.y_offset[0] %d\n",
+               s_ctl_get_frame_dimensions_op.u4_x_offset[0],
+               s_ctl_get_frame_dimensions_op.u4_y_offset[0]);
+*/
+    }
+
+
+    /*************************************************************************/
+    /* Get VUI parameters                                                    */
+    /*************************************************************************/
+    {
+
+        ihevcd_cxa_ctl_get_vui_params_ip_t s_ctl_get_vui_params_ip;
+        ihevcd_cxa_ctl_get_vui_params_op_t s_ctl_get_vui_params_op;
+
+        s_ctl_get_vui_params_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+        s_ctl_get_vui_params_ip.e_sub_cmd =
+                        (IVD_CONTROL_API_COMMAND_TYPE_T)IHEVCD_CXA_CMD_CTL_GET_VUI_PARAMS;
+        s_ctl_get_vui_params_ip.u4_size =
+                        sizeof(ihevcd_cxa_ctl_get_vui_params_ip_t);
+        s_ctl_get_vui_params_op.u4_size =
+                        sizeof(ihevcd_cxa_ctl_get_vui_params_op_t);
+
+        ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_get_vui_params_ip,
+                                   (void *)&s_ctl_get_vui_params_op);
+        if(IV_SUCCESS != ret)
+        {
+            sprintf(ac_error_str, "Error in Get VUI params");
+            //codec_exit(ac_error_str);
+        }
+
+    }
+
+
+    /*************************************************************************/
+    /* Set the decoder in frame decode mode. It was set in header decode     */
+    /* mode earlier                                                          */
+    /*************************************************************************/
+    {
+
+        ivd_ctl_set_config_ip_t s_ctl_ip;
+        ivd_ctl_set_config_op_t s_ctl_op;
+
+        s_ctl_ip.u4_disp_wd = STRIDE;
+        if(1 == s_app_ctx.display)
+            s_ctl_ip.u4_disp_wd = s_app_ctx.get_stride();
+        s_ctl_ip.e_frm_skip_mode = IVD_SKIP_NONE;
+
+        s_ctl_ip.e_frm_out_mode = IVD_DISPLAY_FRAME_OUT;
+        s_ctl_ip.e_vid_dec_mode = IVD_DECODE_FRAME;
+        s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+        s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_SETPARAMS;
+        s_ctl_ip.u4_size = sizeof(ivd_ctl_set_config_ip_t);
+
+        s_ctl_op.u4_size = sizeof(ivd_ctl_set_config_op_t);
+
+        ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip, (void *)&s_ctl_op);
+
+        if(IV_SUCCESS != ret)
+        {
+            sprintf(ac_error_str, "Error in Set Parameters");
+            //codec_exit(ac_error_str);
+        }
+
+    }
+    /*************************************************************************/
+    /* If required disable deblocking and sao at given level                 */
+    /*************************************************************************/
+    set_degrade(codec_obj, s_app_ctx.i4_degrade_type, s_app_ctx.i4_degrade_pics);
+#ifdef X86_MSVC
+    QueryPerformanceFrequency(&frequency);
+#endif
+#ifndef PRINT_PICSIZE
+    get_version(codec_obj);
+#endif
+    while(u4_op_frm_ts < (s_app_ctx.u4_max_frm_ts + s_app_ctx.disp_delay))
+    {
+
+#ifdef TEST_FLUSH
+        if(u4_ip_frm_ts == FLUSH_FRM_CNT)
+        {
+            ivd_ctl_flush_ip_t s_ctl_ip;
+            ivd_ctl_flush_op_t s_ctl_op;
+
+            s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+            s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_FLUSH;
+            s_ctl_ip.u4_size = sizeof(ivd_ctl_flush_ip_t);
+            s_ctl_op.u4_size = sizeof(ivd_ctl_flush_op_t);
+            ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+                                       (void *)&s_ctl_op);
+
+            if(ret != IV_SUCCESS)
+            {
+                printf("Error in Setting the decoder in flush mode\n");
+            }
+            file_pos = 0;
+
+            fseek(ps_ip_file, file_pos, SEEK_SET);
+
+        }
+#endif
+        if(u4_ip_frm_ts < s_app_ctx.num_disp_buf)
+        {
+            release_disp_frame(codec_obj, u4_ip_frm_ts);
+        }
+
+
+        /*************************************************************************/
+        /* set num of cores                                                      */
+        /*************************************************************************/
+#ifdef DYNAMIC_NUMCORES
+        {
+
+            ihevcd_cxa_ctl_set_num_cores_ip_t s_ctl_set_cores_ip;
+            ihevcd_cxa_ctl_set_num_cores_op_t s_ctl_set_cores_op;
+
+            s_ctl_set_cores_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+            s_ctl_set_cores_ip.e_sub_cmd = IHEVCD_CXA_CMD_CTL_SET_NUM_CORES;
+            s_ctl_set_cores_ip.u4_num_cores =  1 + 3 * (u4_ip_frm_ts % 2);
+            s_ctl_set_cores_ip.u4_size = sizeof(ihevcd_cxa_ctl_set_num_cores_ip_t);
+            s_ctl_set_cores_op.u4_size = sizeof(ihevcd_cxa_ctl_set_num_cores_op_t);
+
+            ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_set_cores_ip,
+                                       (void *)&s_ctl_set_cores_op);
+            if(ret != IV_SUCCESS)
+            {
+                sprintf(ac_error_str, "\nError in setting number of cores");
+                codec_exit(ac_error_str);
+            }
+
+        }
+#endif
+        /***********************************************************************/
+        /*   Seek the file to start of current frame, this is equavelent of    */
+        /*   having a parcer which tells the start of current frame            */
+        /***********************************************************************/
+        {
+            WORD32 numbytes;
+
+            if(0 == s_app_ctx.u4_piclen_flag)
+            {
+                fseek(ps_ip_file, file_pos, SEEK_SET);
+                numbytes = u4_ip_buf_len;
+            }
+            else
+            {
+                WORD32 entries;
+                entries = fscanf(ps_piclen_file, "%d\n", &numbytes);
+                if(1 != entries)
+                    numbytes = u4_ip_buf_len;
+            }
+
+            u4_bytes_remaining = fread(pu1_bs_buf, sizeof(UWORD8),
+                                       numbytes, ps_ip_file);
+
+            if(u4_bytes_remaining == 0)
+            {
+                if(1 == s_app_ctx.loopback)
+                {
+                    file_pos = 0;
+                    if(0 == s_app_ctx.u4_piclen_flag)
+                    {
+                        fseek(ps_ip_file, file_pos, SEEK_SET);
+                        numbytes = u4_ip_buf_len;
+                    }
+                    else
+                    {
+                        WORD32 entries;
+                        entries = fscanf(ps_piclen_file, "%d\n", &numbytes);
+                        if(1 != entries)
+                            numbytes = u4_ip_buf_len;
+                    }
+
+
+                    u4_bytes_remaining = fread(pu1_bs_buf, sizeof(UWORD8),
+                                               numbytes, ps_ip_file);
+                }
+                else
+                    break;
+            }
+        }
+
+        /*********************************************************************/
+        /* Following calls can be enabled at diffent times                   */
+        /*********************************************************************/
+#if ENABLE_DEGRADE
+        if(u4_op_frm_ts >= 10000)
+            disable_deblocking(codec_obj, 4);
+
+        if(u4_op_frm_ts == 30000)
+            enable_deblocking(codec_obj);
+
+        if(u4_op_frm_ts == 10000)
+            enable_skippb_frames(codec_obj);
+
+        if(u4_op_frm_ts == 60000)
+            disable_skippb_frames(codec_obj);
+
+        if(u4_op_frm_ts == 30000)
+            enable_skipb_frames(codec_obj);
+
+        if(u4_op_frm_ts == 60000)
+            disable_skipb_frames(codec_obj);
+#endif
+
+
+        {
+            ivd_video_decode_ip_t s_video_decode_ip;
+            ivd_video_decode_op_t s_video_decode_op;
+#ifdef PROFILE_ENABLE
+            UWORD32 s_elapsed_time;
+            TIMER s_start_timer;
+            TIMER s_end_timer;
+#endif
+
+
+            s_video_decode_ip.e_cmd = IVD_CMD_VIDEO_DECODE;
+            s_video_decode_ip.u4_ts = u4_ip_frm_ts;
+            s_video_decode_ip.pv_stream_buffer = pu1_bs_buf;
+            s_video_decode_ip.u4_num_Bytes = u4_bytes_remaining;
+            s_video_decode_ip.u4_size = sizeof(ivd_video_decode_ip_t);
+            s_video_decode_ip.s_out_buffer.u4_min_out_buf_size[0] =
+                            ps_out_buf->u4_min_out_buf_size[0];
+            s_video_decode_ip.s_out_buffer.u4_min_out_buf_size[1] =
+                            ps_out_buf->u4_min_out_buf_size[1];
+            s_video_decode_ip.s_out_buffer.u4_min_out_buf_size[2] =
+                            ps_out_buf->u4_min_out_buf_size[2];
+
+            s_video_decode_ip.s_out_buffer.pu1_bufs[0] =
+                            ps_out_buf->pu1_bufs[0];
+            s_video_decode_ip.s_out_buffer.pu1_bufs[1] =
+                            ps_out_buf->pu1_bufs[1];
+            s_video_decode_ip.s_out_buffer.pu1_bufs[2] =
+                            ps_out_buf->pu1_bufs[2];
+            s_video_decode_ip.s_out_buffer.u4_num_bufs =
+                            ps_out_buf->u4_num_bufs;
+            s_video_decode_op.u4_size = sizeof(ivd_video_decode_op_t);
+
+            /* Get display buffer pointers */
+            if(1 == s_app_ctx.display)
+            {
+                WORD32 wr_idx;
+
+                wr_idx = dispq_producer_dequeue(&s_app_ctx);
+
+                if(s_app_ctx.quit)
+                    break;
+
+                s_app_ctx.set_disp_buffers(s_app_ctx.pv_disp_ctx, wr_idx,
+                                           &s_video_decode_ip.s_out_buffer.pu1_bufs[0],
+                                           &s_video_decode_ip.s_out_buffer.pu1_bufs[1],
+                                           &s_video_decode_ip.s_out_buffer.pu1_bufs[2]);
+            }
+
+            /*****************************************************************************/
+            /*   API Call: Video Decode                                                  */
+            /*****************************************************************************/
+
+            GETTIME(&s_start_timer);
+
+            ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_video_decode_ip,
+                                       (void *)&s_video_decode_op);
+
+
+            GETTIME(&s_end_timer);
+            ELAPSEDTIME(s_start_timer, s_end_timer, s_elapsed_time, frequency);
+#ifdef PROFILE_ENABLE
+            {
+                UWORD32 peak_avg, id;
+                u4_tot_cycles += s_elapsed_time;
+                peak_window[peak_window_idx++] = s_elapsed_time;
+                if(peak_window_idx == PEAK_WINDOW_SIZE)
+                    peak_window_idx = 0;
+                peak_avg = 0;
+                for(id = 0; id < PEAK_WINDOW_SIZE; id++)
+                {
+                    peak_avg += peak_window[id];
+                }
+                peak_avg /= PEAK_WINDOW_SIZE;
+                if(peak_avg > peak_avg_max)
+                    peak_avg_max = peak_avg;
+                frm_cnt++;
+
+                printf("FrameNum: %4d TimeTaken(microsec): %6d AvgTime: %6d PeakAvgTimeMax: %6d Output: %2d NumBytes: %6d \n",
+                       frm_cnt, s_elapsed_time, u4_tot_cycles / frm_cnt, peak_avg_max, s_video_decode_op.u4_output_present, s_video_decode_op.u4_num_bytes_consumed);
+
+            }
+#ifdef INTEL_CE5300
+            time_consumed += s_elapsed_time;
+            bytes_consumed += s_video_decode_op.u4_num_bytes_consumed;
+            if(!(frm_cnt % (s_app_ctx.fps)))
+            {
+                time_consumed = time_consumed / s_app_ctx.fps;
+                printf("Average decode time(micro sec) for the last second = %6d\n", time_consumed);
+                printf("Average bitrate(kb) for the last second = %6d\n", (bytes_consumed * 8) / 1024);
+                time_consumed = 0;
+                bytes_consumed = 0;
+
+            }
+#endif
+#else
+            printf("%d\n", s_video_decode_op.u4_num_bytes_consumed);
+#endif
+
+            if(ret != IV_SUCCESS)
+            {
+                printf("Error in video Frame decode : ret %x Error %x\n", ret,
+                       s_video_decode_op.u4_error_code);
+            }
+
+            if((IV_SUCCESS != ret) &&
+                            ((s_video_decode_op.u4_error_code & 0xFF) == IVD_RES_CHANGED))
+            {
+                ivd_ctl_reset_ip_t s_ctl_ip;
+                ivd_ctl_reset_op_t s_ctl_op;
+
+                flush_output(codec_obj, &s_app_ctx, ps_out_buf,
+                             pu1_bs_buf, &u4_op_frm_ts,
+                             ps_op_file, ps_op_chksum_file,
+                             u4_ip_frm_ts, u4_bytes_remaining);
+
+                s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+                s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_RESET;
+                s_ctl_ip.u4_size = sizeof(ivd_ctl_reset_ip_t);
+                s_ctl_op.u4_size = sizeof(ivd_ctl_reset_op_t);
+
+                ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+                                           (void *)&s_ctl_op);
+                if(IV_SUCCESS != ret)
+                {
+                    sprintf(ac_error_str, "Error in Reset");
+                    codec_exit(ac_error_str);
+                }
+                /*************************************************************************/
+                /* set num of cores                                                      */
+                /*************************************************************************/
+                {
+
+                    ihevcd_cxa_ctl_set_num_cores_ip_t s_ctl_set_cores_ip;
+                    ihevcd_cxa_ctl_set_num_cores_op_t s_ctl_set_cores_op;
+
+                    s_ctl_set_cores_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+                    s_ctl_set_cores_ip.e_sub_cmd = (IVD_CONTROL_API_COMMAND_TYPE_T)IHEVCD_CXA_CMD_CTL_SET_NUM_CORES;
+                    s_ctl_set_cores_ip.u4_num_cores = s_app_ctx.u4_num_cores;
+                    s_ctl_set_cores_ip.u4_size = sizeof(ihevcd_cxa_ctl_set_num_cores_ip_t);
+                    s_ctl_set_cores_op.u4_size = sizeof(ihevcd_cxa_ctl_set_num_cores_op_t);
+
+                    ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_set_cores_ip,
+                                               (void *)&s_ctl_set_cores_op);
+                    if(ret != IV_SUCCESS)
+                    {
+                        sprintf(ac_error_str, "\nError in setting number of cores");
+                        codec_exit(ac_error_str);
+                    }
+
+                }
+                /*************************************************************************/
+                /* set processsor                                                        */
+                /*************************************************************************/
+                {
+
+                    ihevcd_cxa_ctl_set_processor_ip_t s_ctl_set_num_processor_ip;
+                    ihevcd_cxa_ctl_set_processor_op_t s_ctl_set_num_processor_op;
+
+                    s_ctl_set_num_processor_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+                    s_ctl_set_num_processor_ip.e_sub_cmd = (IVD_CONTROL_API_COMMAND_TYPE_T)IHEVCD_CXA_CMD_CTL_SET_PROCESSOR;
+                    s_ctl_set_num_processor_ip.u4_arch = s_app_ctx.e_arch;
+                    s_ctl_set_num_processor_ip.u4_soc = s_app_ctx.e_soc;
+                    s_ctl_set_num_processor_ip.u4_size = sizeof(ihevcd_cxa_ctl_set_processor_ip_t);
+                    s_ctl_set_num_processor_op.u4_size = sizeof(ihevcd_cxa_ctl_set_processor_op_t);
+
+                    ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_set_num_processor_ip,
+                                               (void *)&s_ctl_set_num_processor_op);
+                    if(ret != IV_SUCCESS)
+                    {
+                        sprintf(ac_error_str, "\nError in setting Processor type");
+                        codec_exit(ac_error_str);
+                    }
+
+                }
+            }
+
+
+            if((1 == s_app_ctx.display) &&
+                            (1 == s_video_decode_op.u4_output_present))
+            {
+                dispq_producer_queue(&s_app_ctx);
+            }
+
+            if(IV_B_FRAME == s_video_decode_op.e_pic_type)
+                s_app_ctx.b_pic_present |= 1;
+
+            u4_num_bytes_dec = s_video_decode_op.u4_num_bytes_consumed;
+
+            file_pos += u4_num_bytes_dec;
+            total_bytes_comsumed += u4_num_bytes_dec;
+            u4_ip_frm_ts++;
+
+
+            if(1 == s_video_decode_op.u4_output_present)
+            {
+                width = s_video_decode_op.s_disp_frm_buf.u4_y_wd;
+                height = s_video_decode_op.s_disp_frm_buf.u4_y_ht;
+                dump_output(&s_app_ctx, &(s_video_decode_op.s_disp_frm_buf),
+                            s_video_decode_op.u4_disp_buf_id, ps_op_file,
+                            ps_op_chksum_file,
+                            u4_op_frm_ts, s_app_ctx.u4_file_save_flag,
+                            s_app_ctx.u4_chksum_save_flag);
+
+                u4_op_frm_ts++;
+            }
+            else
+            {
+                if((s_video_decode_op.u4_error_code >> IVD_FATALERROR) & 1)
+                {
+                    printf("Fatal error\n");
+                    break;
+                }
+            }
+
+        }
+    }
+
+    /***********************************************************************/
+    /*      To get the last decoded frames, call process with NULL input    */
+    /***********************************************************************/
+    flush_output(codec_obj, &s_app_ctx, ps_out_buf,
+                 pu1_bs_buf, &u4_op_frm_ts,
+                 ps_op_file, ps_op_chksum_file,
+                 u4_ip_frm_ts, u4_bytes_remaining);
+
+    /* set disp_end flag */
+    s_app_ctx.quit = 1;
+
+
+#ifdef PROFILE_ENABLE
+    printf("Summary\n");
+    printf("Input filename                  : %s\n", s_app_ctx.ac_ip_fname);
+    printf("Output Width                    : %-4d\n", width);
+    printf("Output Height                   : %-4d\n", height);
+
+    if(frm_cnt)
+    {
+        double avg = u4_tot_cycles / frm_cnt;
+        double bytes_avg = total_bytes_comsumed / frm_cnt;
+        double bitrate = (bytes_avg * 8 * s_app_ctx.fps) / 1000000;
+        printf("Bitrate @ %2d fps(mbps)          : %-6.2f\n", s_app_ctx.fps, bitrate);
+        printf("Average decode time(micro sec)  : %-6d\n", (WORD32)avg);
+        printf("Avg Peak decode time(%2d frames) : %-6d\n", PEAK_WINDOW_SIZE, (WORD32)peak_avg_max);
+        avg = (u4_tot_cycles + u4_tot_fmt_cycles) * 1.0 / frm_cnt;
+
+        if(0 == s_app_ctx.share_disp_buf)
+            printf("FPS achieved (with format conv) : %-3.2f\n", 1000000 / avg);
+        else
+            printf("FPS achieved                    : %-3.2f\n", 1000000 / avg);
+    }
+#endif
+    /***********************************************************************/
+    /*   Clear the decoder, close all the files, free all the memory       */
+    /***********************************************************************/
+    if(1 == s_app_ctx.display)
+    {
+        s_app_ctx.display_deinit_flag = 1;
+        /* wait for display to finish */
+        if(s_app_ctx.display_thread_created)
+        {
+            ithread_join(s_app_ctx.display_thread_handle, NULL);
+        }
+        free(s_app_ctx.display_thread_handle);
+    }
+
+    {
+        iv_retrieve_mem_rec_ip_t s_retrieve_dec_ip;
+        iv_retrieve_mem_rec_op_t s_retrieve_dec_op;
+        s_retrieve_dec_ip.pv_mem_rec_location = (iv_mem_rec_t *)pv_mem_rec_location;
+
+        s_retrieve_dec_ip.e_cmd = IV_CMD_RETRIEVE_MEMREC;
+        s_retrieve_dec_ip.u4_size = sizeof(iv_retrieve_mem_rec_ip_t);
+        s_retrieve_dec_op.u4_size = sizeof(iv_retrieve_mem_rec_op_t);
+
+        ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_retrieve_dec_ip,
+                                   (void *)&s_retrieve_dec_op);
+
+        if(IV_SUCCESS != ret)
+        {
+            sprintf(ac_error_str, "Error in Retrieve Memrec");
+            codec_exit(ac_error_str);
+        }
+
+        {
+            iv_mem_rec_t *ps_mem_rec;
+            UWORD16 u2_i;
+
+            u4_num_mem_recs = s_retrieve_dec_op.u4_num_mem_rec_filled;
+
+            ps_mem_rec = s_retrieve_dec_ip.pv_mem_rec_location;
+
+            for(u2_i = 0; u2_i < u4_num_mem_recs; u2_i++)
+            {
+                ihevca_aligned_free(ps_mem_rec->pv_base);
+                ps_mem_rec++;
+            }
+            free(s_retrieve_dec_ip.pv_mem_rec_location);
+        }
+
+    }
+    /***********************************************************************/
+    /*              Close all the files and free all the memory            */
+    /***********************************************************************/
+    {
+        fclose(ps_ip_file);
+
+        if(1 == s_app_ctx.u4_file_save_flag)
+        {
+            fclose(ps_op_file);
+        }
+        if(1 == s_app_ctx.u4_chksum_save_flag)
+        {
+            fclose(ps_op_chksum_file);
+        }
+
+    }
+
+    if(0 == s_app_ctx.share_disp_buf)
+    {
+        free(ps_out_buf->pu1_bufs[0]);
+    }
+
+    for(i = 0; i < s_app_ctx.num_disp_buf; i++)
+    {
+        free(s_app_ctx.s_disp_buffers[i].pu1_bufs[0]);
+    }
+
+    free(ps_out_buf);
+    free(pu1_bs_buf);
+
+    return (0);
+}

diff --git a/test/decoder/test.cfg b/test/decoder/test.cfg
new file mode 100644
index 0000000..036261a
--- /dev/null
+++ b/test/decoder/test.cfg

@@ -0,0 +1,32 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+--input \\is0189\ARM\work\reference\HEVC\HM-10.0\bin\crew_720p_2mbps.265
+--save_output 0
+--num_frames -1
+--output E:\hevc_decoder\out.yuv
+--chroma_format YUV_420P
+--share_display_buf 0
+--max_wd 1920
+--max_ht 1080
+--max_level 41
+--num_cores 1
+--loopback 1
+--display 1
+--fps 30
+--arch X86_GENERIC
+--soc GENERIC
commit	0d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098	[log] [tgz]
author	Harish Mahendrakar <harish.mahendrakar@ittiam.com>	Fri May 16 10:31:13 2014 -0700
committer	Lajos Molnar <lajos@google.com>	Wed May 21 18:14:55 2014 -0700
tree	8a81f7d0f636b8b69bfe611aa124035e32ed4edc
parent	446ae52464da2263587877973845fe044100e205 [diff]